From 1a2ed98a4dfe77ca7a5a47d8bbf1285b960e6050 Mon Sep 17 00:00:00 2001
From: kithumma <kiran.thumma@amd.com>
Date: Sat, 24 Jan 2026 06:14:17 +0000
Subject: [PATCH 1/2] update workflow

---
 .github/workflows/enroot-tests.yml | 142 +++++++++++++++++++++++++----
 tests/enroot/run_test.py           |   7 ++
 2 files changed, 132 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml
index 4ee2c96..33c4489 100644
--- a/.github/workflows/enroot-tests.yml
+++ b/.github/workflows/enroot-tests.yml
@@ -1,15 +1,26 @@
 name: Enroot Tests
 
 on:
+  push:
+    branches:
+      - main
   workflow_dispatch:
     inputs:
-      test_name:
-        description: 'Select test to run'
-        required: true
-        type: choice
-        options:
-          - test_single_node_pytorch
-          - test_multi_node_distributed_pytorch
+      run_single_node_pytorch:
+        description: 'Run single-node PyTorch test'
+        required: false
+        type: boolean
+        default: true
+      run_multi_node_pytorch:
+        description: 'Run multi-node distributed PyTorch test'
+        required: false
+        type: boolean
+        default: true
+      run_multi_node_rccl:
+        description: 'Run multi-node RCCL test'
+        required: false
+        type: boolean
+        default: true
       no_install:
         description: 'Skip installation (--no-install)'
         required: false
@@ -20,47 +31,144 @@ on:
         required: false
         type: boolean
         default: false
-      docker_image:
-        description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)'
+      docker_image_single_node:
+        description: 'Docker image for single-node test (default: rocm/pytorch:latest)'
         required: false
         type: string
         default: ''
-      testbed_file:
-        description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)'
+      docker_image_multi_node:
+        description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)'
         required: false
         type: string
-        default: 'testbed/enroot_tb.yml'
+        default: ''
+      docker_image_rccl:
+        description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)'
+        required: false
+        type: string
+        default: ''
 
 
 jobs:
   run-enroot-tests:
     runs-on: enroot-runners
-    timeout-minutes: 120
+    timeout-minutes: 360
+    strategy:
+      matrix:
+        test_name:
+          - test_single_node_pytorch
+          - test_multi_node_distributed_pytorch
+          - test_multi_node_rccl
+      max-parallel: 1
+      fail-fast: false
     
     steps:
+      - name: Check if test should run
+        id: check
+        run: |
+          if [ "${{ github.event_name }}" = "push" ]; then
+            echo "should_run=true" >> $GITHUB_OUTPUT
+          elif [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ] && [ "${{ inputs.run_single_node_pytorch }}" = "true" ]; then
+            echo "should_run=true" >> $GITHUB_OUTPUT
+          elif [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ] && [ "${{ inputs.run_multi_node_pytorch }}" = "true" ]; then
+            echo "should_run=true" >> $GITHUB_OUTPUT
+          elif [ "${{ matrix.test_name }}" = "test_multi_node_rccl" ] && [ "${{ inputs.run_multi_node_rccl }}" = "true" ]; then
+            echo "should_run=true" >> $GITHUB_OUTPUT
+          else
+            echo "should_run=false" >> $GITHUB_OUTPUT
+          fi
+
       - name: Checkout repository
+        if: steps.check.outputs.should_run == 'true'
         uses: actions/checkout@v4
       
       - name: Set up Python
+        if: steps.check.outputs.should_run == 'true'
         uses: actions/setup-python@v5
         with:
           python-version: '3.8'
       
       - name: Install dependencies
+        if: steps.check.outputs.should_run == 'true'
         run: |
           python3 -m pip install --upgrade pip
           pip install -r tests/enroot/requirements.txt
       
-      - name: Run enroot tests
+      - name: Create testbed files from secrets
+        if: steps.check.outputs.should_run == 'true'
+        working-directory: tests/enroot
+        env:
+          SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
+          MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
+        run: |
+          mkdir -p testbed
+          
+          # Write single-node testbed
+          if [ -n "$SINGLE_NODE_TESTBED" ]; then
+            printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
+            echo "Created testbed/single_node_tb.yml from secret"
+          else
+            echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set"
+          fi
+          
+          # Write multi-node testbed
+          if [ -n "$MULTI_NODE_TESTBED" ]; then
+            printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
+            echo "Created testbed/multi_node_tb.yml from secret"
+          else
+            echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set"
+          fi
+          
+          echo "Testbed files:"
+          ls -la testbed/
+      
+      - name: Run ${{ matrix.test_name }}
+        if: steps.check.outputs.should_run == 'true'
         working-directory: tests/enroot
         run: |
-          python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}"
+          # Determine testbed file and docker image based on test type
+          if [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ]; then
+            TESTBED_FILE="testbed/single_node_tb.yml"
+            DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
+          else
+            # Multi-node tests use multi_node testbed
+            TESTBED_FILE="testbed/multi_node_tb.yml"
+            if [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ]; then
+              DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
+            else
+              DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
+            fi
+          fi
+          
+          # Set flags based on event type
+          if [ "${{ github.event_name }}" = "push" ]; then
+            NO_INSTALL="false"
+            NO_UNINSTALL="false"
+            DOCKER_IMAGE=""
+          else
+            NO_INSTALL="${{ inputs.no_install }}"
+            NO_UNINSTALL="${{ inputs.no_uninstall }}"
+          fi
+          
+          # Validate testbed file exists
+          if [ ! -f "$TESTBED_FILE" ]; then
+            echo "[ERROR] Testbed file not found: $TESTBED_FILE"
+            echo "Please ensure the appropriate secret is set:"
+            echo "  - SINGLE_NODE_TESTBED_FILE for single-node tests"
+            echo "  - MULTI_NODE_TESTBED_FILE for multi-node tests"
+            exit 1
+          fi
+          
+          echo "Running test: ${{ matrix.test_name }}"
+          echo "Testbed file: $TESTBED_FILE"
+          echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
+          
+          python3 run_test.py "${{ matrix.test_name }}" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE"
       
       - name: Upload test results
-        if: always()
+        if: always() && steps.check.outputs.should_run == 'true'
         uses: actions/upload-artifact@v4
         with:
-          name: test-results-${{ inputs.test_name }}-${{ github.run_number }}
+          name: test-results-${{ matrix.test_name }}-${{ github.run_number }}
           path: tests/enroot/results/
           if-no-files-found: warn
           retention-days: 30
diff --git a/tests/enroot/run_test.py b/tests/enroot/run_test.py
index 864dc04..52b1bc5 100644
--- a/tests/enroot/run_test.py
+++ b/tests/enroot/run_test.py
@@ -18,6 +18,13 @@ def update_docker_image(test_name, docker_image):
         pattern = r'export DOCKER_IMAGE=.*'
         replacement = f'export DOCKER_IMAGE={docker_image}'
         print(f"Updating distributed_pytorch_sbatch.sh with image: {docker_image}")
+    elif test_name == "test_multi_node_rccl":
+        script_path = Path("batch_scripts/rccl_tests_sbatch.sh")
+        # Extract version tag from docker image (e.g., docker://rocm/roce-workload:version -> version)
+        version = docker_image.split(':')[-1] if ':' in docker_image else docker_image
+        pattern = r'DOCKER_IMAGE_VERSION=\${DOCKER_IMAGE_VERSION:-"[^"]*"}'
+        replacement = f'DOCKER_IMAGE_VERSION="${{DOCKER_IMAGE_VERSION:-"{version}"}}"'
+        print(f"Updating rccl_tests_sbatch.sh with image version: {version}")
     else:
         print(f"Unknown test name: {test_name}")
         return

From 89c8ec84ca1037deb5e21a1ac0c84143b304162e Mon Sep 17 00:00:00 2001
From: kithumma <kiran.thumma@amd.com>
Date: Sun, 25 Jan 2026 03:17:11 +0000
Subject: [PATCH 2/2] update workflow

---
 .github/workflows/enroot-tests.yml | 198 +++++++++++++++++++----------
 1 file changed, 131 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml
index 33c4489..7534f1c 100644
--- a/.github/workflows/enroot-tests.yml
+++ b/.github/workflows/enroot-tests.yml
@@ -49,126 +49,190 @@ on:
 
 
 jobs:
-  run-enroot-tests:
+  # Single-node PyTorch test
+  test-single-node-pytorch:
+    if: github.event_name == 'push' || inputs.run_single_node_pytorch == true
     runs-on: enroot-runners
-    timeout-minutes: 360
-    strategy:
-      matrix:
-        test_name:
-          - test_single_node_pytorch
-          - test_multi_node_distributed_pytorch
-          - test_multi_node_rccl
-      max-parallel: 1
-      fail-fast: false
+    timeout-minutes: 120
     
     steps:
-      - name: Check if test should run
-        id: check
-        run: |
-          if [ "${{ github.event_name }}" = "push" ]; then
-            echo "should_run=true" >> $GITHUB_OUTPUT
-          elif [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ] && [ "${{ inputs.run_single_node_pytorch }}" = "true" ]; then
-            echo "should_run=true" >> $GITHUB_OUTPUT
-          elif [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ] && [ "${{ inputs.run_multi_node_pytorch }}" = "true" ]; then
-            echo "should_run=true" >> $GITHUB_OUTPUT
-          elif [ "${{ matrix.test_name }}" = "test_multi_node_rccl" ] && [ "${{ inputs.run_multi_node_rccl }}" = "true" ]; then
-            echo "should_run=true" >> $GITHUB_OUTPUT
-          else
-            echo "should_run=false" >> $GITHUB_OUTPUT
-          fi
-
       - name: Checkout repository
-        if: steps.check.outputs.should_run == 'true'
         uses: actions/checkout@v4
       
       - name: Set up Python
-        if: steps.check.outputs.should_run == 'true'
         uses: actions/setup-python@v5
         with:
           python-version: '3.8'
       
       - name: Install dependencies
-        if: steps.check.outputs.should_run == 'true'
         run: |
           python3 -m pip install --upgrade pip
           pip install -r tests/enroot/requirements.txt
       
       - name: Create testbed files from secrets
-        if: steps.check.outputs.should_run == 'true'
         working-directory: tests/enroot
         env:
           SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
-          MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
         run: |
           mkdir -p testbed
-          
-          # Write single-node testbed
           if [ -n "$SINGLE_NODE_TESTBED" ]; then
             printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
             echo "Created testbed/single_node_tb.yml from secret"
           else
-            echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set"
+            echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set"
+            exit 1
+          fi
+      
+      - name: Run test_single_node_pytorch
+        working-directory: tests/enroot
+        run: |
+          DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
+          if [ "${{ github.event_name }}" = "push" ]; then
+            NO_INSTALL="false"
+            NO_UNINSTALL="false"
+          else
+            NO_INSTALL="${{ inputs.no_install }}"
+            NO_UNINSTALL="${{ inputs.no_uninstall }}"
           fi
           
-          # Write multi-node testbed
+          echo "Running test: test_single_node_pytorch"
+          echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
+          
+          python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml"
+      
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-single-node-pytorch-${{ github.run_number }}
+          path: tests/enroot/results/
+          if-no-files-found: warn
+          retention-days: 30
+
+  # Multi-node distributed PyTorch test
+  test-multi-node-pytorch:
+    needs: test-single-node-pytorch
+    if: |
+      always() && 
+      (github.event_name == 'push' || inputs.run_multi_node_pytorch == true) &&
+      (needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped')
+    runs-on: enroot-runners
+    timeout-minutes: 120
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.8'
+      
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          pip install -r tests/enroot/requirements.txt
+      
+      - name: Create testbed files from secrets
+        working-directory: tests/enroot
+        env:
+          MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
+        run: |
+          mkdir -p testbed
           if [ -n "$MULTI_NODE_TESTBED" ]; then
             printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
             echo "Created testbed/multi_node_tb.yml from secret"
           else
-            echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set"
+            echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
+            exit 1
           fi
-          
-          echo "Testbed files:"
-          ls -la testbed/
       
-      - name: Run ${{ matrix.test_name }}
-        if: steps.check.outputs.should_run == 'true'
+      - name: Run test_multi_node_distributed_pytorch
         working-directory: tests/enroot
         run: |
-          # Determine testbed file and docker image based on test type
-          if [ "${{ matrix.test_name }}" = "test_single_node_pytorch" ]; then
-            TESTBED_FILE="testbed/single_node_tb.yml"
-            DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
-          else
-            # Multi-node tests use multi_node testbed
-            TESTBED_FILE="testbed/multi_node_tb.yml"
-            if [ "${{ matrix.test_name }}" = "test_multi_node_distributed_pytorch" ]; then
-              DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
-            else
-              DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
-            fi
-          fi
-          
-          # Set flags based on event type
+          DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
           if [ "${{ github.event_name }}" = "push" ]; then
             NO_INSTALL="false"
             NO_UNINSTALL="false"
-            DOCKER_IMAGE=""
           else
             NO_INSTALL="${{ inputs.no_install }}"
             NO_UNINSTALL="${{ inputs.no_uninstall }}"
           fi
           
-          # Validate testbed file exists
-          if [ ! -f "$TESTBED_FILE" ]; then
-            echo "[ERROR] Testbed file not found: $TESTBED_FILE"
-            echo "Please ensure the appropriate secret is set:"
-            echo "  - SINGLE_NODE_TESTBED_FILE for single-node tests"
-            echo "  - MULTI_NODE_TESTBED_FILE for multi-node tests"
+          echo "Running test: test_multi_node_distributed_pytorch"
+          echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
+          
+          python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
+      
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-multi-node-pytorch-${{ github.run_number }}
+          path: tests/enroot/results/
+          if-no-files-found: warn
+          retention-days: 30
+
+  # Multi-node RCCL test
+  test-multi-node-rccl:
+    needs: test-multi-node-pytorch
+    if: |
+      always() && 
+      (github.event_name == 'push' || inputs.run_multi_node_rccl == true) &&
+      (needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped')
+    runs-on: enroot-runners
+    timeout-minutes: 120
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.8'
+      
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          pip install -r tests/enroot/requirements.txt
+      
+      - name: Create testbed files from secrets
+        working-directory: tests/enroot
+        env:
+          MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
+        run: |
+          mkdir -p testbed
+          if [ -n "$MULTI_NODE_TESTBED" ]; then
+            printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
+            echo "Created testbed/multi_node_tb.yml from secret"
+          else
+            echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
             exit 1
           fi
+      
+      - name: Run test_multi_node_rccl
+        working-directory: tests/enroot
+        run: |
+          DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
+          if [ "${{ github.event_name }}" = "push" ]; then
+            NO_INSTALL="false"
+            NO_UNINSTALL="false"
+          else
+            NO_INSTALL="${{ inputs.no_install }}"
+            NO_UNINSTALL="${{ inputs.no_uninstall }}"
+          fi
           
-          echo "Running test: ${{ matrix.test_name }}"
-          echo "Testbed file: $TESTBED_FILE"
+          echo "Running test: test_multi_node_rccl"
           echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
           
-          python3 run_test.py "${{ matrix.test_name }}" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE"
+          python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
       
       - name: Upload test results
-        if: always() && steps.check.outputs.should_run == 'true'
+        if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: test-results-${{ matrix.test_name }}-${{ github.run_number }}
+          name: test-results-multi-node-rccl-${{ github.run_number }}
           path: tests/enroot/results/
           if-no-files-found: warn
           retention-days: 30