From a83095f5549f493c9a9a2c98d1f22304f65d857d Mon Sep 17 00:00:00 2001 From: Charles Hofer Date: Wed, 11 Feb 2026 21:54:38 +0000 Subject: [PATCH] Make the nightly job use TheRock instead of ROCm --- .github/workflows/build-base-docker.yml | 9 +++++++- .github/workflows/build-docker.yml | 4 ++++ .github/workflows/build-wheels.yml | 4 ++++ .github/workflows/nightly.yml | 22 ++++++++------------ docker/Dockerfile.base-ubu24 | 3 ++- jax_rocm_plugin/build/rocm/tools/get_rocm.py | 16 +++++++++----- 6 files changed, 38 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build-base-docker.yml b/.github/workflows/build-base-docker.yml index 24f3fb2388..0f11ecf916 100644 --- a/.github/workflows/build-base-docker.yml +++ b/.github/workflows/build-base-docker.yml @@ -17,13 +17,16 @@ jobs: strategy: fail-fast: false matrix: - rocm-version: ["7.1.1", "7.2.0"] + rocm-version: ["7.1.1", "7.2.0", "7.12.0"] install-llvm: [true, false] include: - rocm-version: "7.1.1" runner-label: "linux-x86-64-1gpu-amd" - rocm-version: "7.2.0" runner-label: "linux-x86-64-1gpu-amd" + - rocm-version: "7.12.0" + therock-path: "https://rocm.nightlies.amd.com/tarball/therock-dist-linux-gfx94X-dcgpu-7.12.0a20260210.tar.gz" + runner-label: "linux-x86-64-1gpu-amd" steps: - name: Clean up old runs run: | @@ -51,6 +54,7 @@ jobs: env: ROCM_BUILD_JOB: ${{ matrix.rocm-build-job }} ROCM_BUILD_NUM: ${{ matrix.rocm-build-num }} + THEROCK_PATH: ${{ matrix.therock-path }} run: | BUILD_ARGS="" if [ -n "$ROCM_BUILD_JOB" ]; then @@ -59,6 +63,9 @@ jobs: if [ -n "$ROCM_BUILD_NUM" ]; then BUILD_ARGS="$BUILD_ARGS --rocm-build-num=$ROCM_BUILD_NUM" fi + if [ -n "$THEROCK_PATH" ]; then + BUILD_ARGS="$BUILD_ARGS --therock-path=$THEROCK_PATH" + fi python3 build/ci_build \ --rocm-version="${{ matrix.rocm-version }}" \ $BUILD_ARGS \ diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml index bad6f3f122..4fe02dff64 100644 --- a/.github/workflows/build-docker.yml +++ b/.github/workflows/build-docker.yml @@ -12,6 +12,9 @@ on: rocm-build-num: required: false type: string + therock-path: + required: false + type: string runner-label: required: false type: string @@ -67,6 +70,7 @@ jobs: --rocm-version="${{ inputs.rocm-version }}" \ --rocm-build-job="${{ inputs.rocm-build-job }}" \ --rocm-build-num="${{ inputs.rocm-build-num }}" \ + --therock-path="${{ inputs.therock-path }}" \ build_dockers - name: Push docker images env: diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 9300e4019b..545099dab1 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -15,6 +15,9 @@ on: rocm-build-num: required: false type: string + therock-path: + required: false + type: string runner-label: required: false type: string @@ -86,6 +89,7 @@ jobs: --rocm-version="${{ inputs.rocm-version }}" \ --rocm-build-job="${{ inputs.rocm-build-job }}" \ --rocm-build-num="${{ inputs.rocm-build-num }}" \ + --therock-path="${{ inputs.therock-path }}" \ --jax-source-dir="./jax" \ dist_wheels \ --rbe diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index cd292963e3..528f520145 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -21,18 +21,16 @@ jobs: strategy: fail-fast: false matrix: - rocm-version: ["7.1.1", "7.2.0"] + rocm-version: ["7.12.0"] include: - - rocm-version: "7.1.1" - runner-label: '["linux-x86-64-1gpu-amd"]' - - rocm-version: "7.2.0" + - rocm-version: "7.12.0" + therock-path: "https://rocm.nightlies.amd.com/tarball/therock-dist-linux-gfx94X-dcgpu-7.12.0a20260210.tar.gz" runner-label: '["linux-x86-64-1gpu-amd"]' uses: ./.github/workflows/build-wheels.yml with: python-versions: "3.11,3.12,3.13,3.14" rocm-version: ${{ matrix.rocm-version }} - rocm-build-job: ${{ matrix.rocm-build-job }} - rocm-build-num: ${{ matrix.rocm-build-num }} + therock-path: ${{ matrix.therock-path }} runner-label: ${{ matrix.runner-label }} secrets: rbe_ci_cert: ${{ secrets.RBE_CI_CERT }} @@ -42,17 +40,15 @@ jobs: strategy: fail-fast: false matrix: - rocm-version: ["7.1.1", "7.2.0"] + rocm-version: ["7.12.0"] include: - - rocm-version: "7.1.1" - runner-label: '["linux-x86-64-1gpu-amd"]' - - rocm-version: "7.2.0" + - rocm-version: "7.12.0" + therock-path: "https://rocm.nightlies.amd.com/tarball/therock-dist-linux-gfx94X-dcgpu-7.12.0a20260210.tar.gz" runner-label: '["linux-x86-64-1gpu-amd"]' uses: ./.github/workflows/build-docker.yml with: rocm-version: ${{ matrix.rocm-version }} - rocm-build-job: ${{ matrix.rocm-build-job }} - rocm-build-num: ${{ matrix.rocm-build-num }} + therock-path: ${{ matrix.therock-path }} runner-label: ${{ matrix.runner-label }} extra-cr-tag: "nightly" call-test-and-upload: @@ -63,7 +59,7 @@ jobs: test-command: - "python jax_rocm_plugin/build/rocm/run_single_gpu.py -c -s" - "python jax_rocm_plugin/build/rocm/run_multi_gpu.py -c -s" - rocm-version: ["7.1.1", "7.2.0"] + rocm-version: ["7.12.0"] include: - test-command: "python jax_rocm_plugin/build/rocm/run_single_gpu.py -c -s" runner-label: '["linux-x86-64-1gpu-amd"]' diff --git a/docker/Dockerfile.base-ubu24 b/docker/Dockerfile.base-ubu24 index d35df292f9..2d5cf44418 100644 --- a/docker/Dockerfile.base-ubu24 +++ b/docker/Dockerfile.base-ubu24 @@ -76,7 +76,8 @@ RUN --mount=type=cache,target=/var/cache/apt \ apt-transport-https \ ca-certificates \ gnupg \ - curl && \ + curl \ + wget && \ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \ echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ diff --git a/jax_rocm_plugin/build/rocm/tools/get_rocm.py b/jax_rocm_plugin/build/rocm/tools/get_rocm.py index d4bc3a4bc8..b886407360 100644 --- a/jax_rocm_plugin/build/rocm/tools/get_rocm.py +++ b/jax_rocm_plugin/build/rocm/tools/get_rocm.py @@ -233,11 +233,17 @@ def _install_therock(rocm_version, therock_path): os.symlink(rocm_real_path, rocm_sym_path, target_is_directory=True) # Make a symlink to amdgcn to fix LLVM not being able to find binaries - os.symlink( - rocm_real_path + "/lib/llvm/amdgcn/", - rocm_real_path + "/amdgcn", - target_is_directory=True, - ) + try: + os.symlink( + rocm_real_path + "/lib/llvm/amdgcn/", + rocm_real_path + "/amdgcn", + target_is_directory=True, + ) + except FileExistsError: + LOG.info( + "%s/amdgc already exists. Not creating symlink", + rocm_real_path + ) def _setup_internal_repo(system, rocm_version, job_name, build_num):