From f869d83857d865a5067faddd3b813ffd6f31ac60 Mon Sep 17 00:00:00 2001 From: Mitch Zhu Date: Tue, 24 Feb 2026 22:49:59 +0000 Subject: [PATCH] feat: draft grid gpu driver installation for azurelinux draft drid gpu driver selection logic for azurelinux signed-off-by: --- .../artifacts/mariner/cse_install_mariner.sh | 34 +++++++- .../artifacts/cse_install_mariner_spec.sh | 77 +++++++++++++++++++ 2 files changed, 107 insertions(+), 4 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh index c82cc88a796..fc4af29e155 100755 --- a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh +++ b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh @@ -119,6 +119,22 @@ should_use_nvidia_open_drivers() { return 0 } +downloadGridDrivers() { + # Converged GPU sizes (NVads_A10_v5, NCads_A10_v4) require NVIDIA GRID (vGPU guest) + # drivers instead of CUDA drivers. These sizes use a "converged" driver to support + # both CUDA and GRID workloads — installing vanilla CUDA drivers will fail. + # + # TODO(mitchzhu): GRID driver RPM is not yet available on PMC (packages.microsoft.com). + # Once published, replace with: + # GRID_PACKAGE=$(dnf repoquery -y --available "nvidia-vgpu-guest-driver*" | \ + # grep -E "nvidia-vgpu-guest-driver-[0-9]+.*_${KERNEL_VERSION}" | sort -V | tail -n 1) + # dnf_install 30 1 600 ${GRID_PACKAGE} + local grid_rpm="nvidia-vgpu-guest-driver-570.195.03-1_${KERNEL_VERSION}.x86_64.rpm" + local grid_url="https://github.com/miz060/AgentBaker/releases/download/grid-driver-v570.195.03/nvidia-vgpu-guest-driver-570.195.03-1_6.6.121.1.1.azl3.x86_64.rpm" + echo "Installing GRID driver: ${grid_rpm}" + dnf_install 5 10 600 "${grid_url}" || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT +} + downloadGPUDrivers() { # Mariner CUDA rpm name comes in the following format: # @@ -128,15 +144,25 @@ downloadGPUDrivers() { # 2. NVIDIA OpenRM driver: # cuda-open-%{nvidia gpu driver version}_%{kernel source version}.%{kernel release version}.{mariner rpm postfix} # - # Legacy GPUs (T4, V100) require proprietary drivers; A100+ use NVIDIA open drivers. - # VM SKU is retrieved from IMDS to determine which driver to use. + # 3. NVIDIA GRID (vGPU guest) driver for converged GPU sizes: + # nvidia-vgpu-guest-driver-%{version}_%{kernel version}.{mariner rpm postfix} + # + # NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on ConvergedGPUDriverSizes map + # in gpu_components.go. Converged sizes get "grid"; all others get "cuda". + # Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers. KERNEL_VERSION=$(uname -r | sed 's/-/./g') + VM_SKU=$(get_compute_sku) + + # Converged GPU sizes use GRID drivers instead of CUDA drivers + if [ "$NVIDIA_GPU_DRIVER_TYPE" = "grid" ]; then + echo "VM SKU ${VM_SKU} uses NVIDIA GRID driver (converged)" + downloadGridDrivers + return + fi local driver_ret should_use_nvidia_open_drivers driver_ret=$? - # Get VM SKU for logging (export already done by should_use_nvidia_open_drivers) - VM_SKU=$(get_compute_sku) if [ "$driver_ret" -eq 2 ]; then echo "Failed to determine GPU driver type" exit $ERR_MISSING_CUDA_PACKAGE diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_mariner_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_mariner_spec.sh index 9d6100cda8c..d6ec8e45c99 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_mariner_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_mariner_spec.sh @@ -186,4 +186,81 @@ Describe 'cse_install_mariner.sh' The status should equal 0 End End + + Describe 'downloadGPUDrivers grid vs cuda selection' + # Tests the routing logic in downloadGPUDrivers(): + # NVIDIA_GPU_DRIVER_TYPE="grid" → downloadGridDrivers (converged A10 sizes) + # NVIDIA_GPU_DRIVER_TYPE="cuda" → cuda/cuda-open path (all other GPU sizes) + # + # We mock downloadGridDrivers and the cuda download path to isolate + # the selection logic without triggering actual downloads or exits. + + MOCK_VM_SKU="" + get_compute_sku() { echo "$MOCK_VM_SKU"; } + + # Track which path was taken + GRID_CALLED="" + downloadGridDrivers() { GRID_CALLED="true"; } + + # Mock should_use_nvidia_open_drivers to avoid IMDS dependency + MOCK_OPEN_RET=0 + should_use_nvidia_open_drivers() { return "$MOCK_OPEN_RET"; } + + # Mock uname to return a kernel version matching our fake package + uname() { echo "6.6.121.1-1.azl3"; } + + # Mock dnf repoquery to return fake packages matching both cuda and cuda-open patterns + dnf() { + echo "cuda-open-570.195.03-1_6.6.121.1.1.azl3.x86_64" + echo "cuda-570.195.03-1_6.6.121.1.1.azl3.x86_64" + } + + It 'selects GRID driver path when NVIDIA_GPU_DRIVER_TYPE is grid' + NVIDIA_GPU_DRIVER_TYPE="grid" + MOCK_VM_SKU="Standard_NV36ads_A10_v5" + GRID_CALLED="" + When call downloadGPUDrivers + The output should include "NVIDIA GRID driver (converged)" + The variable GRID_CALLED should equal "true" + End + + It 'selects GRID driver path for NCads_A10_v4 converged size' + NVIDIA_GPU_DRIVER_TYPE="grid" + MOCK_VM_SKU="Standard_NC8ads_A10_v4" + GRID_CALLED="" + When call downloadGPUDrivers + The output should include "NVIDIA GRID driver (converged)" + The variable GRID_CALLED should equal "true" + End + + It 'selects cuda-open path for A100 when NVIDIA_GPU_DRIVER_TYPE is cuda' + NVIDIA_GPU_DRIVER_TYPE="cuda" + MOCK_VM_SKU="Standard_ND96asr_v4" + MOCK_OPEN_RET=0 + GRID_CALLED="" + When call downloadGPUDrivers + The output should include "NVIDIA OpenRM driver (cuda-open)" + The variable GRID_CALLED should not equal "true" + End + + It 'selects proprietary cuda path for T4 when NVIDIA_GPU_DRIVER_TYPE is cuda' + NVIDIA_GPU_DRIVER_TYPE="cuda" + MOCK_VM_SKU="Standard_NC4as_T4_v3" + MOCK_OPEN_RET=1 + GRID_CALLED="" + When call downloadGPUDrivers + The output should include "NVIDIA proprietary driver (cuda)" + The variable GRID_CALLED should not equal "true" + End + + It 'does not select GRID path when NVIDIA_GPU_DRIVER_TYPE is empty' + NVIDIA_GPU_DRIVER_TYPE="" + MOCK_VM_SKU="Standard_ND96asr_v4" + MOCK_OPEN_RET=0 + GRID_CALLED="" + When call downloadGPUDrivers + The output should not include "NVIDIA GRID driver" + The variable GRID_CALLED should not equal "true" + End + End End