From a6b1711097363d8d1aee842632eb1087a5f22b40 Mon Sep 17 00:00:00 2001 From: sulixu Date: Thu, 26 Feb 2026 09:49:58 -0800 Subject: [PATCH] add nvidia-cdi-refresh is enabled validator for all AKS GPU sku --- e2e/scenario_test.go | 20 ++++++++++++++------ e2e/test_helpers.go | 1 + e2e/validators.go | 31 +++++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 7fc8c2b35b7..d2e06e8fd73 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1001,12 +1001,13 @@ func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) { VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr(vmSize) }, - Validator: func(ctx context.Context, s *Scenario) { - // Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable - ValidateNvidiaModProbeInstalled(ctx, s) - ValidateKubeletHasNotStopped(ctx, s) - ValidateServicesDoNotRestartKubelet(ctx, s) - }, + Validator: func(ctx context.Context, s *Scenario) { + // Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable + ValidateNvidiaModProbeInstalled(ctx, s) + ValidateKubeletHasNotStopped(ctx, s) + ValidateServicesDoNotRestartKubelet(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) + }, }, }) } @@ -1036,6 +1037,7 @@ func runScenarioUbuntuGRID(t *testing.T, vmSize string) { ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) ValidateNvidiaPersistencedRunning(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) @@ -1058,6 +1060,7 @@ func Test_Ubuntu2204_GPUA10_Scriptless(t *testing.T) { ValidateNvidiaModProbeInstalled(ctx, s) ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { config.VmSize = "Standard_NV6ads_A10_v5" @@ -1091,6 +1094,7 @@ func Test_Ubuntu2204_GPUGridDriver(t *testing.T) { ValidateNvidiaModProbeInstalled(ctx, s) ValidateKubeletHasNotStopped(ctx, s) ValidateNvidiaSMIInstalled(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) @@ -1622,6 +1626,7 @@ func Test_AzureLinuxV3_GPU(t *testing.T) { vmss.SKU.Name = to.Ptr("Standard_NC6s_v3") }, Validator: func(ctx context.Context, s *Scenario) { + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) @@ -1648,6 +1653,7 @@ func Test_AzureLinuxV3_GPUAzureCNI(t *testing.T) { vmss.SKU.Name = to.Ptr("Standard_NC6s_v3") }, Validator: func(ctx context.Context, s *Scenario) { + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) @@ -1673,6 +1679,7 @@ func Test_AzureLinuxV3_GPUAzureCNI_Scriptless(t *testing.T) { vmss.SKU.Name = to.Ptr("Standard_NC6s_v3") }, Validator: func(ctx context.Context, s *Scenario) { + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) @@ -1937,6 +1944,7 @@ func runScenarioUbuntu2404GRID(t *testing.T, vmSize string) { ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) ValidateNvidiaPersistencedRunning(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) }, }, }) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 1c14ceac4e4..a027d194c55 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -782,6 +782,7 @@ func runScenarioGPUNPD(t *testing.T, vmSize, location, k8sSystemPoolSKU string) ValidateNvidiaModProbeInstalled(ctx, s) ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) + ValidateNvidiaCdiRefreshServiceRunning(ctx, s) // Then validate NPD configuration and GPU monitoring ValidateNPDGPUCountPlugin(ctx, s) diff --git a/e2e/validators.go b/e2e/validators.go index 2647fccb699..fced5256764 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1450,15 +1450,34 @@ func ValidateEnableNvidiaResource(ctx context.Context, s *Scenario) { } func ValidateNvidiaDevicePluginServiceRunning(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("validating that NVIDIA device plugin systemd service is running") + + command := []string{ + "set -ex", + "systemctl is-active nvidia-device-plugin.service", + "systemctl is-enabled nvidia-device-plugin.service", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled") +} + +func ValidateNvidiaCdiRefreshServiceRunning(ctx context.Context, s *Scenario) { s.T.Helper() - s.T.Logf("validating that NVIDIA device plugin systemd service is running") command := []string{ - "set -ex", - "systemctl is-active nvidia-device-plugin.service", - "systemctl is-enabled nvidia-device-plugin.service", - } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled") + "set -euo pipefail", + "systemctl is-enabled nvidia-cdi-refresh.path", + "systemctl is-enabled nvidia-cdi-refresh.service", + "RESULT=$(systemctl show nvidia-cdi-refresh.service --property=Result --value)", + "SUBSTATE=$(systemctl show nvidia-cdi-refresh.service --property=SubState --value)", + "if [ \"$RESULT\" != \"success\" ]; then", + " echo \"Expected nvidia-cdi-refresh.service Result=success, got $RESULT\"", + " exit 1", + "fi", + "# SubState is typically 'dead' for a oneshot service that completed successfully", + "echo \"nvidia-cdi-refresh.service Result=$RESULT SubState=$SUBSTATE\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "nvidia-cdi-refresh units should be enabled and last run must succeed") } func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCountExpected int64, resourceName string) {