Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,12 +1001,13 @@ func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) {
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr(vmSize)
},
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
},
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
}
Expand Down Expand Up @@ -1036,6 +1037,7 @@ func runScenarioUbuntuGRID(t *testing.T, vmSize string) {
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand All @@ -1058,6 +1060,7 @@ func Test_Ubuntu2204_GPUA10_Scriptless(t *testing.T) {
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) {
config.VmSize = "Standard_NV6ads_A10_v5"
Expand Down Expand Up @@ -1091,6 +1094,7 @@ func Test_Ubuntu2204_GPUGridDriver(t *testing.T) {
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateNvidiaSMIInstalled(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand Down Expand Up @@ -1622,6 +1626,7 @@ func Test_AzureLinuxV3_GPU(t *testing.T) {
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand All @@ -1648,6 +1653,7 @@ func Test_AzureLinuxV3_GPUAzureCNI(t *testing.T) {
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand All @@ -1673,6 +1679,7 @@ func Test_AzureLinuxV3_GPUAzureCNI_Scriptless(t *testing.T) {
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand Down Expand Up @@ -1937,6 +1944,7 @@ func runScenarioUbuntu2404GRID(t *testing.T, vmSize string) {
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)
},
},
})
Expand Down
1 change: 1 addition & 0 deletions e2e/test_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,7 @@ func runScenarioGPUNPD(t *testing.T, vmSize, location, k8sSystemPoolSKU string)
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaCdiRefreshServiceRunning(ctx, s)

// Then validate NPD configuration and GPU monitoring
ValidateNPDGPUCountPlugin(ctx, s)
Expand Down
31 changes: 25 additions & 6 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -1450,15 +1450,34 @@ func ValidateEnableNvidiaResource(ctx context.Context, s *Scenario) {
}

func ValidateNvidiaDevicePluginServiceRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("validating that NVIDIA device plugin systemd service is running")

command := []string{
"set -ex",
"systemctl is-active nvidia-device-plugin.service",
"systemctl is-enabled nvidia-device-plugin.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled")
}

func ValidateNvidiaCdiRefreshServiceRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("validating that NVIDIA device plugin systemd service is running")

command := []string{
"set -ex",
"systemctl is-active nvidia-device-plugin.service",
"systemctl is-enabled nvidia-device-plugin.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "NVIDIA device plugin systemd service should be active and enabled")
"set -euo pipefail",
"systemctl is-enabled nvidia-cdi-refresh.path",
"systemctl is-enabled nvidia-cdi-refresh.service",
"RESULT=$(systemctl show nvidia-cdi-refresh.service --property=Result --value)",
"SUBSTATE=$(systemctl show nvidia-cdi-refresh.service --property=SubState --value)",
"if [ \"$RESULT\" != \"success\" ]; then",
" echo \"Expected nvidia-cdi-refresh.service Result=success, got $RESULT\"",
" exit 1",
"fi",
"# SubState is typically 'dead' for a oneshot service that completed successfully",
"echo \"nvidia-cdi-refresh.service Result=$RESULT SubState=$SUBSTATE\"",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "nvidia-cdi-refresh units should be enabled and last run must succeed")
}

func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCountExpected int64, resourceName string) {
Expand Down
Loading