From 6a750a9c870bca2a33d668059180128ff8043f10 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 26 Feb 2026 09:54:18 -0800 Subject: [PATCH 1/5] test: add e2e test for NVIDIA device plugin as DaemonSet Add a new e2e test that validates GPU nodes work correctly when the NVIDIA device plugin is deployed as a Kubernetes DaemonSet instead of a systemd service. This tests the upstream deployment model commonly used by customers who manage their own device plugin deployment. The test: - Provisions a GPU node with drivers but without systemd device plugin - Deploys nvidia-device-plugin v0.18.2 as a DaemonSet from MCR - Validates GPU resources are advertised and workloads can be scheduled --- e2e/scenario_gpu_daemonset_test.go | 197 +++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 e2e/scenario_gpu_daemonset_test.go diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go new file mode 100644 index 00000000000..294bb25c4a3 --- /dev/null +++ b/e2e/scenario_gpu_daemonset_test.go @@ -0,0 +1,197 @@ +package e2e + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" +) + +// Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly +// with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service. +// This is the "upstream" deployment model commonly used by customers who manage their own +// NVIDIA device plugin deployment. +func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that NVIDIA device plugin works when deployed as a DaemonSet (not systemd service)", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5" + nbc.ConfigGPUDriverIfNeeded = true + // Disable the systemd-based device plugin - we'll deploy it as a DaemonSet instead + nbc.EnableGPUDevicePluginIfNeeded = false + nbc.EnableNvidia = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5") + }, + Validator: func(ctx context.Context, s *Scenario) { + // First, validate that GPU drivers are installed + ValidateNvidiaModProbeInstalled(ctx, s) + + // Deploy the NVIDIA device plugin as a DaemonSet + deployNvidiaDevicePluginDaemonset(ctx, s) + + // Wait for the DaemonSet pod to be running on our node + waitForNvidiaDevicePluginDaemonsetReady(ctx, s) + + // Validate that GPU resources are advertised by the device plugin + ValidateNodeAdvertisesGPUResources(ctx, s, 1) + + // Validate that GPU workloads can be scheduled + ValidateGPUWorkloadSchedulable(ctx, s, 1) + + s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly") + }, + }, + }) +} + +// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec +// based on the official upstream deployment from: +// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml +func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { + return &appsv1.DaemonSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-device-plugin-daemonset", + Namespace: "kube-system", + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": "nvidia-device-plugin-ds", + }, + }, + UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ + Type: appsv1.RollingUpdateDaemonSetStrategyType, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": "nvidia-device-plugin-ds", + }, + }, + Spec: corev1.PodSpec{ + // Target only our specific test node + NodeSelector: map[string]string{ + "kubernetes.io/hostname": nodeName, + }, + Tolerations: []corev1.Toleration{ + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }, + PriorityClassName: "system-node-critical", + Containers: []corev1.Container{ + { + Name: "nvidia-device-plugin-ctr", + Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2", + Env: []corev1.EnvVar{ + { + Name: "FAIL_ON_INIT_ERROR", + Value: "false", + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: to.Ptr(false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "device-plugin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + }, + }, + }, + } +} + +// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster +func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...") + + ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName) + err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds) + require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet") + + s.T.Logf("NVIDIA device plugin DaemonSet created successfully") +} + +// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node +func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName) + + // Wait for the pod to be running + err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { + pods, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{ + LabelSelector: "name=nvidia-device-plugin-ds", + FieldSelector: fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName), + }) + if err != nil { + return false, err + } + + if len(pods.Items) == 0 { + s.T.Logf("No NVIDIA device plugin pod found yet on node %s", s.Runtime.VM.KubeName) + return false, nil + } + + pod := &pods.Items[0] + s.T.Logf("NVIDIA device plugin pod %s is in phase %s", pod.Name, pod.Status.Phase) + + if pod.Status.Phase == corev1.PodRunning { + // Check if all containers are ready + for _, containerStatus := range pod.Status.ContainerStatuses { + if !containerStatus.Ready { + s.T.Logf("Container %s is not ready yet", containerStatus.Name) + return false, nil + } + } + return true, nil + } + + return false, nil + }) + + require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready") + s.T.Logf("NVIDIA device plugin DaemonSet pod is ready") +} From 978ca6d8f77cfca2d09cb076c153e18ad578a1df Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan <35557827+ganeshkumarashok@users.noreply.github.com> Date: Thu, 26 Feb 2026 11:22:02 -0800 Subject: [PATCH 2/5] Update e2e/scenario_gpu_daemonset_test.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- e2e/scenario_gpu_daemonset_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index 294bb25c4a3..852a98c7e49 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -51,7 +51,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { waitForNvidiaDevicePluginDaemonsetReady(ctx, s) // Validate that GPU resources are advertised by the device plugin - ValidateNodeAdvertisesGPUResources(ctx, s, 1) + ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled ValidateGPUWorkloadSchedulable(ctx, s, 1) From f965b50a9058e2fc044302829eb6c016a97249e8 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 26 Feb 2026 14:00:13 -0800 Subject: [PATCH 3/5] address PR review comments - Use unique DaemonSet name per node to avoid collisions in shared cluster - Add cleanup to delete DaemonSet when test finishes - Use Privileged mode matching upstream NVIDIA device plugin spec - Use existing WaitUntilPodRunning helper instead of custom wait loop - Add comments explaining image version choice --- e2e/scenario_gpu_daemonset_test.go | 88 +++++++++++++++--------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index 852a98c7e49..92dc3506d7f 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "testing" - "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" @@ -14,7 +13,6 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" ) // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly @@ -65,20 +63,26 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { // nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec // based on the official upstream deployment from: // https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml +// +// The DaemonSet name includes the node name to avoid collisions when multiple +// GPU tests run against the same shared cluster. func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { + // Use node name in DaemonSet name to avoid collisions in shared cluster + dsName := fmt.Sprintf("nvidia-device-plugin-%s", nodeName) + return &appsv1.DaemonSet{ TypeMeta: metav1.TypeMeta{ Kind: "DaemonSet", APIVersion: "apps/v1", }, ObjectMeta: metav1.ObjectMeta{ - Name: "nvidia-device-plugin-daemonset", + Name: dsName, Namespace: "kube-system", }, Spec: appsv1.DaemonSetSpec{ Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ - "name": "nvidia-device-plugin-ds", + "name": dsName, }, }, UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ @@ -87,7 +91,7 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ - "name": "nvidia-device-plugin-ds", + "name": dsName, }, }, Spec: corev1.PodSpec{ @@ -105,7 +109,11 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { PriorityClassName: "system-node-critical", Containers: []corev1.Container{ { - Name: "nvidia-device-plugin-ctr", + Name: "nvidia-device-plugin-ctr", + // Using upstream NVIDIA device plugin image from MCR. + // This is intentionally different from components.json which tracks + // the systemd-packaged version. This test validates the upstream + // container-based deployment model. Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2", Env: []corev1.EnvVar{ { @@ -114,10 +122,10 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { }, }, SecurityContext: &corev1.SecurityContext{ - AllowPrivilegeEscalation: to.Ptr(false), - Capabilities: &corev1.Capabilities{ - Drop: []corev1.Capability{"ALL"}, - }, + // Privileged mode is required for the device plugin to access + // GPU devices and register with kubelet's device plugin framework. + // This matches the upstream NVIDIA device plugin deployment spec. + Privileged: to.Ptr(true), }, VolumeMounts: []corev1.VolumeMount{ { @@ -144,6 +152,7 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { } // deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster +// and registers cleanup to delete it when the test finishes. func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { s.T.Helper() s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...") @@ -152,46 +161,37 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds) require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet") - s.T.Logf("NVIDIA device plugin DaemonSet created successfully") + s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name) + + // Register cleanup to delete the DaemonSet when the test finishes + s.T.Cleanup(func() { + s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name) + deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete( + context.Background(), + ds.Name, + metav1.DeleteOptions{}, + ) + if deleteErr != nil { + s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr) + } + }) } -// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node +// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node. +// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states. func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) { s.T.Helper() - s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName) - - // Wait for the pod to be running - err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { - pods, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{ - LabelSelector: "name=nvidia-device-plugin-ds", - FieldSelector: fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName), - }) - if err != nil { - return false, err - } - if len(pods.Items) == 0 { - s.T.Logf("No NVIDIA device plugin pod found yet on node %s", s.Runtime.VM.KubeName) - return false, nil - } - - pod := &pods.Items[0] - s.T.Logf("NVIDIA device plugin pod %s is in phase %s", pod.Name, pod.Status.Phase) - - if pod.Status.Phase == corev1.PodRunning { - // Check if all containers are ready - for _, containerStatus := range pod.Status.ContainerStatuses { - if !containerStatus.Ready { - s.T.Logf("Container %s is not ready yet", containerStatus.Name) - return false, nil - } - } - return true, nil - } - - return false, nil - }) + dsName := fmt.Sprintf("nvidia-device-plugin-%s", s.Runtime.VM.KubeName) + s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName) + _, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning( + ctx, + "kube-system", + fmt.Sprintf("name=%s", dsName), + fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName), + ) require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready") + s.T.Logf("NVIDIA device plugin DaemonSet pod is ready") } From 460b260f6a45bc8130c2a14b117b43a950d45820 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 26 Feb 2026 15:13:16 -0800 Subject: [PATCH 4/5] improve test robustness and cleanup - Extract image version to constant for easier updates - Add validation that systemd device plugin is not running - Truncate DaemonSet name to 63 chars (K8s limit) - Add timeout contexts to cleanup operations - Delete existing DaemonSet before create for idempotency --- e2e/scenario_gpu_daemonset_test.go | 72 +++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index 92dc3506d7f..eb590ff0ad6 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -3,7 +3,9 @@ package e2e import ( "context" "fmt" + "strings" "testing" + "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" @@ -15,6 +17,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + // nvidiaDevicePluginImage is the upstream NVIDIA device plugin image from MCR. + // This is intentionally different from components.json which tracks the systemd-packaged version. + // This test validates the upstream container-based deployment model. + // Update this when a new version is available in MCR. + nvidiaDevicePluginImage = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2" +) + // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly // with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service. // This is the "upstream" deployment model commonly used by customers who manage their own @@ -42,6 +52,10 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { // First, validate that GPU drivers are installed ValidateNvidiaModProbeInstalled(ctx, s) + // Verify that the systemd-based device plugin is NOT running + // (we disabled it via EnableGPUDevicePluginIfNeeded = false) + validateNvidiaDevicePluginServiceNotRunning(ctx, s) + // Deploy the NVIDIA device plugin as a DaemonSet deployNvidiaDevicePluginDaemonset(ctx, s) @@ -60,6 +74,36 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { }) } +// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based +// NVIDIA device plugin service is not running (since we're testing the DaemonSet model). +func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("Verifying that nvidia-device-plugin.service is not running...") + + // Check if the service exists and is inactive + // Using "is-active" which returns non-zero if not active + result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'") + output := strings.TrimSpace(result.stdout) + + // The service should either not exist or be inactive + if output == "active" { + s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled") + } + s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output) +} + +// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node. +// The name is truncated to fit within Kubernetes' 63-character limit for resource names. +func nvidiaDevicePluginDaemonsetName(nodeName string) string { + prefix := "nvdp-" // Short prefix to leave room for node name + maxLen := 63 + name := prefix + nodeName + if len(name) > maxLen { + name = name[:maxLen] + } + return name +} + // nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec // based on the official upstream deployment from: // https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml @@ -67,8 +111,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { // The DaemonSet name includes the node name to avoid collisions when multiple // GPU tests run against the same shared cluster. func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { - // Use node name in DaemonSet name to avoid collisions in shared cluster - dsName := fmt.Sprintf("nvidia-device-plugin-%s", nodeName) + dsName := nvidiaDevicePluginDaemonsetName(nodeName) return &appsv1.DaemonSet{ TypeMeta: metav1.TypeMeta{ @@ -109,12 +152,8 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { PriorityClassName: "system-node-critical", Containers: []corev1.Container{ { - Name: "nvidia-device-plugin-ctr", - // Using upstream NVIDIA device plugin image from MCR. - // This is intentionally different from components.json which tracks - // the systemd-packaged version. This test validates the upstream - // container-based deployment model. - Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2", + Name: "nvidia-device-plugin-ctr", + Image: nvidiaDevicePluginImage, Env: []corev1.EnvVar{ { Name: "FAIL_ON_INIT_ERROR", @@ -158,6 +197,17 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...") ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName) + + // Delete any existing DaemonSet from a previous failed run + deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second) + defer deleteCancel() + _ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete( + deleteCtx, + ds.Name, + metav1.DeleteOptions{}, + ) + + // Create the DaemonSet err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds) require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet") @@ -166,8 +216,10 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { // Register cleanup to delete the DaemonSet when the test finishes s.T.Cleanup(func() { s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name) + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cleanupCancel() deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete( - context.Background(), + cleanupCtx, ds.Name, metav1.DeleteOptions{}, ) @@ -182,7 +234,7 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) { s.T.Helper() - dsName := fmt.Sprintf("nvidia-device-plugin-%s", s.Runtime.VM.KubeName) + dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName) s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName) _, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning( From 2eae10eb6f8348f4eaa665883a90d07d43dc780f Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 26 Feb 2026 15:47:06 -0800 Subject: [PATCH 5/5] fix comments to accurately describe GPU device plugin behavior --- e2e/scenario_gpu_daemonset_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index eb590ff0ad6..2b98e3d0384 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -41,7 +41,8 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5" nbc.ConfigGPUDriverIfNeeded = true - // Disable the systemd-based device plugin - we'll deploy it as a DaemonSet instead + // Don't enable the managed GPU experience - we'll deploy the device plugin as a DaemonSet instead. + // By not setting EnableManagedGPU=true or the VMSS tag, the systemd-based device plugin won't start. nbc.EnableGPUDevicePluginIfNeeded = false nbc.EnableNvidia = true }, @@ -53,7 +54,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { ValidateNvidiaModProbeInstalled(ctx, s) // Verify that the systemd-based device plugin is NOT running - // (we disabled it via EnableGPUDevicePluginIfNeeded = false) + // (managed GPU experience is not enabled, so the service should not be active) validateNvidiaDevicePluginServiceNotRunning(ctx, s) // Deploy the NVIDIA device plugin as a DaemonSet