From 6a750a9c870bca2a33d668059180128ff8043f10 Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Thu, 26 Feb 2026 09:54:18 -0800
Subject: [PATCH 1/5] test: add e2e test for NVIDIA device plugin as DaemonSet

Add a new e2e test that validates GPU nodes work correctly when the
NVIDIA device plugin is deployed as a Kubernetes DaemonSet instead of
a systemd service. This tests the upstream deployment model commonly
used by customers who manage their own device plugin deployment.

The test:
- Provisions a GPU node with drivers but without systemd device plugin
- Deploys nvidia-device-plugin v0.18.2 as a DaemonSet from MCR
- Validates GPU resources are advertised and workloads can be scheduled
---
 e2e/scenario_gpu_daemonset_test.go | 197 +++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 e2e/scenario_gpu_daemonset_test.go

diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go
new file mode 100644
index 00000000000..294bb25c4a3
--- /dev/null
+++ b/e2e/scenario_gpu_daemonset_test.go
@@ -0,0 +1,197 @@
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/Azure/agentbaker/e2e/config"
+	"github.com/Azure/agentbaker/pkg/agent/datamodel"
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
+	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
+	"github.com/stretchr/testify/require"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+)
+
+// Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly
+// with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service.
+// This is the "upstream" deployment model commonly used by customers who manage their own
+// NVIDIA device plugin deployment.
+func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
+	RunScenario(t, &Scenario{
+		Description: "Tests that NVIDIA device plugin works when deployed as a DaemonSet (not systemd service)",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2204Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
+				nbc.ConfigGPUDriverIfNeeded = true
+				// Disable the systemd-based device plugin - we'll deploy it as a DaemonSet instead
+				nbc.EnableGPUDevicePluginIfNeeded = false
+				nbc.EnableNvidia = true
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				// First, validate that GPU drivers are installed
+				ValidateNvidiaModProbeInstalled(ctx, s)
+
+				// Deploy the NVIDIA device plugin as a DaemonSet
+				deployNvidiaDevicePluginDaemonset(ctx, s)
+
+				// Wait for the DaemonSet pod to be running on our node
+				waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
+
+				// Validate that GPU resources are advertised by the device plugin
+				ValidateNodeAdvertisesGPUResources(ctx, s, 1)
+
+				// Validate that GPU workloads can be scheduled
+				ValidateGPUWorkloadSchedulable(ctx, s, 1)
+
+				s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly")
+			},
+		},
+	})
+}
+
+// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
+// based on the official upstream deployment from:
+// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
+func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
+	return &appsv1.DaemonSet{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "DaemonSet",
+			APIVersion: "apps/v1",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "nvidia-device-plugin-daemonset",
+			Namespace: "kube-system",
+		},
+		Spec: appsv1.DaemonSetSpec{
+			Selector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"name": "nvidia-device-plugin-ds",
+				},
+			},
+			UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
+				Type: appsv1.RollingUpdateDaemonSetStrategyType,
+			},
+			Template: corev1.PodTemplateSpec{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						"name": "nvidia-device-plugin-ds",
+					},
+				},
+				Spec: corev1.PodSpec{
+					// Target only our specific test node
+					NodeSelector: map[string]string{
+						"kubernetes.io/hostname": nodeName,
+					},
+					Tolerations: []corev1.Toleration{
+						{
+							Key:      "nvidia.com/gpu",
+							Operator: corev1.TolerationOpExists,
+							Effect:   corev1.TaintEffectNoSchedule,
+						},
+					},
+					PriorityClassName: "system-node-critical",
+					Containers: []corev1.Container{
+						{
+							Name:  "nvidia-device-plugin-ctr",
+							Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2",
+							Env: []corev1.EnvVar{
+								{
+									Name:  "FAIL_ON_INIT_ERROR",
+									Value: "false",
+								},
+							},
+							SecurityContext: &corev1.SecurityContext{
+								AllowPrivilegeEscalation: to.Ptr(false),
+								Capabilities: &corev1.Capabilities{
+									Drop: []corev1.Capability{"ALL"},
+								},
+							},
+							VolumeMounts: []corev1.VolumeMount{
+								{
+									Name:      "device-plugin",
+									MountPath: "/var/lib/kubelet/device-plugins",
+								},
+							},
+						},
+					},
+					Volumes: []corev1.Volume{
+						{
+							Name: "device-plugin",
+							VolumeSource: corev1.VolumeSource{
+								HostPath: &corev1.HostPathVolumeSource{
+									Path: "/var/lib/kubelet/device-plugins",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
+// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster
+func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+	s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")
+
+	ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName)
+	err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
+	require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")
+
+	s.T.Logf("NVIDIA device plugin DaemonSet created successfully")
+}
+
+// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node
+func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+	s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)
+
+	// Wait for the pod to be running
+	err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
+		pods, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{
+			LabelSelector: "name=nvidia-device-plugin-ds",
+			FieldSelector: fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
+		})
+		if err != nil {
+			return false, err
+		}
+
+		if len(pods.Items) == 0 {
+			s.T.Logf("No NVIDIA device plugin pod found yet on node %s", s.Runtime.VM.KubeName)
+			return false, nil
+		}
+
+		pod := &pods.Items[0]
+		s.T.Logf("NVIDIA device plugin pod %s is in phase %s", pod.Name, pod.Status.Phase)
+
+		if pod.Status.Phase == corev1.PodRunning {
+			// Check if all containers are ready
+			for _, containerStatus := range pod.Status.ContainerStatuses {
+				if !containerStatus.Ready {
+					s.T.Logf("Container %s is not ready yet", containerStatus.Name)
+					return false, nil
+				}
+			}
+			return true, nil
+		}
+
+		return false, nil
+	})
+
+	require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready")
+	s.T.Logf("NVIDIA device plugin DaemonSet pod is ready")
+}

From 978ca6d8f77cfca2d09cb076c153e18ad578a1df Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan
 <35557827+ganeshkumarashok@users.noreply.github.com>
Date: Thu, 26 Feb 2026 11:22:02 -0800
Subject: [PATCH 2/5] Update e2e/scenario_gpu_daemonset_test.go

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 e2e/scenario_gpu_daemonset_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go
index 294bb25c4a3..852a98c7e49 100644
--- a/e2e/scenario_gpu_daemonset_test.go
+++ b/e2e/scenario_gpu_daemonset_test.go
@@ -51,7 +51,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 				waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
 
 				// Validate that GPU resources are advertised by the device plugin
-				ValidateNodeAdvertisesGPUResources(ctx, s, 1)
+				ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
 
 				// Validate that GPU workloads can be scheduled
 				ValidateGPUWorkloadSchedulable(ctx, s, 1)

From f965b50a9058e2fc044302829eb6c016a97249e8 Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Thu, 26 Feb 2026 14:00:13 -0800
Subject: [PATCH 3/5] address PR review comments

- Use unique DaemonSet name per node to avoid collisions in shared cluster
- Add cleanup to delete DaemonSet when test finishes
- Use Privileged mode matching upstream NVIDIA device plugin spec
- Use existing WaitUntilPodRunning helper instead of custom wait loop
- Add comments explaining image version choice
---
 e2e/scenario_gpu_daemonset_test.go | 88 +++++++++++++++---------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go
index 852a98c7e49..92dc3506d7f 100644
--- a/e2e/scenario_gpu_daemonset_test.go
+++ b/e2e/scenario_gpu_daemonset_test.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"fmt"
 	"testing"
-	"time"
 
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/Azure/agentbaker/pkg/agent/datamodel"
@@ -14,7 +13,6 @@ import (
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/wait"
 )
 
 // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly
@@ -65,20 +63,26 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 // nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
 // based on the official upstream deployment from:
 // https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
+//
+// The DaemonSet name includes the node name to avoid collisions when multiple
+// GPU tests run against the same shared cluster.
 func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
+	// Use node name in DaemonSet name to avoid collisions in shared cluster
+	dsName := fmt.Sprintf("nvidia-device-plugin-%s", nodeName)
+
 	return &appsv1.DaemonSet{
 		TypeMeta: metav1.TypeMeta{
 			Kind:       "DaemonSet",
 			APIVersion: "apps/v1",
 		},
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      "nvidia-device-plugin-daemonset",
+			Name:      dsName,
 			Namespace: "kube-system",
 		},
 		Spec: appsv1.DaemonSetSpec{
 			Selector: &metav1.LabelSelector{
 				MatchLabels: map[string]string{
-					"name": "nvidia-device-plugin-ds",
+					"name": dsName,
 				},
 			},
 			UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
@@ -87,7 +91,7 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
 			Template: corev1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
 					Labels: map[string]string{
-						"name": "nvidia-device-plugin-ds",
+						"name": dsName,
 					},
 				},
 				Spec: corev1.PodSpec{
@@ -105,7 +109,11 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
 					PriorityClassName: "system-node-critical",
 					Containers: []corev1.Container{
 						{
-							Name:  "nvidia-device-plugin-ctr",
+							Name: "nvidia-device-plugin-ctr",
+							// Using upstream NVIDIA device plugin image from MCR.
+							// This is intentionally different from components.json which tracks
+							// the systemd-packaged version. This test validates the upstream
+							// container-based deployment model.
 							Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2",
 							Env: []corev1.EnvVar{
 								{
@@ -114,10 +122,10 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
 								},
 							},
 							SecurityContext: &corev1.SecurityContext{
-								AllowPrivilegeEscalation: to.Ptr(false),
-								Capabilities: &corev1.Capabilities{
-									Drop: []corev1.Capability{"ALL"},
-								},
+								// Privileged mode is required for the device plugin to access
+								// GPU devices and register with kubelet's device plugin framework.
+								// This matches the upstream NVIDIA device plugin deployment spec.
+								Privileged: to.Ptr(true),
 							},
 							VolumeMounts: []corev1.VolumeMount{
 								{
@@ -144,6 +152,7 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
 }
 
 // deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster
+// and registers cleanup to delete it when the test finishes.
 func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
 	s.T.Helper()
 	s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")
@@ -152,46 +161,37 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
 	err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
 	require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")
 
-	s.T.Logf("NVIDIA device plugin DaemonSet created successfully")
+	s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name)
+
+	// Register cleanup to delete the DaemonSet when the test finishes
+	s.T.Cleanup(func() {
+		s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
+		deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
+			context.Background(),
+			ds.Name,
+			metav1.DeleteOptions{},
+		)
+		if deleteErr != nil {
+			s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr)
+		}
+	})
 }
 
-// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node
+// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node.
+// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states.
 func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
 	s.T.Helper()
-	s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)
-
-	// Wait for the pod to be running
-	err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
-		pods, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{
-			LabelSelector: "name=nvidia-device-plugin-ds",
-			FieldSelector: fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
-		})
-		if err != nil {
-			return false, err
-		}
 
-		if len(pods.Items) == 0 {
-			s.T.Logf("No NVIDIA device plugin pod found yet on node %s", s.Runtime.VM.KubeName)
-			return false, nil
-		}
-
-		pod := &pods.Items[0]
-		s.T.Logf("NVIDIA device plugin pod %s is in phase %s", pod.Name, pod.Status.Phase)
-
-		if pod.Status.Phase == corev1.PodRunning {
-			// Check if all containers are ready
-			for _, containerStatus := range pod.Status.ContainerStatuses {
-				if !containerStatus.Ready {
-					s.T.Logf("Container %s is not ready yet", containerStatus.Name)
-					return false, nil
-				}
-			}
-			return true, nil
-		}
-
-		return false, nil
-	})
+	dsName := fmt.Sprintf("nvidia-device-plugin-%s", s.Runtime.VM.KubeName)
+	s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)
 
+	_, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning(
+		ctx,
+		"kube-system",
+		fmt.Sprintf("name=%s", dsName),
+		fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
+	)
 	require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready")
+
 	s.T.Logf("NVIDIA device plugin DaemonSet pod is ready")
 }

From 460b260f6a45bc8130c2a14b117b43a950d45820 Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Thu, 26 Feb 2026 15:13:16 -0800
Subject: [PATCH 4/5] improve test robustness and cleanup

- Extract image version to constant for easier updates
- Add validation that systemd device plugin is not running
- Truncate DaemonSet name to 63 chars (K8s limit)
- Add timeout contexts to cleanup operations
- Delete existing DaemonSet before create for idempotency
---
 e2e/scenario_gpu_daemonset_test.go | 72 +++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go
index 92dc3506d7f..eb590ff0ad6 100644
--- a/e2e/scenario_gpu_daemonset_test.go
+++ b/e2e/scenario_gpu_daemonset_test.go
@@ -3,7 +3,9 @@ package e2e
 import (
 	"context"
 	"fmt"
+	"strings"
 	"testing"
+	"time"
 
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/Azure/agentbaker/pkg/agent/datamodel"
@@ -15,6 +17,14 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
+const (
+	// nvidiaDevicePluginImage is the upstream NVIDIA device plugin image from MCR.
+	// This is intentionally different from components.json which tracks the systemd-packaged version.
+	// This test validates the upstream container-based deployment model.
+	// Update this when a new version is available in MCR.
+	nvidiaDevicePluginImage = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2"
+)
+
 // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly
 // with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service.
 // This is the "upstream" deployment model commonly used by customers who manage their own
@@ -42,6 +52,10 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 				// First, validate that GPU drivers are installed
 				ValidateNvidiaModProbeInstalled(ctx, s)
 
+				// Verify that the systemd-based device plugin is NOT running
+				// (we disabled it via EnableGPUDevicePluginIfNeeded = false)
+				validateNvidiaDevicePluginServiceNotRunning(ctx, s)
+
 				// Deploy the NVIDIA device plugin as a DaemonSet
 				deployNvidiaDevicePluginDaemonset(ctx, s)
 
@@ -60,6 +74,36 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 	})
 }
 
+// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based
+// NVIDIA device plugin service is not running (since we're testing the DaemonSet model).
+func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+	s.T.Logf("Verifying that nvidia-device-plugin.service is not running...")
+
+	// Check if the service exists and is inactive
+	// Using "is-active" which returns non-zero if not active
+	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
+	output := strings.TrimSpace(result.stdout)
+
+	// The service should either not exist or be inactive
+	if output == "active" {
+		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
+	}
+	s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output)
+}
+
+// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node.
+// The name is truncated to fit within Kubernetes' 63-character limit for resource names.
+func nvidiaDevicePluginDaemonsetName(nodeName string) string {
+	prefix := "nvdp-" // Short prefix to leave room for node name
+	maxLen := 63
+	name := prefix + nodeName
+	if len(name) > maxLen {
+		name = name[:maxLen]
+	}
+	return name
+}
+
 // nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
 // based on the official upstream deployment from:
 // https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
@@ -67,8 +111,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 // The DaemonSet name includes the node name to avoid collisions when multiple
 // GPU tests run against the same shared cluster.
 func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
-	// Use node name in DaemonSet name to avoid collisions in shared cluster
-	dsName := fmt.Sprintf("nvidia-device-plugin-%s", nodeName)
+	dsName := nvidiaDevicePluginDaemonsetName(nodeName)
 
 	return &appsv1.DaemonSet{
 		TypeMeta: metav1.TypeMeta{
@@ -109,12 +152,8 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
 					PriorityClassName: "system-node-critical",
 					Containers: []corev1.Container{
 						{
-							Name: "nvidia-device-plugin-ctr",
-							// Using upstream NVIDIA device plugin image from MCR.
-							// This is intentionally different from components.json which tracks
-							// the systemd-packaged version. This test validates the upstream
-							// container-based deployment model.
-							Image: "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2",
+							Name:  "nvidia-device-plugin-ctr",
+							Image: nvidiaDevicePluginImage,
 							Env: []corev1.EnvVar{
 								{
 									Name:  "FAIL_ON_INIT_ERROR",
@@ -158,6 +197,17 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
 	s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")
 
 	ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName)
+
+	// Delete any existing DaemonSet from a previous failed run
+	deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second)
+	defer deleteCancel()
+	_ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
+		deleteCtx,
+		ds.Name,
+		metav1.DeleteOptions{},
+	)
+
+	// Create the DaemonSet
 	err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
 	require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")
 
@@ -166,8 +216,10 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
 	// Register cleanup to delete the DaemonSet when the test finishes
 	s.T.Cleanup(func() {
 		s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
+		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cleanupCancel()
 		deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
-			context.Background(),
+			cleanupCtx,
 			ds.Name,
 			metav1.DeleteOptions{},
 		)
@@ -182,7 +234,7 @@ func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
 func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
 	s.T.Helper()
 
-	dsName := fmt.Sprintf("nvidia-device-plugin-%s", s.Runtime.VM.KubeName)
+	dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName)
 	s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)
 
 	_, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning(

From 2eae10eb6f8348f4eaa665883a90d07d43dc780f Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Thu, 26 Feb 2026 15:47:06 -0800
Subject: [PATCH 5/5] fix comments to accurately describe GPU device plugin
 behavior

---
 e2e/scenario_gpu_daemonset_test.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go
index eb590ff0ad6..2b98e3d0384 100644
--- a/e2e/scenario_gpu_daemonset_test.go
+++ b/e2e/scenario_gpu_daemonset_test.go
@@ -41,7 +41,8 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
 				nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
 				nbc.ConfigGPUDriverIfNeeded = true
-				// Disable the systemd-based device plugin - we'll deploy it as a DaemonSet instead
+				// Don't enable the managed GPU experience - we'll deploy the device plugin as a DaemonSet instead.
+				// By not setting EnableManagedGPU=true or the VMSS tag, the systemd-based device plugin won't start.
 				nbc.EnableGPUDevicePluginIfNeeded = false
 				nbc.EnableNvidia = true
 			},
@@ -53,7 +54,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
 				ValidateNvidiaModProbeInstalled(ctx, s)
 
 				// Verify that the systemd-based device plugin is NOT running
-				// (we disabled it via EnableGPUDevicePluginIfNeeded = false)
+				// (managed GPU experience is not enabled, so the service should not be active)
 				validateNvidiaDevicePluginServiceNotRunning(ctx, s)
 
 				// Deploy the NVIDIA device plugin as a DaemonSet