Azure · ganeshkumarashok · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Mar 2, 2026
@@ -380,7 +380,8 @@
     },
     {
       "matchPackageNames": [
-        "nvidia-device-plugin"
+        "nvidia-device-plugin",
+        "oss/v2/nvidia/k8s-device-plugin"
       ],
       "groupName": "nvidia-device-plugin",
       "assignees": [

@@ -134,3 +134,58 @@ func RemoveLeadingV(version string) string {
 	}
 	return version
 }
+
+// GetGPUContainerImage returns the full container image URL for a GPU container image
+// by looking up the downloadURL pattern and gpuVersion.latestVersion from components.json.
+// The downloadURL pattern contains a wildcard (*) that gets replaced with the version.
+func GetGPUContainerImage(downloadURLPattern string) string {
+	// Get the project root dynamically
+	_, filename, _, _ := runtime.Caller(0)
+	projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/
+	componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json")
+
+	jsonBytes, err := os.ReadFile(componentsPath)
+	if err != nil {
+		return ""
+	}
+
+	gpuImages := gjson.GetBytes(jsonBytes, "GPUContainerImages")
+	for _, gpuImage := range gpuImages.Array() {
+		downloadURL := gpuImage.Get("downloadURL").String()
+		if strings.EqualFold(downloadURL, downloadURLPattern) {
+			version := gpuImage.Get("gpuVersion.latestVersion").String()
+			if version != "" {
+				return strings.Replace(downloadURL, "*", version, 1)
+			}
+		}
+	}
+	return ""
+}
+
+// GetE2EContainerImage returns the full container image URL for an e2e test container image
+// by looking up the name and version.latestVersion from components.json E2EContainerImages section.
+// The downloadURL pattern contains a wildcard (*) that gets replaced with the version.
+func GetE2EContainerImage(name string) string {
+	// Get the project root dynamically
+	_, filename, _, _ := runtime.Caller(0)
+	projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/
+	componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json")
+
+	jsonBytes, err := os.ReadFile(componentsPath)
+	if err != nil {
+		return ""
+	}
+
+	e2eImages := gjson.GetBytes(jsonBytes, "E2EContainerImages")
+	for _, e2eImage := range e2eImages.Array() {
+		imageName := e2eImage.Get("name").String()
+		if strings.EqualFold(imageName, name) {
+			downloadURL := e2eImage.Get("downloadURL").String()
+			version := e2eImage.Get("version.latestVersion").String()
+			if version != "" {
+				return strings.Replace(downloadURL, "*", version, 1)
+			}
+		}
+	}
+	return ""
+}
@@ -0,0 +1,201 @@
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/Azure/agentbaker/e2e/components"
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
+	"github.com/stretchr/testify/require"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+const (
+	// nvidiaDevicePluginImageName is the name of the NVIDIA device plugin container image
+	// in components.json E2EContainerImages section. The version is managed via Renovate.
+	nvidiaDevicePluginImageName = "nvidia-k8s-device-plugin"
+)
+
+// getNvidiaDevicePluginImage returns the full container image URL for the NVIDIA device plugin
+// by reading the version from components.json E2EContainerImages section.
+func getNvidiaDevicePluginImage() string {
+	return components.GetE2EContainerImage(nvidiaDevicePluginImageName)
+}
+
+// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based
+// NVIDIA device plugin service is not running (since we're testing the DaemonSet model).
+func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+	s.T.Logf("Verifying that nvidia-device-plugin.service is not running...")
+
+	// Check if the service exists and is inactive
+	// Using "is-active" which returns non-zero if not active
+	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
+	output := strings.TrimSpace(result.stdout)
+
+	// The service should either not exist or be inactive
+	if output == "active" {
+		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
-	// Check if the service exists and is inactive
-	// Using "is-active" which returns non-zero if not active
-	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
-	output := strings.TrimSpace(result.stdout)
-
-	// The service should either not exist or be inactive
-	if output == "active" {
-		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
+	// Check the current service state using "is-active".
+	// This will return "active", "inactive", "failed", "activating", "unknown", etc.
+	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null")
+	output := strings.TrimSpace(result.stdout)
+
+	// The service should either not exist or be in a non-running state.
+	// Treat both "active" and "activating" as failures, since the service
+	// must not be running when validating the DaemonSet-based deployment.
+	if output == "active" || output == "activating" {
+		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly %s - this test requires the systemd service to be disabled", output)
-	// Check if the service exists and is inactive
-	// Using "is-active" which returns non-zero if not active
-	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
-	output := strings.TrimSpace(result.stdout)
-
-	// The service should either not exist or be inactive
-	if output == "active" {
-		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
+	// Check the current service state using "is-active".
+	// This will return "active", "inactive", "failed", "activating", "unknown", etc.
+	result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null")
+	output := strings.TrimSpace(result.stdout)
+
+	// The service should either not exist or be in a non-running state.
+	// Treat both "active" and "activating" as failures, since the service
+	// must not be running when validating the DaemonSet-based deployment.
+	if output == "active" || output == "activating" {
+		s.T.Fatalf("nvidia-device-plugin.service is unexpectedly %s - this test requires the systemd service to be disabled", output)
+	}
+	s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output)
+}
+
+// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node.
+// The name is truncated to fit within Kubernetes' 63-character limit for resource names.
+func nvidiaDevicePluginDaemonsetName(nodeName string) string {
+	prefix := "nvdp-" // Short prefix to leave room for node name
+	maxLen := 63
+	name := prefix + nodeName
+	if len(name) > maxLen {
+		name = name[:maxLen]
+	}
+	return name
+}
+
+// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
+// based on the official upstream deployment from:
+// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
+//
+// The DaemonSet name includes the node name to avoid collisions when multiple
+// GPU tests run against the same shared cluster.
+func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
+	dsName := nvidiaDevicePluginDaemonsetName(nodeName)
+
+	return &appsv1.DaemonSet{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "DaemonSet",
+			APIVersion: "apps/v1",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      dsName,
+			Namespace: "kube-system",
+		},
+		Spec: appsv1.DaemonSetSpec{
+			Selector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"name": dsName,
+				},
+			},
+			UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
+				Type: appsv1.RollingUpdateDaemonSetStrategyType,
+			},
+			Template: corev1.PodTemplateSpec{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						"name": dsName,
+					},
+				},
+				Spec: corev1.PodSpec{
+					// Target only our specific test node
+					NodeSelector: map[string]string{
+						"kubernetes.io/hostname": nodeName,
+					},
+					Tolerations: []corev1.Toleration{
+						{
+							Key:      "nvidia.com/gpu",
+							Operator: corev1.TolerationOpExists,
+							Effect:   corev1.TaintEffectNoSchedule,
+						},
+					},
+					PriorityClassName: "system-node-critical",
+					Containers: []corev1.Container{
+						{
+							Name:  "nvidia-device-plugin-ctr",
+							Image: getNvidiaDevicePluginImage(),
-							Name:  "nvidia-device-plugin-ctr",
-							Image: getNvidiaDevicePluginImage(),
+							Name: "nvidia-device-plugin-ctr",
+							Image: func() string {
+								img := getNvidiaDevicePluginImage()
+								require.NotEmpty(t, img, "nvidia device plugin image must be configured in E2EContainerImages (components.json entry %q)", nvidiaDevicePluginImageName)
+								return img
+							}(),
-							Name:  "nvidia-device-plugin-ctr",
-							Image: getNvidiaDevicePluginImage(),
+							Name: "nvidia-device-plugin-ctr",
+							Image: func() string {
+								img := getNvidiaDevicePluginImage()
+								require.NotEmpty(t, img, "nvidia device plugin image must be configured in E2EContainerImages (components.json entry %q)", nvidiaDevicePluginImageName)
+								return img
+							}(),
+							Env: []corev1.EnvVar{
+								{
+									Name:  "FAIL_ON_INIT_ERROR",
+									Value: "false",
+								},
+							},
+							SecurityContext: &corev1.SecurityContext{
+								// Privileged mode is required for the device plugin to access
+								// GPU devices and register with kubelet's device plugin framework.
+								// This matches the upstream NVIDIA device plugin deployment spec.
+								Privileged: to.Ptr(true),
+							},
+							VolumeMounts: []corev1.VolumeMount{
+								{
+									Name:      "device-plugin",
+									MountPath: "/var/lib/kubelet/device-plugins",
+								},
+							},
+						},
+					},
+					Volumes: []corev1.Volume{
+						{
+							Name: "device-plugin",
+							VolumeSource: corev1.VolumeSource{
+								HostPath: &corev1.HostPathVolumeSource{
+									Path: "/var/lib/kubelet/device-plugins",
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
+// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster
+// and registers cleanup to delete it when the test finishes.
+func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+	s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")
+
+	ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName)
+
+	// Delete any existing DaemonSet from a previous failed run
+	deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second)
+	defer deleteCancel()
+	_ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
+		deleteCtx,
+		ds.Name,
+		metav1.DeleteOptions{},
+	)
+
+	// Create the DaemonSet
+	err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
+	require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")
+
+	s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name)
+
+	// Register cleanup to delete the DaemonSet when the test finishes
+	s.T.Cleanup(func() {
+		s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
+		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cleanupCancel()
+		deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
+			cleanupCtx,
+			ds.Name,
+			metav1.DeleteOptions{},
+		)
+		if deleteErr != nil {
+			s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr)
+		}
+	})
+}
+
+// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node.
+// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states.
+func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
+	s.T.Helper()
+
+	dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName)
+	s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)
+
+	_, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning(
+		ctx,
+		"kube-system",
+		fmt.Sprintf("name=%s", dsName),
+		fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
+	)
+	require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready")
+
+	s.T.Logf("NVIDIA device plugin DaemonSet pod is ready")
+}
@@ -982,49 +982,83 @@ func Test_Ubuntu2204_CustomSysctls_Scriptless(t *testing.T) {
 }
 
 func Test_Ubuntu2204_GPUNC(t *testing.T) {
-	runScenarioUbuntu2204GPU(t, "Standard_NC6s_v3")
+	RunScenario(t, &Scenario{
+		Description: "Tests that a GPU-enabled node with Standard_NC6s_v3 can bootstrap and run NVIDIA device plugin as a DaemonSet",
+		Tags: Tags{
+			GPU: true,
+		},
+		Config: Config{
+			Cluster: ClusterKubenet,
+			VHD:     config.VHDUbuntu2204Gen2Containerd,
+			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
+				nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
+				nbc.ConfigGPUDriverIfNeeded = true
+				nbc.EnableGPUDevicePluginIfNeeded = false
+				nbc.EnableNvidia = true
+			},
+			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
+				vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
+			},
+			Validator: func(ctx context.Context, s *Scenario) {
+				ValidateNvidiaModProbeInstalled(ctx, s)
+				ValidateKubeletHasNotStopped(ctx, s)
+				ValidateServicesDoNotRestartKubelet(ctx, s)
+
+				// Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model)
+				validateNvidiaDevicePluginServiceNotRunning(ctx, s)
+				deployNvidiaDevicePluginDaemonset(ctx, s)
+				waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
+				ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
+				ValidateGPUWorkloadSchedulable(ctx, s, 1)
+			},
+		},
+	})
 }
 
 func Test_Ubuntu2204_GPUA100(t *testing.T) {
 	runScenarioUbuntu2204GPU(t, "Standard_NC24ads_A100_v4")
 }
 
 func Test_Ubuntu2204_GPUA10(t *testing.T) {
-	runScenarioUbuntuGRID(t, "Standard_NV6ads_A10_v5")
-}
-
-// Returns config for the 'gpu' E2E scenario
-func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) {
 	RunScenario(t, &Scenario{
-		Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize),
+		Description: "Tests that a GPU-enabled node with Standard_NV6ads_A10_v5 can bootstrap with GRID license and run NVIDIA device plugin as a DaemonSet",
 		Tags: Tags{
 			GPU: true,
 		},
 		Config: Config{
 			Cluster: ClusterKubenet,
 			VHD:     config.VHDUbuntu2204Gen2Containerd,
 			BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
-				nbc.AgentPoolProfile.VMSize = vmSize
+				nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
 				nbc.ConfigGPUDriverIfNeeded = true
 				nbc.EnableGPUDevicePluginIfNeeded = false
 				nbc.EnableNvidia = true
 			},
 			VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
-				vmss.SKU.Name = to.Ptr(vmSize)
+				vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
 			},
 			Validator: func(ctx context.Context, s *Scenario) {
-				// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
 				ValidateNvidiaModProbeInstalled(ctx, s)
+				ValidateNvidiaGRIDLicenseValid(ctx, s)
 				ValidateKubeletHasNotStopped(ctx, s)
 				ValidateServicesDoNotRestartKubelet(ctx, s)
+				ValidateNvidiaPersistencedRunning(ctx, s)
+
+				// Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model)
+				validateNvidiaDevicePluginServiceNotRunning(ctx, s)
+				deployNvidiaDevicePluginDaemonset(ctx, s)
+				waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
+				ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
+				ValidateGPUWorkloadSchedulable(ctx, s, 1)
 			},
 		},
 	})
 }
 
-func runScenarioUbuntuGRID(t *testing.T, vmSize string) {
+// Returns config for the 'gpu' E2E scenario
+func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) {
 	RunScenario(t, &Scenario{
-		Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped, and that the GRID license is valid", vmSize),
+		Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize),
 		Tags: Tags{
 			GPU: true,
 		},
@@ -1043,10 +1077,8 @@ func runScenarioUbuntuGRID(t *testing.T, vmSize string) {
 			Validator: func(ctx context.Context, s *Scenario) {
 				// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
 				ValidateNvidiaModProbeInstalled(ctx, s)
-				ValidateNvidiaGRIDLicenseValid(ctx, s)
 				ValidateKubeletHasNotStopped(ctx, s)
 				ValidateServicesDoNotRestartKubelet(ctx, s)
-				ValidateNvidiaPersistencedRunning(ctx, s)
 			},
 		},
 	})

diff --git a/parts/common/components.json b/parts/common/components.json
@@ -2070,5 +2070,15 @@
         }
       ]
     }
+  ],
+  "E2EContainerImages": [
+    {
+      "name": "nvidia-k8s-device-plugin",
+      "downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*",
+      "version": {
+        "renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin",
+        "latestVersion": "v0.18.2-1"
+      }
+    }
   ]
 }