From 33fbea55d1cf0ed9af684a2d0fb1ef3d074ae4fe Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Fri, 27 Feb 2026 10:51:55 -0800 Subject: [PATCH 1/4] add e2e test for NVIDIA device plugin DaemonSet deployment Add a new e2e test that validates the NVIDIA device plugin works when deployed as a Kubernetes DaemonSet instead of the systemd service. This tests the "upstream" deployment model used by customers who manage their own device plugin deployment. Also add Renovate configuration to auto-update the container image version in e2e test files: - Add custom manager regex pattern for e2e Go files - Add package name for MCR device plugin image to nvidia-device-plugin group --- .github/renovate.json | 15 +- e2e/scenario_gpu_daemonset_test.go | 250 +++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 e2e/scenario_gpu_daemonset_test.go diff --git a/.github/renovate.json b/.github/renovate.json index 339c586ad4d..0ab08ee9c0f 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -380,7 +380,8 @@ }, { "matchPackageNames": [ - "nvidia-device-plugin" + "nvidia-device-plugin", + "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin" ], "groupName": "nvidia-device-plugin", "assignees": [ @@ -682,6 +683,18 @@ "matchStrings": [ "#\\s*renovate:\\s*(datasource=(?.*?) )?depName=(?.*?)( versioning=(?.*?))?\\s*.*?version.*\\\"(?.*)\\\"" ] + }, + { + "customType": "regex", + "description": "update container image versions in e2e Go test files", + "managerFilePatterns": [ + "/e2e/.*\\.go/" + ], + "matchStringsStrategy": "any", + "matchStrings": [ + "//\\s*renovate:\\s*datasource=(?\\S+)\\s+depName=(?\\S+)\\s*\\n\\s*\\S+\\s*=\\s*\"(?[^:]+):(?[^\"]+)\"" + ], + "datasourceTemplate": "docker" } ], "customDatasources": { diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go new file mode 100644 index 00000000000..eda5cddbd1c --- /dev/null +++ b/e2e/scenario_gpu_daemonset_test.go @@ -0,0 +1,250 @@ +package e2e + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // nvidiaDevicePluginImage is the upstream NVIDIA device plugin image from MCR. + // This is intentionally different from components.json which tracks the systemd-packaged version. + // This test validates the upstream container-based deployment model. + // renovate: datasource=docker depName=mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin + nvidiaDevicePluginImage = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2" +) + +// Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly +// with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service. +// This is the "upstream" deployment model commonly used by customers who manage their own +// NVIDIA device plugin deployment. +func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that NVIDIA device plugin works when deployed as a DaemonSet (not systemd service)", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5" + nbc.ConfigGPUDriverIfNeeded = true + // Don't enable the managed GPU experience - we'll deploy the device plugin as a DaemonSet instead. + // By not setting EnableManagedGPU=true or the VMSS tag, the systemd-based device plugin won't start. + nbc.EnableGPUDevicePluginIfNeeded = false + nbc.EnableNvidia = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5") + }, + Validator: func(ctx context.Context, s *Scenario) { + // First, validate that GPU drivers are installed + ValidateNvidiaModProbeInstalled(ctx, s) + + // Verify that the systemd-based device plugin is NOT running + // (managed GPU experience is not enabled, so the service should not be active) + validateNvidiaDevicePluginServiceNotRunning(ctx, s) + + // Deploy the NVIDIA device plugin as a DaemonSet + deployNvidiaDevicePluginDaemonset(ctx, s) + + // Wait for the DaemonSet pod to be running on our node + waitForNvidiaDevicePluginDaemonsetReady(ctx, s) + + // Validate that GPU resources are advertised by the device plugin + ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") + + // Validate that GPU workloads can be scheduled + ValidateGPUWorkloadSchedulable(ctx, s, 1) + + s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly") + }, + }, + }) +} + +// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based +// NVIDIA device plugin service is not running (since we're testing the DaemonSet model). +func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("Verifying that nvidia-device-plugin.service is not running...") + + // Check if the service exists and is inactive + // Using "is-active" which returns non-zero if not active + result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'") + output := strings.TrimSpace(result.stdout) + + // The service should either not exist or be inactive + if output == "active" { + s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled") + } + s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output) +} + +// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node. +// The name is truncated to fit within Kubernetes' 63-character limit for resource names. +func nvidiaDevicePluginDaemonsetName(nodeName string) string { + prefix := "nvdp-" // Short prefix to leave room for node name + maxLen := 63 + name := prefix + nodeName + if len(name) > maxLen { + name = name[:maxLen] + } + return name +} + +// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec +// based on the official upstream deployment from: +// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml +// +// The DaemonSet name includes the node name to avoid collisions when multiple +// GPU tests run against the same shared cluster. +func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { + dsName := nvidiaDevicePluginDaemonsetName(nodeName) + + return &appsv1.DaemonSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: dsName, + Namespace: "kube-system", + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": dsName, + }, + }, + UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ + Type: appsv1.RollingUpdateDaemonSetStrategyType, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": dsName, + }, + }, + Spec: corev1.PodSpec{ + // Target only our specific test node + NodeSelector: map[string]string{ + "kubernetes.io/hostname": nodeName, + }, + Tolerations: []corev1.Toleration{ + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }, + PriorityClassName: "system-node-critical", + Containers: []corev1.Container{ + { + Name: "nvidia-device-plugin-ctr", + Image: nvidiaDevicePluginImage, + Env: []corev1.EnvVar{ + { + Name: "FAIL_ON_INIT_ERROR", + Value: "false", + }, + }, + SecurityContext: &corev1.SecurityContext{ + // Privileged mode is required for the device plugin to access + // GPU devices and register with kubelet's device plugin framework. + // This matches the upstream NVIDIA device plugin deployment spec. + Privileged: to.Ptr(true), + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "device-plugin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + }, + }, + }, + } +} + +// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster +// and registers cleanup to delete it when the test finishes. +func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) { + s.T.Helper() + s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...") + + ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName) + + // Delete any existing DaemonSet from a previous failed run + deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second) + defer deleteCancel() + _ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete( + deleteCtx, + ds.Name, + metav1.DeleteOptions{}, + ) + + // Create the DaemonSet + err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds) + require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet") + + s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name) + + // Register cleanup to delete the DaemonSet when the test finishes + s.T.Cleanup(func() { + s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name) + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cleanupCancel() + deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete( + cleanupCtx, + ds.Name, + metav1.DeleteOptions{}, + ) + if deleteErr != nil { + s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr) + } + }) +} + +// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node. +// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states. +func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) { + s.T.Helper() + + dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName) + s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName) + + _, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning( + ctx, + "kube-system", + fmt.Sprintf("name=%s", dsName), + fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName), + ) + require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready") + + s.T.Logf("NVIDIA device plugin DaemonSet pod is ready") +} From 44195e4419f608b06b0c5d08ffbfdd62c7d56aa0 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Fri, 27 Feb 2026 11:00:11 -0800 Subject: [PATCH 2/4] move nvidia device plugin image version to components.json - Add nvidia k8s-device-plugin container image to GPUContainerImages section in components.json for Renovate auto-updates - Add GetGPUContainerImage helper in e2e/components to read GPU container image versions from components.json - Update e2e test to read version from components.json instead of hardcoding - Update Renovate package rule to match the new package name pattern --- .github/renovate.json | 14 +------------- e2e/components/components.go | 27 +++++++++++++++++++++++++++ e2e/scenario_gpu_daemonset_test.go | 17 +++++++++++------ parts/common/components.json | 7 +++++++ 4 files changed, 46 insertions(+), 19 deletions(-) diff --git a/.github/renovate.json b/.github/renovate.json index 0ab08ee9c0f..2087b0fc42a 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -381,7 +381,7 @@ { "matchPackageNames": [ "nvidia-device-plugin", - "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin" + "oss/v2/nvidia/k8s-device-plugin" ], "groupName": "nvidia-device-plugin", "assignees": [ @@ -683,18 +683,6 @@ "matchStrings": [ "#\\s*renovate:\\s*(datasource=(?.*?) )?depName=(?.*?)( versioning=(?.*?))?\\s*.*?version.*\\\"(?.*)\\\"" ] - }, - { - "customType": "regex", - "description": "update container image versions in e2e Go test files", - "managerFilePatterns": [ - "/e2e/.*\\.go/" - ], - "matchStringsStrategy": "any", - "matchStrings": [ - "//\\s*renovate:\\s*datasource=(?\\S+)\\s+depName=(?\\S+)\\s*\\n\\s*\\S+\\s*=\\s*\"(?[^:]+):(?[^\"]+)\"" - ], - "datasourceTemplate": "docker" } ], "customDatasources": { diff --git a/e2e/components/components.go b/e2e/components/components.go index fefeb136566..1096dc603b4 100644 --- a/e2e/components/components.go +++ b/e2e/components/components.go @@ -134,3 +134,30 @@ func RemoveLeadingV(version string) string { } return version } + +// GetGPUContainerImage returns the full container image URL for a GPU container image +// by looking up the downloadURL pattern and gpuVersion.latestVersion from components.json. +// The downloadURL pattern contains a wildcard (*) that gets replaced with the version. +func GetGPUContainerImage(downloadURLPattern string) string { + // Get the project root dynamically + _, filename, _, _ := runtime.Caller(0) + projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/ + componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json") + + jsonBytes, err := os.ReadFile(componentsPath) + if err != nil { + return "" + } + + gpuImages := gjson.GetBytes(jsonBytes, "GPUContainerImages") + for _, gpuImage := range gpuImages.Array() { + downloadURL := gpuImage.Get("downloadURL").String() + if strings.EqualFold(downloadURL, downloadURLPattern) { + version := gpuImage.Get("gpuVersion.latestVersion").String() + if version != "" { + return strings.Replace(downloadURL, "*", version, 1) + } + } + } + return "" +} diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index eda5cddbd1c..defec7330f6 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -7,6 +7,7 @@ import ( "testing" "time" + "github.com/Azure/agentbaker/e2e/components" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" @@ -18,13 +19,17 @@ import ( ) const ( - // nvidiaDevicePluginImage is the upstream NVIDIA device plugin image from MCR. - // This is intentionally different from components.json which tracks the systemd-packaged version. - // This test validates the upstream container-based deployment model. - // renovate: datasource=docker depName=mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin - nvidiaDevicePluginImage = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2" + // nvidiaDevicePluginDownloadURL is the download URL pattern for the NVIDIA device plugin + // container image in components.json. The version is managed via Renovate. + nvidiaDevicePluginDownloadURL = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*" ) +// getNvidiaDevicePluginImage returns the full container image URL for the NVIDIA device plugin +// by reading the version from components.json GPUContainerImages section. +func getNvidiaDevicePluginImage() string { + return components.GetGPUContainerImage(nvidiaDevicePluginDownloadURL) +} + // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly // with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service. // This is the "upstream" deployment model commonly used by customers who manage their own @@ -154,7 +159,7 @@ func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet { Containers: []corev1.Container{ { Name: "nvidia-device-plugin-ctr", - Image: nvidiaDevicePluginImage, + Image: getNvidiaDevicePluginImage(), Env: []corev1.EnvVar{ { Name: "FAIL_ON_INIT_ERROR", diff --git a/parts/common/components.json b/parts/common/components.json index d289a396b85..1e7522f9a41 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -816,6 +816,13 @@ "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "550.144.06-20260126030228" } + }, + { + "downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*", + "gpuVersion": { + "renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin", + "latestVersion": "v0.18.2-1" + } } ], "Packages": [ From d7f1abf1bf23aacf86f2e08d585c9a1cf46160d8 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Fri, 27 Feb 2026 11:10:45 -0800 Subject: [PATCH 3/4] use E2EContainerImages section to avoid VHD caching - Add new E2EContainerImages section in components.json for container images used only in e2e tests (not cached on VHD) - Add schema definition for E2EContainerImages in components.cue - Add GetE2EContainerImage helper to read from E2EContainerImages - Move nvidia k8s-device-plugin from GPUContainerImages to E2EContainerImages to prevent VHD caching while enabling Renovate auto-updates --- e2e/components/components.go | 28 ++++++++++++++++++++++++++++ e2e/scenario_gpu_daemonset_test.go | 10 +++++----- parts/common/components.json | 17 ++++++++++------- schemas/components.cue | 8 ++++++++ 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/e2e/components/components.go b/e2e/components/components.go index 1096dc603b4..23580c52acf 100644 --- a/e2e/components/components.go +++ b/e2e/components/components.go @@ -161,3 +161,31 @@ func GetGPUContainerImage(downloadURLPattern string) string { } return "" } + +// GetE2EContainerImage returns the full container image URL for an e2e test container image +// by looking up the name and version.latestVersion from components.json E2EContainerImages section. +// The downloadURL pattern contains a wildcard (*) that gets replaced with the version. +func GetE2EContainerImage(name string) string { + // Get the project root dynamically + _, filename, _, _ := runtime.Caller(0) + projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/ + componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json") + + jsonBytes, err := os.ReadFile(componentsPath) + if err != nil { + return "" + } + + e2eImages := gjson.GetBytes(jsonBytes, "E2EContainerImages") + for _, e2eImage := range e2eImages.Array() { + imageName := e2eImage.Get("name").String() + if strings.EqualFold(imageName, name) { + downloadURL := e2eImage.Get("downloadURL").String() + version := e2eImage.Get("version.latestVersion").String() + if version != "" { + return strings.Replace(downloadURL, "*", version, 1) + } + } + } + return "" +} diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index defec7330f6..2cc50f6ebaf 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -19,15 +19,15 @@ import ( ) const ( - // nvidiaDevicePluginDownloadURL is the download URL pattern for the NVIDIA device plugin - // container image in components.json. The version is managed via Renovate. - nvidiaDevicePluginDownloadURL = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*" + // nvidiaDevicePluginImageName is the name of the NVIDIA device plugin container image + // in components.json E2EContainerImages section. The version is managed via Renovate. + nvidiaDevicePluginImageName = "nvidia-k8s-device-plugin" ) // getNvidiaDevicePluginImage returns the full container image URL for the NVIDIA device plugin -// by reading the version from components.json GPUContainerImages section. +// by reading the version from components.json E2EContainerImages section. func getNvidiaDevicePluginImage() string { - return components.GetGPUContainerImage(nvidiaDevicePluginDownloadURL) + return components.GetE2EContainerImage(nvidiaDevicePluginImageName) } // Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly diff --git a/parts/common/components.json b/parts/common/components.json index 1e7522f9a41..be7245b98f0 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -816,13 +816,6 @@ "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "550.144.06-20260126030228" } - }, - { - "downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*", - "gpuVersion": { - "renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin", - "latestVersion": "v0.18.2-1" - } } ], "Packages": [ @@ -2077,5 +2070,15 @@ } ] } + ], + "E2EContainerImages": [ + { + "name": "nvidia-k8s-device-plugin", + "downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*", + "version": { + "renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin", + "latestVersion": "v0.18.2-1" + } + } ] } diff --git a/schemas/components.cue b/schemas/components.cue index 4ee03baa32d..a4ae96f80ac 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -22,6 +22,12 @@ package components gpuVersion: #VersionV2 } +#E2EContainerImage: { + name: string + downloadURL: string + version: #VersionV2 +} + #WindowsVersion: { comment?: string k8sVersion?: string @@ -33,6 +39,7 @@ package components #Images: [...#ContainerImage] #GPUImages: [...#GPUContainerImage] +#E2EImages: [...#E2EContainerImage] #Packages: [...#Package] #OCIArtifacts: [...#OCIArtifact] #VersionV2: { @@ -113,6 +120,7 @@ package components ContainerImages: #Images Packages: #Packages GPUContainerImages?: #GPUImages + E2EContainerImages?: #E2EImages OCIArtifacts?: #OCIArtifacts } From a365ad13c88666af3e3879fb9f19337bc73499f6 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Mon, 2 Mar 2026 13:27:53 -0800 Subject: [PATCH 4/4] move DaemonSet validation into Test_Ubuntu2204_GPUNC and Test_Ubuntu2204_GPUA10 Instead of a separate e2e test, inline the NVIDIA device plugin DaemonSet deployment and validation into two existing regular managed GPU scenarios. This avoids spinning up an additional GPU VM while still covering the upstream DaemonSet deployment model on two different GPU SKUs (NC6s_v3 and NV6ads_A10_v5). --- e2e/scenario_gpu_daemonset_test.go | 54 --------------------------- e2e/scenario_test.go | 60 +++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 68 deletions(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index 2cc50f6ebaf..8c38274d373 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -4,14 +4,10 @@ import ( "context" "fmt" "strings" - "testing" "time" "github.com/Azure/agentbaker/e2e/components" - "github.com/Azure/agentbaker/e2e/config" - "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -30,56 +26,6 @@ func getNvidiaDevicePluginImage() string { return components.GetE2EContainerImage(nvidiaDevicePluginImageName) } -// Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly -// with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service. -// This is the "upstream" deployment model commonly used by customers who manage their own -// NVIDIA device plugin deployment. -func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { - RunScenario(t, &Scenario{ - Description: "Tests that NVIDIA device plugin works when deployed as a DaemonSet (not systemd service)", - Tags: Tags{ - GPU: true, - }, - Config: Config{ - Cluster: ClusterKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5" - nbc.ConfigGPUDriverIfNeeded = true - // Don't enable the managed GPU experience - we'll deploy the device plugin as a DaemonSet instead. - // By not setting EnableManagedGPU=true or the VMSS tag, the systemd-based device plugin won't start. - nbc.EnableGPUDevicePluginIfNeeded = false - nbc.EnableNvidia = true - }, - VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { - vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5") - }, - Validator: func(ctx context.Context, s *Scenario) { - // First, validate that GPU drivers are installed - ValidateNvidiaModProbeInstalled(ctx, s) - - // Verify that the systemd-based device plugin is NOT running - // (managed GPU experience is not enabled, so the service should not be active) - validateNvidiaDevicePluginServiceNotRunning(ctx, s) - - // Deploy the NVIDIA device plugin as a DaemonSet - deployNvidiaDevicePluginDaemonset(ctx, s) - - // Wait for the DaemonSet pod to be running on our node - waitForNvidiaDevicePluginDaemonsetReady(ctx, s) - - // Validate that GPU resources are advertised by the device plugin - ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") - - // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) - - s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly") - }, - }, - }) -} - // validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based // NVIDIA device plugin service is not running (since we're testing the DaemonSet model). func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) { diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 1d70a0742f9..39cdf59ac6e 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -982,7 +982,37 @@ func Test_Ubuntu2204_CustomSysctls_Scriptless(t *testing.T) { } func Test_Ubuntu2204_GPUNC(t *testing.T) { - runScenarioUbuntu2204GPU(t, "Standard_NC6s_v3") + RunScenario(t, &Scenario{ + Description: "Tests that a GPU-enabled node with Standard_NC6s_v3 can bootstrap and run NVIDIA device plugin as a DaemonSet", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3" + nbc.ConfigGPUDriverIfNeeded = true + nbc.EnableGPUDevicePluginIfNeeded = false + nbc.EnableNvidia = true + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NC6s_v3") + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateNvidiaModProbeInstalled(ctx, s) + ValidateKubeletHasNotStopped(ctx, s) + ValidateServicesDoNotRestartKubelet(ctx, s) + + // Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model) + validateNvidiaDevicePluginServiceNotRunning(ctx, s) + deployNvidiaDevicePluginDaemonset(ctx, s) + waitForNvidiaDevicePluginDaemonsetReady(ctx, s) + ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") + ValidateGPUWorkloadSchedulable(ctx, s, 1) + }, + }, + }) } func Test_Ubuntu2204_GPUA100(t *testing.T) { @@ -990,13 +1020,8 @@ func Test_Ubuntu2204_GPUA100(t *testing.T) { } func Test_Ubuntu2204_GPUA10(t *testing.T) { - runScenarioUbuntuGRID(t, "Standard_NV6ads_A10_v5") -} - -// Returns config for the 'gpu' E2E scenario -func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) { RunScenario(t, &Scenario{ - Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize), + Description: "Tests that a GPU-enabled node with Standard_NV6ads_A10_v5 can bootstrap with GRID license and run NVIDIA device plugin as a DaemonSet", Tags: Tags{ GPU: true, }, @@ -1004,27 +1029,36 @@ func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) { Cluster: ClusterKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - nbc.AgentPoolProfile.VMSize = vmSize + nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5" nbc.ConfigGPUDriverIfNeeded = true nbc.EnableGPUDevicePluginIfNeeded = false nbc.EnableNvidia = true }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { - vmss.SKU.Name = to.Ptr(vmSize) + vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5") }, Validator: func(ctx context.Context, s *Scenario) { - // Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable ValidateNvidiaModProbeInstalled(ctx, s) + ValidateNvidiaGRIDLicenseValid(ctx, s) ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) + ValidateNvidiaPersistencedRunning(ctx, s) + + // Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model) + validateNvidiaDevicePluginServiceNotRunning(ctx, s) + deployNvidiaDevicePluginDaemonset(ctx, s) + waitForNvidiaDevicePluginDaemonsetReady(ctx, s) + ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") + ValidateGPUWorkloadSchedulable(ctx, s, 1) }, }, }) } -func runScenarioUbuntuGRID(t *testing.T, vmSize string) { +// Returns config for the 'gpu' E2E scenario +func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) { RunScenario(t, &Scenario{ - Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped, and that the GRID license is valid", vmSize), + Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize), Tags: Tags{ GPU: true, }, @@ -1043,10 +1077,8 @@ func runScenarioUbuntuGRID(t *testing.T, vmSize string) { Validator: func(ctx context.Context, s *Scenario) { // Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable ValidateNvidiaModProbeInstalled(ctx, s) - ValidateNvidiaGRIDLicenseValid(ctx, s) ValidateKubeletHasNotStopped(ctx, s) ValidateServicesDoNotRestartKubelet(ctx, s) - ValidateNvidiaPersistencedRunning(ctx, s) }, }, })