Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,8 @@
},
{
"matchPackageNames": [
"nvidia-device-plugin"
"nvidia-device-plugin",
"oss/v2/nvidia/k8s-device-plugin"
],
Comment on lines 382 to 385
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description says a new Renovate custom manager was added to auto-update container image versions in e2e Go test files (via // renovate: comments), but this renovate.json change only updates package grouping. There is still no customManagers entry that targets e2e/**/*.go or parses // renovate: lines, so Renovate will not update Go test image strings as described. Either add the intended custom manager configuration or update the PR description to match the actual approach (version coming from components.json).

Copilot uses AI. Check for mistakes.
"groupName": "nvidia-device-plugin",
"assignees": [
Expand Down
55 changes: 55 additions & 0 deletions e2e/components/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,58 @@ func RemoveLeadingV(version string) string {
}
return version
}

// GetGPUContainerImage returns the full container image URL for a GPU container image
// by looking up the downloadURL pattern and gpuVersion.latestVersion from components.json.
// The downloadURL pattern contains a wildcard (*) that gets replaced with the version.
func GetGPUContainerImage(downloadURLPattern string) string {
// Get the project root dynamically
_, filename, _, _ := runtime.Caller(0)
projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/
componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json")

jsonBytes, err := os.ReadFile(componentsPath)
if err != nil {
return ""
}

gpuImages := gjson.GetBytes(jsonBytes, "GPUContainerImages")
for _, gpuImage := range gpuImages.Array() {
downloadURL := gpuImage.Get("downloadURL").String()
if strings.EqualFold(downloadURL, downloadURLPattern) {
version := gpuImage.Get("gpuVersion.latestVersion").String()
if version != "" {
return strings.Replace(downloadURL, "*", version, 1)
}
}
}
return ""
}

// GetE2EContainerImage returns the full container image URL for an e2e test container image
// by looking up the name and version.latestVersion from components.json E2EContainerImages section.
// The downloadURL pattern contains a wildcard (*) that gets replaced with the version.
func GetE2EContainerImage(name string) string {
// Get the project root dynamically
_, filename, _, _ := runtime.Caller(0)
projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/
componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json")

jsonBytes, err := os.ReadFile(componentsPath)
if err != nil {
return ""
}
Comment on lines +174 to +177
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GetE2EContainerImage returns an empty string if reading/parsing components.json fails or the image entry is missing. In practice this causes later Kubernetes errors like “invalid image name” and hides the real root cause. Consider returning (string, error) (or failing fast) and have callers require.NoError/require.NotEmpty so test failures point to the missing/misconfigured components.json entry.

Copilot uses AI. Check for mistakes.

e2eImages := gjson.GetBytes(jsonBytes, "E2EContainerImages")
for _, e2eImage := range e2eImages.Array() {
imageName := e2eImage.Get("name").String()
if strings.EqualFold(imageName, name) {
downloadURL := e2eImage.Get("downloadURL").String()
version := e2eImage.Get("version.latestVersion").String()
if version != "" {
return strings.Replace(downloadURL, "*", version, 1)
}
}
}
return ""
}
201 changes: 201 additions & 0 deletions e2e/scenario_gpu_daemonset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
package e2e

import (
"context"
"fmt"
"strings"
"time"

"github.com/Azure/agentbaker/e2e/components"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
// nvidiaDevicePluginImageName is the name of the NVIDIA device plugin container image
// in components.json E2EContainerImages section. The version is managed via Renovate.
nvidiaDevicePluginImageName = "nvidia-k8s-device-plugin"
)

// getNvidiaDevicePluginImage returns the full container image URL for the NVIDIA device plugin
// by reading the version from components.json E2EContainerImages section.
func getNvidiaDevicePluginImage() string {
return components.GetE2EContainerImage(nvidiaDevicePluginImageName)
}

// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based
// NVIDIA device plugin service is not running (since we're testing the DaemonSet model).
func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("Verifying that nvidia-device-plugin.service is not running...")

// Check if the service exists and is inactive
// Using "is-active" which returns non-zero if not active
result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
output := strings.TrimSpace(result.stdout)

// The service should either not exist or be inactive
if output == "active" {
s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
Comment on lines +35 to +42
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The systemctl is-active ... || echo 'not-running' pattern discards both the real service state (e.g., "inactive", "failed", "activating") and the exit code, so the test can incorrectly pass even if the service is transitioning/running (e.g., "activating"). Capture the actual systemctl is-active output/exit status and fail on any running-like state (active/activating), or use execScriptOnVMForScenarioValidateExitCode with the expected non-active exit codes.

Suggested change
// Check if the service exists and is inactive
// Using "is-active" which returns non-zero if not active
result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
output := strings.TrimSpace(result.stdout)
// The service should either not exist or be inactive
if output == "active" {
s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
// Check the current service state using "is-active".
// This will return "active", "inactive", "failed", "activating", "unknown", etc.
result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null")
output := strings.TrimSpace(result.stdout)
// The service should either not exist or be in a non-running state.
// Treat both "active" and "activating" as failures, since the service
// must not be running when validating the DaemonSet-based deployment.
if output == "active" || output == "activating" {
s.T.Fatalf("nvidia-device-plugin.service is unexpectedly %s - this test requires the systemd service to be disabled", output)

Copilot uses AI. Check for mistakes.
}
s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output)
}

// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node.
// The name is truncated to fit within Kubernetes' 63-character limit for resource names.
func nvidiaDevicePluginDaemonsetName(nodeName string) string {
prefix := "nvdp-" // Short prefix to leave room for node name
maxLen := 63
name := prefix + nodeName
if len(name) > maxLen {
name = name[:maxLen]
}
Comment on lines +51 to +55
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvidiaDevicePluginDaemonsetName truncates the name to 63 chars but doesn’t ensure the truncated result still conforms to DNS label rules (must end with an alphanumeric). If truncation cuts on a '-', Kubernetes can reject the object/label value. Mirror the existing truncatePodName behavior by trimming trailing '-' after truncation (and apply any needed sanitization).

Copilot uses AI. Check for mistakes.
return name
}

// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
// based on the official upstream deployment from:
// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
//
// The DaemonSet name includes the node name to avoid collisions when multiple
// GPU tests run against the same shared cluster.
func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
dsName := nvidiaDevicePluginDaemonsetName(nodeName)

return &appsv1.DaemonSet{
TypeMeta: metav1.TypeMeta{
Kind: "DaemonSet",
APIVersion: "apps/v1",
},
ObjectMeta: metav1.ObjectMeta{
Name: dsName,
Namespace: "kube-system",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"name": dsName,
},
},
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
Type: appsv1.RollingUpdateDaemonSetStrategyType,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"name": dsName,
},
},
Spec: corev1.PodSpec{
// Target only our specific test node
NodeSelector: map[string]string{
"kubernetes.io/hostname": nodeName,
},
Tolerations: []corev1.Toleration{
{
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
},
},
PriorityClassName: "system-node-critical",
Containers: []corev1.Container{
{
Name: "nvidia-device-plugin-ctr",
Image: getNvidiaDevicePluginImage(),
Comment on lines +107 to +108
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DaemonSet uses Image: getNvidiaDevicePluginImage() without asserting it’s non-empty/valid. If components.json lookup fails, the pod will be created with an empty image and the test will fail with a confusing scheduling/pull error. Add an early require.NotEmpty (or propagate an error from GetE2EContainerImage) so failures clearly indicate a missing/misconfigured E2EContainerImages entry.

Suggested change
Name: "nvidia-device-plugin-ctr",
Image: getNvidiaDevicePluginImage(),
Name: "nvidia-device-plugin-ctr",
Image: func() string {
img := getNvidiaDevicePluginImage()
require.NotEmpty(t, img, "nvidia device plugin image must be configured in E2EContainerImages (components.json entry %q)", nvidiaDevicePluginImageName)
return img
}(),

Copilot uses AI. Check for mistakes.
Env: []corev1.EnvVar{
{
Name: "FAIL_ON_INIT_ERROR",
Value: "false",
},
},
SecurityContext: &corev1.SecurityContext{
// Privileged mode is required for the device plugin to access
// GPU devices and register with kubelet's device plugin framework.
// This matches the upstream NVIDIA device plugin deployment spec.
Privileged: to.Ptr(true),
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "device-plugin",
MountPath: "/var/lib/kubelet/device-plugins",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "device-plugin",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/kubelet/device-plugins",
},
},
},
},
},
},
},
}
}

// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster
// and registers cleanup to delete it when the test finishes.
func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")

ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName)

// Delete any existing DaemonSet from a previous failed run
deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second)
defer deleteCancel()
_ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
deleteCtx,
ds.Name,
metav1.DeleteOptions{},
)
Comment on lines +156 to +160
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test deletes an existing DaemonSet and then recreates it without waiting for deletion to finish. If a prior run left it terminating, the create can fail with AlreadyExists / "object is being deleted" and make the test flaky. Consider polling until a GET returns NotFound (or using an update-based approach) before creating.

Copilot uses AI. Check for mistakes.

// Create the DaemonSet
err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")

s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name)

// Register cleanup to delete the DaemonSet when the test finishes
s.T.Cleanup(func() {
s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cleanupCancel()
deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
cleanupCtx,
ds.Name,
metav1.DeleteOptions{},
)
if deleteErr != nil {
s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr)
}
})
}

// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node.
// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states.
func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
s.T.Helper()

dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName)
s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)

_, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning(
ctx,
"kube-system",
fmt.Sprintf("name=%s", dsName),
fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
)
require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready")

s.T.Logf("NVIDIA device plugin DaemonSet pod is ready")
}
60 changes: 46 additions & 14 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -982,49 +982,83 @@ func Test_Ubuntu2204_CustomSysctls_Scriptless(t *testing.T) {
}

func Test_Ubuntu2204_GPUNC(t *testing.T) {
runScenarioUbuntu2204GPU(t, "Standard_NC6s_v3")
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node with Standard_NC6s_v3 can bootstrap and run NVIDIA device plugin as a DaemonSet",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_NC6s_v3"
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NC6s_v3")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)

// Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model)
validateNvidiaDevicePluginServiceNotRunning(ctx, s)
deployNvidiaDevicePluginDaemonset(ctx, s)
waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
ValidateGPUWorkloadSchedulable(ctx, s, 1)
},
},
})
}

func Test_Ubuntu2204_GPUA100(t *testing.T) {
runScenarioUbuntu2204GPU(t, "Standard_NC24ads_A100_v4")
}

func Test_Ubuntu2204_GPUA10(t *testing.T) {
runScenarioUbuntuGRID(t, "Standard_NV6ads_A10_v5")
}

// Returns config for the 'gpu' E2E scenario
func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) {
RunScenario(t, &Scenario{
Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize),
Description: "Tests that a GPU-enabled node with Standard_NV6ads_A10_v5 can bootstrap with GRID license and run NVIDIA device plugin as a DaemonSet",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = vmSize
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr(vmSize)
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
},
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateNvidiaGRIDLicenseValid(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)

// Validate that the NVIDIA device plugin can be deployed as a DaemonSet (upstream model)
validateNvidiaDevicePluginServiceNotRunning(ctx, s)
deployNvidiaDevicePluginDaemonset(ctx, s)
waitForNvidiaDevicePluginDaemonsetReady(ctx, s)
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")
ValidateGPUWorkloadSchedulable(ctx, s, 1)
},
},
})
}

func runScenarioUbuntuGRID(t *testing.T, vmSize string) {
// Returns config for the 'gpu' E2E scenario
func runScenarioUbuntu2204GPU(t *testing.T, vmSize string) {
RunScenario(t, &Scenario{
Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped, and that the GRID license is valid", vmSize),
Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an Ubuntu 2204 VHD can be properly bootstrapped", vmSize),
Tags: Tags{
GPU: true,
},
Expand All @@ -1043,10 +1077,8 @@ func runScenarioUbuntuGRID(t *testing.T, vmSize string) {
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateNvidiaGRIDLicenseValid(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
},
},
})
Expand Down
10 changes: 10 additions & 0 deletions parts/common/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -2070,5 +2070,15 @@
}
]
}
],
"E2EContainerImages": [
{
"name": "nvidia-k8s-device-plugin",
"downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*",
"version": {
"renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin",
"latestVersion": "v0.18.2-1"
}
Comment on lines +2078 to +2081
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description says the test uses mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2, but components.json pins latestVersion to v0.18.2-1. Please reconcile the version/tag format (either update the description or adjust latestVersion) to avoid confusion and to ensure the referenced tag is the one actually being tested.

Copilot uses AI. Check for mistakes.
}
]
}
Loading
Loading