Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 250 additions & 0 deletions e2e/scenario_gpu_daemonset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
package e2e

import (
"context"
"fmt"
"strings"
"testing"
"time"

"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/pkg/agent/datamodel"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
// nvidiaDevicePluginImage is the upstream NVIDIA device plugin image from MCR.
// This is intentionally different from components.json which tracks the systemd-packaged version.
// This test validates the upstream container-based deployment model.
// Update this when a new version is available in MCR.
nvidiaDevicePluginImage = "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:v0.18.2"
)

// Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset tests that a GPU node can function correctly
// with the NVIDIA device plugin deployed as a Kubernetes DaemonSet instead of a systemd service.
// This is the "upstream" deployment model commonly used by customers who manage their own
// NVIDIA device plugin deployment.
func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that NVIDIA device plugin works when deployed as a DaemonSet (not systemd service)",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
nbc.ConfigGPUDriverIfNeeded = true
// Don't enable the managed GPU experience - we'll deploy the device plugin as a DaemonSet instead.
// By not setting EnableManagedGPU=true or the VMSS tag, the systemd-based device plugin won't start.
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
},
Validator: func(ctx context.Context, s *Scenario) {
// First, validate that GPU drivers are installed
ValidateNvidiaModProbeInstalled(ctx, s)

// Verify that the systemd-based device plugin is NOT running
// (managed GPU experience is not enabled, so the service should not be active)
validateNvidiaDevicePluginServiceNotRunning(ctx, s)

// Deploy the NVIDIA device plugin as a DaemonSet
deployNvidiaDevicePluginDaemonset(ctx, s)

// Wait for the DaemonSet pod to be running on our node
waitForNvidiaDevicePluginDaemonsetReady(ctx, s)

// Validate that GPU resources are advertised by the device plugin
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
Comment on lines +64 to +70
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After waiting for the device plugin pod to be ready, consider adding validation to check the pod logs for successful GPU discovery and device plugin registration. This would catch issues where the pod starts but the device plugin fails to function correctly (e.g., driver compatibility issues, incorrect configuration).

The logs should contain messages about discovering GPUs and registering with kubelet's device plugin framework. This would make the test more comprehensive and catch edge cases where the pod runs but doesn't actually register GPU resources properly.

Copilot uses AI. Check for mistakes.

s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly")
},
},
})
}

// validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based
// NVIDIA device plugin service is not running (since we're testing the DaemonSet model).
func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("Verifying that nvidia-device-plugin.service is not running...")

// Check if the service exists and is inactive
// Using "is-active" which returns non-zero if not active
result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'")
output := strings.TrimSpace(result.stdout)

// The service should either not exist or be inactive
if output == "active" {
s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled")
}
s.T.Logf("Confirmed nvidia-device-plugin.service is not active (status: %s)", output)
}

// nvidiaDevicePluginDaemonsetName returns a unique DaemonSet name for the given node.
// The name is truncated to fit within Kubernetes' 63-character limit for resource names.
func nvidiaDevicePluginDaemonsetName(nodeName string) string {
prefix := "nvdp-" // Short prefix to leave room for node name
maxLen := 63
name := prefix + nodeName
if len(name) > maxLen {
name = name[:maxLen]
}
return name
}

// nvidiaDevicePluginDaemonset returns the NVIDIA device plugin DaemonSet spec
// based on the official upstream deployment from:
// https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml
//
// The DaemonSet name includes the node name to avoid collisions when multiple
// GPU tests run against the same shared cluster.
func nvidiaDevicePluginDaemonset(nodeName string) *appsv1.DaemonSet {
dsName := nvidiaDevicePluginDaemonsetName(nodeName)

return &appsv1.DaemonSet{
TypeMeta: metav1.TypeMeta{
Kind: "DaemonSet",
APIVersion: "apps/v1",
},
ObjectMeta: metav1.ObjectMeta{
Name: dsName,
Namespace: "kube-system",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"name": dsName,
},
},
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
Type: appsv1.RollingUpdateDaemonSetStrategyType,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"name": dsName,
},
},
Spec: corev1.PodSpec{
// Target only our specific test node
NodeSelector: map[string]string{
"kubernetes.io/hostname": nodeName,
},
Tolerations: []corev1.Toleration{
{
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
},
},
PriorityClassName: "system-node-critical",
Containers: []corev1.Container{
{
Name: "nvidia-device-plugin-ctr",
Image: nvidiaDevicePluginImage,
Env: []corev1.EnvVar{
{
Name: "FAIL_ON_INIT_ERROR",
Value: "false",
},
},
SecurityContext: &corev1.SecurityContext{
// Privileged mode is required for the device plugin to access
// GPU devices and register with kubelet's device plugin framework.
// This matches the upstream NVIDIA device plugin deployment spec.
Privileged: to.Ptr(true),
},
Comment on lines +154 to +169
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DaemonSet container spec is missing several configuration options present in the upstream NVIDIA device plugin deployment (https://github.com/NVIDIA/k8s-device-plugin/blob/main/deployments/static/nvidia-device-plugin.yml). Specifically:

  1. Missing the /dev/null device volume mount which the device plugin may need
  2. The upstream spec typically includes additional environment variables like NVIDIA_MIG_MONITOR_DEVICES, NVIDIA_DRIVER_CAPABILITIES, etc.
  3. The upstream spec sets Privileged: true in SecurityContext rather than just dropping capabilities

While the minimal spec may work for basic testing, using a configuration closer to the upstream deployment would make this test more valuable as it validates the real-world customer scenario. Consider either aligning more closely with the upstream spec or adding a comment explaining why this simplified configuration is sufficient for the test's purpose.

Copilot uses AI. Check for mistakes.
VolumeMounts: []corev1.VolumeMount{
{
Name: "device-plugin",
MountPath: "/var/lib/kubelet/device-plugins",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "device-plugin",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/kubelet/device-plugins",
},
},
},
},
},
},
},
}
}

// deployNvidiaDevicePluginDaemonset creates the NVIDIA device plugin DaemonSet in the cluster
// and registers cleanup to delete it when the test finishes.
func deployNvidiaDevicePluginDaemonset(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("Deploying NVIDIA device plugin as DaemonSet...")

ds := nvidiaDevicePluginDaemonset(s.Runtime.VM.KubeName)

// Delete any existing DaemonSet from a previous failed run
deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second)
defer deleteCancel()
_ = s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
deleteCtx,
ds.Name,
metav1.DeleteOptions{},
)

// Create the DaemonSet
err := s.Runtime.Cluster.Kube.CreateDaemonset(ctx, ds)
require.NoError(s.T, err, "failed to create NVIDIA device plugin DaemonSet")

Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DaemonSet is created but never explicitly cleaned up. While Kubernetes may garbage collect when the test cluster is torn down, it's better practice to add explicit cleanup to prevent resource leaks if the test fails before cluster cleanup. E2E tests should be self-contained and clean up resources they create.

Consider using s.T.Cleanup() or defer to delete the DaemonSet at the end of the validator function, similar to how other e2e tests handle resource cleanup.

Suggested change
// Ensure the DaemonSet is cleaned up after the test finishes
s.T.Cleanup(func() {
s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
if err := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
context.Background(),
ds.Name,
metav1.DeleteOptions{},
); err != nil {
s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, err)
}
})

Copilot uses AI. Check for mistakes.
s.T.Logf("NVIDIA device plugin DaemonSet %s/%s created successfully", ds.Namespace, ds.Name)

// Register cleanup to delete the DaemonSet when the test finishes
s.T.Cleanup(func() {
s.T.Logf("Cleaning up NVIDIA device plugin DaemonSet %s/%s...", ds.Namespace, ds.Name)
cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cleanupCancel()
deleteErr := s.Runtime.Cluster.Kube.Typed.AppsV1().DaemonSets(ds.Namespace).Delete(
cleanupCtx,
ds.Name,
metav1.DeleteOptions{},
)
if deleteErr != nil {
s.T.Logf("Failed to delete NVIDIA device plugin DaemonSet %s/%s: %v", ds.Namespace, ds.Name, deleteErr)
}
})
}

// waitForNvidiaDevicePluginDaemonsetReady waits for the NVIDIA device plugin pod to be running on the test node.
// Uses the existing WaitUntilPodRunning helper which handles CrashLoopBackOff and other failure states.
func waitForNvidiaDevicePluginDaemonsetReady(ctx context.Context, s *Scenario) {
s.T.Helper()

dsName := nvidiaDevicePluginDaemonsetName(s.Runtime.VM.KubeName)
s.T.Logf("Waiting for NVIDIA device plugin DaemonSet pod to be ready on node %s...", s.Runtime.VM.KubeName)

_, err := s.Runtime.Cluster.Kube.WaitUntilPodRunning(
ctx,
"kube-system",
fmt.Sprintf("name=%s", dsName),
fmt.Sprintf("spec.nodeName=%s", s.Runtime.VM.KubeName),
)
require.NoError(s.T, err, "timed out waiting for NVIDIA device plugin DaemonSet pod to be ready")

s.T.Logf("NVIDIA device plugin DaemonSet pod is ready")
}
Loading