-
Notifications
You must be signed in to change notification settings - Fork 249
Add e2e test for NVIDIA device plugin DaemonSet deployment with Renovate auto-update #7984
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
33fbea5
44195e4
d7f1abf
a365ad1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -134,3 +134,58 @@ func RemoveLeadingV(version string) string { | |
| } | ||
| return version | ||
| } | ||
|
|
||
| // GetGPUContainerImage returns the full container image URL for a GPU container image | ||
| // by looking up the downloadURL pattern and gpuVersion.latestVersion from components.json. | ||
| // The downloadURL pattern contains a wildcard (*) that gets replaced with the version. | ||
| func GetGPUContainerImage(downloadURLPattern string) string { | ||
| // Get the project root dynamically | ||
| _, filename, _, _ := runtime.Caller(0) | ||
| projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/ | ||
| componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json") | ||
|
|
||
| jsonBytes, err := os.ReadFile(componentsPath) | ||
| if err != nil { | ||
| return "" | ||
| } | ||
|
|
||
| gpuImages := gjson.GetBytes(jsonBytes, "GPUContainerImages") | ||
| for _, gpuImage := range gpuImages.Array() { | ||
| downloadURL := gpuImage.Get("downloadURL").String() | ||
| if strings.EqualFold(downloadURL, downloadURLPattern) { | ||
| version := gpuImage.Get("gpuVersion.latestVersion").String() | ||
| if version != "" { | ||
| return strings.Replace(downloadURL, "*", version, 1) | ||
| } | ||
| } | ||
| } | ||
| return "" | ||
| } | ||
|
|
||
| // GetE2EContainerImage returns the full container image URL for an e2e test container image | ||
| // by looking up the name and version.latestVersion from components.json E2EContainerImages section. | ||
| // The downloadURL pattern contains a wildcard (*) that gets replaced with the version. | ||
| func GetE2EContainerImage(name string) string { | ||
| // Get the project root dynamically | ||
| _, filename, _, _ := runtime.Caller(0) | ||
| projectRoot := filepath.Dir(filepath.Dir(filepath.Dir(filename))) // Go up 3 levels from e2e/components/ | ||
| componentsPath := filepath.Join(projectRoot, "parts", "common", "components.json") | ||
|
|
||
| jsonBytes, err := os.ReadFile(componentsPath) | ||
| if err != nil { | ||
| return "" | ||
| } | ||
|
Comment on lines
+174
to
+177
|
||
|
|
||
| e2eImages := gjson.GetBytes(jsonBytes, "E2EContainerImages") | ||
| for _, e2eImage := range e2eImages.Array() { | ||
| imageName := e2eImage.Get("name").String() | ||
| if strings.EqualFold(imageName, name) { | ||
| downloadURL := e2eImage.Get("downloadURL").String() | ||
| version := e2eImage.Get("version.latestVersion").String() | ||
| if version != "" { | ||
| return strings.Replace(downloadURL, "*", version, 1) | ||
| } | ||
| } | ||
| } | ||
| return "" | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,201 @@ | ||||||||||||||||||||||||||||||||||||||
| package e2e | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| import ( | ||||||||||||||||||||||||||||||||||||||
| "context" | ||||||||||||||||||||||||||||||||||||||
| "fmt" | ||||||||||||||||||||||||||||||||||||||
| "strings" | ||||||||||||||||||||||||||||||||||||||
| "time" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| "github.com/Azure/agentbaker/e2e/components" | ||||||||||||||||||||||||||||||||||||||
| "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" | ||||||||||||||||||||||||||||||||||||||
| "github.com/stretchr/testify/require" | ||||||||||||||||||||||||||||||||||||||
| appsv1 "k8s.io/api/apps/v1" | ||||||||||||||||||||||||||||||||||||||
| corev1 "k8s.io/api/core/v1" | ||||||||||||||||||||||||||||||||||||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| const ( | ||||||||||||||||||||||||||||||||||||||
| // nvidiaDevicePluginImageName is the name of the NVIDIA device plugin container image | ||||||||||||||||||||||||||||||||||||||
| // in components.json E2EContainerImages section. The version is managed via Renovate. | ||||||||||||||||||||||||||||||||||||||
| nvidiaDevicePluginImageName = "nvidia-k8s-device-plugin" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| // getNvidiaDevicePluginImage returns the full container image URL for the NVIDIA device plugin | ||||||||||||||||||||||||||||||||||||||
| // by reading the version from components.json E2EContainerImages section. | ||||||||||||||||||||||||||||||||||||||
| func getNvidiaDevicePluginImage() string { | ||||||||||||||||||||||||||||||||||||||
| return components.GetE2EContainerImage(nvidiaDevicePluginImageName) | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| // validateNvidiaDevicePluginServiceNotRunning verifies that the systemd-based | ||||||||||||||||||||||||||||||||||||||
| // NVIDIA device plugin service is not running (since we're testing the DaemonSet model). | ||||||||||||||||||||||||||||||||||||||
| func validateNvidiaDevicePluginServiceNotRunning(ctx context.Context, s *Scenario) { | ||||||||||||||||||||||||||||||||||||||
| s.T.Helper() | ||||||||||||||||||||||||||||||||||||||
| s.T.Logf("Verifying that nvidia-device-plugin.service is not running...") | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| // Check if the service exists and is inactive | ||||||||||||||||||||||||||||||||||||||
| // Using "is-active" which returns non-zero if not active | ||||||||||||||||||||||||||||||||||||||
| result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'") | ||||||||||||||||||||||||||||||||||||||
| output := strings.TrimSpace(result.stdout) | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| // The service should either not exist or be inactive | ||||||||||||||||||||||||||||||||||||||
| if output == "active" { | ||||||||||||||||||||||||||||||||||||||
| s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled") | ||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+35
to
+42
|
||||||||||||||||||||||||||||||||||||||
| // Check if the service exists and is inactive | |
| // Using "is-active" which returns non-zero if not active | |
| result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null || echo 'not-running'") | |
| output := strings.TrimSpace(result.stdout) | |
| // The service should either not exist or be inactive | |
| if output == "active" { | |
| s.T.Fatalf("nvidia-device-plugin.service is unexpectedly running - this test requires the systemd service to be disabled") | |
| // Check the current service state using "is-active". | |
| // This will return "active", "inactive", "failed", "activating", "unknown", etc. | |
| result := execScriptOnVMForScenario(ctx, s, "systemctl is-active nvidia-device-plugin.service 2>/dev/null") | |
| output := strings.TrimSpace(result.stdout) | |
| // The service should either not exist or be in a non-running state. | |
| // Treat both "active" and "activating" as failures, since the service | |
| // must not be running when validating the DaemonSet-based deployment. | |
| if output == "active" || output == "activating" { | |
| s.T.Fatalf("nvidia-device-plugin.service is unexpectedly %s - this test requires the systemd service to be disabled", output) |
Copilot
AI
Feb 27, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nvidiaDevicePluginDaemonsetName truncates the name to 63 chars but doesn’t ensure the truncated result still conforms to DNS label rules (must end with an alphanumeric). If truncation cuts on a '-', Kubernetes can reject the object/label value. Mirror the existing truncatePodName behavior by trimming trailing '-' after truncation (and apply any needed sanitization).
Copilot
AI
Feb 27, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The DaemonSet uses Image: getNvidiaDevicePluginImage() without asserting it’s non-empty/valid. If components.json lookup fails, the pod will be created with an empty image and the test will fail with a confusing scheduling/pull error. Add an early require.NotEmpty (or propagate an error from GetE2EContainerImage) so failures clearly indicate a missing/misconfigured E2EContainerImages entry.
| Name: "nvidia-device-plugin-ctr", | |
| Image: getNvidiaDevicePluginImage(), | |
| Name: "nvidia-device-plugin-ctr", | |
| Image: func() string { | |
| img := getNvidiaDevicePluginImage() | |
| require.NotEmpty(t, img, "nvidia device plugin image must be configured in E2EContainerImages (components.json entry %q)", nvidiaDevicePluginImageName) | |
| return img | |
| }(), |
Copilot
AI
Feb 27, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test deletes an existing DaemonSet and then recreates it without waiting for deletion to finish. If a prior run left it terminating, the create can fail with AlreadyExists / "object is being deleted" and make the test flaky. Consider polling until a GET returns NotFound (or using an update-based approach) before creating.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2070,5 +2070,15 @@ | |
| } | ||
| ] | ||
| } | ||
| ], | ||
| "E2EContainerImages": [ | ||
| { | ||
| "name": "nvidia-k8s-device-plugin", | ||
| "downloadURL": "mcr.microsoft.com/oss/v2/nvidia/k8s-device-plugin:*", | ||
| "version": { | ||
| "renovateTag": "registry=https://mcr.microsoft.com, name=oss/v2/nvidia/k8s-device-plugin", | ||
| "latestVersion": "v0.18.2-1" | ||
| } | ||
|
Comment on lines
+2078
to
+2081
|
||
| } | ||
| ] | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description says a new Renovate custom manager was added to auto-update container image versions in e2e Go test files (via
// renovate:comments), but this renovate.json change only updates package grouping. There is still nocustomManagersentry that targetse2e/**/*.goor parses// renovate:lines, so Renovate will not update Go test image strings as described. Either add the intended custom manager configuration or update the PR description to match the actual approach (version coming from components.json).