From 0fbf5bd74e97337a83c642fd8939b2c354ea051c Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Thu, 5 Feb 2026 01:07:10 +0530 Subject: [PATCH] GPUOP-540 GPUOP-543 (#1119) * update ttlForFailedWorkflows field to use duration * user can choose to skip reboot step in workflow (cherry picked from commit cfe54c802ddc0d7cd515eb3eb158bcfade336a42) --- .wordlist.txt | 3 ++- api/v1alpha1/deviceconfig_types.go | 7 ++++--- .../amd-gpu-operator.clusterserviceversion.yaml | 5 +++-- bundle/manifests/amd.com_deviceconfigs.yaml | 9 +++++---- config/crd/bases/amd.com_deviceconfigs.yaml | 9 +++++---- .../amd-gpu-operator.clusterserviceversion.yaml | 3 ++- docs/autoremediation/auto-remediation.md | 4 +++- helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/crds/deviceconfig-crd.yaml | 8 +++++--- internal/controllers/remediation_handler.go | 15 ++++++++++++--- tests/helm-e2e/helm_e2e_test.go | 4 ++-- 11 files changed, 44 insertions(+), 25 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index a6efcba9e..d109f261e 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -181,6 +181,7 @@ TODO TLS tolerations tst +TTL TtlForFailedWorkflows ubuntu UI @@ -200,4 +201,4 @@ VFs VMs webhook xgmi -YAML \ No newline at end of file +YAML diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 85e2db02f..5a23c38b1 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -93,10 +93,11 @@ type RemediationWorkflowSpec struct { //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"} ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"` - // Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours + // Time to live for argo workflow object and its pods for a failed workflow. Accepts duration strings like "30s", "4h", "24h". By default, it is set to 24h //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"} - // +kubebuilder:default:=24 - TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"` + // +kubebuilder:default:="24h" + // +kubebuilder:validation:Pattern=`^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$` + TtlForFailedWorkflows string `json:"ttlForFailedWorkflows,omitempty"` // Tester image used to run tests and verify if remediation fixed the reported problem. //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TesterImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:testerImage"} diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index b95522ca2..171e210b9 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -32,7 +32,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/gpu-operator:v1.4.0 - createdAt: "2026-01-28T11:30:39Z" + createdAt: "2026-02-01T12:58:28Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -753,7 +753,8 @@ spec: x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:testerImage - description: Time to live for argo workflow object and its pods for a failed - workflow in hours. By default, it is set to 24 hours + workflow. Accepts duration strings like "30s", "4h", "24h". By default, + it is set to 24h displayName: TtlForFailedWorkflows path: remediationWorkflow.ttlForFailedWorkflows x-descriptors: diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index c7b9b1968..576007bee 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -1490,11 +1490,12 @@ spec: pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string ttlForFailedWorkflows: - default: 24 + default: 24h description: Time to live for argo workflow object and its pods - for a failed workflow in hours. By default, it is set to 24 - hours - type: integer + for a failed workflow. Accepts duration strings like "30s", + "4h", "24h". By default, it is set to 24h + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string type: object selector: additionalProperties: diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 66845d611..af37025f0 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -1486,11 +1486,12 @@ spec: pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string ttlForFailedWorkflows: - default: 24 + default: 24h description: Time to live for argo workflow object and its pods - for a failed workflow in hours. By default, it is set to 24 - hours - type: integer + for a failed workflow. Accepts duration strings like "30s", + "4h", "24h". By default, it is set to 24h + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string type: object selector: additionalProperties: diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index 1a9c05967..8ad50a8d1 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -724,7 +724,8 @@ spec: x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:testerImage - description: Time to live for argo workflow object and its pods for a failed - workflow in hours. By default, it is set to 24 hours + workflow. Accepts duration strings like "30s", "4h", "24h". By default, + it is set to 24h displayName: TtlForFailedWorkflows path: remediationWorkflow.ttlForFailedWorkflows x-descriptors: diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md index c4dab6b72..a056fa2c5 100644 --- a/docs/autoremediation/auto-remediation.md +++ b/docs/autoremediation/auto-remediation.md @@ -106,7 +106,7 @@ type RemediationWorkflowSpec struct { > **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator. -**TtlForFailedWorkflows** - Specifies the time-to-live (in hours) for failed workflow objects and their associated pods. Failed workflows are retained temporarily to allow inspection and troubleshooting. After the configured time period elapses, the failed workflow resources are automatically cleaned up. Default value is 24 hours. +**TtlForFailedWorkflows** - Defines the time-to-live (TTL) duration for retaining failed workflow objects and their associated pods before automatic cleanup. This field accepts a duration string in standard formats (e.g., "24h", "30m", "1h30m"). Retaining failed workflows allows for post-mortem analysis and troubleshooting. Once the specified duration expires, the workflow resources are automatically garbage collected by the system. The default retention period is 24 hours. **TesterImage** - Specifies the container image for executing GPU validation tests during remediation workflows. This image must align with `Spec.TestRunner.Image` specifications and runs test suites to verify GPU health after remediation completion. If unspecified, the default image is `docker.io/rocm/test-runner:v1.4.1`. @@ -193,6 +193,8 @@ The following example demonstrates a complete error mapping configuration: **recoveryPolicy** - Defines limits on remediation attempts to prevent excessive recovery cycles. Includes `maxAllowedRunsPerWindow` (maximum retry attempts) and `windowSize` (time window for counting attempts). When exceeded, the workflow pauses for manual intervention. +**skipRebootStep** - Controls whether the node reboot step is executed during the remediation workflow. The default workflow template includes an automatic reboot step to reinitialize GPU hardware after performing the recommended remediation actions. Set this field to `true` to skip the reboot step when the node has already been rebooted manually as part of the remediation process or when a reboot is not desired for the specific error condition. Default value is `false`. + ## Default Workflow Template > **Note:** The `default-template` is automatically created on the cluster by the GPU Operator. diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 82be640a0..21700a740 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2026-01-28T11:30:26.115644041Z" +generated: "2026-02-01T12:58:13.380331409Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 21bac56e5..0edb86212 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -1490,10 +1490,12 @@ spec: pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string ttlForFailedWorkflows: - default: 24 + default: 24h description: Time to live for argo workflow object and its pods - for a failed workflow in hours. By default, it is set to 24 hours - type: integer + for a failed workflow. Accepts duration strings like "30s", "4h", + "24h". By default, it is set to 24h + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string type: object selector: additionalProperties: diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index d1f6d56d0..3a3e0b7c2 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -103,6 +103,7 @@ type ConditionWorkflowMapping struct { NotifyRemediationMessage string `json:"notifyRemediationMessage" yaml:"notifyRemediationMessage"` NotifyTestFailureMessage string `json:"notifyTestFailureMessage" yaml:"notifyTestFailureMessage"` RecoveryPolicy RecoveryPolicyConfig `json:"recoveryPolicy" yaml:"recoveryPolicy"` + SkipRebootStep bool `json:"skipRebootStep" yaml:"skipRebootStep"` } type ValidationTestsProfile struct { @@ -625,7 +626,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "suspend", Template: "suspend"}}}, - {Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}}, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}, When: "{{workflow.parameters.skipRebootStep}} == 'false'"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}}, {Steps: []workflowv1alpha1.WorkflowStep{ { @@ -940,8 +941,12 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate wf.Spec.Entrypoint = wfTemplate.Spec.Entrypoint wf.Spec.ServiceAccountName = h.getServiceAccountName(ctx, devConfig) - ttlHours := devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows - ttlSeconds := int32(ttlHours * 3600) + ttlDuration, err := time.ParseDuration(devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to parse TTL duration, using default of 24h") + ttlDuration = 24 * time.Hour + } + ttlSeconds := int32(ttlDuration.Seconds()) wf.Spec.TTLStrategy = &workflowv1alpha1.TTLStrategy{ SecondsAfterCompletion: &ttlSeconds, } @@ -1066,6 +1071,10 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate Name: "drain_policy", Value: workflowv1alpha1.AnyStringPtr(string(drainPolicyJSONBytes)), }, + { + Name: "skipRebootStep", + Value: workflowv1alpha1.AnyStringPtr(mapping.SkipRebootStep), + }, }, } diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index e86f360ff..13f05c6ac 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -971,7 +971,7 @@ deviceConfig: enable: true conditionalWorkflows: name: "conditional-workflows-configmap" - ttlForFailedWorkflows: 36 + ttlForFailedWorkflows: 36h testerImage: "test.io/test/remediation-workflow-tester:v1.3.0" `, extraArgs: []string{"-f", tmpValuesYamlPath, "--set", "crds.defaultCR.upgrade=true"}, @@ -984,7 +984,7 @@ deviceConfig: ConditionalWorkflows: &corev1.LocalObjectReference{ Name: "conditional-workflows-configmap", }, - TtlForFailedWorkflows: 36, + TtlForFailedWorkflows: "36h", TesterImage: "test.io/test/remediation-workflow-tester:v1.3.0", }, },