From 0fbf5bd74e97337a83c642fd8939b2c354ea051c Mon Sep 17 00:00:00 2001
From: Uday Bhaskar <udayb@amd.com>
Date: Thu, 5 Feb 2026 01:07:10 +0530
Subject: [PATCH] GPUOP-540 GPUOP-543 (#1119)

* update ttlForFailedWorkflows field to use duration

* user can choose to skip reboot step in workflow

(cherry picked from commit cfe54c802ddc0d7cd515eb3eb158bcfade336a42)
---
 .wordlist.txt                                     |  3 ++-
 api/v1alpha1/deviceconfig_types.go                |  7 ++++---
 .../amd-gpu-operator.clusterserviceversion.yaml   |  5 +++--
 bundle/manifests/amd.com_deviceconfigs.yaml       |  9 +++++----
 config/crd/bases/amd.com_deviceconfigs.yaml       |  9 +++++----
 .../amd-gpu-operator.clusterserviceversion.yaml   |  3 ++-
 docs/autoremediation/auto-remediation.md          |  4 +++-
 helm-charts-k8s/Chart.lock                        |  2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml        |  8 +++++---
 internal/controllers/remediation_handler.go       | 15 ++++++++++++---
 tests/helm-e2e/helm_e2e_test.go                   |  4 ++--
 11 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/.wordlist.txt b/.wordlist.txt
index a6efcba9e..d109f261e 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -181,6 +181,7 @@ TODO
 TLS
 tolerations
 tst
+TTL
 TtlForFailedWorkflows
 ubuntu
 UI
@@ -200,4 +201,4 @@ VFs
 VMs
 webhook
 xgmi
-YAML
\ No newline at end of file
+YAML
diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 85e2db02f..5a23c38b1 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -93,10 +93,11 @@ type RemediationWorkflowSpec struct {
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
 	ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
 
-	// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
+	// Time to live for argo workflow object and its pods for a failed workflow. Accepts duration strings like "30s", "4h", "24h". By default, it is set to 24h
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
-	// +kubebuilder:default:=24
-	TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
+	// +kubebuilder:default:="24h"
+	// +kubebuilder:validation:Pattern=`^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$`
+	TtlForFailedWorkflows string `json:"ttlForFailedWorkflows,omitempty"`
 
 	// Tester image used to run tests and verify if remediation fixed the reported problem.
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TesterImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:testerImage"}
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index b95522ca2..171e210b9 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.4.0
-    createdAt: "2026-01-28T11:30:39Z"
+    createdAt: "2026-02-01T12:58:28Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -753,7 +753,8 @@ spec:
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:testerImage
       - description: Time to live for argo workflow object and its pods for a failed
-          workflow in hours. By default, it is set to 24 hours
+          workflow. Accepts duration strings like "30s", "4h", "24h". By default,
+          it is set to 24h
         displayName: TtlForFailedWorkflows
         path: remediationWorkflow.ttlForFailedWorkflows
         x-descriptors:
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index c7b9b1968..576007bee 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -1490,11 +1490,12 @@ spec:
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   ttlForFailedWorkflows:
-                    default: 24
+                    default: 24h
                     description: Time to live for argo workflow object and its pods
-                      for a failed workflow in hours. By default, it is set to 24
-                      hours
-                    type: integer
+                      for a failed workflow. Accepts duration strings like "30s",
+                      "4h", "24h". By default, it is set to 24h
+                    pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
+                    type: string
                 type: object
               selector:
                 additionalProperties:
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 66845d611..af37025f0 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -1486,11 +1486,12 @@ spec:
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   ttlForFailedWorkflows:
-                    default: 24
+                    default: 24h
                     description: Time to live for argo workflow object and its pods
-                      for a failed workflow in hours. By default, it is set to 24
-                      hours
-                    type: integer
+                      for a failed workflow. Accepts duration strings like "30s",
+                      "4h", "24h". By default, it is set to 24h
+                    pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
+                    type: string
                 type: object
               selector:
                 additionalProperties:
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index 1a9c05967..8ad50a8d1 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -724,7 +724,8 @@ spec:
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:testerImage
       - description: Time to live for argo workflow object and its pods for a failed
-          workflow in hours. By default, it is set to 24 hours
+          workflow. Accepts duration strings like "30s", "4h", "24h". By default,
+          it is set to 24h
         displayName: TtlForFailedWorkflows
         path: remediationWorkflow.ttlForFailedWorkflows
         x-descriptors:
diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
index c4dab6b72..a056fa2c5 100644
--- a/docs/autoremediation/auto-remediation.md
+++ b/docs/autoremediation/auto-remediation.md
@@ -106,7 +106,7 @@ type RemediationWorkflowSpec struct {
 
 > **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator.
 
-**TtlForFailedWorkflows** - Specifies the time-to-live (in hours) for failed workflow objects and their associated pods. Failed workflows are retained temporarily to allow inspection and troubleshooting. After the configured time period elapses, the failed workflow resources are automatically cleaned up. Default value is 24 hours.
+**TtlForFailedWorkflows** - Defines the time-to-live (TTL) duration for retaining failed workflow objects and their associated pods before automatic cleanup. This field accepts a duration string in standard formats (e.g., "24h", "30m", "1h30m"). Retaining failed workflows allows for post-mortem analysis and troubleshooting. Once the specified duration expires, the workflow resources are automatically garbage collected by the system. The default retention period is 24 hours.
 
 **TesterImage** - Specifies the container image for executing GPU validation tests during remediation workflows. This image must align with `Spec.TestRunner.Image` specifications and runs test suites to verify GPU health after remediation completion. If unspecified, the default image is `docker.io/rocm/test-runner:v1.4.1`.
 
@@ -193,6 +193,8 @@ The following example demonstrates a complete error mapping configuration:
 
 **recoveryPolicy** - Defines limits on remediation attempts to prevent excessive recovery cycles. Includes `maxAllowedRunsPerWindow` (maximum retry attempts) and `windowSize` (time window for counting attempts). When exceeded, the workflow pauses for manual intervention.
 
+**skipRebootStep** - Controls whether the node reboot step is executed during the remediation workflow. The default workflow template includes an automatic reboot step to reinitialize GPU hardware after performing the recommended remediation actions. Set this field to `true` to skip the reboot step when the node has already been rebooted manually as part of the remediation process or when a reboot is not desired for the specific error condition. Default value is `false`.
+
 ## Default Workflow Template
 
 > **Note:** The `default-template` is automatically created on the cluster by the GPU Operator.
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 82be640a0..21700a740 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -9,4 +9,4 @@ dependencies:
   repository: file://./charts/remediation
   version: v1.0.0
 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
-generated: "2026-01-28T11:30:26.115644041Z"
+generated: "2026-02-01T12:58:13.380331409Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 21bac56e5..0edb86212 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -1490,10 +1490,12 @@ spec:
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   ttlForFailedWorkflows:
-                    default: 24
+                    default: 24h
                     description: Time to live for argo workflow object and its pods
-                      for a failed workflow in hours. By default, it is set to 24 hours
-                    type: integer
+                      for a failed workflow. Accepts duration strings like "30s", "4h",
+                      "24h". By default, it is set to 24h
+                    pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
+                    type: string
                 type: object
               selector:
                 additionalProperties:
diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go
index d1f6d56d0..3a3e0b7c2 100644
--- a/internal/controllers/remediation_handler.go
+++ b/internal/controllers/remediation_handler.go
@@ -103,6 +103,7 @@ type ConditionWorkflowMapping struct {
 	NotifyRemediationMessage string                 `json:"notifyRemediationMessage" yaml:"notifyRemediationMessage"`
 	NotifyTestFailureMessage string                 `json:"notifyTestFailureMessage" yaml:"notifyTestFailureMessage"`
 	RecoveryPolicy           RecoveryPolicyConfig   `json:"recoveryPolicy" yaml:"recoveryPolicy"`
+	SkipRebootStep           bool                   `json:"skipRebootStep" yaml:"skipRebootStep"`
 }
 
 type ValidationTestsProfile struct {
@@ -625,7 +626,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 						},
 						},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "suspend", Template: "suspend"}}},
-						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
+						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}, When: "{{workflow.parameters.skipRebootStep}} == 'false'"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
@@ -940,8 +941,12 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 
 	wf.Spec.Entrypoint = wfTemplate.Spec.Entrypoint
 	wf.Spec.ServiceAccountName = h.getServiceAccountName(ctx, devConfig)
-	ttlHours := devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows
-	ttlSeconds := int32(ttlHours * 3600)
+	ttlDuration, err := time.ParseDuration(devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows)
+	if err != nil {
+		log.FromContext(ctx).Error(err, "Failed to parse TTL duration, using default of 24h")
+		ttlDuration = 24 * time.Hour
+	}
+	ttlSeconds := int32(ttlDuration.Seconds())
 	wf.Spec.TTLStrategy = &workflowv1alpha1.TTLStrategy{
 		SecondsAfterCompletion: &ttlSeconds,
 	}
@@ -1066,6 +1071,10 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 				Name:  "drain_policy",
 				Value: workflowv1alpha1.AnyStringPtr(string(drainPolicyJSONBytes)),
 			},
+			{
+				Name:  "skipRebootStep",
+				Value: workflowv1alpha1.AnyStringPtr(mapping.SkipRebootStep),
+			},
 		},
 	}
 
diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go
index e86f360ff..13f05c6ac 100644
--- a/tests/helm-e2e/helm_e2e_test.go
+++ b/tests/helm-e2e/helm_e2e_test.go
@@ -971,7 +971,7 @@ deviceConfig:
       enable: true
       conditionalWorkflows:
         name: "conditional-workflows-configmap"
-      ttlForFailedWorkflows: 36
+      ttlForFailedWorkflows: 36h
       testerImage: "test.io/test/remediation-workflow-tester:v1.3.0"
 `,
 			extraArgs:            []string{"-f", tmpValuesYamlPath, "--set", "crds.defaultCR.upgrade=true"},
@@ -984,7 +984,7 @@ deviceConfig:
 					ConditionalWorkflows: &corev1.LocalObjectReference{
 						Name: "conditional-workflows-configmap",
 					},
-					TtlForFailedWorkflows: 36,
+					TtlForFailedWorkflows: "36h",
 					TesterImage:           "test.io/test/remediation-workflow-tester:v1.3.0",
 				},
 			},