Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ TODO
TLS
tolerations
tst
TTL
TtlForFailedWorkflows
ubuntu
UI
Expand All @@ -200,4 +201,4 @@ VFs
VMs
webhook
xgmi
YAML
YAML
7 changes: 4 additions & 3 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ type RemediationWorkflowSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`

// Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours
// Time to live for argo workflow object and its pods for a failed workflow. Accepts duration strings like "30s", "4h", "24h". By default, it is set to 24h
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
// +kubebuilder:default:=24
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
// +kubebuilder:default:="24h"
// +kubebuilder:validation:Pattern=`^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$`
TtlForFailedWorkflows string `json:"ttlForFailedWorkflows,omitempty"`

// Tester image used to run tests and verify if remediation fixed the reported problem.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TesterImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:testerImage"}
Expand Down
5 changes: 3 additions & 2 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.4.0
createdAt: "2026-01-28T11:30:39Z"
createdAt: "2026-02-01T12:58:28Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -753,7 +753,8 @@ spec:
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:testerImage
- description: Time to live for argo workflow object and its pods for a failed
workflow in hours. By default, it is set to 24 hours
workflow. Accepts duration strings like "30s", "4h", "24h". By default,
it is set to 24h
displayName: TtlForFailedWorkflows
path: remediationWorkflow.ttlForFailedWorkflows
x-descriptors:
Expand Down
9 changes: 5 additions & 4 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1490,11 +1490,12 @@ spec:
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
ttlForFailedWorkflows:
default: 24
default: 24h
description: Time to live for argo workflow object and its pods
for a failed workflow in hours. By default, it is set to 24
hours
type: integer
for a failed workflow. Accepts duration strings like "30s",
"4h", "24h". By default, it is set to 24h
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
type: object
selector:
additionalProperties:
Expand Down
9 changes: 5 additions & 4 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1486,11 +1486,12 @@ spec:
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
ttlForFailedWorkflows:
default: 24
default: 24h
description: Time to live for argo workflow object and its pods
for a failed workflow in hours. By default, it is set to 24
hours
type: integer
for a failed workflow. Accepts duration strings like "30s",
"4h", "24h". By default, it is set to 24h
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
type: object
selector:
additionalProperties:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,8 @@ spec:
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:testerImage
- description: Time to live for argo workflow object and its pods for a failed
workflow in hours. By default, it is set to 24 hours
workflow. Accepts duration strings like "30s", "4h", "24h". By default,
it is set to 24h
displayName: TtlForFailedWorkflows
path: remediationWorkflow.ttlForFailedWorkflows
x-descriptors:
Expand Down
4 changes: 3 additions & 1 deletion docs/autoremediation/auto-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ type RemediationWorkflowSpec struct {

> **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator.

**TtlForFailedWorkflows** - Specifies the time-to-live (in hours) for failed workflow objects and their associated pods. Failed workflows are retained temporarily to allow inspection and troubleshooting. After the configured time period elapses, the failed workflow resources are automatically cleaned up. Default value is 24 hours.
**TtlForFailedWorkflows** - Defines the time-to-live (TTL) duration for retaining failed workflow objects and their associated pods before automatic cleanup. This field accepts a duration string in standard formats (e.g., "24h", "30m", "1h30m"). Retaining failed workflows allows for post-mortem analysis and troubleshooting. Once the specified duration expires, the workflow resources are automatically garbage collected by the system. The default retention period is 24 hours.

**TesterImage** - Specifies the container image for executing GPU validation tests during remediation workflows. This image must align with `Spec.TestRunner.Image` specifications and runs test suites to verify GPU health after remediation completion. If unspecified, the default image is `docker.io/rocm/test-runner:v1.4.1`.

Expand Down Expand Up @@ -193,6 +193,8 @@ The following example demonstrates a complete error mapping configuration:

**recoveryPolicy** - Defines limits on remediation attempts to prevent excessive recovery cycles. Includes `maxAllowedRunsPerWindow` (maximum retry attempts) and `windowSize` (time window for counting attempts). When exceeded, the workflow pauses for manual intervention.

**skipRebootStep** - Controls whether the node reboot step is executed during the remediation workflow. The default workflow template includes an automatic reboot step to reinitialize GPU hardware after performing the recommended remediation actions. Set this field to `true` to skip the reboot step when the node has already been rebooted manually as part of the remediation process or when a reboot is not desired for the specific error condition. Default value is `false`.

## Default Workflow Template

> **Note:** The `default-template` is automatically created on the cluster by the GPU Operator.
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation
version: v1.0.0
digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
generated: "2026-01-28T11:30:26.115644041Z"
generated: "2026-02-01T12:58:13.380331409Z"
8 changes: 5 additions & 3 deletions helm-charts-k8s/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1490,10 +1490,12 @@ spec:
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
ttlForFailedWorkflows:
default: 24
default: 24h
description: Time to live for argo workflow object and its pods
for a failed workflow in hours. By default, it is set to 24 hours
type: integer
for a failed workflow. Accepts duration strings like "30s", "4h",
"24h". By default, it is set to 24h
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
type: object
selector:
additionalProperties:
Expand Down
15 changes: 12 additions & 3 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ type ConditionWorkflowMapping struct {
NotifyRemediationMessage string `json:"notifyRemediationMessage" yaml:"notifyRemediationMessage"`
NotifyTestFailureMessage string `json:"notifyTestFailureMessage" yaml:"notifyTestFailureMessage"`
RecoveryPolicy RecoveryPolicyConfig `json:"recoveryPolicy" yaml:"recoveryPolicy"`
SkipRebootStep bool `json:"skipRebootStep" yaml:"skipRebootStep"`
}

type ValidationTestsProfile struct {
Expand Down Expand Up @@ -625,7 +626,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
},
},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "suspend", Template: "suspend"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "reboot", Template: "reboot", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}, When: "{{workflow.parameters.skipRebootStep}} == 'false'"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
{Steps: []workflowv1alpha1.WorkflowStep{
{
Expand Down Expand Up @@ -940,8 +941,12 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate

wf.Spec.Entrypoint = wfTemplate.Spec.Entrypoint
wf.Spec.ServiceAccountName = h.getServiceAccountName(ctx, devConfig)
ttlHours := devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows
ttlSeconds := int32(ttlHours * 3600)
ttlDuration, err := time.ParseDuration(devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows)
if err != nil {
log.FromContext(ctx).Error(err, "Failed to parse TTL duration, using default of 24h")
ttlDuration = 24 * time.Hour
}
ttlSeconds := int32(ttlDuration.Seconds())
wf.Spec.TTLStrategy = &workflowv1alpha1.TTLStrategy{
SecondsAfterCompletion: &ttlSeconds,
}
Expand Down Expand Up @@ -1066,6 +1071,10 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
Name: "drain_policy",
Value: workflowv1alpha1.AnyStringPtr(string(drainPolicyJSONBytes)),
},
{
Name: "skipRebootStep",
Value: workflowv1alpha1.AnyStringPtr(mapping.SkipRebootStep),
},
},
}

Expand Down
4 changes: 2 additions & 2 deletions tests/helm-e2e/helm_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -971,7 +971,7 @@ deviceConfig:
enable: true
conditionalWorkflows:
name: "conditional-workflows-configmap"
ttlForFailedWorkflows: 36
ttlForFailedWorkflows: 36h
testerImage: "test.io/test/remediation-workflow-tester:v1.3.0"
`,
extraArgs: []string{"-f", tmpValuesYamlPath, "--set", "crds.defaultCR.upgrade=true"},
Expand All @@ -984,7 +984,7 @@ deviceConfig:
ConditionalWorkflows: &corev1.LocalObjectReference{
Name: "conditional-workflows-configmap",
},
TtlForFailedWorkflows: 36,
TtlForFailedWorkflows: "36h",
TesterImage: "test.io/test/remediation-workflow-tester:v1.3.0",
},
},
Expand Down