diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index c4f2beaa6..85e2db02f 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -108,6 +108,22 @@ type RemediationWorkflowSpec struct { //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"} // +optional MaxParallelWorkflows int `json:"maxParallelWorkflows"` + + // Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + // If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationTaints",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints"} + // +optional + NodeRemediationTaints []v1.Taint `json:"nodeRemediationTaints,omitempty"` + + // Node Remediation labels are custom labels that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationLabels",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels"} + // +optional + NodeRemediationLabels map[string]string `json:"nodeRemediationLabels,omitempty"` + + // Node drain policy during remediation workflow execution + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeDrainPolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy"} + // +optional + NodeDrainPolicy *DrainSpec `json:"nodeDrainPolicy,omitempty"` } type RegistryTLS struct { @@ -319,6 +335,14 @@ type DrainSpec struct { // +optional // +kubebuilder:default:=-1 GracePeriodSeconds int `json:"gracePeriodSeconds,omitempty"` + // IgnoreDaemonSets indicates whether to ignore DaemonSet-managed pods + // +optional + // +kubebuilder:default:=true + IgnoreDaemonSets *bool `json:"ignoreDaemonSets,omitempty"` + // IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + // This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + // +optional + IgnoreNamespaces []string `json:"ignoreNamespaces,omitempty"` } type PodDeletionSpec struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 3dbd9d698..091e81e93 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -323,6 +323,16 @@ func (in *DrainSpec) DeepCopyInto(out *DrainSpec) { *out = new(bool) **out = **in } + if in.IgnoreDaemonSets != nil { + in, out := &in.IgnoreDaemonSets, &out.IgnoreDaemonSets + *out = new(bool) + **out = **in + } + if in.IgnoreNamespaces != nil { + in, out := &in.IgnoreNamespaces, &out.IgnoreNamespaces + *out = make([]string, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DrainSpec. @@ -738,6 +748,25 @@ func (in *RemediationWorkflowSpec) DeepCopyInto(out *RemediationWorkflowSpec) { *out = new(v1.LocalObjectReference) **out = **in } + if in.NodeRemediationTaints != nil { + in, out := &in.NodeRemediationTaints, &out.NodeRemediationTaints + *out = make([]v1.Taint, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.NodeRemediationLabels != nil { + in, out := &in.NodeRemediationLabels, &out.NodeRemediationLabels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.NodeDrainPolicy != nil { + in, out := &in.NodeDrainPolicy, &out.NodeDrainPolicy + *out = new(DrainSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationWorkflowSpec. diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index fac57f019..b95522ca2 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -32,7 +32,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/gpu-operator:v1.4.0 - createdAt: "2025-12-09T09:27:50Z" + createdAt: "2026-01-28T11:30:39Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -725,6 +725,27 @@ spec: path: remediationWorkflow.maxParallelWorkflows x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows + - description: Node drain policy during remediation workflow execution + displayName: NodeDrainPolicy + path: remediationWorkflow.nodeDrainPolicy + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy + - description: Node Remediation labels are custom labels that we can apply on + the node to specify that the node is undergoing remediation or needs attention + by the administrator. + displayName: NodeRemediationLabels + path: remediationWorkflow.nodeRemediationLabels + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels + - description: Node Remediation taints are custom taints that we can apply on + the node to specify that the node is undergoing remediation or needs attention + by the administrator. If user does not specify any taints, the operator + will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" + to the node under remediation. + displayName: NodeRemediationTaints + path: remediationWorkflow.nodeRemediationTaints + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints - description: Tester image used to run tests and verify if remediation fixed the reported problem. displayName: TesterImage diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index ae180f96a..c7b9b1968 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -596,6 +596,18 @@ spec: waits for a pod to shut down gracefully after receiving a termination signal type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array timeoutSeconds: default: 300 description: TimeoutSecond specifies the length of time @@ -1399,6 +1411,79 @@ spec: remediation workflows can be executed in parallel. 0 is the default value and it means no limit. type: integer + nodeDrainPolicy: + description: Node drain policy during remediation workflow execution + properties: + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving + a termination signal + type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in + seconds to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + nodeRemediationLabels: + additionalProperties: + type: string + description: Node Remediation labels are custom labels that we + can apply on the node to specify that the node is undergoing + remediation or needs attention by the administrator. + type: object + nodeRemediationTaints: + description: |- + Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a + node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key + type: object + type: array testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 80e1beb12..66845d611 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -592,6 +592,18 @@ spec: waits for a pod to shut down gracefully after receiving a termination signal type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array timeoutSeconds: default: 300 description: TimeoutSecond specifies the length of time @@ -1395,6 +1407,79 @@ spec: remediation workflows can be executed in parallel. 0 is the default value and it means no limit. type: integer + nodeDrainPolicy: + description: Node drain policy during remediation workflow execution + properties: + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving + a termination signal + type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in + seconds to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + nodeRemediationLabels: + additionalProperties: + type: string + description: Node Remediation labels are custom labels that we + can apply on the node to specify that the node is undergoing + remediation or needs attention by the administrator. + type: object + nodeRemediationTaints: + description: |- + Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a + node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key + type: object + type: array testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index 5a4d28ff2..1a9c05967 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -696,6 +696,27 @@ spec: path: remediationWorkflow.maxParallelWorkflows x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows + - description: Node drain policy during remediation workflow execution + displayName: NodeDrainPolicy + path: remediationWorkflow.nodeDrainPolicy + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy + - description: Node Remediation labels are custom labels that we can apply on + the node to specify that the node is undergoing remediation or needs attention + by the administrator. + displayName: NodeRemediationLabels + path: remediationWorkflow.nodeRemediationLabels + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels + - description: Node Remediation taints are custom taints that we can apply on + the node to specify that the node is undergoing remediation or needs attention + by the administrator. If user does not specify any taints, the operator + will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" + to the node under remediation. + displayName: NodeRemediationTaints + path: remediationWorkflow.nodeRemediationTaints + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints - description: Tester image used to run tests and verify if remediation fixed the reported problem. displayName: TesterImage diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md index a619d7133..382ce32c2 100644 --- a/docs/autoremediation/auto-remediation.md +++ b/docs/autoremediation/auto-remediation.md @@ -1,204 +1,257 @@ -# Auto Remediation of GPU nodes using Argo Workflows +# Auto Remediation of GPU nodes -The GPU Operator supports remediation of GPU worker nodes that have moved into an unhealthy state due to GPU problems by triggering a workflow (set of steps) which attempts to remediate the issue. To achieve this, the GPU Operator makes use of Argo Workflows and its workflow templates. Argo Workflows is a popular open-source workflow engine for Kubernetes. It is lightweight and scalable. The GPU Operator, as part of its helm installation, installs the following: +The GPU Operator provides automatic remediation for GPU worker nodes that become unhealthy due to GPU-related issues. When such problems are detected, the operator triggers a workflow—a series of automated steps designed to restore the node to a healthy state. This functionality is powered by Argo Workflows, a lightweight and scalable open-source workflow engine for Kubernetes. Through the DeviceConfig Custom Resource, the GPU Operator offers extensive customization options for configuring remediation behavior. -1) Argo workflow controller as a k8s deployment -2) Argo CRDs for defining workflow templates and workflos +## Auto-Remediation Workflow Overview -GPU Operator installs Argo v3.6.5 +The following diagram illustrates the end-to-end flow of automatic remediation: -The source yaml to install it is present here: https://github.com/argoproj/argo-workflows/releases/download/v3.6.5/install.yaml +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ GPU Worker Node │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────┐ │ +│ │ Device Metrics │ │ +│ │ Exporter │ Reports inband-RAS errors │ +│ └───────────┬────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────┐ │ +│ │ Node Problem │ Queries for inband-RAS errors │ +│ │ Detector (NPD) │ and marks node condition as True │ +│ └───────────┬────────────┘ │ +│ │ │ +└──────────────┼────────────────────────────────────────────────────────────────┘ + │ + │ Node condition status update + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Controller Node │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────┐ │ +│ │ GPU Operator │ Observes node error conditions │ +│ │ │ │ +│ └───────────┬────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────┐ │ +│ │ Argo Workflow │ Triggers remediation workflow │ +│ │ Controller │ for the affected node │ +│ └────────────────────────┘ │ +│ │ │ +└──────────────┼────────────────────────────────────────────────────────────────┘ + │ + │ Executes remediation steps + ▼ + Affected GPU Worker Node +``` -It has been modified to fit the requirements of this feature. For example, the workflow server is not necessary, so it doesn't get deployed as part of the -GPU Operator-packaged argo installation +The Node Problem Detector (NPD) maintains a unique node condition for each error type, enabling users to configure different remediation actions tailored to specific error conditions. -## About Workflows and Workflow Templates +> **Note:** The GPU Operator prevents multiple concurrent workflows on the same node. When a node is tainted and a workflow is already executing, no additional workflows will be triggered on that node until the current workflow completes. -The workflow controller is responsible for running a workflow and managing its lifecycle. +## Pre-requisites -Argo workflows by default uses Kubernetes API Server(etcd) as its database. Once a workflow is triggered, the controller maintains the running state of the workflow and persists in the database. In case workflow controller restarts in between, we still have the state. +Automatic node remediation requires the following components to be enabled and running on the cluster: -A typical workflow refers a workflow template. A workflow template can either be used to define a specific work, or it can be used to orchestrate a workflow. Each task within a workflow is run inside a container. +1. **Device Metrics Exporter** - Reports unhealthy metrics and inband-RAS errors that are used to detect faulty GPUs. +2. **Node Problem Detector (NPD)** - An open-source Kubernetes component that runs on all nodes to identify node issues and report them to upstream controllers in the Kubernetes management stack. For more information about NPD configuration, see the [NPD documentation](../npd/node-problem-detector.md). -Creating a `workflow-template` on the cluster will store the template with its steps in k8s apiserver (etcd) but not trigger any action. -Creating a `workflow` which invokes a `workflow-template` will store the workflow in k8s apiserver(etcd) and also trigger the actual steps in the template. -GPU Operator creates the `workflow` which invokes the `workflow-template` to trigger remediation +## Installation -## Configuration to be handled by the User +The GPU Operator Helm installation includes the following Argo Workflows components: --> Toggling `RemediationWorkflow.Enable` to True. +1. Argo workflow controller (deployed as a Kubernetes deployment) +2. Argo CRDs for defining workflow templates and workflows --> NPD daemonset is relied upon to verify that the issue is fixed during the workflow run. Hence, user needs to add this toleration to NPD daemonset so that it can continue to be scheduled during the workflow run: +The GPU Operator installs Argo Workflows v3.6.5, using a [customized installation YAML](https://github.com/argoproj/argo-workflows/releases/download/v3.6.5/install.yaml) tailored for auto-remediation requirements. This customization excludes components not needed for remediation, such as the Argo workflow server. For more information about Argo Workflows concepts, refer to the [official documentation](https://argo-workflows.readthedocs.io/en/release-3.6/workflow-concepts/). - `amd-gpu-unhealthy:NoSchedule op=Exists` +> **Note:** By default, auto-remediation components (workflow controller and CRDs) are installed during Helm deployment. To disable the installation of these components, use the following Helm flag: +> +> ```bash +> --set remediation.enabled=false +> ``` -GPU Operator will handle adding this toleration for in-house components like KMM, metrics-exporter which should stay running during the workflow run +## Configuration and customization --> Remediation workflow uses a utility image for executing the steps. Specify the utility image in `Spec.CommonConfig.UtilsContainer` section of Device Config. If the UtilsContainer section is not specified, default image used is `docker.io/rocm/gpu-operator-utils:latest` +### Device Config configuration --> Specify the test runner image in field `RemediationWorkflow.TesterImage`. The image can be one of the images supported by `Spec.TestRunner.Image`. This image is used to test the GPUs after the remediation process is performed. If the field is not specified, default image used is `rocm/test-runner:agfhc-latest`. +The DeviceConfig Custom Resource includes a `RemediationWorkflowSpec` section for configuring and customizing the auto-remediation feature: --> If a workflow runs and fails, the node will remain in tainted state. If the user wants to go ahead and make the node schedulable again for workloads, the node should be untainted with: - `kubectl taint node amd-gpu-unhealthy:NoSchedule-` +```yaml +type RemediationWorkflowSpec struct { + Enable *bool -## How Workflows are triggered + ConditionalWorkflows *v1.LocalObjectReference -Node problem detector (NPD) can set the node conditions by listening to GPU health reported by device metrics exporter periodically. -GPU-Operator keeps monitoring the node conditions periodically and creates appropriate workflow based on the node condition status moving to `True`. For example, the below node condition would mean node is in a bad state: + TtlForFailedWorkflows int -```yaml - - lastHeartbeatTime: "2025-08-04T08:56:04Z" - lastTransitionTime: "2025-08-04T08:56:04Z" - reason: "Temperature Threshold Exceeded" - status: "True" - type: AMDGPUUnhealthy -``` + TesterImage string -When the status of the node condition is `False`, it means that node condition is currently fine and in good state. -These are the new fields introduced under the RemediationWorkflow field in the DeviceConfig CR: + MaxParallelWorkflows int -```yaml - type RemediationWorkflowSpec struct { - // enable remediation workflows. disabled by default - // enable if operator should automatically handle remediation of node incase of gpu issues - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Enable",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:enable"} - Enable *bool `json:"enable,omitempty"` - - // Name of the ConfigMap that holds condition-to-workflow mappings. - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"} - ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"` - - // Time to live for argo workflow object and its pods for a failed workflow in hours. By default, it is set to 24 hours - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"} - // +kubebuilder:default:=24 - TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"` - } -``` -The mappings are present in the configmap referenced by the ConditionalWorkflows field. -GPU-Operator will create the `default-conditional-workflow-mappings` configmap on the cluster with some default mappings. The user can modify them if required and can add more mappings as well. If the user wants to use this default configmap, then they may leave the `RemediationWorkflow.ConditionalWorkflows` field empty in the CR. The user can also come up with their own configmap and mention the name of the configmap under `RemediationWorkflow.ConditionalWorkflows` if they do not want to use the default `default-conditional-workflow-mappings` configmap. - -Note: `default-conditional-workflow-mappings` will be created on the cluster by GPU-Operator + NodeRemediationLabels map[string]string -```yaml -apiVersion: v1 -kind: ConfigMap -data: - workflow: |- - - nodeCondition: "AMDGPUUnhealthy" - workflowTemplate: "default-template" - notifyMessage: "notification message for admin(if any) to take manual remediation action" - validationTestsProfile: - framework: "AGFHC" - recipe: "all_lvl4" - iterations: 1 - stopOnFailure: true - timeoutSeconds: 4800 + NodeRemediationTaints []v1.Taint + + NodeDrainPolicy *DrainSpec +} ``` -`NodeCondition` field refers to the node condition that the user wants the Operator to watch for and to trigger remediation workflow. +**Enable** - Controls whether automatic node remediation is enabled. Set this field to `true` to activate the auto-remediation feature in the cluster. -`WorkflowTemplate` will use the default-template in most cases which is discussed below. If user wants to use his own workflow template for a certain node condition, he can create the template in the cluster and mention the name of the template in this field but the recommended way is to let Operator handle it through the default-template. +**ConditionalWorkflows** - References a ConfigMap that contains mappings between node conditions and their corresponding remediation workflows. The GPU Operator automatically creates a `default-conditional-workflow-mappings` ConfigMap with predefined mappings. Users can either modify this default ConfigMap or create their own custom ConfigMap. If left empty, the default ConfigMap will be used automatically. More about the ConfigMap in [below section](auto-remediation.md#remediation-workflow-configmap). -`notifyMessage` contains remediation instructions for the admin in case the node problem requires manual action. Workflow will trigger a Kubernetes event with the content of **notifyMessage** to alert the admin. +> **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator. -`validationTestsProfile` field refers to the AGFHC/RVS test-profile to be run by the workflow to verify that the problem is fixed. The test-profile will be passed onto testrunner for it to be run. +**TtlForFailedWorkflows** - Specifies the time-to-live (in hours) for failed workflow objects and their associated pods. Failed workflows are retained temporarily to allow inspection and troubleshooting. After the configured time period elapses, the failed workflow resources are automatically cleaned up. Default value is 24 hours. -```yaml - validationTestsProfile: - framework: "AGFHC" - recipe: "all_lvl4" - iterations: 1 - stopOnFailure: true - timeoutSeconds: 4800` - ``` +**TesterImage** - Specifies the container image for executing GPU validation tests during remediation workflows. This image must align with `Spec.TestRunner.Image` specifications and runs test suites to verify GPU health after remediation completion. If unspecified, the default image is `docker.io/rocm/test-runner:v1.4.1`. -If a user would like to run a testsuite as part of the workflow, these fields under `validationTestsProfile` are mandatory and they correspond to the fields of the same in the [Test Runner Documentation](../test/manual-test.md) +> **Note:** The default image supports only RVS test execution. For AGFHC test framework support within workflows, contact your AMD representative to obtain access to the AGFHC-enabled test runner image. -`physicalActionNeeded` field refers to the physical action the user has to take for certain conditions that will not be fixed by a reboot. The action will be mentioned for each of those conditions in the `default-conditional-workflow-mappings`. For conditions where reboot fixes the issue, this field will be left empty. +**MaxParallelWorkflows** - Limits the maximum number of remediation workflows that can execute concurrently across the cluster. This setting helps maintain minimum node availability by preventing excessive simultaneous remediation operations. A value of zero (default) means no limit is enforced. -This integration works on the basis that NPD applies different node conditions for different critical errors. +When the number of triggered workflows exceeds this limit, additional workflows are queued by the Argo workflow controller in a **Pending** state. Queued workflows remain pending until an active workflow completes, freeing a slot within the configured parallelism limit. -Note: Operator ensures that when a node is tainted and a workflow is already running, we don’t trigger any new workflows on the node. +**NodeRemediationLabels** - Defines custom labels to be applied to nodes during automatic remediation workflows. These labels persist throughout the remediation process and can be used for monitoring, tracking, or applying custom policies. -## Enable auto remediation +**NodeRemediationTaints** - Specifies custom taints to be applied to nodes during the remediation process. If no taints are specified, the Operator applies the default taint `amd-gpu-unhealthy:NoSchedule` to prevent workload scheduling on the affected node. -To enable this feature, the user needs to toggle `RemediationWorkflow.Enable` to true in the Device Config CR. It is disabled by default. -The most common CR users will be using will be of this form which will use the `default-conditional-workflow-mappings` for ConditionalWorkflows field unless the user wants to create their own configmap. +**NodeDrainPolicy** - Configures the pod eviction behavior when draining workloads from nodes during the remediation process. This policy controls how pods are removed, including timeout settings, grace periods, and namespace exclusions. See the [Node Drain Policy Configuration](#node-drain-policy-configuration) section below for detailed field descriptions. -```yaml - remediationWorkflow: - enable: true -``` +**Spec.CommonConfig.UtilsContainer** - Remediation workflow uses a utility image for executing the steps. Specify the utility image in `Spec.CommonConfig.UtilsContainer` section of Device Config. If the UtilsContainer section is not specified, default image used is `docker.io/rocm/gpu-operator-utils:latest` + +#### Node Drain Policy Configuration + +The `NodeDrainPolicy` field accepts a `DrainSpec` object with the following configurable parameters: + +**Force** - Enables forced draining of pods that do not respond to standard termination signals. When set to `true`, pods that cannot be evicted gracefully will be forcibly removed. Default value is `false`. + +**TimeoutSeconds** - Specifies the maximum time in seconds to wait for the drain operation to complete before giving up. A value of zero means infinite timeout, allowing the drain operation to continue indefinitely. Default value is `300` seconds (5 minutes). + +**GracePeriodSeconds** - Defines the grace period in seconds that Kubernetes allows for a pod to shut down gracefully after receiving a termination signal. This value overrides the pod's configured `terminationGracePeriodSeconds`. A value of `-1` uses each pod's own grace period setting. Default value is `-1`. + +**IgnoreDaemonSets** - When set to `true`, DaemonSet-managed pods are excluded from the drain operation. This is typically desired since DaemonSets are designed to run on all nodes and will automatically reschedule on the same node. Default value is `true`. + +**IgnoreNamespaces** - Defines a list of namespaces to exclude from pod eviction during the drain operation. Pods running in these namespaces will remain on the node, allowing critical infrastructure components to continue operating throughout the remediation process. By default, the following namespaces are excluded: `kube-system`, `cert-manager`, and the GPU Operator's namespace. -You can limit the number of nodes undergoing remediation simultaneously by setting the `maxParallelWorkflows` field in the Device Config custom resource. For example, to ensure no more than 5 nodes undergo remediation at the same time, configure the value as 5(as shown below). The default value is zero, which means there is no upper limit on the number of parallel workflows that can run simultaneously. +### Other Configuration options: + +**NPD Configuration** - NPD configuration is explained in more detail [here](../npd/node-problem-detector.md). The Node Problem Detector (NPD) DaemonSet must continue running during workflow execution to verify issue resolution. Add the following toleration to the NPD DaemonSet: + + `amd-gpu-unhealthy:NoSchedule op=Exists` + +The GPU Operator automatically applies this toleration to internal components such as KMM and metrics-exporter, ensuring they continue running during workflow execution. + +**Failed Workflow Handling** - If a remediation workflow fails, the affected node remains in a tainted state. To manually restore the node to a schedulable state for workloads, remove the taint using the following command: + + ```bash + kubectl taint node amd-gpu-unhealthy:NoSchedule- + ``` + +## Remediation Workflow ConfigMap + +The AMD GPU Operator automatically generates a default ConfigMap (`default-conditional-workflow-mappings`) derived from the latest AMD Service Action Guide. This ConfigMap establishes mappings between unique error codes (AFID) and their associated remediation workflows. Each mapping entry defines the error type, the workflow template to invoke for remediation, and workflow-specific parameters. The default ConfigMap is available in the [GPU Operator repository](https://github.com/ROCm/gpu-operator/blob/main/internal/controllers/remediation/configs/default-configmap.yaml) and includes all node conditions managed by the Operator by default. + +### Example Error Mapping Section + +The following example demonstrates a complete error mapping configuration: ```yaml - remediationWorkflow: - enable: true - maxParallelWorkflows: 5 +- nodeCondition: AMDGPUXgmi + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: Remove GPU tray from node.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation guideRe-install the GPU tray into node. + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m ``` -When more workflows are triggered beyond the above workflow parallelism limit, the excess workflows are queued by the Argo workflow controller and enter a **Pending** state. They will remain in the queue until a running workflow finishes and a "slot" within the configured parallelism limit becomes available. +### ConfigMap Field Descriptions + +**nodeCondition** - Specifies a unique description for an error code (AFID). This value must match the corresponding node condition defined in the Node Problem Detector (NPD) configuration. + +**workflowTemplate** - Defines the Argo Workflows template to execute for this specific error condition. The `default-template` is used by default and provides comprehensive remediation steps (detailed below). While users can create and reference custom Argo workflow templates in the cluster, it is recommended to use the operator-managed `default-template` for consistency and maintainability. + +**validationTestsProfile** - Specifies the test framework and test suite to execute for validating GPU health after remediation. Supported frameworks include AGFHC and RVS. All fields under `validationTestsProfile` are mandatory and correspond to the parameters documented in the [Test Runner Documentation](../test/manual-test.md). + +**physicalActionNeeded** - Indicates whether manual physical intervention is required on the node (e.g., RMA of faulty GPU, hardware inspection, etc.). Specific actions are detailed in the `notifyRemediationMessage` field for each error condition. For issues resolved by a reboot, this field is set to `false`. + +**notifyRemediationMessage** - Provides detailed instructions for physical or manual actions when `physicalActionNeeded` is `true`. This message guides administrators through the required remediation steps to resolve the fault. + +**notifyTestFailureMessage** - Contains instructions to be displayed when validation tests fail after remediation attempts. This message typically includes escalation procedures and diagnostic information requirements. + +**recoveryPolicy** - Defines limits on remediation attempts to prevent excessive recovery cycles. Includes `maxAllowedRunsPerWindow` (maximum retry attempts) and `windowSize` (time window for counting attempts). When exceeded, the workflow pauses for manual intervention. ## Default Workflow Template -Note: `default-template` will be created on the cluster by GPU-Operator +> **Note:** The `default-template` is automatically created on the cluster by the GPU Operator. +The `default-template` workflow performs the following remediation steps: -`default-template` will perform the following steps: +1. **Label Node** - Applies custom labels to the node as specified in the `NodeRemediationLabels` field of the DeviceConfig Custom Resource. If no labels are configured, this step is skipped and the workflow proceeds to the next step. -1. Taint the node with `key = "AMD_GPU_Unhealthy”, op = equal, value = node_condition, effect = noSchedule ` +2. **Taint Node** - Apply taint with `key = "AMD_GPU_Unhealthy", op = equal, value = node_condition, effect = noSchedule` to prevent new workload scheduling. -2. Drain workloads/pods that are using AMD GPUs +3. **Drain Workloads** - Evict all pods utilizing AMD GPUs from the affected node. -3. Notify admin/user if manual intervention is required +4. **Notify Administrator** - Send notification if manual intervention is required for the detected issue. -4. Suspend workflow +5. **Suspend Workflow** - Pause workflow execution pending manual intervention or automatic resumption based on configured policies. -5. Reboot the node +6. **Reboot Node** - Perform node reboot to clear transient errors and reinitialize GPU hardware. -6. Run AGFHC/RVS tests to verify the GPUs are healthy post reboot. +7. **Validate GPUs** - Execute AGFHC/RVS validation tests to confirm GPU health after reboot. -7. Verify that the node condition has become False +8. **Verify Condition** - Confirm that the triggering node condition has been resolved (status changed to False). -8. Un-taint the node and this will make the GPUs available for scheduling again. +9. **Remove Taint** - Remove the node taint to restore GPU availability for workload scheduling. -For each step in the workflow template, a pod is spun up that performs the task. -For the case when user wants to create his own template, the argo CRDs are present on the cluster and the user can create any workflow template and refer it in the config-map. +10. **Remove Labels** - Removes all custom labels that were applied to the node in Step 1, restoring the node to its original label state. -Most steps in the default-template are self-explanatory. However, there are some details to be known about Step 2, 3 and 6 +Each workflow step is executed as a separate Kubernetes pod. For advanced use cases, users can create custom workflow templates using the Argo CRDs available on the cluster and reference them in the ConfigMap. -## Workflow Step 2: Check if physical intervention is required +While most workflow steps are self-explanatory, Steps 4, 5, and 7 require additional clarification. -As per AMD service action guide, many problems require user to intervene physically (checking wiring, screws, retorquing, etc.). The workflow, as per this, will raise a k8s event to suggest the physical action required to the user in such cases before suspending the workflow in step3. If a physical action is needed for a certain node condition, it will be present in the `physicalActionNeeded` field in the configmap mapping corresponding to that node condition. +### Workflow Step 4: Physical Intervention Check -The benefit of having this step is that admin can see which node is waiting for physical intervention. Once he fixes it physically, he can simply resume the workflow for validation using the label mentioned in Workflow Step3. +According to the AMD service action guide, certain GPU issues require physical intervention (e.g., checking wiring, securing screws, retorquing connections). When such conditions are detected, the workflow generates a Kubernetes event to notify the administrator of the required physical action before suspending at this step. The specific physical action for each node condition is defined in the `physicalActionNeeded` field within the corresponding ConfigMap mapping. -## Workflow Step 3: Suspend/Resume the Workflow +This step enables administrators to identify nodes awaiting physical intervention. After completing the necessary physical repairs, administrators can resume the workflow for validation using the label described in Workflow Step 4. -The GPU-Operator determines whether to resume the workflow after it has been paused in Step 2. This pause provides an opportunity for users to perform necessary manual actions. There are two primary scenarios where user intervention may be required: +### Workflow Step 5: Workflow Suspension and Resumption -1. **Excessive Node Remediation:** - Users can define a `RecoveryPolicy` in the `ConditionalWorkflowMappings` ConfigMap, specifying the maximum number of recovery attempts allowed within a given time window. If a node exceeds this limit, the workflow remains paused. +The GPU Operator determines whether to automatically resume the workflow after it pauses in Step 4. This pause accommodates scenarios requiring manual intervention. The workflow may remain suspended in two primary cases: + +1. **Excessive Remediation Attempts:** + When a `RecoveryPolicy` is configured in the `ConditionalWorkflowMappings` ConfigMap, it defines the maximum remediation attempts allowed within a specified time window. Nodes exceeding this threshold will have their workflows paused indefinitely until manual resumption. 2. **Physical Action Required:** - If a physical action is specified for a workflow in the `ConditionalWorkflowMappings` ConfigMap, the node will pause at this step, allowing the user to perform the required action. The user is also notified via an event. + When a physical action is specified for a workflow in the `ConditionalWorkflowMappings` ConfigMap, the workflow pauses at this step, allowing administrators to perform the required maintenance. A notification event is generated to alert the user. + +If neither condition applies, the workflow automatically resumes without manual intervention. -If neither of these conditions apply, the workflow will automatically resume from this step. +#### Resuming a Paused Workflow -### Resuming a paused workflow -Whenever the user is satisfied that the workflow can be resumed, they can add the label `operator.amd.com/gpu-force-resume-workflow=true` to the relevant node. The operator will detect this label and resume the workflow. +To resume a suspended workflow, apply the label `operator.amd.com/gpu-force-resume-workflow=true` to the affected node. The operator detects this label and resumes workflow execution. -To abort the workflow, label the node with `operator.amd.com/gpu-abort-workflow=true`. The node will remain in a tainted state for manual intervention. If remediation is no longer desired, this label provides the option to delete the workflow while the node is paused. +To abort the workflow entirely, apply the label `operator.amd.com/gpu-abort-workflow=true` to the node. This keeps the node in a tainted state for manual remediation. This option is useful when automatic remediation is no longer desired and the workflow should be deleted while paused. -## Workflow Step 6: Run AGFHC/RVS tests - --> The user will mention the test-profile to pass to test runner to run in the configmap for each condition under `validationTestsProfile` +### Workflow Step 7: GPU Validation Testing --> The workflow step will ensure that a k8s job is created which spins up a test runner container which picks up that test-profile to run as part of this step. +This step executes comprehensive GPU health validation tests using the test runner: --> The test results will be checked by the workflow step and will ensure that the workflow moves ahead only if the tests pass. If the tests fail, the workflow will fail. +- **Test Profile Configuration:** The test profile for each node condition is specified in the `validationTestsProfile` field within the ConfigMap. -#### **Notes** -During helm installation of GPU Operator, by default, installation of remediation components like workflow controller and crds is enabled. If the admin does not require this auto remediation feature and would like to disable the installation of these components, they can simply pass this flag during the helm installation: +- **Test Execution:** The workflow creates a Kubernetes Job that launches a test runner container. This container retrieves and executes the specified test profile. - `--set remediation.enabled=false` +- **Result Verification:** The workflow evaluates test results and only proceeds if all tests pass successfully. If any test fails, the entire workflow terminates with a failure status. diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 92deade4c..82be640a0 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2026-01-07T10:51:28.442192317Z" +generated: "2026-01-28T11:30:26.115644041Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 6061e2c93..21bac56e5 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -600,6 +600,18 @@ spec: waits for a pod to shut down gracefully after receiving a termination signal type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array timeoutSeconds: default: 300 description: TimeoutSecond specifies the length of time @@ -1401,6 +1413,77 @@ spec: workflows can be executed in parallel. 0 is the default value and it means no limit. type: integer + nodeDrainPolicy: + description: Node drain policy during remediation workflow execution + properties: + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving a + termination signal + type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore DaemonSet-managed + pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in seconds + to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + nodeRemediationLabels: + additionalProperties: + type: string + description: Node Remediation labels are custom labels that we can + apply on the node to specify that the node is undergoing remediation + or needs attention by the administrator. + type: object + nodeRemediationTaints: + description: |- + Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 682e0e641..2c110d8fc 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2026-01-07T10:51:41.178709798Z" +generated: "2026-01-28T11:30:38.160988877Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 6061e2c93..21bac56e5 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -600,6 +600,18 @@ spec: waits for a pod to shut down gracefully after receiving a termination signal type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore + DaemonSet-managed pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array timeoutSeconds: default: 300 description: TimeoutSecond specifies the length of time @@ -1401,6 +1413,77 @@ spec: workflows can be executed in parallel. 0 is the default value and it means no limit. type: integer + nodeDrainPolicy: + description: Node drain policy during remediation workflow execution + properties: + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving a + termination signal + type: integer + ignoreDaemonSets: + default: true + description: IgnoreDaemonSets indicates whether to ignore DaemonSet-managed + pods + type: boolean + ignoreNamespaces: + description: |- + IgnoreNamespaces is the list of namespaces to ignore during node drain operation. + This is useful to avoid draining pods from critical namespaces like 'kube-system', etc. + items: + type: string + type: array + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in seconds + to wait before giving up drain, zero means infinite + minimum: 0 + type: integer + type: object + nodeRemediationLabels: + additionalProperties: + type: string + description: Node Remediation labels are custom labels that we can + apply on the node to specify that the node is undergoing remediation + or needs attention by the administrator. + type: object + nodeRemediationTaints: + description: |- + Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. + If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go index 7fb938222..42fe85334 100644 --- a/internal/controllers/mock_remediation_handler.go +++ b/internal/controllers/mock_remediation_handler.go @@ -126,6 +126,18 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) abortWorkflow(ctx, workflow a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "abortWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).abortWorkflow), ctx, workflow) } +// applyTolerationsToWorkflow mocks base method. +func (m *MockremediationMgrHelperAPI) applyTolerationsToWorkflow(wf *v1alpha10.Workflow, devConfig *v1alpha1.DeviceConfig, nodeCondition string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "applyTolerationsToWorkflow", wf, devConfig, nodeCondition) +} + +// applyTolerationsToWorkflow indicates an expected call of applyTolerationsToWorkflow. +func (mr *MockremediationMgrHelperAPIMockRecorder) applyTolerationsToWorkflow(wf, devConfig, nodeCondition any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "applyTolerationsToWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).applyTolerationsToWorkflow), wf, devConfig, nodeCondition) +} + // attemptAbortWorkflowOnNode mocks base method. func (m *MockremediationMgrHelperAPI) attemptAbortWorkflowOnNode(ctx context.Context, node *v1.Node, wf *v1alpha10.Workflow) (bool, error) { m.ctrl.T.Helper() @@ -168,17 +180,17 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) canResumeWorkflowOnNode(ctx, } // checkIfTaintExists mocks base method. -func (m *MockremediationMgrHelperAPI) checkIfTaintExists(node *v1.Node, targetTaint v1.Taint) bool { +func (m *MockremediationMgrHelperAPI) checkIfTaintExists(node *v1.Node, devConfig *v1alpha1.DeviceConfig, nodeCondition string) bool { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "checkIfTaintExists", node, targetTaint) + ret := m.ctrl.Call(m, "checkIfTaintExists", node, devConfig, nodeCondition) ret0, _ := ret[0].(bool) return ret0 } // checkIfTaintExists indicates an expected call of checkIfTaintExists. -func (mr *MockremediationMgrHelperAPIMockRecorder) checkIfTaintExists(node, targetTaint any) *gomock.Call { +func (mr *MockremediationMgrHelperAPIMockRecorder) checkIfTaintExists(node, devConfig, nodeCondition any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "checkIfTaintExists", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).checkIfTaintExists), node, targetTaint) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "checkIfTaintExists", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).checkIfTaintExists), node, devConfig, nodeCondition) } // createDefaultConfigMap mocks base method. @@ -340,6 +352,34 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getMaxAllowedRunsPerWindow(re return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getMaxAllowedRunsPerWindow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getMaxAllowedRunsPerWindow), recoveryPolicy) } +// getNodeLabelsFromCR mocks base method. +func (m *MockremediationMgrHelperAPI) getNodeLabelsFromCR(ctx context.Context, devConfig *v1alpha1.DeviceConfig) []string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getNodeLabelsFromCR", ctx, devConfig) + ret0, _ := ret[0].([]string) + return ret0 +} + +// getNodeLabelsFromCR indicates an expected call of getNodeLabelsFromCR. +func (mr *MockremediationMgrHelperAPIMockRecorder) getNodeLabelsFromCR(ctx, devConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getNodeLabelsFromCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getNodeLabelsFromCR), ctx, devConfig) +} + +// getNodeTaints mocks base method. +func (m *MockremediationMgrHelperAPI) getNodeTaints(ctx context.Context, devConfig *v1alpha1.DeviceConfig, nodeCondition string) []string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getNodeTaints", ctx, devConfig, nodeCondition) + ret0, _ := ret[0].([]string) + return ret0 +} + +// getNodeTaints indicates an expected call of getNodeTaints. +func (mr *MockremediationMgrHelperAPIMockRecorder) getNodeTaints(ctx, devConfig, nodeCondition any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getNodeTaints", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getNodeTaints), ctx, devConfig, nodeCondition) +} + // getRecentRecoveryCount mocks base method. func (m *MockremediationMgrHelperAPI) getRecentRecoveryCount(nodeName, nodeCondition string) int { m.ctrl.T.Helper() diff --git a/internal/controllers/remediation/scripts/applylabels.sh b/internal/controllers/remediation/scripts/applylabels.sh new file mode 100644 index 000000000..ffe072d75 --- /dev/null +++ b/internal/controllers/remediation/scripts/applylabels.sh @@ -0,0 +1,39 @@ +set -e +NODE_NAME="{{inputs.parameters.node_name}}" +NODE_LABELS="{{inputs.parameters.node_labels}}" + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not present in the utils container. Proceeding without applying labels on the node" + exit 0 +fi + +# Get the length of the array +LENGTH=$(echo "$NODE_LABELS" | jq 'length') + +if [ "$LENGTH" -eq 0 ]; then + echo "No labels to apply" + exit 0 +fi + +echo "Applying $LENGTH labels to node '$NODE_NAME'..." + +# Loop through each label in the JSON array +for i in $(seq 0 $((LENGTH - 1))); do + LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]") + + if [ "$LABEL" == "null" ] || [ -z "$LABEL" ]; then + echo "Warning: Skipping empty label at index $i" + continue + fi + + echo "Applying label '$LABEL'..." + + if kubectl label node "$NODE_NAME" "$LABEL" --overwrite; then + echo "Successfully applied label '$LABEL'" + else + echo "Failed to apply label '$LABEL'" + fi +done + +echo "Done applying all labels to node '$NODE_NAME'" diff --git a/internal/controllers/remediation/scripts/drain.sh b/internal/controllers/remediation/scripts/drain.sh index 03aff201d..9267b92a1 100644 --- a/internal/controllers/remediation/scripts/drain.sh +++ b/internal/controllers/remediation/scripts/drain.sh @@ -1,29 +1,80 @@ set -e echo "Fetching node name..." NODE_NAME="{{inputs.parameters.node_name}}" +DRAIN_POLICY="{{inputs.parameters.drain_policy}}" + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not present in the utils container. Cannot parse drain policy" + exit 1 +fi + +# Parse drain policy JSON and extract fields +FORCE=$(echo "$DRAIN_POLICY" | jq -r '.force') +TIMEOUT_SECONDS=$(echo "$DRAIN_POLICY" | jq -r '.timeoutSeconds') +GRACE_PERIOD_SECONDS=$(echo "$DRAIN_POLICY" | jq -r '.gracePeriodSeconds') +IGNORE_DAEMONSETS=$(echo "$DRAIN_POLICY" | jq -r '.ignoreDaemonSets') + +# Parse ignoreNamespaces as an array +if [ "$(echo "$DRAIN_POLICY" | jq -r '.ignoreNamespaces')" != "null" ]; then + readarray -t IGNORE_NAMESPACES < <(echo "$DRAIN_POLICY" | jq -r '.ignoreNamespaces[]') +else + IGNORE_NAMESPACES=() +fi + +echo "Drain policy configuration:" +echo " Force: $FORCE" +echo " Timeout: $TIMEOUT_SECONDS seconds" +echo " Grace period: $GRACE_PERIOD_SECONDS seconds" +echo " Ignore DaemonSets: $IGNORE_DAEMONSETS" +echo " Ignore Namespaces: ${IGNORE_NAMESPACES[*]}" + echo "Identified node: $NODE_NAME" -echo "Finding pods on node $NODE_NAME with volume mount path starting with /dev/dri..." -PODS=$(kubectl get pods --all-namespaces -o json | jq -r ' +echo "Finding pods on node $NODE_NAME matching the drain policy criteria..." + +# Convert IGNORE_NAMESPACES array to JSON array for jq +IGNORE_NAMESPACES_JSON=$(printf '%s\n' "${IGNORE_NAMESPACES[@]}" | jq -R . | jq -s .) + +PODS=$(kubectl get pods --all-namespaces -o json | jq --argjson ignoreNs "$IGNORE_NAMESPACES_JSON" --arg ignoreDaemonSets "$IGNORE_DAEMONSETS" -r ' .items[] | select(.spec.nodeName == "'"$NODE_NAME"'") | + select((.metadata.namespace as $ns | $ignoreNs | index($ns) | not)) | select( - ( - [.spec.volumes[]? | select(.hostPath?.path != null and (.hostPath.path | startswith("/dev/dri")))] - | length > 0 - ) or ( - [.spec.containers[]? | select(.resources.requests["amd.com/gpu"] != null)] - | length > 0 - ) + if $ignoreDaemonSets == "true" then + ([.metadata.ownerReferences[]? | select(.kind == "DaemonSet")] | length) == 0 + else + true + end ) | "\(.metadata.namespace) \(.metadata.name)" ') if [ -z "$PODS" ]; then - echo "No pods with /dev/dri mounts found on node $NODE_NAME." -else - echo "Evicting pods:" - echo "$PODS" - echo "$PODS" | while read -r ns name; do - echo "Deleting pod $name in namespace $ns" - kubectl delete pod "$name" -n "$ns" --grace-period=0 --force || true - done -fi \ No newline at end of file + echo "No pods matching the drain policy criteria found on node $NODE_NAME." + exit 0 +fi + +echo "Draining pods:" +echo "$PODS" +echo "$PODS" | while read -r ns name; do + echo "Deleting pod $name in namespace $ns" + + # Build kubectl delete command with drain policy settings + DELETE_CMD="kubectl delete pod \"$name\" -n \"$ns\"" + + # Add --grace-period if specified + if [ "$GRACE_PERIOD_SECONDS" != "null" ] && [ -n "$GRACE_PERIOD_SECONDS" ]; then + DELETE_CMD="$DELETE_CMD --grace-period=$GRACE_PERIOD_SECONDS" + fi + + # Add --timeout if specified + if [ "$TIMEOUT_SECONDS" != "null" ] && [ -n "$TIMEOUT_SECONDS" ]; then + DELETE_CMD="$DELETE_CMD --timeout=${TIMEOUT_SECONDS}s" + fi + + # Add --force flag if FORCE is true + if [ "$FORCE" = "true" ]; then + DELETE_CMD="$DELETE_CMD --force" + fi + + eval "$DELETE_CMD" || true +done \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/removelabels.sh b/internal/controllers/remediation/scripts/removelabels.sh new file mode 100644 index 000000000..5c789449e --- /dev/null +++ b/internal/controllers/remediation/scripts/removelabels.sh @@ -0,0 +1,47 @@ +set -e +NODE_NAME="{{inputs.parameters.node_name}}" +NODE_LABELS="{{inputs.parameters.node_labels}}" + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not present in the utils container. Proceeding without removing labels from the node" + exit 0 +fi + +# Get the length of the array +LENGTH=$(echo "$NODE_LABELS" | jq 'length') + +if [ "$LENGTH" -eq 0 ]; then + echo "No labels to remove" + exit 0 +fi + +echo "Removing $LENGTH labels from node '$NODE_NAME'..." + +# Loop through each label in the JSON array +for i in $(seq 0 $((LENGTH - 1))); do + LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]") + + if [ "$LABEL" == "null" ] || [ -z "$LABEL" ]; then + echo "Warning: Skipping empty label at index $i" + continue + fi + + # Extract the key from the key=value format + LABEL_KEY="${LABEL%%=*}" + + if [ -z "$LABEL_KEY" ]; then + echo "Warning: Could not extract key from label '$LABEL'" + continue + fi + + echo "Removing label key '$LABEL_KEY' (from '$LABEL')..." + + if kubectl label node "$NODE_NAME" "$LABEL_KEY"-; then + echo "Successfully removed label '$LABEL_KEY'" + else + echo "Failed to remove label '$LABEL_KEY'" + fi +done + +echo "Done removing all labels from node '$NODE_NAME'" \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/taint.sh b/internal/controllers/remediation/scripts/taint.sh index 006e3c527..999bac212 100644 --- a/internal/controllers/remediation/scripts/taint.sh +++ b/internal/controllers/remediation/scripts/taint.sh @@ -1,4 +1,28 @@ set -e NODE_NAME="{{inputs.parameters.node_name}}" -echo "Tainting node $NODE_NAME" -kubectl taint node "$NODE_NAME" amd-gpu-unhealthy="{{inputs.parameters.node_condition}}":NoSchedule --overwrite \ No newline at end of file +NODE_TAINTS="{{inputs.parameters.node_taints}}" + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not present in the utils container. Proceeding without applying labels on the node" + exit 0 +fi + +# Get the length of the array +LENGTH=$(echo "$NODE_TAINTS" | jq 'length') + +for i in $(seq 0 $((LENGTH - 1))); do + TAINT=$(echo "$NODE_TAINTS" | jq -r ".[$i]") + if [ "$TAINT" == "null" ] || [ -z "$TAINT" ]; then + echo "Warning: Skipping empty taint at index $i" + continue + fi + echo "Tainting node $NODE_NAME with taint $TAINT" + if kubectl taint node "$NODE_NAME" "$TAINT" --overwrite; then + echo "Successfully applied taint '$TAINT'" + else + echo "Failed to apply taint '$TAINT'" + fi +done + +echo "Done applying all taints on node '$NODE_NAME'" \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/untaint.sh b/internal/controllers/remediation/scripts/untaint.sh index f8481b268..4c3240f2a 100644 --- a/internal/controllers/remediation/scripts/untaint.sh +++ b/internal/controllers/remediation/scripts/untaint.sh @@ -1,4 +1,27 @@ set -e NODE_NAME="{{inputs.parameters.node_name}}" -echo "Untainting node $NODE_NAME" -kubectl taint node "$NODE_NAME" amd-gpu-unhealthy:NoSchedule- \ No newline at end of file +NODE_TAINTS="{{inputs.parameters.node_taints}}" + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not present in the utils container. Proceeding without applying labels on the node" + exit 0 +fi + +LENGTH=$(echo "$NODE_TAINTS" | jq 'length') + +for i in $(seq 0 $((LENGTH - 1))); do + TAINT=$(echo "$NODE_TAINTS" | jq -r ".[$i]") + if [ "$TAINT" == "null" ] || [ -z "$TAINT" ]; then + echo "Warning: Skipping empty taint at index $i" + continue + fi + echo "Removing taint $TAINT from node $NODE_NAME" + if kubectl taint node "$NODE_NAME" "$TAINT"-; then + echo "Successfully removed taint '$TAINT'" + else + echo "Failed to remove taint '$TAINT'" + fi +done + +echo "Done removing all remediation taints on node '$NODE_NAME'" \ No newline at end of file diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index f77c9d6b5..d1f6d56d0 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -34,6 +34,7 @@ package controllers import ( "context" + "encoding/json" "errors" "fmt" "os" @@ -63,7 +64,7 @@ const ( RemediationTaintKey = "amd-gpu-unhealthy" DefaultConfigMapSuffix = "default-conditional-workflow-mappings" DefaultTemplate = "default-template" - DefaultTestRunnerImage = "docker.io/rocm/test-runner:agfhc-latest" + DefaultTestRunnerImage = "docker.io/rocm/test-runner:v1.4.1" TestRunnerServiceAccount = "amd-gpu-operator-test-runner" AmdGpuRemediationRequired = "amd-gpu-remediation-required" AmdGpuRemediationSucceeded = "amd-gpu-remediation-succeeded" @@ -272,7 +273,7 @@ type remediationMgrHelperAPI interface { isRemediationDisabled(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (bool, error) resumeSuspendedWorkflow(ctx context.Context, wfName, namespace string) error isDriverUpgradeInProgress(devCfg *amdv1alpha1.DeviceConfig, node *v1.Node) bool - checkIfTaintExists(node *v1.Node, targetTaint v1.Taint) bool + checkIfTaintExists(node *v1.Node, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) bool getWorkflowList(ctx context.Context, namespace string) (*workflowv1alpha1.WorkflowList, error) getWorkflowTemplate(ctx context.Context, workflowTemplateName, namespace string) (*workflowv1alpha1.WorkflowTemplate, error) getConfigMap(ctx context.Context, configmapName string, namespace string) (*v1.ConfigMap, error) @@ -311,6 +312,9 @@ type remediationMgrHelperAPI interface { handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) bool getWorkflowTaskScriptSource(scriptFileName string) (string, error) updateMaxParallelWorkflows(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error + getNodeLabelsFromCR(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) []string + getNodeTaints(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) []string + applyTolerationsToWorkflow(wf *workflowv1alpha1.Workflow, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) } type remediationMgrHelper struct { @@ -427,10 +431,21 @@ func (h *remediationMgrHelper) isDriverUpgradeInProgress(devCfg *amdv1alpha1.Dev return false } -func (h *remediationMgrHelper) checkIfTaintExists(node *v1.Node, targetTaint v1.Taint) bool { +func (h *remediationMgrHelper) checkIfTaintExists(node *v1.Node, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) bool { + taints := make([]v1.Taint, 0) + if len(devConfig.Spec.RemediationWorkflow.NodeRemediationTaints) > 0 { + taints = devConfig.Spec.RemediationWorkflow.NodeRemediationTaints + } else { + taints = append(taints, v1.Taint{ + Key: RemediationTaintKey, + Effect: v1.TaintEffectNoSchedule, + }) + } for _, t := range node.Spec.Taints { - if t.Key == targetTaint.Key && t.Effect == targetTaint.Effect { - return true + for _, targetTaint := range taints { + if t.Key == targetTaint.Key && t.Effect == targetTaint.Effect { + return true + } } } return false @@ -572,6 +587,14 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context if err != nil { return nil, err } + applyLabelsSrc, err := h.getWorkflowTaskScriptSource("applylabels.sh") + if err != nil { + return nil, err + } + removeLabelsSrc, err := h.getWorkflowTaskScriptSource("removelabels.sh") + if err != nil { + return nil, err + } template := &workflowv1alpha1.WorkflowTemplate{ ObjectMeta: metav1.ObjectMeta{ @@ -584,6 +607,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context { Name: "inbuilt", Steps: []workflowv1alpha1.ParallelSteps{ + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "applylabels", Template: "applylabels"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "taint", Template: "taint"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "drain", Template: "drain"}}}, {Steps: []workflowv1alpha1.WorkflowStep{ @@ -618,6 +642,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, }, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failurecleanup", Template: "removelabels", When: "{{steps.test.exitCode}} != 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failworkflow", Template: "failworkflow", When: "{{steps.test.exitCode}} != 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "wait", Template: "wait", When: "{{steps.test.exitCode}} == 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "untaint", Template: "untaint", When: "{{steps.test.exitCode}} == 0"}}}, @@ -636,6 +661,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, }, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "successcleanup", Template: "removelabels", When: "{{steps.test.exitCode}} == 0"}}}, }, }, { @@ -780,6 +806,40 @@ containers: Container: utilityContainer, }, }, + { + Name: "applylabels", + Inputs: workflowv1alpha1.Inputs{ + Parameters: []workflowv1alpha1.Parameter{ + { + Name: "node_name", + Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"), + }, + { + Name: "labels", + Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.labels}}"), + }, + }, + }, + Script: &workflowv1alpha1.ScriptTemplate{ + Source: applyLabelsSrc, + Container: utilityContainer, + }, + }, + { + Name: "removelabels", + Inputs: workflowv1alpha1.Inputs{ + Parameters: []workflowv1alpha1.Parameter{ + { + Name: "node_name", + Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.node_name}}"), + }, + }, + }, + Script: &workflowv1alpha1.ScriptTemplate{ + Source: removeLabelsSrc, + Container: utilityContainer, + }, + }, }, }, } @@ -891,18 +951,9 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate wf.Spec.Templates[i].NodeSelector = map[string]string{} } wf.Spec.Templates[i].NodeSelector["kubernetes.io/hostname"] = nodeName - - toleration := v1.Toleration{ - Key: RemediationTaintKey, - Operator: v1.TolerationOpExists, - Effect: v1.TaintEffectNoSchedule, - } - - if wf.Spec.Templates[i].Tolerations == nil { - wf.Spec.Templates[i].Tolerations = []v1.Toleration{} - } - wf.Spec.Templates[i].Tolerations = append(wf.Spec.Templates[i].Tolerations, toleration) } + // apply tolerations based on node taints + h.applyTolerationsToWorkflow(wf, devConfig, mapping.NodeCondition) testrunnerImage := DefaultTestRunnerImage @@ -915,6 +966,35 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate initContainerImage = devConfig.Spec.CommonConfig.InitContainerImage } + nodeLabels := h.getNodeLabelsFromCR(ctx, devConfig) + labelsJSONBytes, err := json.Marshal(nodeLabels) + if err != nil { + labelsJSONBytes = []byte("[]") + } + + nodeTaints := h.getNodeTaints(ctx, devConfig, mapping.NodeCondition) + taintsJSONBytes, err := json.Marshal(nodeTaints) + if err != nil { + taintsJSONBytes = []byte("[]") + } + + drainPolicy := devConfig.Spec.RemediationWorkflow.NodeDrainPolicy + if drainPolicy == nil { + // Set default drain policy if not specified + drainPolicy = &amdv1alpha1.DrainSpec{ + Force: ptr.To(true), + IgnoreDaemonSets: ptr.To(true), + TimeoutSeconds: 300, + GracePeriodSeconds: -1, + IgnoreNamespaces: []string{"kube-system", "cert-manager", devConfig.Namespace}, + } + } + + drainPolicyJSONBytes, err := json.Marshal(drainPolicy) + if err != nil { + drainPolicyJSONBytes = []byte("{}") + } + // Pass the args required to be used in the template wf.Spec.Arguments = workflowv1alpha1.Arguments{ Parameters: []workflowv1alpha1.Parameter{ @@ -974,11 +1054,22 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate Name: "initContainerImage", Value: workflowv1alpha1.AnyStringPtr(initContainerImage), }, + { + Name: "node_labels", + Value: workflowv1alpha1.AnyStringPtr(string(labelsJSONBytes)), + }, + { + Name: "node_taints", + Value: workflowv1alpha1.AnyStringPtr(string(taintsJSONBytes)), + }, + { + Name: "drain_policy", + Value: workflowv1alpha1.AnyStringPtr(string(drainPolicyJSONBytes)), + }, }, } return wf - } func (h *remediationMgrHelper) createWorkflow(ctx context.Context, workflow *workflowv1alpha1.Workflow) error { @@ -1041,7 +1132,7 @@ func (h *remediationMgrHelper) isWorkflowSchedulableOnNode(ctx context.Context, } // If taint already exists, skip the node - if hasTaint := h.checkIfTaintExists(node, taint); hasTaint { + if hasTaint := h.checkIfTaintExists(node, devConfig, mapping.NodeCondition); hasTaint { logger.Info(fmt.Sprintf("Taint %s already present on node %s, skipping creation of workflow", taint.Key, node.Name)) return false } @@ -1484,3 +1575,54 @@ func (h *remediationMgrHelper) abortWorkflow(ctx context.Context, wf *workflowv1 logger.Info(fmt.Sprintf("Workflow %s aborted successfully", wf.Name)) return nil } + +func (h *remediationMgrHelper) getNodeLabelsFromCR(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) []string { + nodeLabels := make([]string, 0) + for key, value := range devConfig.Spec.RemediationWorkflow.NodeRemediationLabels { + nodeLabels = append(nodeLabels, fmt.Sprintf("%s=%s", key, value)) + } + return nodeLabels +} + +func (h *remediationMgrHelper) getNodeTaintsFromCR(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) []string { + taints := make([]string, 0) + for _, taint := range devConfig.Spec.RemediationWorkflow.NodeRemediationTaints { + taints = append(taints, fmt.Sprintf("%s=%s:%s", taint.Key, taint.Value, taint.Effect)) + } + return taints +} + +// getNodeTaints returns the list of taints to be applied to the node during remediation. +// If no user configured taints are found in the DeviceConfig CR, it returns a default taint. +func (h *remediationMgrHelper) getNodeTaints(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) []string { + taints := h.getNodeTaintsFromCR(ctx, devConfig) + if len(taints) == 0 { + taints = append(taints, fmt.Sprintf("%s=%s:%s", RemediationTaintKey, nodeCondition, v1.TaintEffectNoSchedule)) + } + return taints +} + +func (h *remediationMgrHelper) applyTolerationsToWorkflow(wf *workflowv1alpha1.Workflow, devConfig *amdv1alpha1.DeviceConfig, nodeCondition string) { + taints := make([]v1.Taint, 0) + if len(devConfig.Spec.RemediationWorkflow.NodeRemediationTaints) > 0 { + taints = devConfig.Spec.RemediationWorkflow.NodeRemediationTaints + } else { + taints = append(taints, v1.Taint{ + Key: RemediationTaintKey, + Value: nodeCondition, + Effect: v1.TaintEffectNoSchedule, + }) + } + for i := range wf.Spec.Templates { + if wf.Spec.Templates[i].Tolerations == nil { + wf.Spec.Templates[i].Tolerations = []v1.Toleration{} + } + for _, taint := range taints { + wf.Spec.Templates[i].Tolerations = append(wf.Spec.Templates[i].Tolerations, v1.Toleration{ + Key: taint.Key, + Operator: v1.TolerationOpExists, + Effect: taint.Effect, + }) + } + } +} diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index 5cdc2865f..286bc9eda 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -556,12 +556,25 @@ func setKMMModuleLoader(ctx context.Context, mod *kmmv1beta1.Module, devConfig * Value: "up", Operator: v1.TolerationOpEqual, }, - v1.Toleration{ + ) + + // Add tolerations to support running of kmm module loader during auto node remediation + if len(devConfig.Spec.RemediationWorkflow.NodeRemediationTaints) > 0 { + for _, taint := range devConfig.Spec.RemediationWorkflow.NodeRemediationTaints { + mod.Spec.Tolerations = append(mod.Spec.Tolerations, v1.Toleration{ + Key: taint.Key, + Operator: v1.TolerationOpExists, + Effect: taint.Effect, + }) + } + } else { + // Default toleration in case no custom taints are specified + mod.Spec.Tolerations = append(mod.Spec.Tolerations, v1.Toleration{ Key: "amd-gpu-unhealthy", Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoSchedule, - }, - ) + }) + } return nil } diff --git a/internal/metricsexporter/metricsexporter.go b/internal/metricsexporter/metricsexporter.go index b3775c1e9..b71d61753 100644 --- a/internal/metricsexporter/metricsexporter.go +++ b/internal/metricsexporter/metricsexporter.go @@ -477,17 +477,27 @@ func (nl *metricsExporter) SetMetricsExporterAsDesired(ds *appsv1.DaemonSet, dev } else { ds.Spec.Template.Spec.Tolerations = nil } - // Add tolerations for the node unhealthy conditions - gpuUnhealthyTolerations := []v1.Toleration{ - { + + // Add tolerations to support running of metrics exporter during auto node remediation + if len(devConfig.Spec.RemediationWorkflow.NodeRemediationTaints) > 0 { + for _, taint := range devConfig.Spec.RemediationWorkflow.NodeRemediationTaints { + toleration := v1.Toleration{ + Key: taint.Key, + Operator: v1.TolerationOpExists, + Effect: taint.Effect, + } + ds.Spec.Template.Spec.Tolerations = append(ds.Spec.Template.Spec.Tolerations, toleration) + } + } else { + // Default toleration in case no custom taints are specified + ds.Spec.Template.Spec.Tolerations = append(ds.Spec.Template.Spec.Tolerations, v1.Toleration{ Key: "amd-gpu-unhealthy", Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoSchedule, - }, + }) } - ds.Spec.Template.Spec.Tolerations = append(ds.Spec.Template.Spec.Tolerations, gpuUnhealthyTolerations...) - return controllerutil.SetControllerReference(devConfig, ds, nl.scheme) + return controllerutil.SetControllerReference(devConfig, ds, nl.scheme) } func (nl *metricsExporter) SetMetricsServiceAsDesired(svc *v1.Service, devConfig *amdv1alpha1.DeviceConfig) error { diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go index 9955685fa..8c2e9d099 100644 --- a/tests/e2e/remediation_test.go +++ b/tests/e2e/remediation_test.go @@ -98,6 +98,7 @@ func (s *E2ESuite) populateDeviceConfig(c *C) *v1alpha1.DeviceConfig { devCfg := s.getDeviceConfig(c) devCfg.Spec.Driver.Enable = &driverEnable devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable + devCfg.Spec.RemediationWorkflow.TesterImage = agfhcTestRunnerImage devCfg.Spec.MetricsExporter.Enable = &remediationEnable devCfg.Spec.MetricsExporter.Image = exporterImage devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always" diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index d7fb32f19..e86f360ff 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -326,6 +326,7 @@ deviceConfig: timeoutSeconds: 600 # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period gracePeriodSeconds: -2 + ignoreDaemonSets: true podDeletionPolicy: # -- whether force deletion is allowed or not force: false @@ -404,6 +405,7 @@ deviceConfig: Force: &boolFalse, TimeoutSeconds: 600, GracePeriodSeconds: -2, + IgnoreDaemonSets: &boolTrue, }, PodDeletionPolicy: &v1alpha1.PodDeletionSpec{ Force: &boolFalse,