ROCm · sajmera-pensando · Jan 30, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 30, 2026
diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
@@ -108,6 +108,22 @@ type RemediationWorkflowSpec struct {
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
 	// +optional
 	MaxParallelWorkflows int `json:"maxParallelWorkflows"`
+
+	// Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
+	// If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationTaints",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints"}
+	// +optional
+	NodeRemediationTaints []v1.Taint `json:"nodeRemediationTaints,omitempty"`
+
+	// Node Remediation labels are custom labels that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationLabels",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels"}
+	// +optional
+	NodeRemediationLabels map[string]string `json:"nodeRemediationLabels,omitempty"`
+
+	// Node drain policy during remediation workflow execution
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeDrainPolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy"}
+	// +optional
+	NodeDrainPolicy *DrainSpec `json:"nodeDrainPolicy,omitempty"`
 }
 
 type RegistryTLS struct {
@@ -319,6 +335,14 @@ type DrainSpec struct {
 	// +optional
 	// +kubebuilder:default:=-1
 	GracePeriodSeconds int `json:"gracePeriodSeconds,omitempty"`
+	// IgnoreDaemonSets indicates whether to ignore DaemonSet-managed pods
+	// +optional
+	// +kubebuilder:default:=true
+	IgnoreDaemonSets *bool `json:"ignoreDaemonSets,omitempty"`
+	// IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
+	// This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
+	// +optional
+	IgnoreNamespaces []string `json:"ignoreNamespaces,omitempty"`
 }
 
 type PodDeletionSpec struct {

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.4.0
-    createdAt: "2025-12-09T09:27:50Z"
+    createdAt: "2026-01-28T11:30:39Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -725,6 +725,27 @@ spec:
         path: remediationWorkflow.maxParallelWorkflows
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
+      - description: Node drain policy during remediation workflow execution
+        displayName: NodeDrainPolicy
+        path: remediationWorkflow.nodeDrainPolicy
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy
+      - description: Node Remediation labels are custom labels that we can apply on
+          the node to specify that the node is undergoing remediation or needs attention
+          by the administrator.
+        displayName: NodeRemediationLabels
+        path: remediationWorkflow.nodeRemediationLabels
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels
+      - description: Node Remediation taints are custom taints that we can apply on
+          the node to specify that the node is undergoing remediation or needs attention
+          by the administrator. If user does not specify any taints, the operator
+          will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule"
+          to the node under remediation.
+        displayName: NodeRemediationTaints
+        path: remediationWorkflow.nodeRemediationTaints
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage

diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -596,6 +596,18 @@ spec:
                               waits for a pod to shut down gracefully after receiving
                               a termination signal
                             type: integer
+                          ignoreDaemonSets:
+                            default: true
+                            description: IgnoreDaemonSets indicates whether to ignore
+                              DaemonSet-managed pods
+                            type: boolean
+                          ignoreNamespaces:
+                            description: |-
+                              IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
+                              This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
+                            items:
+                              type: string
+                            type: array
                           timeoutSeconds:
                             default: 300
                             description: TimeoutSecond specifies the length of time
@@ -1399,6 +1411,79 @@ spec:
                       remediation workflows can be executed in parallel. 0 is the
                       default value and it means no limit.
                     type: integer
+                  nodeDrainPolicy:
+                    description: Node drain policy during remediation workflow execution
+                    properties:
+                      force:
+                        default: false
+                        description: Force indicates if force draining is allowed
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: -1
+                        description: GracePeriodSeconds indicates the time kubernetes
+                          waits for a pod to shut down gracefully after receiving
+                          a termination signal
+                        type: integer
+                      ignoreDaemonSets:
+                        default: true
+                        description: IgnoreDaemonSets indicates whether to ignore
+                          DaemonSet-managed pods
+                        type: boolean
+                      ignoreNamespaces:
+                        description: |-
+                          IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
+                          This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
+                        items:
+                          type: string
+                        type: array
+                      timeoutSeconds:
+                        default: 300
+                        description: TimeoutSecond specifies the length of time in
+                          seconds to wait before giving up drain, zero means infinite
+                        minimum: 0
+                        type: integer
+                    type: object
+                  nodeRemediationLabels:
+                    additionalProperties:
+                      type: string
+                    description: Node Remediation labels are custom labels that we
+                      can apply on the node to specify that the node is undergoing
+                      remediation or needs attention by the administrator.
+                    type: object
+                  nodeRemediationTaints:
+                    description: |-
+                      Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
+                      If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
+                    items:
+                      description: |-
+                        The node this Taint is attached to has the "effect" on
+                        any pod that does not tolerate the Taint.
+                      properties:
+                        effect:
+                          description: |-
+                            Required. The effect of the taint on pods
+                            that do not tolerate the taint.
+                            Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
+                          type: string
+                        key:
+                          description: Required. The taint key to be applied to a
+                            node.
+                          type: string
+                        timeAdded:
+                          description: |-
+                            TimeAdded represents the time at which the taint was added.
+                            It is only written for NoExecute taints.
+                          format: date-time
+                          type: string
+                        value:
+                          description: The taint value corresponding to the taint
+                            key.
+                          type: string
+                      required:
+                      - effect
+                      - key
+                      type: object
+                    type: array
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -592,6 +592,18 @@ spec:
                               waits for a pod to shut down gracefully after receiving
                               a termination signal
                             type: integer
+                          ignoreDaemonSets:
+                            default: true
+                            description: IgnoreDaemonSets indicates whether to ignore
+                              DaemonSet-managed pods
+                            type: boolean
+                          ignoreNamespaces:
+                            description: |-
+                              IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
+                              This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
+                            items:
+                              type: string
+                            type: array
                           timeoutSeconds:
                             default: 300
                             description: TimeoutSecond specifies the length of time
@@ -1395,6 +1407,79 @@ spec:
                       remediation workflows can be executed in parallel. 0 is the
                       default value and it means no limit.
                     type: integer
+                  nodeDrainPolicy:
+                    description: Node drain policy during remediation workflow execution
+                    properties:
+                      force:
+                        default: false
+                        description: Force indicates if force draining is allowed
+                        type: boolean
+                      gracePeriodSeconds:
+                        default: -1
+                        description: GracePeriodSeconds indicates the time kubernetes
+                          waits for a pod to shut down gracefully after receiving
+                          a termination signal
+                        type: integer
+                      ignoreDaemonSets:
+                        default: true
+                        description: IgnoreDaemonSets indicates whether to ignore
+                          DaemonSet-managed pods
+                        type: boolean
+                      ignoreNamespaces:
+                        description: |-
+                          IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
+                          This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
+                        items:
+                          type: string
+                        type: array
+                      timeoutSeconds:
+                        default: 300
+                        description: TimeoutSecond specifies the length of time in
+                          seconds to wait before giving up drain, zero means infinite
+                        minimum: 0
+                        type: integer
+                    type: object
+                  nodeRemediationLabels:
+                    additionalProperties:
+                      type: string
+                    description: Node Remediation labels are custom labels that we
+                      can apply on the node to specify that the node is undergoing
+                      remediation or needs attention by the administrator.
+                    type: object
+                  nodeRemediationTaints:
+                    description: |-
+                      Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
+                      If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
+                    items:
+                      description: |-
+                        The node this Taint is attached to has the "effect" on
+                        any pod that does not tolerate the Taint.
+                      properties:
+                        effect:
+                          description: |-
+                            Required. The effect of the taint on pods
+                            that do not tolerate the taint.
+                            Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
+                          type: string
+                        key:
+                          description: Required. The taint key to be applied to a
+                            node.
+                          type: string
+                        timeAdded:
+                          description: |-
+                            TimeAdded represents the time at which the taint was added.
+                            It is only written for NoExecute taints.
+                          format: date-time
+                          type: string
+                        value:
+                          description: The taint value corresponding to the taint
+                            key.
+                          type: string
+                      required:
+                      - effect
+                      - key
+                      type: object
+                    type: array
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -696,6 +696,27 @@ spec:
         path: remediationWorkflow.maxParallelWorkflows
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
+      - description: Node drain policy during remediation workflow execution
+        displayName: NodeDrainPolicy
+        path: remediationWorkflow.nodeDrainPolicy
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy
+      - description: Node Remediation labels are custom labels that we can apply on
+          the node to specify that the node is undergoing remediation or needs attention
+          by the administrator.
+        displayName: NodeRemediationLabels
+        path: remediationWorkflow.nodeRemediationLabels
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels
+      - description: Node Remediation taints are custom taints that we can apply on
+          the node to specify that the node is undergoing remediation or needs attention
+          by the administrator. If user does not specify any taints, the operator
+          will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule"
+          to the node under remediation.
+        displayName: NodeRemediationTaints
+        path: remediationWorkflow.nodeRemediationTaints
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage