Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,22 @@ type RemediationWorkflowSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
// +optional
MaxParallelWorkflows int `json:"maxParallelWorkflows"`

// Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
// If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationTaints",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints"}
// +optional
NodeRemediationTaints []v1.Taint `json:"nodeRemediationTaints,omitempty"`

// Node Remediation labels are custom labels that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeRemediationLabels",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels"}
// +optional
NodeRemediationLabels map[string]string `json:"nodeRemediationLabels,omitempty"`

// Node drain policy during remediation workflow execution
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeDrainPolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy"}
// +optional
NodeDrainPolicy *DrainSpec `json:"nodeDrainPolicy,omitempty"`
}

type RegistryTLS struct {
Expand Down Expand Up @@ -319,6 +335,14 @@ type DrainSpec struct {
// +optional
// +kubebuilder:default:=-1
GracePeriodSeconds int `json:"gracePeriodSeconds,omitempty"`
// IgnoreDaemonSets indicates whether to ignore DaemonSet-managed pods
// +optional
// +kubebuilder:default:=true
IgnoreDaemonSets *bool `json:"ignoreDaemonSets,omitempty"`
// IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
// This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
// +optional
IgnoreNamespaces []string `json:"ignoreNamespaces,omitempty"`
}

type PodDeletionSpec struct {
Expand Down
29 changes: 29 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 22 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.4.0
createdAt: "2025-12-09T09:27:50Z"
createdAt: "2026-01-28T11:30:39Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -725,6 +725,27 @@ spec:
path: remediationWorkflow.maxParallelWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
- description: Node drain policy during remediation workflow execution
displayName: NodeDrainPolicy
path: remediationWorkflow.nodeDrainPolicy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy
- description: Node Remediation labels are custom labels that we can apply on
the node to specify that the node is undergoing remediation or needs attention
by the administrator.
displayName: NodeRemediationLabels
path: remediationWorkflow.nodeRemediationLabels
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels
- description: Node Remediation taints are custom taints that we can apply on
the node to specify that the node is undergoing remediation or needs attention
by the administrator. If user does not specify any taints, the operator
will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule"
to the node under remediation.
displayName: NodeRemediationTaints
path: remediationWorkflow.nodeRemediationTaints
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints
- description: Tester image used to run tests and verify if remediation fixed
the reported problem.
displayName: TesterImage
Expand Down
85 changes: 85 additions & 0 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,18 @@ spec:
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
ignoreDaemonSets:
default: true
description: IgnoreDaemonSets indicates whether to ignore
DaemonSet-managed pods
type: boolean
ignoreNamespaces:
description: |-
IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
items:
type: string
type: array
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time
Expand Down Expand Up @@ -1399,6 +1411,79 @@ spec:
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
type: integer
nodeDrainPolicy:
description: Node drain policy during remediation workflow execution
properties:
force:
default: false
description: Force indicates if force draining is allowed
type: boolean
gracePeriodSeconds:
default: -1
description: GracePeriodSeconds indicates the time kubernetes
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
ignoreDaemonSets:
default: true
description: IgnoreDaemonSets indicates whether to ignore
DaemonSet-managed pods
type: boolean
ignoreNamespaces:
description: |-
IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
items:
type: string
type: array
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time in
seconds to wait before giving up drain, zero means infinite
minimum: 0
type: integer
type: object
nodeRemediationLabels:
additionalProperties:
type: string
description: Node Remediation labels are custom labels that we
can apply on the node to specify that the node is undergoing
remediation or needs attention by the administrator.
type: object
nodeRemediationTaints:
description: |-
Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
items:
description: |-
The node this Taint is attached to has the "effect" on
any pod that does not tolerate the Taint.
properties:
effect:
description: |-
Required. The effect of the taint on pods
that do not tolerate the taint.
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Required. The taint key to be applied to a
node.
type: string
timeAdded:
description: |-
TimeAdded represents the time at which the taint was added.
It is only written for NoExecute taints.
format: date-time
type: string
value:
description: The taint value corresponding to the taint
key.
type: string
required:
- effect
- key
type: object
type: array
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
85 changes: 85 additions & 0 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,18 @@ spec:
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
ignoreDaemonSets:
default: true
description: IgnoreDaemonSets indicates whether to ignore
DaemonSet-managed pods
type: boolean
ignoreNamespaces:
description: |-
IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
items:
type: string
type: array
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time
Expand Down Expand Up @@ -1395,6 +1407,79 @@ spec:
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
type: integer
nodeDrainPolicy:
description: Node drain policy during remediation workflow execution
properties:
force:
default: false
description: Force indicates if force draining is allowed
type: boolean
gracePeriodSeconds:
default: -1
description: GracePeriodSeconds indicates the time kubernetes
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
ignoreDaemonSets:
default: true
description: IgnoreDaemonSets indicates whether to ignore
DaemonSet-managed pods
type: boolean
ignoreNamespaces:
description: |-
IgnoreNamespaces is the list of namespaces to ignore during node drain operation.
This is useful to avoid draining pods from critical namespaces like 'kube-system', etc.
items:
type: string
type: array
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time in
seconds to wait before giving up drain, zero means infinite
minimum: 0
type: integer
type: object
nodeRemediationLabels:
additionalProperties:
type: string
description: Node Remediation labels are custom labels that we
can apply on the node to specify that the node is undergoing
remediation or needs attention by the administrator.
type: object
nodeRemediationTaints:
description: |-
Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
items:
description: |-
The node this Taint is attached to has the "effect" on
any pod that does not tolerate the Taint.
properties:
effect:
description: |-
Required. The effect of the taint on pods
that do not tolerate the taint.
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Required. The taint key to be applied to a
node.
type: string
timeAdded:
description: |-
TimeAdded represents the time at which the taint was added.
It is only written for NoExecute taints.
format: date-time
type: string
value:
description: The taint value corresponding to the taint
key.
type: string
required:
- effect
- key
type: object
type: array
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
21 changes: 21 additions & 0 deletions config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,27 @@ spec:
path: remediationWorkflow.maxParallelWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
- description: Node drain policy during remediation workflow execution
displayName: NodeDrainPolicy
path: remediationWorkflow.nodeDrainPolicy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy
- description: Node Remediation labels are custom labels that we can apply on
the node to specify that the node is undergoing remediation or needs attention
by the administrator.
displayName: NodeRemediationLabels
path: remediationWorkflow.nodeRemediationLabels
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationLabels
- description: Node Remediation taints are custom taints that we can apply on
the node to specify that the node is undergoing remediation or needs attention
by the administrator. If user does not specify any taints, the operator
will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule"
to the node under remediation.
displayName: NodeRemediationTaints
path: remediationWorkflow.nodeRemediationTaints
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeRemediationTaints
- description: Tester image used to run tests and verify if remediation fixed
the reported problem.
displayName: TesterImage
Expand Down
Loading
Loading