Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ COPY --from=builder /opt/app-root/src/${TARGET} /usr/local/bin/manager
COPY --from=builder /opt/app-root/src/kubectl /usr/local/bin/kubectl
COPY --from=builder /opt/app-root/src/LICENSE /licenses/LICENSE
COPY --from=builder /opt/app-root/src/helm-charts-k8s/crds/deviceconfig-crd.yaml \
/opt/app-root/src/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/node-feature-discovery/crds/nfd-api-crds.yaml \
/opt/app-root/src/helm-charts-k8s/charts/kmm/crds/module-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/kmm/crds/nodemodulesconfig-crd.yaml \
Expand All @@ -63,6 +64,10 @@ COPY --from=builder /opt/app-root/src/helm-charts-openshift/crds/deviceconfig-cr
/opt/app-root/src/helm-charts-openshift/charts/kmm/crds/nodemodulesconfig-crd.yaml \
/opt/helm-charts-crds-openshift/

RUN mkdir -p /remediation
COPY --from=builder /opt/app-root/src/internal/controllers/remediation/configs /remediation/configs
COPY --from=builder /opt/app-root/src/internal/controllers/remediation/scripts /remediation/scripts

RUN microdnf update -y && \
microdnf install -y shadow-utils jq && \
microdnf clean all
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ KMM_OPERATOR_IMG_NAME ?= $(DOCKER_REGISTRY)/kernel-module-management-operator
#######################
# Helm Charts variables
YAML_FILES=bundle/manifests/amd-gpu-operator-node-metrics_rbac.authorization.k8s.io_v1_rolebinding.yaml bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml bundle/manifests/amd-gpu-operator-node-labeller_rbac.authorization.k8s.io_v1_clusterrolebinding.yaml bundle/manifests/amd-gpu-operator-node-metrics_monitoring.coreos.com_v1_servicemonitor.yaml config/samples/amd.com_deviceconfigs.yaml config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml example/deviceconfig_example.yaml config/default/kustomization.yaml
CRD_YAML_FILES = deviceconfig-crd.yaml
CRD_YAML_FILES = deviceconfig-crd.yaml remediationworkflowstatus-crd.yaml
K8S_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
OPENSHIFT_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
OPENSHIFT_CLUSTER_NFD_CRD_YAML_FILES=nodefeature-crd.yaml nodefeaturediscovery-crd.yaml nodefeaturerule-crd.yaml
Expand Down Expand Up @@ -592,6 +592,7 @@ helm-install-openshift:
helm-uninstall-openshift:
echo "Deleting all CRs before uninstalling operator..."
${KUBECTL_CMD} delete deviceconfigs.amd.com -n kube-amd-gpu --all
${KUBECTL_CMD} delete remediationworkflowstatuses.amd.com -n kube-amd-gpu --all
${KUBECTL_CMD} delete nodefeaturediscoveries.nfd.openshift.io -n kube-amd-gpu --all
echo "Uninstalling operator..."
helm uninstall amd-gpu-operator -n kube-amd-gpu
Expand All @@ -602,6 +603,7 @@ helm-install-k8s:
helm-uninstall-k8s:
echo "Deleting all device configs before uninstalling operator..."
${KUBECTL_CMD} delete deviceconfigs.amd.com -n kube-amd-gpu --all
${KUBECTL_CMD} delete remediationworkflowstatuses.amd.com -n kube-amd-gpu --all
echo "Uninstalling operator..."
helm uninstall amd-gpu-operator -n kube-amd-gpu

Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,12 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \
--version=v1.4.0
```

#### Installation Options

* Skip NFD installation: `--set node-feature-discovery.enabled=false`
* Skip KMM installation: `--set kmm.enabled=false`
```{note}
Installation Options
- Skip NFD installation: `--set node-feature-discovery.enabled=false`
- Skip KMM installation: `--set kmm.enabled=false`
- Skip Auto Node Remediation: `--set remediation.enabled=false`
```

> [!WARNING]
> It is strongly recommended to use AMD-optimized KMM images included in the operator release. This is not required when installing the GPU Operator on Red Hat OpenShift.
Expand Down
18 changes: 18 additions & 0 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ type DriverSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"}
Blacklist *bool `json:"blacklist,omitempty"`

// NOTE: currently only for OpenShift cluster
// set to true to use source image to build driver image on the fly
// otherwise use installer debian/rpm packages from radeon repo to build driver image
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="UseSourceImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:useSourceImage"}
UseSourceImage *bool `json:"useSourceImage,omitempty"`

// radeon repo URL for fetching amdgpu installer if building driver image on the fly
// installer URL is https://repo.radeon.com/amdgpu-install by default
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="AMDGPUInstallerRepoURL",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL"}
Expand Down Expand Up @@ -421,11 +427,23 @@ type ImageSignSpec struct {
type ImageBuildSpec struct {
// image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry
// e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04
// Use spec.driver.imageRegistrySecret for authentication with private registries.
// NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image
// +kubebuilder:default=docker.io
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BaseImageRegistry",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistry"}
BaseImageRegistry string `json:"baseImageRegistry,omitempty"`

// SourceImageRepo specifies the image repository for the driver source code (OpenShift only).
// Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag
// based on cluster RHEL version and spec.driver.version (format: coreos-<rhel>-<driver version>).
// Default: docker.io/rocm/amdgpu-driver
// Use spec.driver.imageRegistrySecret for authentication with private registries.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="SourceImageRepo",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:sourceImageRepo"}
SourceImageRepo string `json:"sourceImageRepo,omitempty"`

// TLS settings for fetching base image
// this field will be applied to SourceImageRepo as well
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BaseImageRegistryTLS",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistryTLS"}
BaseImageRegistryTLS RegistryTLS `json:"baseImageRegistryTLS,omitempty"`
}

Expand Down
53 changes: 53 additions & 0 deletions api/v1alpha1/remediationwf_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.

Licensed under the Apache License, Version 2.0 (the \"License\");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an \"AS IS\" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

//+kubebuilder:object:root=true
//+kubebuilder:resource:scope=Namespaced,shortName=rwfstatus
//+kubebuilder:subresource:status

// RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
// +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
type RemediationWorkflowStatus struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"`
}

type WorkflowMetadata struct {
Name string `json:"name,omitempty"`
StartTime string `json:"startTime,omitempty"`
}

//+kubebuilder:object:root=true

// RemediationWorkflowStatusList contains a list of RemediationWorkflowStatuses
type RemediationWorkflowStatusList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`

Items []RemediationWorkflowStatus `json:"items"`
}

func init() {
SchemeBuilder.Register(&RemediationWorkflowStatus{}, &RemediationWorkflowStatusList{})
}
104 changes: 104 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 41 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.4.0
createdAt: "2025-10-24T01:39:09Z"
createdAt: "2025-11-03T10:08:51Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -303,6 +303,23 @@ spec:
path: driver.imageBuild
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:imageBuild
- description: 'image registry to fetch base image for building driver image,
default value is docker.io, the builder will search for corresponding OS
base image from given registry e.g. if your worker node is using Ubuntu
22.04, by default the base image would be docker.io/ubuntu:22.04 Use spec.driver.imageRegistrySecret
for authentication with private registries. NOTE: this field won''t apply
for OpenShift since OpenShift is using its own DriverToolKit image to build
driver image'
displayName: BaseImageRegistry
path: driver.imageBuild.baseImageRegistry
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistry
- description: TLS settings for fetching base image this field will be applied
to SourceImageRepo as well
displayName: BaseImageRegistryTLS
path: driver.imageBuild.baseImageRegistryTLS
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistryTLS
- description: If true, check if the container image already exists using plain
HTTP.
displayName: Insecure
Expand All @@ -314,6 +331,16 @@ spec:
path: driver.imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:insecureSkipTLSVerify
- description: 'SourceImageRepo specifies the image repository for the driver
source code (OpenShift only). Used when spec.driver.useSourceImage is true.
The operator automatically determines the image tag based on cluster RHEL
version and spec.driver.version (format: coreos-<rhel>-<driver version>).
Default: docker.io/rocm/amdgpu-driver Use spec.driver.imageRegistrySecret
for authentication with private registries.'
displayName: SourceImageRepo
path: driver.imageBuild.sourceImageRepo
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:sourceImageRepo
- description: secrets used for pull/push images from/to private registry specified
in driversImage
displayName: ImageRegistrySecret
Expand Down Expand Up @@ -429,6 +456,13 @@ spec:
path: driver.upgradePolicy.rebootRequired
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:rebootRequired
- description: 'NOTE: currently only for OpenShift cluster set to true to use
source image to build driver image on the fly otherwise use installer debian/rpm
packages from radeon repo to build driver image'
displayName: UseSourceImage
path: driver.useSourceImage
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:useSourceImage
- description: 'version of the drivers source code, can be used as part of image
of dockerfile source image default value for different OS is: ubuntu: 6.1.3,
coreOS: 6.2.2'
Expand Down Expand Up @@ -850,6 +884,9 @@ spec:
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
version: v1alpha1
- kind: RemediationWorkflowStatus
name: remediationworkflowstatuses.amd.com
version: v1alpha1
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -1000,6 +1037,7 @@ spec:
- amd.com
resources:
- deviceconfigs
- remediationworkflowstatuses
verbs:
- create
- get
Expand All @@ -1011,12 +1049,14 @@ spec:
- amd.com
resources:
- deviceconfigs/finalizers
- remediationworkflowstatuses/finalizers
verbs:
- update
- apiGroups:
- amd.com
resources:
- deviceconfigs/status
- remediationworkflowstatuses/status
verbs:
- get
- patch
Expand Down
Loading
Loading