diff --git a/Dockerfile b/Dockerfile index 4bdcaef51..76a5d4440 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,7 @@ COPY --from=builder /opt/app-root/src/${TARGET} /usr/local/bin/manager COPY --from=builder /opt/app-root/src/kubectl /usr/local/bin/kubectl COPY --from=builder /opt/app-root/src/LICENSE /licenses/LICENSE COPY --from=builder /opt/app-root/src/helm-charts-k8s/crds/deviceconfig-crd.yaml \ + /opt/app-root/src/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml \ /opt/app-root/src/helm-charts-k8s/charts/node-feature-discovery/crds/nfd-api-crds.yaml \ /opt/app-root/src/helm-charts-k8s/charts/kmm/crds/module-crd.yaml \ /opt/app-root/src/helm-charts-k8s/charts/kmm/crds/nodemodulesconfig-crd.yaml \ @@ -63,6 +64,10 @@ COPY --from=builder /opt/app-root/src/helm-charts-openshift/crds/deviceconfig-cr /opt/app-root/src/helm-charts-openshift/charts/kmm/crds/nodemodulesconfig-crd.yaml \ /opt/helm-charts-crds-openshift/ +RUN mkdir -p /remediation +COPY --from=builder /opt/app-root/src/internal/controllers/remediation/configs /remediation/configs +COPY --from=builder /opt/app-root/src/internal/controllers/remediation/scripts /remediation/scripts + RUN microdnf update -y && \ microdnf install -y shadow-utils jq && \ microdnf clean all diff --git a/Makefile b/Makefile index 9a7397337..bccf85840 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ KMM_OPERATOR_IMG_NAME ?= $(DOCKER_REGISTRY)/kernel-module-management-operator ####################### # Helm Charts variables YAML_FILES=bundle/manifests/amd-gpu-operator-node-metrics_rbac.authorization.k8s.io_v1_rolebinding.yaml bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml bundle/manifests/amd-gpu-operator-node-labeller_rbac.authorization.k8s.io_v1_clusterrolebinding.yaml bundle/manifests/amd-gpu-operator-node-metrics_monitoring.coreos.com_v1_servicemonitor.yaml config/samples/amd.com_deviceconfigs.yaml config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml example/deviceconfig_example.yaml config/default/kustomization.yaml -CRD_YAML_FILES = deviceconfig-crd.yaml +CRD_YAML_FILES = deviceconfig-crd.yaml remediationworkflowstatus-crd.yaml K8S_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml OPENSHIFT_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml OPENSHIFT_CLUSTER_NFD_CRD_YAML_FILES=nodefeature-crd.yaml nodefeaturediscovery-crd.yaml nodefeaturerule-crd.yaml @@ -592,6 +592,7 @@ helm-install-openshift: helm-uninstall-openshift: echo "Deleting all CRs before uninstalling operator..." ${KUBECTL_CMD} delete deviceconfigs.amd.com -n kube-amd-gpu --all + ${KUBECTL_CMD} delete remediationworkflowstatuses.amd.com -n kube-amd-gpu --all ${KUBECTL_CMD} delete nodefeaturediscoveries.nfd.openshift.io -n kube-amd-gpu --all echo "Uninstalling operator..." helm uninstall amd-gpu-operator -n kube-amd-gpu @@ -602,6 +603,7 @@ helm-install-k8s: helm-uninstall-k8s: echo "Deleting all device configs before uninstalling operator..." ${KUBECTL_CMD} delete deviceconfigs.amd.com -n kube-amd-gpu --all + ${KUBECTL_CMD} delete remediationworkflowstatuses.amd.com -n kube-amd-gpu --all echo "Uninstalling operator..." helm uninstall amd-gpu-operator -n kube-amd-gpu diff --git a/README.md b/README.md index 6abd919e4..f753cb0a4 100644 --- a/README.md +++ b/README.md @@ -70,10 +70,12 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ --version=v1.4.0 ``` -#### Installation Options - -* Skip NFD installation: `--set node-feature-discovery.enabled=false` -* Skip KMM installation: `--set kmm.enabled=false` +```{note} +Installation Options + - Skip NFD installation: `--set node-feature-discovery.enabled=false` + - Skip KMM installation: `--set kmm.enabled=false` + - Skip Auto Node Remediation: `--set remediation.enabled=false` +``` > [!WARNING] > It is strongly recommended to use AMD-optimized KMM images included in the operator release. This is not required when installing the GPU Operator on Red Hat OpenShift. diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 076101321..3b6c6ea68 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -154,6 +154,12 @@ type DriverSpec struct { //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"} Blacklist *bool `json:"blacklist,omitempty"` + // NOTE: currently only for OpenShift cluster + // set to true to use source image to build driver image on the fly + // otherwise use installer debian/rpm packages from radeon repo to build driver image + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="UseSourceImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:useSourceImage"} + UseSourceImage *bool `json:"useSourceImage,omitempty"` + // radeon repo URL for fetching amdgpu installer if building driver image on the fly // installer URL is https://repo.radeon.com/amdgpu-install by default //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="AMDGPUInstallerRepoURL",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL"} @@ -421,11 +427,23 @@ type ImageSignSpec struct { type ImageBuildSpec struct { // image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry // e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + // Use spec.driver.imageRegistrySecret for authentication with private registries. // NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image // +kubebuilder:default=docker.io + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BaseImageRegistry",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistry"} BaseImageRegistry string `json:"baseImageRegistry,omitempty"` + // SourceImageRepo specifies the image repository for the driver source code (OpenShift only). + // Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag + // based on cluster RHEL version and spec.driver.version (format: coreos--). + // Default: docker.io/rocm/amdgpu-driver + // Use spec.driver.imageRegistrySecret for authentication with private registries. + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="SourceImageRepo",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:sourceImageRepo"} + SourceImageRepo string `json:"sourceImageRepo,omitempty"` + // TLS settings for fetching base image + // this field will be applied to SourceImageRepo as well + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BaseImageRegistryTLS",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistryTLS"} BaseImageRegistryTLS RegistryTLS `json:"baseImageRegistryTLS,omitempty"` } diff --git a/api/v1alpha1/remediationwf_types.go b/api/v1alpha1/remediationwf_types.go new file mode 100644 index 000000000..36234ca25 --- /dev/null +++ b/api/v1alpha1/remediationwf_types.go @@ -0,0 +1,53 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the \"License\"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an \"AS IS\" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +//+kubebuilder:object:root=true +//+kubebuilder:resource:scope=Namespaced,shortName=rwfstatus +//+kubebuilder:subresource:status + +// RemediationWorkflowStatus keeps a record of recent remediation workflow runs. +// +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}} +type RemediationWorkflowStatus struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"` +} + +type WorkflowMetadata struct { + Name string `json:"name,omitempty"` + StartTime string `json:"startTime,omitempty"` +} + +//+kubebuilder:object:root=true + +// RemediationWorkflowStatusList contains a list of RemediationWorkflowStatuses +type RemediationWorkflowStatusList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + + Items []RemediationWorkflowStatus `json:"items"` +} + +func init() { + SchemeBuilder.Register(&RemediationWorkflowStatus{}, &RemediationWorkflowStatusList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index e1a2aa01a..3dbd9d698 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -350,6 +350,11 @@ func (in *DriverSpec) DeepCopyInto(out *DriverSpec) { *out = new(bool) **out = **in } + if in.UseSourceImage != nil { + in, out := &in.UseSourceImage, &out.UseSourceImage + *out = new(bool) + **out = **in + } in.ImageRegistryTLS.DeepCopyInto(&out.ImageRegistryTLS) if in.ImageRegistrySecret != nil { in, out := &in.ImageRegistrySecret, &out.ImageRegistrySecret @@ -745,6 +750,90 @@ func (in *RemediationWorkflowSpec) DeepCopy() *RemediationWorkflowSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemediationWorkflowStatus) DeepCopyInto(out *RemediationWorkflowStatus) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + if in.Status != nil { + in, out := &in.Status, &out.Status + *out = make(map[string]map[string][]WorkflowMetadata, len(*in)) + for key, val := range *in { + var outVal map[string][]WorkflowMetadata + if val == nil { + (*out)[key] = nil + } else { + inVal := (*in)[key] + in, out := &inVal, &outVal + *out = make(map[string][]WorkflowMetadata, len(*in)) + for key, val := range *in { + var outVal []WorkflowMetadata + if val == nil { + (*out)[key] = nil + } else { + inVal := (*in)[key] + in, out := &inVal, &outVal + *out = make([]WorkflowMetadata, len(*in)) + copy(*out, *in) + } + (*out)[key] = outVal + } + } + (*out)[key] = outVal + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationWorkflowStatus. +func (in *RemediationWorkflowStatus) DeepCopy() *RemediationWorkflowStatus { + if in == nil { + return nil + } + out := new(RemediationWorkflowStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RemediationWorkflowStatus) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemediationWorkflowStatusList) DeepCopyInto(out *RemediationWorkflowStatusList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]RemediationWorkflowStatus, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemediationWorkflowStatusList. +func (in *RemediationWorkflowStatusList) DeepCopy() *RemediationWorkflowStatusList { + if in == nil { + return nil + } + out := new(RemediationWorkflowStatusList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RemediationWorkflowStatusList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServiceMonitorConfig) DeepCopyInto(out *ServiceMonitorConfig) { *out = *in @@ -915,3 +1004,18 @@ func (in *VFIOConfigSpec) DeepCopy() *VFIOConfigSpec { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkflowMetadata) DeepCopyInto(out *WorkflowMetadata) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkflowMetadata. +func (in *WorkflowMetadata) DeepCopy() *WorkflowMetadata { + if in == nil { + return nil + } + out := new(WorkflowMetadata) + in.DeepCopyInto(out) + return out +} diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index e2a21da67..94294cea1 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -32,7 +32,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/gpu-operator:v1.4.0 - createdAt: "2025-10-24T01:39:09Z" + createdAt: "2025-11-03T10:08:51Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -303,6 +303,23 @@ spec: path: driver.imageBuild x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:imageBuild + - description: 'image registry to fetch base image for building driver image, + default value is docker.io, the builder will search for corresponding OS + base image from given registry e.g. if your worker node is using Ubuntu + 22.04, by default the base image would be docker.io/ubuntu:22.04 Use spec.driver.imageRegistrySecret + for authentication with private registries. NOTE: this field won''t apply + for OpenShift since OpenShift is using its own DriverToolKit image to build + driver image' + displayName: BaseImageRegistry + path: driver.imageBuild.baseImageRegistry + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistry + - description: TLS settings for fetching base image this field will be applied + to SourceImageRepo as well + displayName: BaseImageRegistryTLS + path: driver.imageBuild.baseImageRegistryTLS + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistryTLS - description: If true, check if the container image already exists using plain HTTP. displayName: Insecure @@ -314,6 +331,16 @@ spec: path: driver.imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:insecureSkipTLSVerify + - description: 'SourceImageRepo specifies the image repository for the driver + source code (OpenShift only). Used when spec.driver.useSourceImage is true. + The operator automatically determines the image tag based on cluster RHEL + version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver Use spec.driver.imageRegistrySecret + for authentication with private registries.' + displayName: SourceImageRepo + path: driver.imageBuild.sourceImageRepo + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:sourceImageRepo - description: secrets used for pull/push images from/to private registry specified in driversImage displayName: ImageRegistrySecret @@ -429,6 +456,13 @@ spec: path: driver.upgradePolicy.rebootRequired x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:rebootRequired + - description: 'NOTE: currently only for OpenShift cluster set to true to use + source image to build driver image on the fly otherwise use installer debian/rpm + packages from radeon repo to build driver image' + displayName: UseSourceImage + path: driver.useSourceImage + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:useSourceImage - description: 'version of the drivers source code, can be used as part of image of dockerfile source image default value for different OS is: ubuntu: 6.1.3, coreOS: 6.2.2' @@ -850,6 +884,9 @@ spec: x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus version: v1alpha1 + - kind: RemediationWorkflowStatus + name: remediationworkflowstatuses.amd.com + version: v1alpha1 description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -1000,6 +1037,7 @@ spec: - amd.com resources: - deviceconfigs + - remediationworkflowstatuses verbs: - create - get @@ -1011,12 +1049,14 @@ spec: - amd.com resources: - deviceconfigs/finalizers + - remediationworkflowstatuses/finalizers verbs: - update - apiGroups: - amd.com resources: - deviceconfigs/status + - remediationworkflowstatuses/status verbs: - get - patch diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index f237cdf77..d4941f3b7 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -404,10 +404,13 @@ spec: description: |- image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + Use spec.driver.imageRegistrySecret for authentication with private registries. NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image type: string baseImageRegistryTLS: - description: TLS settings for fetching base image + description: |- + TLS settings for fetching base image + this field will be applied to SourceImageRepo as well properties: insecure: description: If true, check if the container image already @@ -418,6 +421,14 @@ spec: validation type: boolean type: object + sourceImageRepo: + description: |- + SourceImageRepo specifies the image repository for the driver source code (OpenShift only). + Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag + based on cluster RHEL version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver + Use spec.driver.imageRegistrySecret for authentication with private registries. + type: string type: object imageRegistrySecret: description: secrets used for pull/push images from/to private @@ -623,6 +634,12 @@ spec: to perform reboot on worker nodes type: boolean type: object + useSourceImage: + description: |- + NOTE: currently only for OpenShift cluster + set to true to use source image to build driver image on the fly + otherwise use installer debian/rpm packages from radeon repo to build driver image + type: boolean version: description: |- version of the drivers source code, can be used as part of image of dockerfile source image diff --git a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml new file mode 100644 index 000000000..7becfc992 --- /dev/null +++ b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml @@ -0,0 +1,69 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.0 + creationTimestamp: null + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/name: amd-gpu + app.kubernetes.io/part-of: amd-gpu + name: remediationworkflowstatuses.amd.com +spec: + group: amd.com + names: + kind: RemediationWorkflowStatus + listKind: RemediationWorkflowStatusList + plural: remediationworkflowstatuses + shortNames: + - rwfstatus + singular: remediationworkflowstatus + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: RemediationWorkflowStatus keeps a record of recent remediation + workflow runs. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + additionalProperties: + additionalProperties: + items: + properties: + name: + type: string + startTime: + type: string + type: object + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index b6ea3159e..c4bf1868a 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -400,10 +400,13 @@ spec: description: |- image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + Use spec.driver.imageRegistrySecret for authentication with private registries. NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image type: string baseImageRegistryTLS: - description: TLS settings for fetching base image + description: |- + TLS settings for fetching base image + this field will be applied to SourceImageRepo as well properties: insecure: description: If true, check if the container image already @@ -414,6 +417,14 @@ spec: validation type: boolean type: object + sourceImageRepo: + description: |- + SourceImageRepo specifies the image repository for the driver source code (OpenShift only). + Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag + based on cluster RHEL version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver + Use spec.driver.imageRegistrySecret for authentication with private registries. + type: string type: object imageRegistrySecret: description: secrets used for pull/push images from/to private @@ -619,6 +630,12 @@ spec: to perform reboot on worker nodes type: boolean type: object + useSourceImage: + description: |- + NOTE: currently only for OpenShift cluster + set to true to use source image to build driver image on the fly + otherwise use installer debian/rpm packages from radeon repo to build driver image + type: boolean version: description: |- version of the drivers source code, can be used as part of image of dockerfile source image diff --git a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml new file mode 100644 index 000000000..651af2d92 --- /dev/null +++ b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml @@ -0,0 +1,59 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.0 + name: remediationworkflowstatuses.amd.com +spec: + group: amd.com + names: + kind: RemediationWorkflowStatus + listKind: RemediationWorkflowStatusList + plural: remediationworkflowstatuses + shortNames: + - rwfstatus + singular: remediationworkflowstatus + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: RemediationWorkflowStatus keeps a record of recent remediation + workflow runs. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + additionalProperties: + additionalProperties: + items: + properties: + name: + type: string + startTime: + type: string + type: object + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index c12a78495..f48702e60 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -6,6 +6,7 @@ kind: Kustomization # It should be run by config/default resources: - bases/amd.com_deviceconfigs.yaml +- bases/amd.com_remediationworkflowstatuses.yaml #+kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index 77e06d4a2..a189e3643 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -274,6 +274,23 @@ spec: path: driver.imageBuild x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:imageBuild + - description: 'image registry to fetch base image for building driver image, + default value is docker.io, the builder will search for corresponding OS + base image from given registry e.g. if your worker node is using Ubuntu + 22.04, by default the base image would be docker.io/ubuntu:22.04 Use spec.driver.imageRegistrySecret + for authentication with private registries. NOTE: this field won''t apply + for OpenShift since OpenShift is using its own DriverToolKit image to build + driver image' + displayName: BaseImageRegistry + path: driver.imageBuild.baseImageRegistry + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistry + - description: TLS settings for fetching base image this field will be applied + to SourceImageRepo as well + displayName: BaseImageRegistryTLS + path: driver.imageBuild.baseImageRegistryTLS + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:baseImageRegistryTLS - description: If true, check if the container image already exists using plain HTTP. displayName: Insecure @@ -285,6 +302,16 @@ spec: path: driver.imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:insecureSkipTLSVerify + - description: 'SourceImageRepo specifies the image repository for the driver + source code (OpenShift only). Used when spec.driver.useSourceImage is true. + The operator automatically determines the image tag based on cluster RHEL + version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver Use spec.driver.imageRegistrySecret + for authentication with private registries.' + displayName: SourceImageRepo + path: driver.imageBuild.sourceImageRepo + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:sourceImageRepo - description: secrets used for pull/push images from/to private registry specified in driversImage displayName: ImageRegistrySecret @@ -400,6 +427,13 @@ spec: path: driver.upgradePolicy.rebootRequired x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:rebootRequired + - description: 'NOTE: currently only for OpenShift cluster set to true to use + source image to build driver image on the fly otherwise use installer debian/rpm + packages from radeon repo to build driver image' + displayName: UseSourceImage + path: driver.useSourceImage + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:useSourceImage - description: 'version of the drivers source code, can be used as part of image of dockerfile source image default value for different OS is: ubuntu: 6.1.3, coreOS: 6.2.2' diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9d08bf39f..2d0b992aa 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -77,6 +77,7 @@ rules: - amd.com resources: - deviceconfigs + - remediationworkflowstatuses verbs: - create - get @@ -88,12 +89,14 @@ rules: - amd.com resources: - deviceconfigs/finalizers + - remediationworkflowstatuses/finalizers verbs: - update - apiGroups: - amd.com resources: - deviceconfigs/status + - remediationworkflowstatuses/status verbs: - get - patch diff --git a/docs/dcm/applying-partition-profiles.rst b/docs/dcm/applying-partition-profiles.rst index 04a4147c5..16f84cce9 100644 --- a/docs/dcm/applying-partition-profiles.rst +++ b/docs/dcm/applying-partition-profiles.rst @@ -19,8 +19,8 @@ GPU Partitioning Workflow Setting GPU Partitioning ------------------------- -1. Add tolerations to all deployments in kube-system namespace -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +1. Add tolerations to all deployments and daemonsets in kube-system namespace +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since tainting a node will bring down all pods/daemonsets, we need to add toleration to the Kubernetes system pods to prevent them from getting evicted. Pods in the system namespace are responsible for things like DNS, networking, proxy and the overall proper functioning of your node. @@ -34,6 +34,16 @@ Here we are patching all the deployments in the `kube-system` namespace with the kubectl get deployments -n kube-system -o json | jq -r '.items[] | .metadata.name' | xargs -I {} kubectl patch deployment {} -n kube-system --type='json' -p='[{"op": "add", "path": "/spec/template/spec/tolerations", "value": [{"key": "amd-dcm", "operator": "Equal", "value": "up", "effect": "NoExecute"}]}]' +We also need to patch all the daemonsets in the `kube-system` namespace to prevent CNI (e.g., Cilium) malfunction: + +.. tab-set:: + + .. tab-item:: Kubernetes + + .. code-block:: bash + + kubectl get daemonsets -n kube-system -o json | jq -r '.items[] | .metadata.name' | xargs -I {} kubectl patch daemonsets {} -n kube-system --type='json' -p='[{"op": "add", "path": "/spec/template/spec/tolerations", "value": [{"key": "amd-dcm", "operator": "Equal", "value": "up", "effect": "NoExecute"}]}]' + .. .. tab-item:: OpenShift .. .. code-block:: bash @@ -56,7 +66,37 @@ The above command is convenient as it adds the required tolerations all with a s 2. Create DCM Profile ConfigMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Next you will need to create the Device Config Mangaer ConfigMap that specifies the different partitioning profiles you would like to set. Refer to the [Device Config Mangaer ConfigMap](../dcm/device-config-manager-configmap.html#configmap) page for more details on how to create the DCM ConfigMap. +Next you will need to create the Device Config Mangaer ConfigMap that specifies the different partitioning profiles you would like to set. Refer to the [Device Config Mangaer ConfigMap](../dcm/device-config-manager-configmap.html#configmap) page for more details on how to create the DCM ConfigMap. + +Before creating your partition profiles, ensure you use the correct compute and memory partition combinations for your GPU model. For detailed information on supported partition profiles by GPU model, refer to the `AMD GPU Partitioning documentation `_. + +**Checking Supported Partitions on Your System** + +You can verify the supported compute and memory partition modes directly on your GPU node by checking the sysfs files. SSH into your node and run the following commands: + +.. code-block:: bash + + # Check available compute partitions (e.g., SPX, DPX, QPX, CPX) + cat /sys/module/amdgpu/drivers/pci\:amdgpu//available_compute_partition + + # Check available memory partitions (e.g., NPS1, NPS2, NPS4, NPS8) + cat /sys/module/amdgpu/drivers/pci\:amdgpu//available_memory_partition + +Replace ```` with your GPU's PCI bus/device/function identifier (e.g., ``0000:87:00.0``). You can find the available BDFs by listing the directory contents: + +.. code-block:: bash + + ls /sys/module/amdgpu/drivers/pci\:amdgpu/ + +Example output: + +.. code-block:: bash + + $ cat /sys/module/amdgpu/drivers/pci\:amdgpu/0000\:87\:00.0/available_compute_partition + SPX, DPX, QPX, CPX + + $ cat /sys/module/amdgpu/drivers/pci\:amdgpu/0000\:87\:00.0/available_memory_partition + NPS1, NPS4, NPS8 Below is an example of how to create the `config-manager-config.yaml` file that has the following 2 profiles: @@ -124,6 +164,35 @@ Now apply the DCM ConfigMap to your cluster .. oc apply -f config-manager-config.yaml +After creating the ConfigMap, you need to associate it with the Device Config Manager by updating the DeviceConfig Custom Resource (CR) + +.. code-block:: yaml + + configManager: + # To enable/disable the config manager, enable to partition + enable: True + + # image for the device-config-manager container + image: "rocm/device-config-manager:v1.4.0" + + # image pull policy for config manager. Accepted values are Always, IfNotPresent, Never + imagePullPolicy: IfNotPresent + + # specify configmap name which stores profile config info + config: + name: "config-manager-config" + + # OPTIONAL + # toleration field for dcm pod to bypass nodes with specific taints + configManagerTolerations: + - key: "key1" + operator: "Equal" + value: "value1" + effect: "NoExecute" + +.. note:: + The ConfigMap name is of type ``string``. Ensure you change the ``spec/configManager/config/name`` to match the name of the config map you created (in this example, ``config-manager-config``). The Device-Config-Manager pod needs a ConfigMap to be present or else the pod does not come up. + 3. Add Taint to node ~~~~~~~~~~~~~~~~~~~~ @@ -181,11 +250,23 @@ You can also confirm that the label got applied by checking the node: 5. Verify GPU partitioning ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Connect to the node in your cluster via SSH and run amd-smi to confirm you now see the new partitions: +Use kubectl exec to run amd-smi inside the Device Config Manager pod to confirm you now see the new partitions: -.. code-block:: bash +.. tab-set:: - amd-smi list + .. tab-item:: Kubernetes + + .. code-block:: bash + + kubectl exec -n kube-amd-gpu -it [dcm-pod-name] -- amd-smi list + +.. .. tab-item:: OpenShift + +.. .. code-block:: bash + +.. oc exec -n kube-amd-gpu -it [dcm-pod-name] -- amd-smi list + +Replace ``[dcm-pod-name]`` with the actual name of your Device Config Manager pod (e.g., ``gpu-operator-device-config-manager-hn9rb``). 6. Remove Taint from the node ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -209,6 +290,8 @@ Remove the taint from the node to restart all previous workloads and allow the n Reverting back to SPX (no partitions) ------------------------------------- +To revert a node back to SPX mode (no partitions), apply the ``spx-profile`` label to the node: + .. tab-set:: .. tab-item:: Kubernetes @@ -224,7 +307,9 @@ Reverting back to SPX (no partitions) .. oc label node [nodename] dcm.amd.com/gpu-config-profile=spx-profile --overwrite Removing Partition Profile label --------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To completely remove the partition profile label from a node: .. tab-set:: @@ -241,7 +326,9 @@ Removing Partition Profile label .. oc label node [nodename] dcm.amd.com/gpu-config-profile- Removing DCM tolerations from all daemonsets in kube-system namespace ---------------------------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After completing partitioning operations, you can remove the DCM tolerations that were added to the kube-system namespace: .. tab-set:: diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md index 764b1af05..01f3f6496 100644 --- a/docs/drivers/installation.md +++ b/docs/drivers/installation.md @@ -106,6 +106,8 @@ spec: driver: # enable operator to install out-of-tree amdgpu kernel module enable: true + # Specify the driver version by using ROCm version + version: "7.0" # blacklist is required for installing out-of-tree amdgpu kernel module # Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. # Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olmhtml#create-blacklist-for-installing-out-of-tree-kernel-module @@ -121,8 +123,24 @@ spec: # Make sure you created the secret within the namespace that KMM operator is running imageRegistrySecret: name: mysecret - # Specify the driver version by using ROCm version - version: "7.0" + # (Optional) Currently only for OpenShift cluster, set to true to use source code image to build driver within the cluster + # default is false and operator will use debian or rpm package from radeon repo to install driver + useSourceImage: false + # (Optional) configure the driver image build within the cluster + imageBuild: + # configure the registry to search for base image for building driver + # e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io + # image builder will use docker.io/ubuntu:22.04 as base image + baseImageRegistry: docker.io + # sourceImageRepo: specify the amdgpu source code image repo for building driver + # the Operator will decide the image tag based on user provided driver version and system OS version + # e.g. if you input docker.io/rocm/amdgpu-driver the image tag will be coreos-- + # NOTE: currently only work for OpenShift cluster + # NOTE: will be used when spec.driver.useSourceImage is true + sourceImageRepo: docker.io/rocm/amdgpu-driver + baseImageRegistryTLS: + insecure: False # If True, check for the container image using plain HTTP + insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates) devicePlugin: devicePluginImage: rocm/k8s-device-plugin:latest @@ -141,6 +159,102 @@ spec: feature.node.kubernetes.io/amd-gpu: "true" ``` +```{note} +As for the configuration in `spec.driver.imageBuild`: +1. If the base OS image or source image is hosted in a registry that requires pull secrets to pull those images, you need to use `spec.driver.imageRegistrySecret` to inject the pull secret. +2. `spec.driver.imageRegistrySecret` was originally designed for providing secret to pull/push image to the repository specified in `spec.driver.image`, if unfortunately the base image and source image requires different secret to pull, please combine the access information into one single Kubernetes secret. + + ```bash + REGISTRY1=https://index.docker.io/v1/ + USER1=my-username-1 + PWD1=my-password-1 + REGISTRY2=another-registry.io:5000 + USER2=my-username-2 + PWD2=my-password-2 + cat > config.json </dev/null) + [[ -n "$BUILDER_TOKEN" ]] && break + sleep 1 + done + [ -z "$BUILDER_TOKEN" ] && { echo "❌ token not ready"; exit 1; } + + # 4. generate combined docker config + cat > config.json <:$(echo $BUILDER_TOKEN | base64 -d)" | base64 -w0)" + }, + "image-registry.openshift-image-registry.svc.cluster.local:5000": { + "auth": "$(echo -n ":$(echo $BUILDER_TOKEN | base64 -d)" | base64 -w0)" + }, + "${REGISTRY1}": { + "auth": "$(echo -n "${USER1}:${PWD1}" | base64 -w0)" + } + } + } + EOF + + # 5. create kubernetes secret + oc delete secret "${SECRET_NAME}" -n "$NS" --ignore-not-found + oc create secret generic "${SECRET_NAME}" \ + -n "$NS" \ + --type=kubernetes.io/dockerconfigjson \ + --from-file=.dockerconfigjson=config.json + + echo "✅ Secret '${SECRET_NAME}' created and ready." + ``` + +``` + + #### Configuration Reference To list existing `DeviceConfig` resources run `kubectl get deviceconfigs -A` @@ -168,6 +282,10 @@ To check the full spec of `DeviceConfig` definition run `kubectl get crds device | `imageSign.keySecret` | secret name of the private key
used to sign kernel modules after image building in cluster
see [secure boot](./secure-boot) doc for instructions to create the secret | | | `imageSign.certSecret` | secret name of the public key
used to sign kernel modules after image building in cluster
see [secure boot](./secure-boot) doc for instructions to create the secret | | | `tolerations` | List of tolerations that will be set for KMM module object and its components like build pod and worker pod | | +| `imageBuild.baseImageRegistry` | registry to host base OS image, e.g. when using Ubuntu 22.04 worker node with specified baseImageRegistry `docker.io` the operator will use base image from `docker.io/ubuntu:22.04` | `docker.io` | +| `imageBuild.baseImageRegistryTLS.insecure` | If true, check if the container image
already exists using plain HTTP | `false` | +| `imageBuild.baseImageRegistryTLS.insecureSkipTLSVerify` | If true, skip any TLS server certificate validation | `false` | +| `imageBuild.sourceImageRepo` | (Currently only applied to OpenShift) Image repository to host amdgpu source code image, operator will auto determine the image tag based on users system and `spec.driver.version`. E.g. for building driver from ROCm 7.0 + RHEL 9.6 + default source image repo, the image would be `docker.io/rocm/amdgpu-driver:coreos-9.6-7.0` | `docker.io/rocm/amdgpu-driver` | #### `spec.devicePlugin` Parameters diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst index fc85f6fed..87576f112 100644 --- a/docs/fulldeviceconfig.rst +++ b/docs/fulldeviceconfig.rst @@ -69,12 +69,21 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM name: image-sign-private-key-secret certSecret: name: image-sign-public-key-secret + # (Optional) Currently only for OpenShift cluster, set to true to use source code image to build driver within the cluster + # default is false and operator will use debian or rpm package from radeon repo to install driver + useSourceImage: false # (Optional) configure the driver image build within the cluster imageBuild: # configure the registry to search for base image for building driver # e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io # image builder will use docker.io/ubuntu:22.04 as base image baseImageRegistry: docker.io + # sourceImageRepo: specify the amdgpu source code image repo for building driver + # the Operator will decide the image tag based on user provided driver version and system OS version + # e.g. if you input docker.io/rocm/amdgpu-driver the image tag will be coreos-- + # NOTE: currently only work for OpenShift cluster + # NOTE: will be used when spec.driver.useSourceImage is true + sourceImageRepo: docker.io/rocm/amdgpu-driver baseImageRegistryTLS: insecure: False # If True, check for the container image using plain HTTP insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates) diff --git a/docs/installation/kubernetes-helm.md b/docs/installation/kubernetes-helm.md index f455c71fa..8a0d59c20 100644 --- a/docs/installation/kubernetes-helm.md +++ b/docs/installation/kubernetes-helm.md @@ -125,7 +125,8 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ ```{note} Installation Options - Skip NFD installation: `--set node-feature-discovery.enabled=false` - - Skip KMM installation: `--set kmm.enabled=false`.
Although KMM is a [Kubernetes-SIGs](https://github.com/kubernetes-sigs) maintained project, it is strongly recommended to use AMD optimized and published KMM images included in each operator release. + - Skip KMM installation: `--set kmm.enabled=false`
Although KMM is a [Kubernetes-SIGs](https://github.com/kubernetes-sigs) maintained project, it is strongly recommended to use AMD optimized and published KMM images included in each operator release. + - Skip Auto Node Remediation: `--set remediation.enabled=false` - Disable default DeviceConfig installation: `--set crds.defaultCR.install=false` ``` diff --git a/docs/releasenotes.md b/docs/releasenotes.md index d2b6898d2..05e88b34c 100644 --- a/docs/releasenotes.md +++ b/docs/releasenotes.md @@ -2,19 +2,27 @@ ## GPU Operator v1.4.1 Release Notes -The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.20 +The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.20 and introduces the ability to build `amdgpu` kernel modules directly within air-gapped clusters. ### Release Highlights -- **Device-Metrics-Exporter enhancements** +- **OpenShift Platform Support Enhancements** + - **Build Driver Images Directly within Disconnected OpenShift Clusters** + - Starting from v1.4.1, the AMD GPU Operator supports building driver kernel modules directly within disconnected OpenShift clusters. + - For Red Hat Enterprise Linux CoreOS (used by OpenShift), OpenShift will download source code and firmware from AMD provided [amdgpu-driver images](https://hub.docker.com/r/rocm/amdgpu-driver) into their [DriverToolKit](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/specialized_hardware_and_driver_enablement/driver-toolkit) and directly build the kernel modules from source code without dependency on lots of RPM packages. + - **Cluster Monitoring Enablement** + - The v1.4.1 AMD GPU Operator automatically creates the RBAC resources required by the OpenShift [Cluster Monitoring stack](https://rhobs-handbook.netlify.app/products/openshiftmonitoring/collecting_metrics.md/#configuring-prometheus-to-scrape-metrics). This reduces one manual configuration steps when setting up the OpenShift monitoring stack to scrape metrics from the device metrics exporter. +- **Device-Metrics-Exporter Enhancements** - **Enhanced Pod and Service Annotations** - - **Pod Annotations**, **Service Annotations** : Custom annotations can now be applied to exporter pods via the DeviceConfig CRD -- **Test Runner enhancements** + - Custom annotations can now be applied to exporter pods and services via the DeviceConfig CRD, providing greater flexibility in metadata management. +- **Test Runner Enhancements** - **Enhanced Test Result Events** - - Test runner Kubernetes events now include additional information: pod UID and test framework name (e.g., RVS, AGFHC) as event labels, providing more comprehensive test run information for improved tracking and diagnostics. + - Test runner Kubernetes events now include additional information such as pod UID and test framework name (e.g., RVS, AGFHC) as event labels, providing more comprehensive test run information for improved tracking and diagnostics. ### Fixes - 1. **Node Feature Discovery rule fix** - * Fix the PCI device ID for the Virtual Function (VF) of these GPU: MI308X and MI300X-HF + 1. **Node Feature Discovery Rule Fix** + * Fixed the PCI device ID for the Virtual Function (VF) of MI308X and MI300X-HF GPUs + 2. **Helm Chart default DeviceConfig Fix** + * Fixed an issue where the Helm chart could not render the metrics exporter's pod resource API socket path in the default DeviceConfig when specified via `values.yaml` or the `--set` option. ## GPU Operator v1.4.0 Release Notes diff --git a/docs/specialized_networks/airgapped-install-openshift.md b/docs/specialized_networks/airgapped-install-openshift.md index 98f41f868..b0864e6c6 100644 --- a/docs/specialized_networks/airgapped-install-openshift.md +++ b/docs/specialized_networks/airgapped-install-openshift.md @@ -1,27 +1,28 @@ -# Air-gapped Installation Guide for Openshift Environments +# Air-gapped Installation Guide for OpenShift Environments -This guide explains how to install the AMD GPU Operator in an air-gapped environment where the Openshift cluster has no external network connectivity. +This guide explains how to install the AMD GPU Operator in an air-gapped environment where the OpenShift cluster has no external network connectivity. ## Prerequisites 1. OpenShift 4.16+ -2. Assume users have followed [OpenShift Official Documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/disconnected_environments/mirroring-in-disconnected-environments) to install the air-gapped cluster and setup a Mirror Registry in Air-gapped environment. +2. Users should have followed the [OpenShift Official Documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/disconnected_environments/mirroring-in-disconnected-environments) to install the air-gapped cluster and set up a Mirror Registry. ![Air-gapped Installation Diagram](../_static/ocp_airgapped.png) ```{Note} - * In general users action item is only to provide the `ImageSetConfiguration` to configure the operator catalogs and images for mirroring the artifacts in the mirror container registry. - * Users may need to take extra step to manually copy mirrored artifacts to air-gapped system, in case the jump host is not allowed to directly push image to the mirror container registry. - * Most of the steps described in the graph above is automatically completed by the `oc-mirror` and other RedHat provided tool, which can be downloaded from [OpenShift official website](https://console.redhat.com/openshift/downloads). + * Users only need to provide the `ImageSetConfiguration` to configure operator catalogs and images for mirroring artifacts into the mirror container registry. + * Users may need to manually copy mirrored artifacts to the air-gapped system if the jump host cannot directly push images to the mirror container registry. + * Most steps in the diagram above are automatically completed by `oc-mirror` and other Red Hat provided tools, which can be downloaded from the [OpenShift official website](https://console.redhat.com/openshift/downloads). ``` -Here is an example of AMD GPU Operator required `ImageSetConfiguration` for users to mirror required catalogs and images into their mirror registry. +Here is an example `ImageSetConfiguration` for mirroring the required catalogs and images for the AMD GPU Operator into your mirror registry. ```{Warning} -1. The following `ImageSetConfiguration` file is just an example and it is incomplete. -2. Users need to configure the `storageConfig` part, for directly pushing artifacts to the mirror container registry or saving into local file storage. -3. Users may merge the `mirror` part of this example file with their own `ImageSetConfiguration`. -4. The detailed explanation of `ImageSetConfiguration` can be found from [OpenShift official documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/disconnected_environments/mirroring-in-disconnected-environments#using-oc-mirror_about-installing-oc-mirror-v2). +1. The following `ImageSetConfiguration` file is an incomplete example. +2. Users must configure the `storageConfig` section for either directly pushing artifacts to the mirror container registry or saving to local file storage. +3. Users may merge the `mirror` section of this example with their own `ImageSetConfiguration`. +4. Detailed explanation of `ImageSetConfiguration` can be found in the [OpenShift official documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/disconnected_environments/mirroring-in-disconnected-environments#using-oc-mirror_about-installing-oc-mirror-v2). +5. When mirroring the source image `docker.io/rocm/amdgpu-driver`, we strongly recommend making it accessible without requiring image pull secrets. If image pull secrets are required for pulling the source image, refer to the notes in [Driver Installation](../drivers/installation.md#install-out-of-tree-amd-gpu-drivers-with-operator) to configure the pull secret. ``` ```yaml @@ -46,7 +47,7 @@ mirror: type: ocp operators: - - catalog: registry.redhat.io/redhat/redhat-operator-index:v4.19 + - catalog: registry.redhat.io/redhat/redhat-operator-index:v4.19 # adjust the OpenShift version if needed packages: # Node Feature Discovery (NFD) - name: nfd @@ -64,15 +65,16 @@ mirror: # To get full list of released version # Either go to OperatorHub # Or check https://github.com/redhat-openshift-ecosystem/certified-operators/tree/main/operators/amd-gpu-operator - - catalog: registry.redhat.io/redhat/certified-operator-index:v4.19 + - catalog: registry.redhat.io/redhat/certified-operator-index:v4.19 # adjust the OpenShift version if needed packages: - name: amd-gpu-operator minVersion: "1.3.2" # adjust the version if needed channels: - name: alpha - # adjust the image tag if needed + # adjust the image tags if needed additionalImages: - name: registry.redhat.io/ubi9/ubi:latest + - name: registry.redhat.io/ubi9/ubi-minimal:latest - name: docker.io/rocm/gpu-operator:v1.3.1 - name: docker.io/rocm/gpu-operator-utils:v1.3.1 - name: docker.io/library/busybox:1.36 @@ -82,19 +84,34 @@ mirror: - name: docker.io/rocm/rocm-terminal:latest - name: docker.io/rocm/k8s-device-plugin:latest - name: docker.io/rocm/k8s-node-labeller:latest + # adjust RHEL version and ROCm version if needed for source image + # image tag format for CoreOS is coreos-- + - name: docker.io/rocm/amdgpu-driver:coreos-9.6-7.0.2 helm: {} ``` -3. After mirroring setup, assume users installed NFD, KMM and enabled internal image registry in air-gapped cluster, see [OpenShift OLM Installation](../installation/openshift-olm.md##configure-rnternal-registry) for details. +3. After mirroring setup, users should have installed NFD and KMM, and enabled the internal image registry in the air-gapped cluster. See [OpenShift OLM Installation](../installation/openshift-olm.md#configure-internal-registry) for details. -4. Users installed AMD GPU Operator in Air-gapped cluster without creating DeviceConfig. +4. Users should have installed the AMD GPU Operator in the air-gapped cluster without creating a `DeviceConfig`. ## Installation Steps -### 1. Build precompiled driver image +```{Note} +Starting from AMD GPU Operator v1.4.1, building the amdgpu driver directly within the disconnected cluster is supported. + +**Option 1:** Follow steps 1 and 2 to prepare a pre-compiled driver image in a connected environment, then import the pre-compiled image into the disconnected cluster. + +**Option 2:** Skip preparing a pre-compiled driver image and go directly to step 3, ensuring that: + +* The source image `docker.io/rocm/amdgpu-driver` is properly mirrored into your mirror image registry in the disconnected environment. + +* The `oc-mirror` generated `ImageDigestMirrorSet` and `ImageTagMirrorSet` are applied. +``` -Please build the pre-compiled driver image in the build cluster that has Internet access by following [Preparing Pre-compiled Driver Images](../drivers/precompiled-driver.md) and follow the steps for OpenShift section. +### 1. Build Precompiled Driver Image + +Build the pre-compiled driver image in a build cluster that has internet access by following [Preparing Pre-compiled Driver Images](../drivers/precompiled-driver.md) and the steps for OpenShift. After successfully pushing the driver image, save it by running: @@ -112,38 +129,40 @@ podman pull registry.example.com/amdgpu_kmod:coreos-9.6-5.14.0-570.45.1.el9_6.x8 podman save registry.example.com/amdgpu_kmod:coreos-9.6-5.14.0-570.45.1.el9_6.x86_64-7.0 -o driver-image.tar ``` -### 2. Import pre-compiled driver image +### 2. Import Pre-compiled Driver Image A. Import images ```{Note} -1. This step is for using the pre-compiled driver image within the cOpenShift internal registry (this is the OpenShift built-in image registry, not the mirror registry for Air-gapped installation). -2. For users who already push the pre-compiled driver image to other registry, they don't have to manually load it in internal registry, just skip to step 3 to specify the image URL in `spec.driver.image`. +1. This step is for using the pre-compiled driver image within the OpenShift internal registry (the OpenShift built-in image registry, not the mirror registry for air-gapped installation). +2. Users who have already pushed the pre-compiled driver image to another registry don't need to manually load it into the internal registry. Skip to step 3 and specify the image URL in `spec.driver.image`. ``` * Import pre-compiled driver image -After copying the image files to the air-gapped cluster, please switch to the air-gapped cluster and use podman to load the image, re-tag if needed then push the image to desired image registry: +After copying the image files to the air-gapped cluster, switch to the air-gapped cluster and use podman to load the image, re-tag if needed, then push the image to the desired image registry: * Load the image file: `podman load -i driver-image.tar` - * Re-tag if needed `podman tag `, remember to tag the image to the gpu operator's namespace, e.g. if you are using gpu operator in `openshift-amd-gpu`, please tag the image to`image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu/amdgpu_kmod`. - * Use podman to login to the image registry if needed, for OpenShift internal registry: + * Re-tag if needed: `podman tag `. Remember to tag the image to the GPU operator's namespace. For example, if using the GPU operator in `openshift-amd-gpu`, tag the image to `image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu/amdgpu_kmod`. + * Use podman to log in to the image registry if needed. For OpenShift internal registry: ```bash podman login -u builder -p $(oc create token builder) image-registry.openshift-image-registry.svc:5000 ``` * Push the image: `podman push ` -B. Once imported, verify that the required images are located in the internal registry. +B. Verify that the required images are located in the internal registry. -For example, if you are using internal registry: +For example, if using the internal registry: ```bash $ oc get is -n openshift-amd-gpu NAME IMAGE REPOSITORY TAGS UPDATED amdgpu_kmod image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu/amdgpu_kmod coreos-9.6-5.14.0-570.19.1.el9_6.x86_64-6.4.1 3 days ago ``` -### 3. Deployment of DeviceConfig in air-gapped environment +### 3. Deployment of DeviceConfig in Air-gapped Environment + +A. If pre-compiled driver images are present, the operator will directly pull and use the pre-compiled driver image. +B. If pre-compiled driver images are not present, the operator will build the kernel module based on the mirrored source image, which was previously mirrored from `docker.io/rocm/amdgpu-driver`. -A. Once all the required images and the precompiled driver are present in the internal registry we can now deploy the modified DeviceConfig. Note: the image variables are pointing to the internal registry instead the external ROCm repository. ```yaml apiVersion: amd.com/v1alpha1 kind: DeviceConfig @@ -152,10 +171,13 @@ metadata: namespace: openshift-amd-gpu spec: driver: - # 1. specify image here if you are NOT using OpenShift internal registry - # 2. specify the image without tag + # Specify image repo here if NOT using OpenShift internal registry without image tag + # Default value for OpenShift is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod #image: registry.example.com/amdgpu_kmod enable: true + # For OpenShift, set useSourceImage to true + # to enable building driver from source code image in air-gapped environment + useSourceImage: true version: "7.0" devicePlugin: enableNodeLabeller: true @@ -164,3 +186,24 @@ spec: selector: feature.node.kubernetes.io/amd-gpu: "true" ``` + +### 4. Verify the Deployment of DeviceConfig + +After successfully building the driver image and loading the `amdgpu` kernel modules, the metrics exporter and device plugin pods should be running in the same namespace as the `DeviceConfig`. + +```bash +$ oc get pods -n openshift-amd-gpu +NAME READY STATUS RESTARTS AGE +amd-gpu-operator-controller-manager-d7654f88d-4x4tx 1/1 Running 0 6h45m +test-cr-device-plugin-5w7dp 1/1 Running 0 2m +test-cr-metrics-exporter-5xwwr 1/1 Running 0 2m +``` + +If the driver loaded correctly without any issues, the device plugin will start advertising resources. If the device plugin is running but the node still shows zero `amd.com/gpu`, check the device plugin pod logs or `dmesg` on the node to detect potential failures. + +```bash +$ oc get node -oyaml | grep amd.com + gpu.operator.amd.com/openshift-amd-gpu.test-cr.driver: container + amd.com/gpu: "1" + amd.com/gpu: "1" +``` diff --git a/docs/test/auto-unhealthy-device-test.md b/docs/test/auto-unhealthy-device-test.md index c080ca4ef..da23a2fb4 100644 --- a/docs/test/auto-unhealthy-device-test.md +++ b/docs/test/auto-unhealthy-device-test.md @@ -228,7 +228,7 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600, + "TimeoutSeconds": 1200, "Arguments": "--parallel" } ] @@ -243,7 +243,7 @@ data: "Recipe": "mem", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600, + "TimeoutSeconds": 1200, "Arguments": "--parallel" } ] diff --git a/docs/test/manual-test.md b/docs/test/manual-test.md index f68d2dbc3..03c7d1c6b 100644 --- a/docs/test/manual-test.md +++ b/docs/test/manual-test.md @@ -74,7 +74,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -217,7 +218,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -374,7 +376,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -557,7 +560,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -571,7 +575,8 @@ data: "Recipe": "babel", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -719,7 +724,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -733,7 +739,8 @@ data: "Recipe": "babel", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } diff --git a/docs/test/pre-start-job-test.md b/docs/test/pre-start-job-test.md index faad76b9a..4326d3a04 100644 --- a/docs/test/pre-start-job-test.md +++ b/docs/test/pre-start-job-test.md @@ -79,7 +79,7 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600, + "TimeoutSeconds": 1200, "Arguments": "--parallel" } ] diff --git a/example/deviceconfig_example.yaml b/example/deviceconfig_example.yaml index afe73888b..0c5a69089 100644 --- a/example/deviceconfig_example.yaml +++ b/example/deviceconfig_example.yaml @@ -36,6 +36,21 @@ spec: # insecure: true # insecureSkipTLSVerify: true + ## (Optional) configure the driver image build within the cluster + #imageBuild: + # # configure the registry to search for base image for building driver + # # e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io + # # image builder will use docker.io/ubuntu:22.04 as base image + # baseImageRegistry: docker.io + # # sourceImageRepo: specify the amdgpu source code image repo for building driver + # # the Operator will decide the image tag based on user provided driver version and system OS version + # # e.g. if you input docker.io/rocm/amdgpu-driver the image tag will be coreos-- + # # NOTE: currently only work for OpenShift cluster + # sourceImageRepo: docker.io/rocm/amdgpu-driver + # baseImageRegistryTLS: + # insecure: False # If True, check for the container image using plain HTTP + # insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates) + # Specify the image signing config for building + signing image within cluster #imageSign: # keySecret: @@ -86,7 +101,7 @@ spec: nodePort: 32500 # exporter image - image: docker.io/rocm/device-metrics-exporter:v1.4.0 + image: docker.io/rocm/device-metrics-exporter:v1.4.1 # image pull policy for metrics exporter # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag imagePullPolicy: "IfNotPresent" @@ -101,7 +116,7 @@ spec: enable: True # testrunner image - image: docker.io/rocm/test-runner:v1.4.0 + image: docker.io/rocm/test-runner:v1.4.1 # image pull policy for the testrunner # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag imagePullPolicy: "IfNotPresent" @@ -123,7 +138,7 @@ spec: # To enable/disable the config manager, enable to partition enable: True # image for the device-config-manager container - image: rocm/device-config-manager:v1.4.0 + image: rocm/device-config-manager:v1.4.1 # image pull policy for config manager set to always to pull image of latest version imagePullPolicy: IfNotPresent # specify configmap name which stores profile config info diff --git a/example/testrunner/config.json b/example/testrunner/config.json index a1c1b9914..84570eef2 100644 --- a/example/testrunner/config.json +++ b/example/testrunner/config.json @@ -10,7 +10,8 @@ "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ], "LogsExportConfig": [ diff --git a/example/testrunner/configmap.yaml b/example/testrunner/configmap.yaml index 096f2234b..2a81f8600 100644 --- a/example/testrunner/configmap.yaml +++ b/example/testrunner/configmap.yaml @@ -17,7 +17,8 @@ data: "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ], "LogsExportConfig": [ diff --git a/example/testrunner/manual_test_job.yaml b/example/testrunner/manual_test_job.yaml index 5df6efee0..ec0688469 100644 --- a/example/testrunner/manual_test_job.yaml +++ b/example/testrunner/manual_test_job.yaml @@ -79,7 +79,8 @@ data: # file name within configmap should be config.json "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } diff --git a/example/testrunner/pre_start_job_check.yaml b/example/testrunner/pre_start_job_check.yaml index 888089418..82222bb8e 100644 --- a/example/testrunner/pre_start_job_check.yaml +++ b/example/testrunner/pre_start_job_check.yaml @@ -79,7 +79,8 @@ data: # file name within configmap should be config.json "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -93,7 +94,8 @@ data: # file name within configmap should be config.json "Recipe": "mem", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ], "LogsExportConfig": [ diff --git a/example/testrunner/schedule_test_cronjob.yaml b/example/testrunner/schedule_test_cronjob.yaml index c5a6de412..682a66966 100644 --- a/example/testrunner/schedule_test_cronjob.yaml +++ b/example/testrunner/schedule_test_cronjob.yaml @@ -79,7 +79,8 @@ data: # file name within configmap should be config.json "Recipe": "gst_single", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ] } @@ -93,7 +94,8 @@ data: # file name within configmap should be config.json "Recipe": "mem", "Iterations": 1, "StopOnFailure": true, - "TimeoutSeconds": 600 + "TimeoutSeconds": 1200, + "Arguments": "--parallel" } ], "LogsExportConfig": [ diff --git a/hack/k8s-patch/k8s-remediation-patch/metadata-patch/values.yaml b/hack/k8s-patch/k8s-remediation-patch/metadata-patch/values.yaml index 52383fc80..83339b213 100644 --- a/hack/k8s-patch/k8s-remediation-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/k8s-remediation-patch/metadata-patch/values.yaml @@ -1,2 +1,11 @@ controller: - image: "quay.io/argoproj/workflow-controller:v3.6.5" \ No newline at end of file + image: "quay.io/argoproj/workflow-controller:v3.6.5" + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + nodeSelector: {} \ No newline at end of file diff --git a/hack/k8s-patch/k8s-remediation-patch/template-patch/deployment.yaml b/hack/k8s-patch/k8s-remediation-patch/template-patch/deployment.yaml index d6637380a..a9bfc08a5 100644 --- a/hack/k8s-patch/k8s-remediation-patch/template-patch/deployment.yaml +++ b/hack/k8s-patch/k8s-remediation-patch/template-patch/deployment.yaml @@ -2988,6 +2988,11 @@ spec: labels: app: amd-gpu-operator-workflow-controller spec: + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + nodeSelector: {{- toYaml .Values.controller.nodeSelector | nindent 8 }} containers: - args: [] command: @@ -3019,8 +3024,6 @@ spec: - ALL readOnlyRootFilesystem: true runAsNonRoot: true - nodeSelector: - kubernetes.io/os: linux priorityClassName: workflow-controller securityContext: runAsNonRoot: true diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index dbaacc51a..985b72a64 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -230,6 +230,8 @@ deviceConfig: maxUnavailable: 1 # -- config manager tolerations configManagerTolerations: [] + remediationWorkflow: + enable: false # AMD GPU operator controller related configs controllerManager: manager: diff --git a/hack/k8s-patch/template-patch/argo-rbac.yaml b/hack/k8s-patch/template-patch/argo-rbac.yaml index d707b00da..c593dcd19 100644 --- a/hack/k8s-patch/template-patch/argo-rbac.yaml +++ b/hack/k8s-patch/template-patch/argo-rbac.yaml @@ -26,6 +26,14 @@ rules: - pods/log verbs: - '*' +- apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/hack/k8s-patch/template-patch/default-deviceconfig.yaml b/hack/k8s-patch/template-patch/default-deviceconfig.yaml index a5cbaf7b3..3c40cff8a 100644 --- a/hack/k8s-patch/template-patch/default-deviceconfig.yaml +++ b/hack/k8s-patch/template-patch/default-deviceconfig.yaml @@ -1,4 +1,3 @@ - {{- if or (and .Release.IsInstall .Values.crds.defaultCR.install) (and .Release.IsUpgrade .Values.crds.defaultCR.upgrade) }} {{- if and (hasKey .Values "deviceConfig") (hasKey .Values.deviceConfig "spec") }} apiVersion: amd.com/v1alpha1 @@ -29,6 +28,14 @@ spec: driverType: {{ . }} {{- end }} + {{- if (hasKey . "useSourceImage") }} + useSourceImage: {{ .useSourceImage }} + {{- end }} + + {{- with .amdgpuInstallerRepoURL }} + amdgpuInstallerRepoURL: {{ . }} + {{- end }} + {{- with .vfioConfig }} vfioConfig: {{- with .deviceIDs }} @@ -221,6 +228,25 @@ spec: {{- toYaml . | nindent 6 }} {{- end }} + {{- with .podResourceAPISocketPath }} + podResourceAPISocketPath: {{ . }} + {{- end }} + + {{- with .resource }} + resource: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .podAnnotations }} + podAnnotations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .serviceAnnotations }} + serviceAnnotations: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .rbacConfig }} rbacConfig: {{- if (hasKey . "enable") }} @@ -296,6 +322,10 @@ spec: {{- toYaml . | nindent 10 }} {{- end }} + {{- with .bearerTokenFile }} + bearerTokenFile: {{ . }} + {{- end }} + {{- with .tlsConfig }} tlsConfig: {{- toYaml . | nindent 10 }} @@ -389,5 +419,25 @@ spec: {{- end }} {{- end }} + {{- with .Values.deviceConfig.spec.remediationWorkflow }} + remediationWorkflow: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .conditionalWorkflows }} + conditionalWorkflows: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .ttlForFailedWorkflows }} + ttlForFailedWorkflows: {{ . }} + {{- end }} + + {{- with .testerImage }} + testerImage: {{ . }} + {{- end }} + {{- end }} + {{- end }} {{- end }} diff --git a/hack/k8s-patch/template-patch/nic-nfd-default-rule.yaml b/hack/k8s-patch/template-patch/nic-nfd-default-rule.yaml index d9a2a6c87..da4251dd9 100644 --- a/hack/k8s-patch/template-patch/nic-nfd-default-rule.yaml +++ b/hack/k8s-patch/template-patch/nic-nfd-default-rule.yaml @@ -8,6 +8,20 @@ metadata: # source2: https://devicehunt.com/view/type/pci/vendor/1dd8 spec: rules: + - name: amd-vnic + labels: + feature.node.kubernetes.io/amd-vnic: "true" + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + ionic: {op: Exists} + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1dd8"]} # AMD Pensando Systems + device: {op: In, value: ["1003"]} # DSC Ethernet Controller VF + subsystem_vendor: {op: In, value: ["1dd8"]} + subsystem_device: {op: In, value: ["5201"]} # POLLARA-1Q400 100/200/400G 1-port Card - name: amd-nic labels: feature.node.kubernetes.io/amd-nic: "true" diff --git a/hack/k8s-patch/template-patch/post-delete-hook.yaml b/hack/k8s-patch/template-patch/post-delete-hook.yaml index ad54a95e6..da7c287d4 100644 --- a/hack/k8s-patch/template-patch/post-delete-hook.yaml +++ b/hack/k8s-patch/template-patch/post-delete-hook.yaml @@ -83,6 +83,9 @@ spec: if kubectl get crds deviceconfigs.amd.com > /dev/null 2>&1; then kubectl delete crds deviceconfigs.amd.com fi + if kubectl get crds remediationworkflowstatuses.amd.com > /dev/null 2>&1; then + kubectl delete crds remediationworkflowstatuses.amd.com + fi {{- if index .Values "node-feature-discovery" "enabled" }} if kubectl get crds nodefeaturegroups.nfd.k8s-sigs.io > /dev/null 2>&1; then kubectl delete crds nodefeaturegroups.nfd.k8s-sigs.io diff --git a/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml b/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml index 5a30e9da7..e93b101bd 100644 --- a/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml +++ b/hack/k8s-patch/template-patch/pre-upgrade-hook.yaml @@ -224,6 +224,7 @@ spec: kubectl apply -f /opt/helm-charts-crds-k8s/module-crd.yaml kubectl apply -f /opt/helm-charts-crds-k8s/nodemodulesconfig-crd.yaml {{- end }} + kubectl apply -f /opt/helm-charts-crds-k8s/remediationworkflowstatus-crd.yaml restartPolicy: OnFailure {{- end }} # Run helm upgrade with --no-hooks to bypass the pre-upgrade hook \ No newline at end of file diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 5d21c3e2f..bb567af66 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2025-10-15T22:05:24.769635037Z" +generated: "2025-11-03T10:08:37.655536804Z" diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index a9b86c505..80eae601f 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -70,10 +70,12 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ --version=v1.4.0 ``` -#### Installation Options - -* Skip NFD installation: `--set node-feature-discovery.enabled=false` -* Skip KMM installation: `--set kmm.enabled=false` +```{note} +Installation Options + - Skip NFD installation: `--set node-feature-discovery.enabled=false` + - Skip KMM installation: `--set kmm.enabled=false` + - Skip Auto Node Remediation: `--set remediation.enabled=false` +``` > [!WARNING] > It is strongly recommended to use AMD-optimized KMM images included in the operator release. This is not required when installing the GPU Operator on Red Hat OpenShift. @@ -312,4 +314,8 @@ Kubernetes: `>= 1.29.0-0` | kmm.webhookService.ports[0].protocol | string | `"TCP"` | | | kmm.webhookService.ports[0].targetPort | int | `9443` | | | kmm.webhookService.type | string | `"ClusterIP"` | | +| remediation-controller.controller.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution[0].preference.matchExpressions[0].key | string | `"node-role.kubernetes.io/control-plane"` | | +| remediation-controller.controller.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution[0].preference.matchExpressions[0].operator | string | `"Exists"` | | +| remediation-controller.controller.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution[0].weight | int | `1` | | | remediation-controller.controller.image | string | `"quay.io/argoproj/workflow-controller:v3.6.5"` | | +| remediation-controller.controller.nodeSelector | object | `{}` | | diff --git a/helm-charts-k8s/charts/remediation/templates/deployment.yaml b/helm-charts-k8s/charts/remediation/templates/deployment.yaml index d6637380a..a9bfc08a5 100644 --- a/helm-charts-k8s/charts/remediation/templates/deployment.yaml +++ b/helm-charts-k8s/charts/remediation/templates/deployment.yaml @@ -2988,6 +2988,11 @@ spec: labels: app: amd-gpu-operator-workflow-controller spec: + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + nodeSelector: {{- toYaml .Values.controller.nodeSelector | nindent 8 }} containers: - args: [] command: @@ -3019,8 +3024,6 @@ spec: - ALL readOnlyRootFilesystem: true runAsNonRoot: true - nodeSelector: - kubernetes.io/os: linux priorityClassName: workflow-controller securityContext: runAsNonRoot: true diff --git a/helm-charts-k8s/charts/remediation/values.yaml b/helm-charts-k8s/charts/remediation/values.yaml index 52383fc80..83339b213 100644 --- a/helm-charts-k8s/charts/remediation/values.yaml +++ b/helm-charts-k8s/charts/remediation/values.yaml @@ -1,2 +1,11 @@ controller: - image: "quay.io/argoproj/workflow-controller:v3.6.5" \ No newline at end of file + image: "quay.io/argoproj/workflow-controller:v3.6.5" + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + nodeSelector: {} \ No newline at end of file diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 97b637e41..752cf1413 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -409,10 +409,13 @@ spec: description: |- image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + Use spec.driver.imageRegistrySecret for authentication with private registries. NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image type: string baseImageRegistryTLS: - description: TLS settings for fetching base image + description: |- + TLS settings for fetching base image + this field will be applied to SourceImageRepo as well properties: insecure: description: If true, check if the container image already @@ -422,6 +425,14 @@ spec: description: If true, skip any TLS server certificate validation type: boolean type: object + sourceImageRepo: + description: |- + SourceImageRepo specifies the image repository for the driver source code (OpenShift only). + Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag + based on cluster RHEL version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver + Use spec.driver.imageRegistrySecret for authentication with private registries. + type: string type: object imageRegistrySecret: description: secrets used for pull/push images from/to private registry @@ -627,6 +638,12 @@ spec: perform reboot on worker nodes type: boolean type: object + useSourceImage: + description: |- + NOTE: currently only for OpenShift cluster + set to true to use source image to build driver image on the fly + otherwise use installer debian/rpm packages from radeon repo to build driver image + type: boolean version: description: |- version of the drivers source code, can be used as part of image of dockerfile source image diff --git a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml new file mode 100644 index 000000000..aa5c0ac02 --- /dev/null +++ b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml @@ -0,0 +1,74 @@ +--- +# Source: gpu-operator-charts/templates/remediationworkflowstatus-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: remediationworkflowstatuses.amd.com + annotations: + controller-gen.kubebuilder.io/version: v0.17.0 + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + helm.sh/chart: gpu-operator-charts-v1.4.0 + app.kubernetes.io/name: gpu-operator-charts + app.kubernetes.io/instance: amd-gpu + app.kubernetes.io/version: "v1.4.0" + app.kubernetes.io/managed-by: Helm +spec: + group: amd.com + names: + kind: RemediationWorkflowStatus + listKind: RemediationWorkflowStatusList + plural: remediationworkflowstatuses + shortNames: + - rwfstatus + singular: remediationworkflowstatus + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: RemediationWorkflowStatus keeps a record of recent remediation + workflow runs. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + additionalProperties: + additionalProperties: + items: + properties: + name: + type: string + startTime: + type: string + type: object + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/helm-charts-k8s/templates/argo-rbac.yaml b/helm-charts-k8s/templates/argo-rbac.yaml index d707b00da..c593dcd19 100644 --- a/helm-charts-k8s/templates/argo-rbac.yaml +++ b/helm-charts-k8s/templates/argo-rbac.yaml @@ -26,6 +26,14 @@ rules: - pods/log verbs: - '*' +- apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/helm-charts-k8s/templates/default-deviceconfig.yaml b/helm-charts-k8s/templates/default-deviceconfig.yaml index a5cbaf7b3..3c40cff8a 100644 --- a/helm-charts-k8s/templates/default-deviceconfig.yaml +++ b/helm-charts-k8s/templates/default-deviceconfig.yaml @@ -1,4 +1,3 @@ - {{- if or (and .Release.IsInstall .Values.crds.defaultCR.install) (and .Release.IsUpgrade .Values.crds.defaultCR.upgrade) }} {{- if and (hasKey .Values "deviceConfig") (hasKey .Values.deviceConfig "spec") }} apiVersion: amd.com/v1alpha1 @@ -29,6 +28,14 @@ spec: driverType: {{ . }} {{- end }} + {{- if (hasKey . "useSourceImage") }} + useSourceImage: {{ .useSourceImage }} + {{- end }} + + {{- with .amdgpuInstallerRepoURL }} + amdgpuInstallerRepoURL: {{ . }} + {{- end }} + {{- with .vfioConfig }} vfioConfig: {{- with .deviceIDs }} @@ -221,6 +228,25 @@ spec: {{- toYaml . | nindent 6 }} {{- end }} + {{- with .podResourceAPISocketPath }} + podResourceAPISocketPath: {{ . }} + {{- end }} + + {{- with .resource }} + resource: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .podAnnotations }} + podAnnotations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .serviceAnnotations }} + serviceAnnotations: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .rbacConfig }} rbacConfig: {{- if (hasKey . "enable") }} @@ -296,6 +322,10 @@ spec: {{- toYaml . | nindent 10 }} {{- end }} + {{- with .bearerTokenFile }} + bearerTokenFile: {{ . }} + {{- end }} + {{- with .tlsConfig }} tlsConfig: {{- toYaml . | nindent 10 }} @@ -389,5 +419,25 @@ spec: {{- end }} {{- end }} + {{- with .Values.deviceConfig.spec.remediationWorkflow }} + remediationWorkflow: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .conditionalWorkflows }} + conditionalWorkflows: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .ttlForFailedWorkflows }} + ttlForFailedWorkflows: {{ . }} + {{- end }} + + {{- with .testerImage }} + testerImage: {{ . }} + {{- end }} + {{- end }} + {{- end }} {{- end }} diff --git a/helm-charts-k8s/templates/manager-rbac.yaml b/helm-charts-k8s/templates/manager-rbac.yaml index a09a11937..4a9547ec4 100644 --- a/helm-charts-k8s/templates/manager-rbac.yaml +++ b/helm-charts-k8s/templates/manager-rbac.yaml @@ -80,6 +80,7 @@ rules: - amd.com resources: - deviceconfigs + - remediationworkflowstatuses verbs: - create - get @@ -91,12 +92,14 @@ rules: - amd.com resources: - deviceconfigs/finalizers + - remediationworkflowstatuses/finalizers verbs: - update - apiGroups: - amd.com resources: - deviceconfigs/status + - remediationworkflowstatuses/status verbs: - get - patch diff --git a/helm-charts-k8s/templates/nic-nfd-default-rule.yaml b/helm-charts-k8s/templates/nic-nfd-default-rule.yaml index d9a2a6c87..da4251dd9 100644 --- a/helm-charts-k8s/templates/nic-nfd-default-rule.yaml +++ b/helm-charts-k8s/templates/nic-nfd-default-rule.yaml @@ -8,6 +8,20 @@ metadata: # source2: https://devicehunt.com/view/type/pci/vendor/1dd8 spec: rules: + - name: amd-vnic + labels: + feature.node.kubernetes.io/amd-vnic: "true" + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + ionic: {op: Exists} + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1dd8"]} # AMD Pensando Systems + device: {op: In, value: ["1003"]} # DSC Ethernet Controller VF + subsystem_vendor: {op: In, value: ["1dd8"]} + subsystem_device: {op: In, value: ["5201"]} # POLLARA-1Q400 100/200/400G 1-port Card - name: amd-nic labels: feature.node.kubernetes.io/amd-nic: "true" diff --git a/helm-charts-k8s/templates/post-delete-hook.yaml b/helm-charts-k8s/templates/post-delete-hook.yaml index ad54a95e6..da7c287d4 100644 --- a/helm-charts-k8s/templates/post-delete-hook.yaml +++ b/helm-charts-k8s/templates/post-delete-hook.yaml @@ -83,6 +83,9 @@ spec: if kubectl get crds deviceconfigs.amd.com > /dev/null 2>&1; then kubectl delete crds deviceconfigs.amd.com fi + if kubectl get crds remediationworkflowstatuses.amd.com > /dev/null 2>&1; then + kubectl delete crds remediationworkflowstatuses.amd.com + fi {{- if index .Values "node-feature-discovery" "enabled" }} if kubectl get crds nodefeaturegroups.nfd.k8s-sigs.io > /dev/null 2>&1; then kubectl delete crds nodefeaturegroups.nfd.k8s-sigs.io diff --git a/helm-charts-k8s/templates/pre-upgrade-hook.yaml b/helm-charts-k8s/templates/pre-upgrade-hook.yaml index 5a30e9da7..e93b101bd 100644 --- a/helm-charts-k8s/templates/pre-upgrade-hook.yaml +++ b/helm-charts-k8s/templates/pre-upgrade-hook.yaml @@ -224,6 +224,7 @@ spec: kubectl apply -f /opt/helm-charts-crds-k8s/module-crd.yaml kubectl apply -f /opt/helm-charts-crds-k8s/nodemodulesconfig-crd.yaml {{- end }} + kubectl apply -f /opt/helm-charts-crds-k8s/remediationworkflowstatus-crd.yaml restartPolicy: OnFailure {{- end }} # Run helm upgrade with --no-hooks to bypass the pre-upgrade hook \ No newline at end of file diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index dbaacc51a..985b72a64 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -230,6 +230,8 @@ deviceConfig: maxUnavailable: 1 # -- config manager tolerations configManagerTolerations: [] + remediationWorkflow: + enable: false # AMD GPU operator controller related configs controllerManager: manager: diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 9952ec320..2310380d5 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-10-15T22:05:38.558917303Z" +generated: "2025-11-03T10:08:49.883010865Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 97b637e41..752cf1413 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -409,10 +409,13 @@ spec: description: |- image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + Use spec.driver.imageRegistrySecret for authentication with private registries. NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image type: string baseImageRegistryTLS: - description: TLS settings for fetching base image + description: |- + TLS settings for fetching base image + this field will be applied to SourceImageRepo as well properties: insecure: description: If true, check if the container image already @@ -422,6 +425,14 @@ spec: description: If true, skip any TLS server certificate validation type: boolean type: object + sourceImageRepo: + description: |- + SourceImageRepo specifies the image repository for the driver source code (OpenShift only). + Used when spec.driver.useSourceImage is true. The operator automatically determines the image tag + based on cluster RHEL version and spec.driver.version (format: coreos--). + Default: docker.io/rocm/amdgpu-driver + Use spec.driver.imageRegistrySecret for authentication with private registries. + type: string type: object imageRegistrySecret: description: secrets used for pull/push images from/to private registry @@ -627,6 +638,12 @@ spec: perform reboot on worker nodes type: boolean type: object + useSourceImage: + description: |- + NOTE: currently only for OpenShift cluster + set to true to use source image to build driver image on the fly + otherwise use installer debian/rpm packages from radeon repo to build driver image + type: boolean version: description: |- version of the drivers source code, can be used as part of image of dockerfile source image diff --git a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml new file mode 100644 index 000000000..aa5c0ac02 --- /dev/null +++ b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml @@ -0,0 +1,74 @@ +--- +# Source: gpu-operator-charts/templates/remediationworkflowstatus-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: remediationworkflowstatuses.amd.com + annotations: + controller-gen.kubebuilder.io/version: v0.17.0 + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + helm.sh/chart: gpu-operator-charts-v1.4.0 + app.kubernetes.io/name: gpu-operator-charts + app.kubernetes.io/instance: amd-gpu + app.kubernetes.io/version: "v1.4.0" + app.kubernetes.io/managed-by: Helm +spec: + group: amd.com + names: + kind: RemediationWorkflowStatus + listKind: RemediationWorkflowStatusList + plural: remediationworkflowstatuses + shortNames: + - rwfstatus + singular: remediationworkflowstatus + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: RemediationWorkflowStatus keeps a record of recent remediation + workflow runs. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + additionalProperties: + additionalProperties: + items: + properties: + name: + type: string + startTime: + type: string + type: object + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/helm-charts-openshift/templates/manager-rbac.yaml b/helm-charts-openshift/templates/manager-rbac.yaml index 0a94cce45..fe673eb6c 100644 --- a/helm-charts-openshift/templates/manager-rbac.yaml +++ b/helm-charts-openshift/templates/manager-rbac.yaml @@ -80,6 +80,7 @@ rules: - amd.com resources: - deviceconfigs + - remediationworkflowstatuses verbs: - create - get @@ -91,12 +92,14 @@ rules: - amd.com resources: - deviceconfigs/finalizers + - remediationworkflowstatuses/finalizers verbs: - update - apiGroups: - amd.com resources: - deviceconfigs/status + - remediationworkflowstatuses/status verbs: - get - patch diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index 2a23c0851..90ef9e269 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -162,6 +162,9 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) { //+kubebuilder:rbac:groups=amd.com,resources=deviceconfigs,verbs=get;list;watch;create;patch;update //+kubebuilder:rbac:groups=amd.com,resources=deviceconfigs/status,verbs=get;patch;update //+kubebuilder:rbac:groups=amd.com,resources=deviceconfigs/finalizers,verbs=update +//+kubebuilder:rbac:groups=amd.com,resources=remediationworkflowstatuses,verbs=get;list;watch;create;patch;update +//+kubebuilder:rbac:groups=amd.com,resources=remediationworkflowstatuses/status,verbs=get;patch;update +//+kubebuilder:rbac:groups=amd.com,resources=remediationworkflowstatuses/finalizers,verbs=update //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=modules,verbs=get;list;watch;create;patch;update;delete //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=modules/status,verbs=get;update;patch //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=modules/finalizers,verbs=get;update;watch diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go index f08c1bcce..0ab77e2c7 100644 --- a/internal/controllers/mock_remediation_handler.go +++ b/internal/controllers/mock_remediation_handler.go @@ -27,6 +27,7 @@ package controllers import ( context "context" reflect "reflect" + time "time" v1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" v1alpha10 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" @@ -111,6 +112,61 @@ func (m *MockremediationMgrHelperAPI) EXPECT() *MockremediationMgrHelperAPIMockR return m.recorder } +// abortWorkflow mocks base method. +func (m *MockremediationMgrHelperAPI) abortWorkflow(ctx context.Context, workflow *v1alpha10.Workflow) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "abortWorkflow", ctx, workflow) + ret0, _ := ret[0].(error) + return ret0 +} + +// abortWorkflow indicates an expected call of abortWorkflow. +func (mr *MockremediationMgrHelperAPIMockRecorder) abortWorkflow(ctx, workflow any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "abortWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).abortWorkflow), ctx, workflow) +} + +// attemptAbortWorkflowOnNode mocks base method. +func (m *MockremediationMgrHelperAPI) attemptAbortWorkflowOnNode(ctx context.Context, node *v1.Node, wf *v1alpha10.Workflow) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "attemptAbortWorkflowOnNode", ctx, node, wf) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// attemptAbortWorkflowOnNode indicates an expected call of attemptAbortWorkflowOnNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) attemptAbortWorkflowOnNode(ctx, node, wf any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "attemptAbortWorkflowOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).attemptAbortWorkflowOnNode), ctx, node, wf) +} + +// attemptResumeWorkflowOnNode mocks base method. +func (m *MockremediationMgrHelperAPI) attemptResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping ConditionWorkflowMapping, wf *v1alpha10.Workflow) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "attemptResumeWorkflowOnNode", ctx, node, mapping, wf) +} + +// attemptResumeWorkflowOnNode indicates an expected call of attemptResumeWorkflowOnNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) attemptResumeWorkflowOnNode(ctx, node, mapping, wf any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "attemptResumeWorkflowOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).attemptResumeWorkflowOnNode), ctx, node, mapping, wf) +} + +// canResumeWorkflowOnNode mocks base method. +func (m *MockremediationMgrHelperAPI) canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "canResumeWorkflowOnNode", ctx, node, mapping) + ret0, _ := ret[0].(bool) + return ret0 +} + +// canResumeWorkflowOnNode indicates an expected call of canResumeWorkflowOnNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) canResumeWorkflowOnNode(ctx, node, mapping any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "canResumeWorkflowOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).canResumeWorkflowOnNode), ctx, node, mapping) +} + // checkIfTaintExists mocks base method. func (m *MockremediationMgrHelperAPI) checkIfTaintExists(node *v1.Node, targetTaint v1.Taint) bool { m.ctrl.T.Helper() @@ -170,6 +226,21 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) createDefaultWorkflowTemplate return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "createDefaultWorkflowTemplate", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).createDefaultWorkflowTemplate), ctx, devConfig) } +// createRemediationWorkflowStatus mocks base method. +func (m *MockremediationMgrHelperAPI) createRemediationWorkflowStatus(ctx context.Context, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "createRemediationWorkflowStatus", ctx, namespace) + ret0, _ := ret[0].(*v1alpha1.RemediationWorkflowStatus) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// createRemediationWorkflowStatus indicates an expected call of createRemediationWorkflowStatus. +func (mr *MockremediationMgrHelperAPIMockRecorder) createRemediationWorkflowStatus(ctx, namespace any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "createRemediationWorkflowStatus", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).createRemediationWorkflowStatus), ctx, namespace) +} + // createWorkflow mocks base method. func (m *MockremediationMgrHelperAPI) createWorkflow(ctx context.Context, workflow *v1alpha10.Workflow) error { m.ctrl.T.Helper() @@ -212,6 +283,34 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) deleteWorkflow(ctx, workflow return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).deleteWorkflow), ctx, workflow) } +// dropOlderRecoveryAttemptsFromStatusCR mocks base method. +func (m *MockremediationMgrHelperAPI) dropOlderRecoveryAttemptsFromStatusCR(ctx context.Context, namespace string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "dropOlderRecoveryAttemptsFromStatusCR", ctx, namespace) + ret0, _ := ret[0].(error) + return ret0 +} + +// dropOlderRecoveryAttemptsFromStatusCR indicates an expected call of dropOlderRecoveryAttemptsFromStatusCR. +func (mr *MockremediationMgrHelperAPIMockRecorder) dropOlderRecoveryAttemptsFromStatusCR(ctx, namespace any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "dropOlderRecoveryAttemptsFromStatusCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).dropOlderRecoveryAttemptsFromStatusCR), ctx, namespace) +} + +// dropOlderRecoveryAttemptsInternal mocks base method. +func (m *MockremediationMgrHelperAPI) dropOlderRecoveryAttemptsInternal(nodeName, nodeCondition, windowSize string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "dropOlderRecoveryAttemptsInternal", nodeName, nodeCondition, windowSize) + ret0, _ := ret[0].(error) + return ret0 +} + +// dropOlderRecoveryAttemptsInternal indicates an expected call of dropOlderRecoveryAttemptsInternal. +func (mr *MockremediationMgrHelperAPIMockRecorder) dropOlderRecoveryAttemptsInternal(nodeName, nodeCondition, windowSize any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "dropOlderRecoveryAttemptsInternal", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).dropOlderRecoveryAttemptsInternal), nodeName, nodeCondition, windowSize) +} + // getConfigMap mocks base method. func (m *MockremediationMgrHelperAPI) getConfigMap(ctx context.Context, configmapName, namespace string) (*v1.ConfigMap, error) { m.ctrl.T.Helper() @@ -227,6 +326,92 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getConfigMap(ctx, configmapNa return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getConfigMap", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getConfigMap), ctx, configmapName, namespace) } +// getMaxAllowedRunsPerWindow mocks base method. +func (m *MockremediationMgrHelperAPI) getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getMaxAllowedRunsPerWindow", recoveryPolicy) + ret0, _ := ret[0].(int) + return ret0 +} + +// getMaxAllowedRunsPerWindow indicates an expected call of getMaxAllowedRunsPerWindow. +func (mr *MockremediationMgrHelperAPIMockRecorder) getMaxAllowedRunsPerWindow(recoveryPolicy any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getMaxAllowedRunsPerWindow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getMaxAllowedRunsPerWindow), recoveryPolicy) +} + +// getRecentRecoveryCount mocks base method. +func (m *MockremediationMgrHelperAPI) getRecentRecoveryCount(nodeName, nodeCondition string) int { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getRecentRecoveryCount", nodeName, nodeCondition) + ret0, _ := ret[0].(int) + return ret0 +} + +// getRecentRecoveryCount indicates an expected call of getRecentRecoveryCount. +func (mr *MockremediationMgrHelperAPIMockRecorder) getRecentRecoveryCount(nodeName, nodeCondition any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getRecentRecoveryCount", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getRecentRecoveryCount), nodeName, nodeCondition) +} + +// getRecoveryTrackerKey mocks base method. +func (m *MockremediationMgrHelperAPI) getRecoveryTrackerKey(nodeName, nodeCondition string) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getRecoveryTrackerKey", nodeName, nodeCondition) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// getRecoveryTrackerKey indicates an expected call of getRecoveryTrackerKey. +func (mr *MockremediationMgrHelperAPIMockRecorder) getRecoveryTrackerKey(nodeName, nodeCondition any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getRecoveryTrackerKey", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getRecoveryTrackerKey), nodeName, nodeCondition) +} + +// getRemediationWorkflowStatus mocks base method. +func (m *MockremediationMgrHelperAPI) getRemediationWorkflowStatus(ctx context.Context, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getRemediationWorkflowStatus", ctx, namespace) + ret0, _ := ret[0].(*v1alpha1.RemediationWorkflowStatus) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// getRemediationWorkflowStatus indicates an expected call of getRemediationWorkflowStatus. +func (mr *MockremediationMgrHelperAPIMockRecorder) getRemediationWorkflowStatus(ctx, namespace any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getRemediationWorkflowStatus", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getRemediationWorkflowStatus), ctx, namespace) +} + +// getServiceAccountName mocks base method. +func (m *MockremediationMgrHelperAPI) getServiceAccountName(ctx context.Context, devConfig *v1alpha1.DeviceConfig) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getServiceAccountName", ctx, devConfig) + ret0, _ := ret[0].(string) + return ret0 +} + +// getServiceAccountName indicates an expected call of getServiceAccountName. +func (mr *MockremediationMgrHelperAPIMockRecorder) getServiceAccountName(ctx, devConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getServiceAccountName", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getServiceAccountName), ctx, devConfig) +} + +// getWindowSize mocks base method. +func (m *MockremediationMgrHelperAPI) getWindowSize(recoveryPolicy *RecoveryPolicyConfig) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getWindowSize", recoveryPolicy) + ret0, _ := ret[0].(string) + return ret0 +} + +// getWindowSize indicates an expected call of getWindowSize. +func (mr *MockremediationMgrHelperAPIMockRecorder) getWindowSize(recoveryPolicy any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getWindowSize", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getWindowSize), recoveryPolicy) +} + // getWorkflowList mocks base method. func (m *MockremediationMgrHelperAPI) getWorkflowList(ctx context.Context, namespace string) (*v1alpha10.WorkflowList, error) { m.ctrl.T.Helper() @@ -242,6 +427,21 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getWorkflowList(ctx, namespac return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getWorkflowList", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getWorkflowList), ctx, namespace) } +// getWorkflowTaskScriptSource mocks base method. +func (m *MockremediationMgrHelperAPI) getWorkflowTaskScriptSource(scriptFileName string) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getWorkflowTaskScriptSource", scriptFileName) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// getWorkflowTaskScriptSource indicates an expected call of getWorkflowTaskScriptSource. +func (mr *MockremediationMgrHelperAPIMockRecorder) getWorkflowTaskScriptSource(scriptFileName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getWorkflowTaskScriptSource", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).getWorkflowTaskScriptSource), scriptFileName) +} + // getWorkflowTemplate mocks base method. func (m *MockremediationMgrHelperAPI) getWorkflowTemplate(ctx context.Context, workflowTemplateName, namespace string) (*v1alpha10.WorkflowTemplate, error) { m.ctrl.T.Helper() @@ -272,17 +472,31 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getWorkflowUtilityImage(devCo } // handleExistingWorkflowsOnNode mocks base method. -func (m *MockremediationMgrHelperAPI) handleExistingWorkflowsOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node) bool { +func (m *MockremediationMgrHelperAPI) handleExistingWorkflowsOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "handleExistingWorkflowsOnNode", ctx, devConfig, node) + ret := m.ctrl.Call(m, "handleExistingWorkflowsOnNode", ctx, devConfig, node, mapping) ret0, _ := ret[0].(bool) return ret0 } // handleExistingWorkflowsOnNode indicates an expected call of handleExistingWorkflowsOnNode. -func (mr *MockremediationMgrHelperAPIMockRecorder) handleExistingWorkflowsOnNode(ctx, devConfig, node any) *gomock.Call { +func (mr *MockremediationMgrHelperAPIMockRecorder) handleExistingWorkflowsOnNode(ctx, devConfig, node, mapping any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "handleExistingWorkflowsOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).handleExistingWorkflowsOnNode), ctx, devConfig, node) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "handleExistingWorkflowsOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).handleExistingWorkflowsOnNode), ctx, devConfig, node, mapping) +} + +// handleSuspendedWorkflowsOnNode mocks base method. +func (m *MockremediationMgrHelperAPI) handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *v1alpha10.Workflow) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "handleSuspendedWorkflowsOnNode", ctx, devConfig, node, mapping, wf) + ret0, _ := ret[0].(bool) + return ret0 +} + +// handleSuspendedWorkflowsOnNode indicates an expected call of handleSuspendedWorkflowsOnNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) handleSuspendedWorkflowsOnNode(ctx, devConfig, node, mapping, wf any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "handleSuspendedWorkflowsOnNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).handleSuspendedWorkflowsOnNode), ctx, devConfig, node, mapping, wf) } // isDriverUpgradeInProgress mocks base method. @@ -299,6 +513,48 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) isDriverUpgradeInProgress(dev return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isDriverUpgradeInProgress", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isDriverUpgradeInProgress), devCfg, node) } +// isNodeLabelledForAbortWorkflow mocks base method. +func (m *MockremediationMgrHelperAPI) isNodeLabelledForAbortWorkflow(node *v1.Node) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "isNodeLabelledForAbortWorkflow", node) + ret0, _ := ret[0].(bool) + return ret0 +} + +// isNodeLabelledForAbortWorkflow indicates an expected call of isNodeLabelledForAbortWorkflow. +func (mr *MockremediationMgrHelperAPIMockRecorder) isNodeLabelledForAbortWorkflow(node any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isNodeLabelledForAbortWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isNodeLabelledForAbortWorkflow), node) +} + +// isNodeLabelledForForceResume mocks base method. +func (m *MockremediationMgrHelperAPI) isNodeLabelledForForceResume(ctx context.Context, node *v1.Node) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "isNodeLabelledForForceResume", ctx, node) + ret0, _ := ret[0].(bool) + return ret0 +} + +// isNodeLabelledForForceResume indicates an expected call of isNodeLabelledForForceResume. +func (mr *MockremediationMgrHelperAPIMockRecorder) isNodeLabelledForForceResume(ctx, node any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isNodeLabelledForForceResume", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isNodeLabelledForForceResume), ctx, node) +} + +// isRecoveryPolicyViolated mocks base method. +func (m *MockremediationMgrHelperAPI) isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "isRecoveryPolicyViolated", ctx, nodeName, mapping) + ret0, _ := ret[0].(bool) + return ret0 +} + +// isRecoveryPolicyViolated indicates an expected call of isRecoveryPolicyViolated. +func (mr *MockremediationMgrHelperAPIMockRecorder) isRecoveryPolicyViolated(ctx, nodeName, mapping any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isRecoveryPolicyViolated", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isRecoveryPolicyViolated), ctx, nodeName, mapping) +} + // isRemediationDisabled mocks base method. func (m *MockremediationMgrHelperAPI) isRemediationDisabled(ctx context.Context, devConfig *v1alpha1.DeviceConfig) (bool, error) { m.ctrl.T.Helper() @@ -314,6 +570,20 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) isRemediationDisabled(ctx, de return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isRemediationDisabled", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isRemediationDisabled), ctx, devConfig) } +// isStatusSynced mocks base method. +func (m *MockremediationMgrHelperAPI) isStatusSynced(ctx context.Context) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "isStatusSynced", ctx) + ret0, _ := ret[0].(bool) + return ret0 +} + +// isStatusSynced indicates an expected call of isStatusSynced. +func (mr *MockremediationMgrHelperAPIMockRecorder) isStatusSynced(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isStatusSynced", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isStatusSynced), ctx) +} + // isWorkflowSchedulableOnNode mocks base method. func (m *MockremediationMgrHelperAPI) isWorkflowSchedulableOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool { m.ctrl.T.Helper() @@ -342,6 +612,76 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) populateWorkflow(ctx, wfTempl return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "populateWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).populateWorkflow), ctx, wfTemplate, mapping, nodeName, devCfg) } +// registerRecoveryAttempt mocks base method. +func (m *MockremediationMgrHelperAPI) registerRecoveryAttempt(ctx context.Context, nodeName, nodeCondition, namespace, wfName string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "registerRecoveryAttempt", ctx, nodeName, nodeCondition, namespace, wfName) + ret0, _ := ret[0].(error) + return ret0 +} + +// registerRecoveryAttempt indicates an expected call of registerRecoveryAttempt. +func (mr *MockremediationMgrHelperAPIMockRecorder) registerRecoveryAttempt(ctx, nodeName, nodeCondition, namespace, wfName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "registerRecoveryAttempt", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).registerRecoveryAttempt), ctx, nodeName, nodeCondition, namespace, wfName) +} + +// registerRecoveryAttemptInternal mocks base method. +func (m *MockremediationMgrHelperAPI) registerRecoveryAttemptInternal(nodeName, nodeCondition, namespace string, startTime time.Time) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "registerRecoveryAttemptInternal", nodeName, nodeCondition, namespace, startTime) + ret0, _ := ret[0].(error) + return ret0 +} + +// registerRecoveryAttemptInternal indicates an expected call of registerRecoveryAttemptInternal. +func (mr *MockremediationMgrHelperAPIMockRecorder) registerRecoveryAttemptInternal(nodeName, nodeCondition, namespace, startTime any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "registerRecoveryAttemptInternal", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).registerRecoveryAttemptInternal), nodeName, nodeCondition, namespace, startTime) +} + +// registerRecoveryAttemptToStatusCR mocks base method. +func (m *MockremediationMgrHelperAPI) registerRecoveryAttemptToStatusCR(ctx context.Context, nodeName, nodeCondition, namespace, wfName string, startTime time.Time) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "registerRecoveryAttemptToStatusCR", ctx, nodeName, nodeCondition, namespace, wfName, startTime) + ret0, _ := ret[0].(error) + return ret0 +} + +// registerRecoveryAttemptToStatusCR indicates an expected call of registerRecoveryAttemptToStatusCR. +func (mr *MockremediationMgrHelperAPIMockRecorder) registerRecoveryAttemptToStatusCR(ctx, nodeName, nodeCondition, namespace, wfName, startTime any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "registerRecoveryAttemptToStatusCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).registerRecoveryAttemptToStatusCR), ctx, nodeName, nodeCondition, namespace, wfName, startTime) +} + +// removeAbortWorkflowLabelFromNode mocks base method. +func (m *MockremediationMgrHelperAPI) removeAbortWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "removeAbortWorkflowLabelFromNode", ctx, node) + ret0, _ := ret[0].(error) + return ret0 +} + +// removeAbortWorkflowLabelFromNode indicates an expected call of removeAbortWorkflowLabelFromNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) removeAbortWorkflowLabelFromNode(ctx, node any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "removeAbortWorkflowLabelFromNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).removeAbortWorkflowLabelFromNode), ctx, node) +} + +// removeForceResumeWorkflowLabelFromNode mocks base method. +func (m *MockremediationMgrHelperAPI) removeForceResumeWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "removeForceResumeWorkflowLabelFromNode", ctx, node) + ret0, _ := ret[0].(error) + return ret0 +} + +// removeForceResumeWorkflowLabelFromNode indicates an expected call of removeForceResumeWorkflowLabelFromNode. +func (mr *MockremediationMgrHelperAPIMockRecorder) removeForceResumeWorkflowLabelFromNode(ctx, node any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "removeForceResumeWorkflowLabelFromNode", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).removeForceResumeWorkflowLabelFromNode), ctx, node) +} + // resumeSuspendedWorkflow mocks base method. func (m *MockremediationMgrHelperAPI) resumeSuspendedWorkflow(ctx context.Context, wfName, namespace string) error { m.ctrl.T.Helper() @@ -356,6 +696,20 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) resumeSuspendedWorkflow(ctx, return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "resumeSuspendedWorkflow", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).resumeSuspendedWorkflow), ctx, wfName, namespace) } +// syncInternalMapFromStatusCR mocks base method. +func (m *MockremediationMgrHelperAPI) syncInternalMapFromStatusCR(ctx context.Context, namespace string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "syncInternalMapFromStatusCR", ctx, namespace) + ret0, _ := ret[0].(error) + return ret0 +} + +// syncInternalMapFromStatusCR indicates an expected call of syncInternalMapFromStatusCR. +func (mr *MockremediationMgrHelperAPIMockRecorder) syncInternalMapFromStatusCR(ctx, namespace any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "syncInternalMapFromStatusCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).syncInternalMapFromStatusCR), ctx, namespace) +} + // validateNodeConditions mocks base method. func (m *MockremediationMgrHelperAPI) validateNodeConditions(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mappings map[string]ConditionWorkflowMapping) (ConditionWorkflowMapping, error) { m.ctrl.T.Helper() diff --git a/internal/controllers/remediation/configs/default-configmap.yaml b/internal/controllers/remediation/configs/default-configmap.yaml new file mode 100644 index 000000000..4435fffe4 --- /dev/null +++ b/internal/controllers/remediation/configs/default-configmap.yaml @@ -0,0 +1,332 @@ +- nodeCondition: AMDGPUXgmi + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: Remove GPU tray from node.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation guideRe-install the GPU tray into node. + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUBadPageRetirementThreshold + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUThrottle + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: thermal + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: Ensure that all platform fans are operational. For DLC, ensure that all liquid pumps, heat exchangers, etc. are operational, and that there are no leaks.Ensure that a static low fan speed has not been configured on the system. Fans must be set statically at full speed or in the dynamic mode that allows the BMC to adjust fan speeds based on observed component temperatures. For DLC, ensure that a static low pump speed has not been configured on the system. Pumps must be set statically at full speed or in the dynamic mode that allows the BMC to adjust pump speeds based on observed component temperatures.Verify data center temperatures are within the acceptable range. + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUPcieLinkSpeedNotCorrect + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: pcie_link_status + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Validate the PCIe links by running the following AGFHC command. Note if PCIe Link Speed Not Correct, or PCIe Link Width Not Correct, or if there are reported AER: device recovery failed in dmesg. /opt/amd/agfhc/agfhc -t pcie_link_statusVerify upstream devices (switches, root port) are working properly, by checking for error messages on the host. (See vendor diagnostics.) If there are host-related issues, resolve those and return to step 1. If there are no host-related issues, continue to step 3.Check the mechanical connection of upstream devices (switches, root port). See vendor instructions. If issues are found and corrected, return to step 1. If no issues are found, continue to step 4.Remove GPU tray from node. See vendor instructions.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation.Re-install the GPU tray into node. See vendor instructions.Repeat steps 1 and 2.If the problem still has not been resolved, then replace OAM (OAM location to be detected by OS/ System SEL-based monitoring). Send OS AER logs / System SEL event logs and dmesg with the RMA (see SA2 - RMA the Failing OAM).' + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUBootFailed + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUGemmPerformance + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUHBMError + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUPcieLinkWidthNotCorrect + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: pcie_link_status + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Validate the PCIe links by running the following AGFHC command. Note if PCIe Link Speed Not Correct, or PCIe Link Width Not Correct, or if there are reported AER: device recovery failed in dmesg. /opt/amd/agfhc/agfhc -t pcie_link_statusVerify upstream devices (switches, root port) are working properly, by checking for error messages on the host. (See vendor diagnostics.) If there are host-related issues, resolve those and return to step 1. If there are no host-related issues, continue to step 3.Check the mechanical connection of upstream devices (switches, root port). See vendor instructions. If issues are found and corrected, return to step 1. If no issues are found, continue to step 4.Remove GPU tray from node. See vendor instructions.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation.Re-install the GPU tray into node. See vendor instructions.Repeat steps 1 and 2.If the problem still has not been resolved, then replace OAM (OAM location to be detected by OS/ System SEL-based monitoring). Send OS AER logs / System SEL event logs and dmesg with the RMA (see SA2 - RMA the Failing OAM).' + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUPcieBandwidthPerformance + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: pcie_link_status + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Validate the PCIe links by running the following AGFHC command. Note if PCIe Link Speed Not Correct, or PCIe Link Width Not Correct, or if there are reported AER: device recovery failed in dmesg. /opt/amd/agfhc/agfhc -t pcie_link_statusVerify upstream devices (switches, root port) are working properly, by checking for error messages on the host. (See vendor diagnostics.) If there are host-related issues, resolve those and return to step 1. If there are no host-related issues, continue to step 3.Check the mechanical connection of upstream devices (switches, root port). See vendor instructions. If issues are found and corrected, return to step 1. If no issues are found, continue to step 4.Remove GPU tray from node. See vendor instructions.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation.Re-install the GPU tray into node. See vendor instructions.Repeat steps 1 and 2.If the problem still has not been resolved, then replace OAM (OAM location to be detected by OS/ System SEL-based monitoring). Send OS AER logs / System SEL event logs and dmesg with the RMA (see SA2 - RMA the Failing OAM).' + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUDeviceInternalError + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUXgmiBandwidthPerformance + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUWatchdogTimeoutWdt + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUProgramFaultDueToHardwareError + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUGeneralApplicationFault + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUHbmBandwidthPerformance + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUHwsHang + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUPcieAer + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: pcie_link_status + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Validate the PCIe links by running the following AGFHC command. Note if PCIe Link Speed Not Correct, or PCIe Link Width Not Correct, or if there are reported AER: device recovery failed in dmesg. /opt/amd/agfhc/agfhc -t pcie_link_statusVerify upstream devices (switches, root port) are working properly, by checking for error messages on the host. (See vendor diagnostics.) If there are host-related issues, resolve those and return to step 1. If there are no host-related issues, continue to step 3.Check the mechanical connection of upstream devices (switches, root port). See vendor instructions. If issues are found and corrected, return to step 1. If no issues are found, continue to step 4.Remove GPU tray from node. See vendor instructions.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation.Re-install the GPU tray into node. See vendor instructions.Repeat steps 1 and 2.If the problem still has not been resolved, then replace OAM (OAM location to be detected by OS/ System SEL-based monitoring). Send OS AER logs / System SEL event logs and dmesg with the RMA (see SA2 - RMA the Failing OAM).' + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUHardwareAssertionHwa + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: Rerun the known failing workload. + notifyTestFailureMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUOnDieEcc + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUEndToEndCrc + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUHplFailure + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: rochpl_isolation + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyRemediationMessage: "" + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUWafl + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: Remove GPU tray from node.Confirm that all four screws on all eight OAMs are torqued as described in OAM Removal and Installation guideRe-install the GPU tray into node. + notifyTestFailureMessage: 'Remove the failing UBB assembly and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good UBB assembly to the GPU tray.' + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: AMDGPUAccuracyCheckFailureAcf + workflowTemplate: default-template + validationTestsProfile: + framework: AGFHC + recipe: all_lvl4 + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: true + notifyRemediationMessage: 'Remove the failing OAM (see OAM Removal and Installation) and return to AMD, along with the relevant failure details: at a minimum this should be the RF event that indicated the original fail, and if that RF event includes an additional data URI, the CPER and/or the decoded JSON from the CPER as pointed by the additional data.Install a new or known-good OAM (see OAM Removal and Installation).' + notifyTestFailureMessage: Check test run logs for next steps. + recoveryPolicy: + maxAllowedRunsPerWindow: 3 + windowSize: 15m +- nodeCondition: "AMDGPUUnhealthy" + workflowTemplate: "default-template" + validationTestsProfile: + framework: "AGFHC" + recipe: "all_lvl4" + iterations: 1 + stopOnFailure: true + timeoutSeconds: 4800 + physicalActionNeeded: false + notifyTestFailureMessage: Check test run logs for next steps. \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/drain.sh b/internal/controllers/remediation/scripts/drain.sh new file mode 100644 index 000000000..03aff201d --- /dev/null +++ b/internal/controllers/remediation/scripts/drain.sh @@ -0,0 +1,29 @@ +set -e +echo "Fetching node name..." +NODE_NAME="{{inputs.parameters.node_name}}" +echo "Identified node: $NODE_NAME" +echo "Finding pods on node $NODE_NAME with volume mount path starting with /dev/dri..." +PODS=$(kubectl get pods --all-namespaces -o json | jq -r ' + .items[] | + select(.spec.nodeName == "'"$NODE_NAME"'") | + select( + ( + [.spec.volumes[]? | select(.hostPath?.path != null and (.hostPath.path | startswith("/dev/dri")))] + | length > 0 + ) or ( + [.spec.containers[]? | select(.resources.requests["amd.com/gpu"] != null)] + | length > 0 + ) + ) | + "\(.metadata.namespace) \(.metadata.name)" +') +if [ -z "$PODS" ]; then + echo "No pods with /dev/dri mounts found on node $NODE_NAME." +else + echo "Evicting pods:" + echo "$PODS" + echo "$PODS" | while read -r ns name; do + echo "Deleting pod $name in namespace $ns" + kubectl delete pod "$name" -n "$ns" --grace-period=0 --force || true + done +fi \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/notify.sh b/internal/controllers/remediation/scripts/notify.sh new file mode 100644 index 000000000..11c321407 --- /dev/null +++ b/internal/controllers/remediation/scripts/notify.sh @@ -0,0 +1,27 @@ +set -e +NODE_NAME="{{inputs.parameters.nodeName}}" +NOTIFY_MESSAGE="{{inputs.parameters.notifyMessage}}" +EVENT_NAME="{{inputs.parameters.eventName}}" + +kubectl create -f - </dev/null; then + echo "Error: Job $JOB_NAME was not created in namespace $NAMESPACE" + exit 1 +fi + +timeout=$((TIMEOUTSECONDS * ITERATIONS)) +elapsed=0 +echo "Overall timeout for the job is set to $timeout seconds." +echo "Waiting for Job $JOB_NAME to complete..." + +while true; do + job_status=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || true) + if [ "$job_status" = "Complete" ]; then + echo "Test runner job completed successfully." + kubectl logs -n $NAMESPACE job/$JOB_NAME + echo "Detailed run report can be found at /var/log/amd-test-runner" + exit 0 + elif [ "$job_status" = "Failed" ]; then + echo "Test runner job failed." + kubectl logs -n $NAMESPACE job/$JOB_NAME + echo "Detailed run report can be found at /var/log/amd-test-runner" + exit 1 + else + echo "Test runner job is still running. Waiting..." + sleep 60 + elapsed=$((elapsed + 60)) + if [ "$elapsed" -gt "$timeout" ]; then + echo "Timeout reached. Job did not complete within the specified time." + exit 1 + fi + fi +done \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/untaint.sh b/internal/controllers/remediation/scripts/untaint.sh new file mode 100644 index 000000000..f8481b268 --- /dev/null +++ b/internal/controllers/remediation/scripts/untaint.sh @@ -0,0 +1,4 @@ +set -e +NODE_NAME="{{inputs.parameters.node_name}}" +echo "Untainting node $NODE_NAME" +kubectl taint node "$NODE_NAME" amd-gpu-unhealthy:NoSchedule- \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/wait.sh b/internal/controllers/remediation/scripts/wait.sh new file mode 100644 index 000000000..3b4522c75 --- /dev/null +++ b/internal/controllers/remediation/scripts/wait.sh @@ -0,0 +1,24 @@ +set -e +NODE_NAME="{{inputs.parameters.node_name}}" +echo "Waiting for {{inputs.parameters.node_condition}} condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)" +STABLE_COUNT=0 +TOTAL_WAIT=0 +while [ "$TOTAL_WAIT" -lt 15 ]; do + STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type=='{{inputs.parameters.node_condition}}')].status}") + echo "[$(date)] {{inputs.parameters.node_condition}} status: $STATUS" + if [ "$STATUS" = "False" ]; then + STABLE_COUNT=$((STABLE_COUNT + 1)) + echo "Condition is stable (False) for $STABLE_COUNT minute(s)" + if [ "$STABLE_COUNT" -ge 2 ]; then + echo "Condition has been False for 2 consecutive checks (~2 minutes). Proceeding..." + exit 0 + fi + else + STABLE_COUNT=0 + echo "Condition is not stable (status: $STATUS)." + fi + sleep 60 + TOTAL_WAIT=$((TOTAL_WAIT + 1)) +done +echo "{{inputs.parameters.node_condition}} did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure." +exit 1 \ No newline at end of file diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 15d0fa766..87c888e69 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -36,7 +36,10 @@ import ( "context" "errors" "fmt" + "os" + "path/filepath" "strings" + "sync" "time" "gopkg.in/yaml.v3" @@ -64,16 +67,37 @@ const ( AmdGpuRemediationSucceeded = "amd-gpu-remediation-succeeded" AmdGpuRemediationFailed = "amd-gpu-remediation-failed" DefaultUtilityImage = "docker.io/rocm/gpu-operator-utils:latest" + // DefaultRecoveryPolicyWindowSize - defines the time window size for recovery policy + DefaultRecoveryPolicyWindowSize = "15m" + // DefaultRecoveryPolicyMaxRunsPerWindow - defines the max allowed runs per window for recovery policy + // If a specific node condition is hit more than this number of times within the window size, no new remediation workflows will be scheduled + DefaultRecoveryPolicyMaxRunsPerWindow = 3 + DefaultTimeFormatLayout = "2006-01-02 15:04:05 UTC" + DefaultStatusCRCleanupWindowSize = "72h" + // Below is the label and value needed to be added to node to force resume a suspended workflow + ForceResumeWorkflowLabelKey = "operator.amd.com/gpu-force-resume-workflow" + ForceResumeWorkflowLabelValue = "true" + // Below is the label and value needed to be added to node to abort an ongoing workflow + AbortWorkflowLabelKey = "operator.amd.com/gpu-abort-workflow" + AbortWorkflowLabelValue = "true" + RemediationFilesPath = "/remediation" ) +type RecoveryPolicyConfig struct { + MaxAllowedRunsPerWindow int `json:"maxAllowedRunsPerWindow" yaml:"maxAllowedRunsPerWindow"` + WindowSize string `json:"windowSize" yaml:"windowSize"` +} + // ConditionWorkflowMapping defines a single condition-to-workflow mapping. // This is used when parsing the ConfigMap specified in the DeviceConfig. type ConditionWorkflowMapping struct { - NodeCondition string `json:"nodeCondition" yaml:"nodeCondition"` - WorkflowTemplate string `json:"workflowTemplate" yaml:"workflowTemplate"` - ValidationTests ValidationTestsProfile `json:"validationTestsProfile" yaml:"validationTestsProfile"` - PhysicalActionNeeded string `json:"physicalActionNeeded" yaml:"physicalActionNeeded"` - NotifyMessage string `json:"notifyMessage" yaml:"notifyMessage"` + NodeCondition string `json:"nodeCondition" yaml:"nodeCondition"` + WorkflowTemplate string `json:"workflowTemplate" yaml:"workflowTemplate"` + ValidationTests ValidationTestsProfile `json:"validationTestsProfile" yaml:"validationTestsProfile"` + PhysicalActionNeeded bool `json:"physicalActionNeeded" yaml:"physicalActionNeeded"` + NotifyRemediationMessage string `json:"notifyRemediationMessage" yaml:"notifyRemediationMessage"` + NotifyTestFailureMessage string `json:"notifyTestFailureMessage" yaml:"notifyTestFailureMessage"` + RecoveryPolicy RecoveryPolicyConfig `json:"recoveryPolicy" yaml:"recoveryPolicy"` } type ValidationTestsProfile struct { @@ -113,7 +137,6 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 // Don't handle remediation if disabled remediationDisabled, err := n.helper.isRemediationDisabled(ctx, devConfig) - if err != nil { return res, err } @@ -127,6 +150,21 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 return res, err } + // Clear any older recovery attempts from the status CR + if err := n.helper.dropOlderRecoveryAttemptsFromStatusCR(ctx, devConfig.Namespace); err != nil { + logger.Error(err, "Failed to drop older recovery attempts from status CR") + return res, err + } + + // If statusSynced is false, we need to populate the internal map from the status CR + if !n.helper.isStatusSynced(ctx) { + if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil { + logger.Error(err, "Failed to sync internal map from status CR") + return res, err + } + logger.Info("Internal map synced from status CR successfully") + } + var mappingsList []ConditionWorkflowMapping if err = yaml.Unmarshal([]byte(configMap.Data["workflow"]), &mappingsList); err != nil { return res, fmt.Errorf("failed to parse workflows from ConfigMap: %w", err) @@ -145,13 +183,12 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 logger.Info(fmt.Sprintf("Node conditions validations for node %s failed with error: %v", node.Name, err)) continue } - canSchedule := n.helper.isWorkflowSchedulableOnNode(ctx, devConfig, &node, mapping) - if !canSchedule { + createNewWorkflow := n.helper.handleExistingWorkflowsOnNode(ctx, devConfig, &node, mapping) + if !createNewWorkflow { continue } - - createNewWorkflow := n.helper.handleExistingWorkflowsOnNode(ctx, devConfig, &node) - if !createNewWorkflow { + canSchedule := n.helper.isWorkflowSchedulableOnNode(ctx, devConfig, &node, mapping) + if !canSchedule { continue } logger.Info(fmt.Sprintf("GPU Condition: %s observed and node: %s is unhealthy. Starting Remediation Workflow: %s", mapping.NodeCondition, node.Name, mapping.WorkflowTemplate)) @@ -173,8 +210,20 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 errs = errors.Join(errs, err) continue } - logger.Info(fmt.Sprintf("Remediation Workflow for the condition is created successfully on node %s using template %s", node.Name, mapping.WorkflowTemplate)) + + // Drop older recovery attempts from internal map based on the window size + windowSize := n.helper.getWindowSize(&mapping.RecoveryPolicy) + if err := n.helper.dropOlderRecoveryAttemptsInternal(node.Name, mapping.NodeCondition, windowSize); err != nil { + logger.Error(err, fmt.Sprintf("Failed to drop older recovery attempts for node %s and condition %s", node.Name, mapping.NodeCondition)) + return res, err + } + + // Register the recovery attempt in internal map + if err := n.helper.registerRecoveryAttempt(ctx, node.Name, mapping.NodeCondition, devConfig.Namespace, wf.Name); err != nil { + logger.Error(err, fmt.Sprintf("Failed to register recovery attempt for node %s", node.Name)) + return res, err + } } logger.Info("Requeue for any node conditions that may be present") return res, errs @@ -213,6 +262,7 @@ func (n *remediationMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1al //go:generate mockgen -source=remediation_handler.go -package=controllers -destination=mock_remediation_handler.go remediationMgrHelperAPI type remediationMgrHelperAPI interface { + getServiceAccountName(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) string isRemediationDisabled(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (bool, error) resumeSuspendedWorkflow(ctx context.Context, wfName, namespace string) error isDriverUpgradeInProgress(devCfg *amdv1alpha1.DeviceConfig, node *v1.Node) bool @@ -229,21 +279,65 @@ type remediationMgrHelperAPI interface { deleteWorkflow(ctx context.Context, workflow *workflowv1alpha1.Workflow) error validateNodeConditions(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mappings map[string]ConditionWorkflowMapping) (ConditionWorkflowMapping, error) isWorkflowSchedulableOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool - handleExistingWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node) bool + handleExistingWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool getWorkflowUtilityImage(devConfig *amdv1alpha1.DeviceConfig) v1.Container + createRemediationWorkflowStatus(ctx context.Context, namespace string) (*amdv1alpha1.RemediationWorkflowStatus, error) + getRemediationWorkflowStatus(ctx context.Context, namespace string) (*amdv1alpha1.RemediationWorkflowStatus, error) + getRecentRecoveryCount(nodeName string, nodeCondition string) int + dropOlderRecoveryAttemptsFromStatusCR(ctx context.Context, namespace string) error + dropOlderRecoveryAttemptsInternal(nodeName string, nodeCondition string, windowSize string) error + registerRecoveryAttempt(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string) error + registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error + registerRecoveryAttemptToStatusCR(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string, startTime time.Time) error + getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error) + getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int + getWindowSize(recoveryPolicy *RecoveryPolicyConfig) string + isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool + canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool + syncInternalMapFromStatusCR(ctx context.Context, namespace string) error + isStatusSynced(ctx context.Context) bool + isNodeLabelledForForceResume(ctx context.Context, node *v1.Node) bool + removeForceResumeWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error + isNodeLabelledForAbortWorkflow(node *v1.Node) bool + removeAbortWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error + abortWorkflow(ctx context.Context, workflow *workflowv1alpha1.Workflow) error + attemptAbortWorkflowOnNode(ctx context.Context, node *v1.Node, wf *workflowv1alpha1.Workflow) (bool, error) + attemptResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) + handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) bool + getWorkflowTaskScriptSource(scriptFileName string) (string, error) } type remediationMgrHelper struct { - client client.Client - k8sInterface kubernetes.Interface + client client.Client + k8sInterface kubernetes.Interface + recoveryTracker *sync.Map + statusSynced bool + serviceAccountName string } // Initialize remediation manager helper interface func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernetes.Interface) remediationMgrHelperAPI { return &remediationMgrHelper{ - client: client, - k8sInterface: k8sInterface, + client: client, + k8sInterface: k8sInterface, + recoveryTracker: new(sync.Map), + statusSynced: false, + } +} + +func (h *remediationMgrHelper) getServiceAccountName(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) string { + if h.serviceAccountName == "" { + sas := v1.ServiceAccountList{} + if err := h.client.List(ctx, &sas, client.InNamespace(devConfig.Namespace)); err == nil { + for _, sa := range sas.Items { + if strings.HasSuffix(sa.Name, "controller-manager") { + h.serviceAccountName = sa.Name + break + } + } + } } + return h.serviceAccountName } func (h *remediationMgrHelper) isRemediationDisabled(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (bool, error) { @@ -351,22 +445,19 @@ func (h *remediationMgrHelper) getConfigMap(ctx context.Context, configmapName s Name: configmapName, Namespace: namespace, }, cm) - if err != nil { - return nil, err - } - return cm, nil + return cm, err } func (h *remediationMgrHelper) createDefaultConfigMap(ctx context.Context, name string, namespace string) (*v1.ConfigMap, error) { + logger := log.FromContext(ctx) + + yamlBytes, err := os.ReadFile(filepath.Join(RemediationFilesPath, "configs/default-configmap.yaml")) + if err != nil { + logger.Error(err, "Failed to read default remediation workflows file") + return nil, err + } - workflowYaml := `- nodeCondition: "AMDGPUUnhealthy" - workflowTemplate: "default-template" - validationTestsProfile: - framework: "AGFHC" - recipe: "all_lvl4" - iterations: 1 - stopOnFailure: true - timeoutSeconds: 4800` + workflowYaml := string(yamlBytes) defaultCfgMap := &v1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -382,8 +473,9 @@ func (h *remediationMgrHelper) createDefaultConfigMap(ctx context.Context, name }, } - err := h.client.Create(ctx, defaultCfgMap) + err = h.client.Create(ctx, defaultCfgMap) if err != nil { + logger.Error(err, "Failed to create default remediation configmap") return nil, err } return defaultCfgMap, nil @@ -397,6 +489,15 @@ func (h *remediationMgrHelper) deleteConfigMap(ctx context.Context, name, namesp return h.client.Delete(ctx, cm) } +func (h *remediationMgrHelper) getWorkflowTaskScriptSource(scriptFileName string) (string, error) { + scriptPath := filepath.Join(RemediationFilesPath, "scripts", scriptFileName) + yamlBytes, err := os.ReadFile(scriptPath) + if err != nil { + return "", fmt.Errorf("failed to read script file %q: %w", scriptFileName, err) + } + return string(yamlBytes), nil +} + func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (*workflowv1alpha1.WorkflowTemplate, error) { utilityContainer := h.getWorkflowUtilityImage(devConfig) @@ -406,6 +507,11 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context rebootContainer.Command = []string{"/nsenter", "--all", "--target=1", "--", "/sbin/reboot", "-f"} rebootContainer.SecurityContext = &v1.SecurityContext{Privileged: ptr.To(true)} + notifySrc, err := h.getWorkflowTaskScriptSource("notify.sh") + if err != nil { + return nil, err + } + notifyTemplate := &workflowv1alpha1.WorkflowTemplate{ ObjectMeta: metav1.ObjectMeta{ Name: "event-notify-template", @@ -430,36 +536,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, Script: &workflowv1alpha1.ScriptTemplate{ - Source: ` -set -e -NODE_NAME="{{inputs.parameters.nodeName}}" -NOTIFY_MESSAGE="{{inputs.parameters.notifyMessage}}" -EVENT_NAME="{{inputs.parameters.eventName}}" - -kubectl create -f - < 0 - ) or ( - [.spec.containers[]? | select(.resources.requests["amd.com/gpu"] != null)] - | length > 0 - ) - ) | - "\(.metadata.namespace) \(.metadata.name)" -') -if [ -z "$PODS" ]; then - echo "No pods with /dev/dri mounts found on node $NODE_NAME." -else - echo "Evicting pods:" - echo "$PODS" - echo "$PODS" | while read -r ns name; do - echo "Deleting pod $name in namespace $ns" - kubectl delete pod "$name" -n "$ns" --grace-period=0 --force || true - done -fi -`, + Source: drainSrc, Container: utilityContainer, }, }, @@ -664,148 +726,7 @@ containers: }, }, Script: &workflowv1alpha1.ScriptTemplate{ - Source: ` -set -e -NODE_NAME="{{inputs.parameters.node_name}}" -JOB_NAME="test-runner-manual-trigger-${NODE_NAME}" -CM_NAME="manual-config-map-${NODE_NAME}" -FRAMEWORK="{{inputs.parameters.framework}}" -RECIPE="{{inputs.parameters.recipe}}" -ITERATIONS="{{inputs.parameters.iterations}}" -STOPONFAILURE="{{inputs.parameters.stopOnFailure}}" -TIMEOUTSECONDS="{{inputs.parameters.timeoutSeconds}}" -TESTRUNNERIMAGE="{{inputs.parameters.testRunnerImage}}" -TESTRUNNERSA="{{inputs.parameters.testRunnerServiceAccount}}" -NAMESPACE="{{inputs.parameters.namespace}}" - -if [ -z "$FRAMEWORK" ] || [ -z "$RECIPE" ] || [ -z "$ITERATIONS" ] || [ -z "$STOPONFAILURE" ] || [ -z "$TIMEOUTSECONDS" ]; then - echo "Validation profile incomplete, skipping configmap and job creation. Please enter framework, recipe, iterations, stopOnFailure, timeoutSeconds as per testrunner requirements" - exit 0 -fi - -echo "Creating test runner Job $JOB_NAME and ConfigMap $CM_NAME..." - -cat </dev/null || true) - if [ "$job_status" = "Complete" ]; then - echo "Test runner job completed successfully." - kubectl logs -n $NAMESPACE job/$JOB_NAME - echo "Detailed run report can be found at /var/log/amd-test-runner" - exit 0 - elif [ "$job_status" = "Failed" ]; then - echo "Test runner job failed." - kubectl logs -n $NAMESPACE job/$JOB_NAME - echo "Detailed run report can be found at /var/log/amd-test-runner" - exit 1 - else - echo "Test runner job is still running. Waiting..." - sleep 60 - fi -done -`, + Source: testSrc, Container: utilityContainer, }, }, @@ -824,32 +745,7 @@ done }, }, Script: &workflowv1alpha1.ScriptTemplate{ - Source: ` -set -e -NODE_NAME="{{inputs.parameters.node_name}}" -echo "Waiting for {{inputs.parameters.node_condition}} condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)" -STABLE_COUNT=0 -TOTAL_WAIT=0 -while [ "$TOTAL_WAIT" -lt 15 ]; do - STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type=='{{inputs.parameters.node_condition}}')].status}") - echo "[$(date)] {{inputs.parameters.node_condition}} status: $STATUS" - if [ "$STATUS" = "False" ]; then - STABLE_COUNT=$((STABLE_COUNT + 1)) - echo "Condition is stable (False) for $STABLE_COUNT minute(s)" - if [ "$STABLE_COUNT" -ge 2 ]; then - echo "Condition has been False for 2 consecutive checks (~2 minutes). Proceeding..." - exit 0 - fi - else - STABLE_COUNT=0 - echo "Condition is not stable (status: $STATUS)." - fi - sleep 60 - TOTAL_WAIT=$((TOTAL_WAIT + 1)) -done -echo "{{inputs.parameters.node_condition}} did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure." -exit 1 -`, + Source: waitSrc, Container: utilityContainer, }, }, @@ -864,22 +760,14 @@ exit 1 }, }, Script: &workflowv1alpha1.ScriptTemplate{ - Source: ` -set -e -NODE_NAME="{{inputs.parameters.node_name}}" -echo "Untainting node $NODE_NAME" -kubectl taint node "$NODE_NAME" amd-gpu-unhealthy:NoSchedule- -`, + Source: untaintSrc, Container: utilityContainer, }, }, { Name: "failWorkflow", Script: &workflowv1alpha1.ScriptTemplate{ - Source: ` -echo "Failing workflow" -exit 1 -`, + Source: `echo "Failing workflow" && exit 1`, Container: utilityContainer, }, }, @@ -903,7 +791,6 @@ func (h *remediationMgrHelper) createDefaultObjects(ctx context.Context, devConf } else { cfgMapName = devConfig.Name + "-" + DefaultConfigMapSuffix } - // Create default configmap if required cm, err := h.getConfigMap(ctx, cfgMapName, devConfig.Namespace) if err != nil { @@ -931,6 +818,17 @@ func (h *remediationMgrHelper) createDefaultObjects(ctx context.Context, devConf logger.Info("Created default workflow template successfully") } + // Create Default RemediationWorkflowStatus if required + _, err = h.getRemediationWorkflowStatus(ctx, devConfig.Namespace) + if err != nil { + logger.Error(err, fmt.Sprintf("Failed to fetch RemediationWorkflowStatus %s", "default")) + if _, err = h.createRemediationWorkflowStatus(ctx, devConfig.Namespace); err != nil { + logger.Error(err, "Failed to create default remediation workflow status") + return nil, err + } + logger.Info("Created default remediation workflow status successfully") + } + return cm, nil } @@ -944,7 +842,7 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate } wf.Spec.Entrypoint = wfTemplate.Spec.Entrypoint - wf.Spec.ServiceAccountName = "amd-gpu-operator-gpu-operator-charts-controller-manager" + wf.Spec.ServiceAccountName = h.getServiceAccountName(ctx, devConfig) ttlHours := devConfig.Spec.RemediationWorkflow.TtlForFailedWorkflows ttlSeconds := int32(ttlHours * 3600) wf.Spec.TTLStrategy = &workflowv1alpha1.TTLStrategy{ @@ -1020,11 +918,11 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate }, { Name: "notifyMessage", - Value: workflowv1alpha1.AnyStringPtr(mapping.NotifyMessage), + Value: workflowv1alpha1.AnyStringPtr(mapping.NotifyRemediationMessage), }, { Name: "notifyErrorMessage", - Value: workflowv1alpha1.AnyStringPtr(fmt.Sprintf("Remediation for node condition %s failed on node %s", mapping.NodeCondition, nodeName)), + Value: workflowv1alpha1.AnyStringPtr(fmt.Sprintf("Remediation for node condition %s failed on node %s. %s", mapping.NodeCondition, nodeName, mapping.NotifyTestFailureMessage)), }, { Name: "notifySuccessMessage", @@ -1110,7 +1008,7 @@ func (h *remediationMgrHelper) isWorkflowSchedulableOnNode(ctx context.Context, return true } -func (h *remediationMgrHelper) handleExistingWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node) bool { +func (h *remediationMgrHelper) handleExistingWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool { logger := log.FromContext(ctx) wfList, err := h.getWorkflowList(ctx, devConfig.Namespace) if err != nil { @@ -1120,35 +1018,87 @@ func (h *remediationMgrHelper) handleExistingWorkflowsOnNode(ctx context.Context // If a workflow is already running on that node, then skip the node but resume/delete workflow if needed for _, wf := range wfList.Items { - if strings.HasPrefix(wf.Name, fmt.Sprintf("%s-", node.Name)) { - if wf.Status.Phase == workflowv1alpha1.WorkflowSucceeded { - if err := h.deleteWorkflow(ctx, &wf); err != nil { - logger.Error(err, fmt.Sprintf("Failed to delete workflow %s on node %v", wf.Name, node.Name)) - return false - } - logger.Info(fmt.Sprintf("Deleted completed workflow %s on node %v", wf.Name, node.Name)) - } else if wf.Status.Phase == workflowv1alpha1.WorkflowRunning { - stages := wf.Status.Nodes - for _, wfStage := range stages { - if wfStage.Type == "Suspend" && wfStage.Phase == "Running" { - logger.Info(fmt.Sprintf("Found suspended workflow %s on node %s. Attempting resume.", wf.Name, node.Name)) - if err := h.resumeSuspendedWorkflow(ctx, wf.Name, wf.Namespace); err != nil { - logger.Error(err, fmt.Sprintf("Failed to resume workflow %s on node %s", wf.Name, node.Name)) - } else { - logger.Info(fmt.Sprintf("successfully resumed workflow %s on node %v", wf.Name, node.Name)) - } - return false - } - } - logger.Info(fmt.Sprintf("Workflow: %s already running on the node: %s, skipping creation of workflow", wf.Name, node.Name)) + if !strings.HasPrefix(wf.Name, fmt.Sprintf("%s-", node.Name)) { + continue + } + if wf.Status.Phase == workflowv1alpha1.WorkflowSucceeded { + if err := h.deleteWorkflow(ctx, &wf); err != nil { + logger.Error(err, fmt.Sprintf("Failed to delete workflow %s on node %v", wf.Name, node.Name)) return false } - break + logger.Info(fmt.Sprintf("Deleted completed workflow %s on node %v", wf.Name, node.Name)) + } else if wf.Status.Phase == workflowv1alpha1.WorkflowRunning { + return h.handleSuspendedWorkflowsOnNode(ctx, devConfig, node, mapping, &wf) } } return true } +func (h *remediationMgrHelper) handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) bool { + logger := log.FromContext(ctx) + if wf.Status.Phase != workflowv1alpha1.WorkflowRunning { + return false + } + stages := wf.Status.Nodes + for _, wfStage := range stages { + if wfStage.Type == "Suspend" && wfStage.Phase == "Running" { + logger.Info(fmt.Sprintf("Suspended workflow %s found on node %s", wf.Name, node.Name)) + // Check if the workflow can be aborted, and attempt abort + // If aborted, return true so that new workflow can be created + canAbort, err := h.attemptAbortWorkflowOnNode(ctx, node, wf) + if canAbort && err == nil { + return true + } + + // Check if the workflow can be resumed, and attempt resume + h.attemptResumeWorkflowOnNode(ctx, node, mapping, wf) + // irrespective of whether it was resumed or not, return false to avoid creating a new workflow + return false + } + } + return false +} + +func (h *remediationMgrHelper) attemptAbortWorkflowOnNode(ctx context.Context, node *v1.Node, wf *workflowv1alpha1.Workflow) (bool, error) { + logger := log.FromContext(ctx) + canAbort := h.isNodeLabelledForAbortWorkflow(node) + if canAbort { + logger.Info(fmt.Sprintf("Found abort label on node %s. Attempting abort workflow %s", node.Name, wf.Name)) + if err := h.abortWorkflow(ctx, wf); err != nil { + logger.Error(err, fmt.Sprintf("Failed to abort workflow %s on node %s", wf.Name, node.Name)) + return true, fmt.Errorf("Failed to abort workflow %s on node %s", wf.Name, node.Name) + } + if err := h.removeAbortWorkflowLabelFromNode(ctx, node); err != nil { + return true, err + } + logger.Info(fmt.Sprintf("Aborted and deleted workflow %s on node %s.", wf.Name, node.Name)) + return true, nil + } + return canAbort, nil +} + +func (h *remediationMgrHelper) attemptResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) { + logger := log.FromContext(ctx) + // Check if the workflow can be resumed + canResume := h.canResumeWorkflowOnNode(ctx, node, &mapping) + if canResume { + logger.Info(fmt.Sprintf("Attempting to resume suspended workflow %q on node %q.", wf.Name, node.Name)) + if err := h.resumeSuspendedWorkflow(ctx, wf.Name, wf.Namespace); err != nil { + logger.Error(err, fmt.Sprintf("Failed to resume workflow %s", wf.Name)) + return + } + resume := h.isNodeLabelledForForceResume(ctx, node) + if resume { + // Remove the label after allowing resumption + if err := h.removeForceResumeWorkflowLabelFromNode(ctx, node); err != nil { + logger.Error(err, fmt.Sprintf("Failed to remove force resume label from node %s", node.Name)) + return + } + } + logger.Info(fmt.Sprintf("Resumed suspended workflow %q on node %q.", wf.Name, node.Name)) + } +} + func (h *remediationMgrHelper) getWorkflowUtilityImage(devConfig *amdv1alpha1.DeviceConfig) v1.Container { output := v1.Container{} if devConfig.Spec.CommonConfig.UtilsContainer.Image != "" { @@ -1162,3 +1112,344 @@ func (h *remediationMgrHelper) getWorkflowUtilityImage(devConfig *amdv1alpha1.De return output } + +func (h *remediationMgrHelper) getRecentRecoveryCount(nodeName string, nodeCondition string) int { + // get the length of the slice of attempts for the given node and condition + key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) + if err != nil { + return 0 + } + + attempts, ok := h.recoveryTracker.Load(key) + if !ok { + return 0 + } + if attemptsSlice, ok := attempts.([]time.Time); ok { + // Return the length of the slice as the count of recent recovery attempts + return len(attemptsSlice) + } + return 0 +} + +func (h *remediationMgrHelper) dropOlderRecoveryAttemptsInternal(nodeName string, nodeCondition string, windowSize string) error { + key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) + if err != nil { + return fmt.Errorf("failed to get recovery tracker key: %w", err) + } + + attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{}) + if attemptsSlice, ok := attempts.([]time.Time); ok { + windowSizeDuration, err := time.ParseDuration(windowSize) + if err != nil { + return fmt.Errorf("failed to parse window size %s: %w", windowSize, err) + } + + // Filter out attempts older than the window size + cutoffTime := time.Now().UTC().Add(-time.Duration(windowSizeDuration)) + startIndex := len(attemptsSlice) + for i, attempt := range attemptsSlice { + if attempt.After(cutoffTime) { + startIndex = i + break + } + } + filtered := attemptsSlice[startIndex:] + h.recoveryTracker.Store(key, filtered) + } else { + return fmt.Errorf("failed to cast recovery tracker value to []time.Time") + } + + return nil +} + +func (h *remediationMgrHelper) dropOlderRecoveryAttemptsFromStatusCR(ctx context.Context, namespace string) error { + windowSize := DefaultStatusCRCleanupWindowSize + wfStatus, err := h.getRemediationWorkflowStatus(ctx, namespace) + if err != nil { + return fmt.Errorf("failed to get remediation workflow status: %w", err) + } + + if wfStatus.Status == nil { + return nil // Nothing to drop + } + + wfStatusCopy := wfStatus.DeepCopy() + windowSizeDuration, err := time.ParseDuration(windowSize) + if err != nil { + return fmt.Errorf("failed to parse window size %s: %w", windowSize, err) + } + + cutoffTime := time.Now().UTC().Add(-time.Duration(windowSizeDuration)) + + for nodeName, conditions := range wfStatus.Status { + for nodeCondition, attempts := range conditions { + filtered := []amdv1alpha1.WorkflowMetadata{} + for _, attempt := range attempts { + attemptTime, err := time.Parse(DefaultTimeFormatLayout, attempt.StartTime) + if err != nil { + return fmt.Errorf("failed to parse attempt start time %s: %w", attempt.StartTime, err) + } + if attemptTime.After(cutoffTime) { + filtered = append(filtered, attempt) + } + } + // Update the status for the node and condition with the filtered attempts + if len(filtered) > 0 { + wfStatus.Status[nodeName][nodeCondition] = filtered + } else { + // If no attempts are left, remove the condition from the node + delete(wfStatus.Status[nodeName], nodeCondition) + } + } + // If no conditions are left for the node, remove the node from the status + if len(wfStatus.Status[nodeName]) == 0 { + delete(wfStatus.Status, nodeName) + } + } + + if err := h.client.Status().Patch(ctx, wfStatus, client.MergeFrom(wfStatusCopy)); err != nil { + return fmt.Errorf("failed to patch remediation workflow status: %w", err) + } + + return nil +} + +func (h *remediationMgrHelper) registerRecoveryAttempt(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string) error { + startTime := time.Now().UTC() + + // Register the recovery attempt in internal map + if err := h.registerRecoveryAttemptInternal(nodeName, nodeCondition, namespace, startTime); err != nil { + return fmt.Errorf("failed to register recovery attempt: %w", err) + } + + // Register the recovery attempt in the status CR + if err := h.registerRecoveryAttemptToStatusCR(ctx, nodeName, nodeCondition, namespace, wfName, startTime); err != nil { + return fmt.Errorf("failed to register recovery attempt in status CR: %w", err) + } + + return nil +} + +func (h *remediationMgrHelper) registerRecoveryAttemptToStatusCR(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string, startTime time.Time) error { + wfStatus, err := h.getRemediationWorkflowStatus(ctx, namespace) + if err != nil { + return fmt.Errorf("failed to get remediation workflow status: %w", err) + } + + wfStatusCopy := wfStatus.DeepCopy() + + if wfStatus.Status == nil { + wfStatus.Status = make(map[string]map[string][]amdv1alpha1.WorkflowMetadata) + } + if wfStatus.Status[nodeName] == nil { + wfStatus.Status[nodeName] = make(map[string][]amdv1alpha1.WorkflowMetadata) + } + if wfStatus.Status[nodeName][nodeCondition] == nil { + wfStatus.Status[nodeName][nodeCondition] = []amdv1alpha1.WorkflowMetadata{} + } + + // Create a new WorkflowMetadata entry + metadata := amdv1alpha1.WorkflowMetadata{ + Name: wfName, + StartTime: startTime.Format(DefaultTimeFormatLayout), + } + + // Append the new metadata entry to the status + wfStatus.Status[nodeName][nodeCondition] = append(wfStatus.Status[nodeName][nodeCondition], metadata) + + // Patch the wfStatus with the new entry + if err := h.client.Status().Patch(ctx, wfStatus, client.MergeFrom(wfStatusCopy)); err != nil { + return fmt.Errorf("failed to patch remediation workflow status: %w", err) + } + return nil +} + +func (h *remediationMgrHelper) registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error { + key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) + if err != nil { + return fmt.Errorf("failed to get recovery tracker key: %w", err) + } + + attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{}) + if attemptsSlice, ok := attempts.([]time.Time); ok { + attemptsSlice = append(attemptsSlice, startTime) + h.recoveryTracker.Store(key, attemptsSlice) + } else { + return fmt.Errorf("failed to cast recovery tracker value to []time.Time") + } + + return nil +} + +func (h *remediationMgrHelper) getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error) { + key := fmt.Sprintf("%s-%s", nodeName, nodeCondition) + return key, nil +} + +func (h *remediationMgrHelper) getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int { + if recoveryPolicy == nil || recoveryPolicy.MaxAllowedRunsPerWindow == 0 { + return DefaultRecoveryPolicyMaxRunsPerWindow + } + return recoveryPolicy.MaxAllowedRunsPerWindow +} + +func (h *remediationMgrHelper) getWindowSize(recoveryPolicy *RecoveryPolicyConfig) string { + if recoveryPolicy == nil || recoveryPolicy.WindowSize == "" { + return DefaultRecoveryPolicyWindowSize + } + return recoveryPolicy.WindowSize +} + +func (h *remediationMgrHelper) createRemediationWorkflowStatus(ctx context.Context, namespace string) (*amdv1alpha1.RemediationWorkflowStatus, error) { + wfstatus := &amdv1alpha1.RemediationWorkflowStatus{ + ObjectMeta: metav1.ObjectMeta{ + Name: "default", + Namespace: namespace, + }, + Status: make(map[string]map[string][]amdv1alpha1.WorkflowMetadata), + } + + if err := h.client.Create(ctx, wfstatus); err != nil { + return nil, fmt.Errorf("failed to create remediation workflow status: %w", err) + } + return wfstatus, nil +} + +func (h *remediationMgrHelper) getRemediationWorkflowStatus(ctx context.Context, namespace string) (*amdv1alpha1.RemediationWorkflowStatus, error) { + wfstatus := &amdv1alpha1.RemediationWorkflowStatus{} + err := h.client.Get(ctx, client.ObjectKey{Name: "default", Namespace: namespace}, wfstatus) + if err != nil { + return nil, fmt.Errorf("failed to get remediation workflow status: %w", err) + } + return wfstatus, nil +} + +func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context, namespace string) error { + wfStatus, err := h.getRemediationWorkflowStatus(ctx, namespace) + if err != nil { + return fmt.Errorf("failed to get remediation workflow status: %w", err) + } + + if wfStatus.Status == nil { + h.statusSynced = true + return nil // Nothing to sync + } + + for nodeName, conditions := range wfStatus.Status { + for nodeCondition, attempts := range conditions { + key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) + if err != nil { + return fmt.Errorf("failed to get recovery tracker key: %w", err) + } + + attemptTimes := make([]time.Time, len(attempts)) + for i, attempt := range attempts { + attemptTime, err := time.Parse(DefaultTimeFormatLayout, attempt.StartTime) + if err != nil { + return fmt.Errorf("failed to parse attempt start time %s: %w", attempt.StartTime, err) + } + attemptTimes[i] = attemptTime + } + h.recoveryTracker.Store(key, attemptTimes) + } + } + + h.statusSynced = true + return nil +} + +func (h *remediationMgrHelper) isStatusSynced(ctx context.Context) bool { + return h.statusSynced +} + +func (h *remediationMgrHelper) isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool { + logger := log.FromContext(ctx) + + maxAllowedRuns := h.getMaxAllowedRunsPerWindow(&mapping.RecoveryPolicy) + recentRecoveryCount := h.getRecentRecoveryCount(nodeName, mapping.NodeCondition) + + logger.Info(fmt.Sprintf("Recent recovery count for node %s and condition %s: %d", nodeName, mapping.NodeCondition, recentRecoveryCount)) + logger.Info(fmt.Sprintf("Max allowed runs per window for node %s and condition %s: %d", nodeName, mapping.NodeCondition, maxAllowedRuns)) + return recentRecoveryCount > maxAllowedRuns +} + +func (h *remediationMgrHelper) isNodeLabelledForForceResume(ctx context.Context, nodeObj *v1.Node) bool { + if labelValue, exists := nodeObj.Labels[ForceResumeWorkflowLabelKey]; exists && labelValue == ForceResumeWorkflowLabelValue { + return true + } + return false +} + +func (h *remediationMgrHelper) removeForceResumeWorkflowLabelFromNode(ctx context.Context, nodeObj *v1.Node) error { + logger := log.FromContext(ctx) + + if labelValue, exists := nodeObj.Labels[ForceResumeWorkflowLabelKey]; exists { + if labelValue == ForceResumeWorkflowLabelValue { + original := nodeObj.DeepCopy() + delete(nodeObj.Labels, ForceResumeWorkflowLabelKey) + + if err := h.client.Patch(ctx, nodeObj, client.MergeFrom(original)); err != nil { + logger.Error(err, fmt.Sprintf("Failed to remove label %q from node %s using Patch", ForceResumeWorkflowLabelKey, nodeObj.Name)) + return err + } + logger.Info(fmt.Sprintf("Successfully removed label %q from node %s", ForceResumeWorkflowLabelKey, nodeObj.Name)) + } + } + return nil +} + +func (h *remediationMgrHelper) canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool { + logger := log.FromContext(ctx) + + // Check if the recovery policy is violated, if so, do not allow resumption + recoveryPolicyViolated := h.isRecoveryPolicyViolated(ctx, node.Name, mapping) + if recoveryPolicyViolated { + logger.Info(fmt.Sprintf("Recovery policy is violated for node %s with condition %s, not allowing workflow resumption", node.Name, mapping.NodeCondition)) + return false + } + + // if no physical action is needed, allow resumption of workflow + if !mapping.PhysicalActionNeeded { + return true + } + + // in case physical action is needed, check if the node is labelled for force resume + resume := h.isNodeLabelledForForceResume(ctx, node) + if !resume { + logger.Info(fmt.Sprintf("Node %s is not labelled for force resume, not allowing workflow resumption", node.Name)) + } + return resume +} + +func (h *remediationMgrHelper) isNodeLabelledForAbortWorkflow(node *v1.Node) bool { + if labelValue, exists := node.Labels[AbortWorkflowLabelKey]; exists && labelValue == AbortWorkflowLabelValue { + return true + } + return false +} + +func (h *remediationMgrHelper) removeAbortWorkflowLabelFromNode(ctx context.Context, nodeObj *v1.Node) error { + logger := log.FromContext(ctx) + if labelValue, exists := nodeObj.Labels[AbortWorkflowLabelKey]; exists && labelValue == AbortWorkflowLabelValue { + original := nodeObj.DeepCopy() + delete(nodeObj.Labels, AbortWorkflowLabelKey) + if err := h.client.Patch(ctx, nodeObj, client.MergeFrom(original)); err != nil { + logger.Error(err, fmt.Sprintf("Failed to remove label %q on node %s", AbortWorkflowLabelKey, nodeObj.Name)) + return err + } + logger.Info(fmt.Sprintf("Successfully removed label %q from node %s", AbortWorkflowLabelKey, nodeObj.Name)) + } + return nil +} + +func (h *remediationMgrHelper) abortWorkflow(ctx context.Context, wf *workflowv1alpha1.Workflow) error { + logger := log.FromContext(ctx) + + // Delete the workflow + if err := h.client.Delete(ctx, wf); err != nil { + return fmt.Errorf("failed to delete workflow %s: %w", wf.Name, err) + } + + logger.Info(fmt.Sprintf("Workflow %s aborted successfully", wf.Name)) + return nil +} diff --git a/internal/kmmmodule/dockerfiles/DockerfileTemplate.coreos b/internal/kmmmodule/dockerfiles/DockerfileTemplate.rpm.coreos similarity index 100% rename from internal/kmmmodule/dockerfiles/DockerfileTemplate.coreos rename to internal/kmmmodule/dockerfiles/DockerfileTemplate.rpm.coreos diff --git a/internal/kmmmodule/dockerfiles/DockerfileTemplate.srcimg.coreos b/internal/kmmmodule/dockerfiles/DockerfileTemplate.srcimg.coreos new file mode 100644 index 000000000..5b01c78ba --- /dev/null +++ b/internal/kmmmodule/dockerfiles/DockerfileTemplate.srcimg.coreos @@ -0,0 +1,37 @@ +ARG DTK_AUTO +ARG RHEL_VERSION +ARG SOURCE_IMAGE_REPO=docker.io/rocm/amdgpu-source +ARG DRIVERS_VERSION + +FROM ${SOURCE_IMAGE_REPO}:coreos-${RHEL_VERSION}-${DRIVERS_VERSION} as sources +FROM ${DTK_AUTO} as builder + +ARG KERNEL_VERSION + +COPY --from=sources /amdgpu_src/driver /amdgpu_src + +WORKDIR /amdgpu_src +RUN make modules + +RUN mkdir -p /lib/modules/${KERNEL_VERSION}/extra && \ + rm -f /lib/modules/${KERNEL_VERSION}/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko.xz && \ + find /amdgpu_src -type f -name "*.ko" -print -exec cp {} /lib/modules/${KERNEL_VERSION}/extra/ \; ; \ + depmod ${KERNEL_VERSION} && \ + find /lib/modules/${KERNEL_VERSION} -name "*.ko.xz" -exec xz -d {} \; && \ + depmod ${KERNEL_VERSION} + +RUN mkdir -p /modules_files && \ + mkdir -p /amdgpu_ko_files && \ + mkdir -p /kernel_files && \ + cp /lib/modules/${KERNEL_VERSION}/modules.* /modules_files/ && \ + cp -r /lib/modules/${KERNEL_VERSION}/extra/* /amdgpu_ko_files/ && \ + cp -r /lib/modules/${KERNEL_VERSION}/kernel/* /kernel_files/ + +FROM registry.redhat.io/ubi9/ubi-minimal + +ARG KERNEL_VERSION + +COPY --from=builder /amdgpu_ko_files /opt/lib/modules/${KERNEL_VERSION}/extra +COPY --from=builder /kernel_files /opt/lib/modules/${KERNEL_VERSION}/kernel +COPY --from=builder /modules_files /opt/lib/modules/${KERNEL_VERSION}/ +COPY --from=sources /amdgpu_src/firmware /firmwareDir/updates/amdgpu \ No newline at end of file diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index ed7dd3998..5cdc2865f 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -82,13 +82,17 @@ const ( defaultInstallerRepoURL = "https://repo.radeon.com" defaultInitContainerImage = "busybox:1.36" defaultBaseImageRegistry = "docker.io" + defaultSourceImageRepo = "docker.io/rocm/amdgpu-driver" + nfdOSReleaseLabelKey = "feature.node.kubernetes.io/system-os_release.VERSION_ID" ) var ( //go:embed dockerfiles/DockerfileTemplate.ubuntu dockerfileTemplateUbuntu string - //go:embed dockerfiles/DockerfileTemplate.coreos - dockerfileTemplateCoreOS string + //go:embed dockerfiles/DockerfileTemplate.srcimg.coreos + dockerfileTemplateCoreOSFromSrcImage string + //go:embed dockerfiles/DockerfileTemplate.rpm.coreos + dockerfileTemplateCoreOSFromRPM string //go:embed devdockerfiles/devdockerfile.txt dockerfileDevTemplateUbuntu string //go:embed dockerfiles/DockerfileTemplate.ubuntu.gim @@ -164,10 +168,16 @@ func (km *kmmModule) SetBuildConfigMapAsDesired(buildCM *v1.ConfigMap, devConfig buildCM.Data = make(map[string]string) } if km.isOpenShift { - buildCM.Data["dockerfile"] = dockerfileTemplateCoreOS switch devConfig.Spec.Driver.DriverType { case utils.DriverTypeVFPassthrough: buildCM.Data["dockerfile"] = dockerfileTemplateGIMCoreOS + case utils.DriverTypeContainer: + fallthrough + default: + buildCM.Data["dockerfile"] = dockerfileTemplateCoreOSFromRPM + if devConfig.Spec.Driver.UseSourceImage != nil && *devConfig.Spec.Driver.UseSourceImage { + buildCM.Data["dockerfile"] = dockerfileTemplateCoreOSFromSrcImage + } } } else { dockerfile, err := resolveDockerfile(buildCM.Name, devConfig) @@ -185,6 +195,40 @@ var driverLabels = map[string]string{ "24.04": "noble", } +func parseRHELHelper(regExp *regexp.Regexp, osImage string) string { + matches := regExp.FindStringSubmatch(osImage) + if len(matches) >= 3 { + return fmt.Sprintf("%s.%s", matches[1], matches[2]) + } + return "" +} + +func parseRHELVersion(labels map[string]string, osImage string) string { + // firstly check if NFD label for RHEL version is present + // if yes, use it directly + if labels != nil { + if rhelVersion, found := labels[nfdOSReleaseLabelKey]; found { + return rhelVersion + } + } + + // if NFD label not found, parse the RHEL version from OS image string + // https://github.com/openshift/release-controller/blob/c4b8d4c3c7674884f2e479c35a8876428aa08de8/pkg/rhcos/rhcos.go#L38-L42 + // OpenShift < 4.19 Legacy format: e.g., 418.94.202410090804-0 → 9.4 + reLegacy := regexp.MustCompile(`\b4\d+\.(\d)(\d+)\.\d+-\d+\b`) + // OpenShift >= 4.19 Modern format: e.g., 9.6.20250121-0 or 10.0.20260101-0 → 9.6 or 10.0 + reModern := regexp.MustCompile(`\b(\d+)\.(\d+)\.\d+-\d+\b`) + + switch { + case reLegacy.MatchString(osImage): + return parseRHELHelper(reLegacy, osImage) + case reModern.MatchString(osImage): + return parseRHELHelper(reModern, osImage) + } + + return "" +} + func resolveDockerfile(cmName string, devConfig *amdv1alpha1.DeviceConfig) (string, error) { splits := strings.SplitN(cmName, "-", -1) osDistro := splits[0] @@ -225,10 +269,16 @@ func resolveDockerfile(cmName string, devConfig *amdv1alpha1.DeviceConfig) (stri dockerfileTemplate = strings.Replace(dockerfileTemplate, "$$BASEIMG_REGISTRY/ubuntu:$$VERSION", fmt.Sprintf("%v:$$VERSION", internalUbuntuBaseImage), -1) } case "coreos": - dockerfileTemplate = dockerfileTemplateCoreOS switch devConfig.Spec.Driver.DriverType { case utils.DriverTypeVFPassthrough: dockerfileTemplate = dockerfileTemplateGIMCoreOS + case utils.DriverTypeContainer: + fallthrough + default: + dockerfileTemplate = dockerfileTemplateCoreOSFromRPM + if devConfig.Spec.Driver.UseSourceImage != nil && *devConfig.Spec.Driver.UseSourceImage { + dockerfileTemplate = dockerfileTemplateCoreOSFromSrcImage + } } // FIX ME // add the RHEL back when it is fully supported @@ -553,6 +603,10 @@ func getKM(devConfig *amdv1alpha1.DeviceConfig, node v1.Node, inTreeModuleToRemo return kmmv1beta1.KernelMapping{}, "", err } + // OpenShift source image build needs RHEL version and source image registry + rhelVersion := "" + sourceImageRepo := defaultSourceImageRepo + if isOpenShift { if driversVersion == "" { driversVersion = defaultOcDriversVersion @@ -561,6 +615,10 @@ func getKM(devConfig *amdv1alpha1.DeviceConfig, node v1.Node, inTreeModuleToRemo driversImage = defaultOcDriversImageTemplate } driversImage = addNodeInfoSuffixToImageTag(driversImage, osName, driversVersion, devConfig) + rhelVersion = parseRHELVersion(node.Labels, node.Status.NodeInfo.OSImage) + if devConfig.Spec.Driver.ImageBuild.SourceImageRepo != "" { + sourceImageRepo = devConfig.Spec.Driver.ImageBuild.SourceImageRepo + } } else { if driversVersion == "" { driversVersion, err = utils.GetDefaultDriversVersion(node) @@ -632,6 +690,21 @@ func getKM(devConfig *amdv1alpha1.DeviceConfig, node v1.Node, inTreeModuleToRemo kmmBuild.BaseImageRegistryTLS.Insecure = true kmmBuild.BaseImageRegistryTLS.InsecureSkipTLSVerify = true } + if isOpenShift { + if rhelVersion != "" { + kmmBuild.BuildArgs = append(kmmBuild.BuildArgs, + kmmv1beta1.BuildArg{ + Name: "RHEL_VERSION", + Value: rhelVersion, + }) + } + kmmBuild.BuildArgs = append(kmmBuild.BuildArgs, + kmmv1beta1.BuildArg{ + Name: "SOURCE_IMAGE_REPO", + Value: sourceImageRepo, + }, + ) + } return kmmv1beta1.KernelMapping{ Literal: node.Status.NodeInfo.KernelVersion, diff --git a/internal/kmmmodule/kmmmodule_test.go b/internal/kmmmodule/kmmmodule_test.go index f9f305097..67f17f763 100644 --- a/internal/kmmmodule/kmmmodule_test.go +++ b/internal/kmmmodule/kmmmodule_test.go @@ -677,4 +677,58 @@ var _ = Describe("getKernelMappings", func() { } } }) + + It("test parseRHELVersion", func() { + testCases := []struct { + labels map[string]string + input string + expected string + }{ + { + // legacy format + input: "Red Hat Enterprise Linux CoreOS 418.94.202410090804-0 (Pecan)", + expected: "9.4", + }, + { + // legacy format + input: "Red Hat Enterprise Linux CoreOS 419.96.202410090804-0 (Plow)", + expected: "9.6", + }, + { + input: "Red Hat Enterprise Linux CoreOS 9.6.20250916-0 (Plow)", + expected: "9.6", + }, + { + labels: map[string]string{"feature.node.kubernetes.io/system-os_release.VERSION_ID": "9.6"}, + input: "Red Hat Enterprise Linux CoreOS 9.6.20250916-0 (Plow)", + expected: "9.6", + }, + { + input: "Red Hat Enterprise Linux CoreOS 10.0.20260101-0 (Future)", + expected: "10.0", + }, + { + labels: map[string]string{"feature.node.kubernetes.io/system-os_release.VERSION_ID": "10.0"}, + input: "Red Hat Enterprise Linux CoreOS 10.0.20250916-0 (Future)", + expected: "10.0", + }, + { + input: "Red Hat Enterprise Linux CoreOS 10.1.20260101-0 (Future)", + expected: "10.1", + }, + { + input: "Red Hat Enterprise Linux CoreOS 10.10.20260101-0 (Future)", + expected: "10.10", + }, + { + input: "Red Hat Enterprise Linux CoreOS 11.0.20260101-0 (Future)", + expected: "11.0", + }, + } + + for _, tc := range testCases { + result := parseRHELVersion(tc.labels, tc.input) + Expect(result).To(Equal(tc.expected)) + } + }) }) diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile index aba704fb3..1478c9df9 100644 --- a/internal/utils_container/Dockerfile +++ b/internal/utils_container/Dockerfile @@ -9,7 +9,7 @@ LABEL summary="AMD GPU Operator Utility Image" LABEL description="The AMD GPU Operator utils image is a utility image used by the AMD GPU Operator. The operator controller employs this image as the container's base layer to automate tasks on target worker nodes." # Install nsenter from util-linux package -RUN microdnf install -y util-linux pciutils kmod tar && \ +RUN microdnf install -y util-linux pciutils kmod tar jq && \ cp /usr/bin/nsenter /nsenter && \ microdnf clean all @@ -17,8 +17,8 @@ ADD LICENSE /licenses/LICENSE # Install kubectl and oc RUN mkdir -p /oc && cd /oc && \ - curl -SsLO 'https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux-amd64-rhel9-4.19.13.tar.gz' && \ - tar -xzf openshift-client-linux-amd64-rhel9-4.19.13.tar.gz -C /oc && \ + curl -SsLO 'https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux-amd64-rhel9.tar.gz' && \ + tar -xzf openshift-client-linux-amd64-rhel9.tar.gz -C /oc && \ cp ./kubectl /usr/local/bin && chmod +x /usr/local/bin/kubectl && \ cp ./oc /usr/local/bin && chmod +x /usr/local/bin/oc && \ rm -rf /oc diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index b6129bea1..d7fb32f19 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -28,6 +28,7 @@ import ( . "gopkg.in/check.v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -164,6 +165,11 @@ func (s *E2ESuite) verifyDevicePlugin(expect, actual *v1alpha1.DeviceConfigSpec) reflect.DeepEqual(expect.DevicePlugin, actual.DevicePlugin) } +func (s *E2ESuite) verifyRemediationWorkflow(expect, actual *v1alpha1.DeviceConfigSpec) bool { + return expect != nil && actual != nil && + reflect.DeepEqual(expect.RemediationWorkflow, actual.RemediationWorkflow) +} + func (s *E2ESuite) writeYAMLToFile(yamlContent string) error { os.Remove(tmpValuesYamlPath) file, err := os.Create(tmpValuesYamlPath) @@ -294,9 +300,11 @@ deviceConfig: name: publicKeySecret imageBuild: baseImageRegistry: quay.io + sourceImageRepo: custom.io/rocm/amdgpu-driver baseImageRegistryTLS: insecure: true insecureSkipTLSVerify: false + useSourceImage: true tolerations: - key: "example-key" operator: "Equal" @@ -370,8 +378,10 @@ deviceConfig: Name: "publicKeySecret", }, }, + UseSourceImage: &boolTrue, ImageBuild: v1alpha1.ImageBuildSpec{ BaseImageRegistry: "quay.io", + SourceImageRepo: "custom.io/rocm/amdgpu-driver", BaseImageRegistryTLS: v1alpha1.RegistryTLS{ Insecure: &boolTrue, InsecureSkipTLSVerify: &boolFalse, @@ -764,6 +774,19 @@ deviceConfig: imagePullPolicy: "Always" config: name: metricsConfig + podResourceAPISocketPath: /var/lib/kubelet/pod-resources-custom + resource: + limits: + cpu: "4" + memory: "8G" + requests: + cpu: "1" + memory: "1G" + podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "5001" + serviceAnnotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" tolerations: - key: "example-key" operator: "Equal" @@ -810,6 +833,7 @@ deviceConfig: credentials: name: test key: test123 + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token tlsConfig: keyFile: /etc/credential `, @@ -828,6 +852,24 @@ deviceConfig: Config: v1alpha1.MetricsConfig{ Name: "metricsConfig", }, + PodResourceAPISocketPath: "/var/lib/kubelet/pod-resources-custom", + Resource: &corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: resource.MustParse("8G"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + PodAnnotations: map[string]string{ + "prometheus.io/scrape": "true", + "prometheus.io/port": "5001", + }, + ServiceAnnotations: map[string]string{ + "service.beta.kubernetes.io/aws-load-balancer-type": "nlb", + }, Tolerations: []corev1.Toleration{ { Key: "example-key", @@ -896,6 +938,7 @@ deviceConfig: Key: "test123", }, }, + BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", TLSConfig: &monitoringv1.TLSConfig{ KeyFile: "/etc/credential", }, @@ -905,6 +948,46 @@ deviceConfig: }, verifyFunc: s.verifyMetricsExporter, }, + { + description: "upgrade with rendering spec.remediationWorkflow", + valuesYAML: ` +deviceConfig: + spec: + selector: + kubernetes.io/hostname: "node123" + feature.node.kubernetes.io/amd-gpu: "true" + driver: + enable: true + blacklist: true + image: "test.io/username/repo" + commonConfig: + initContainerImage: busybox:1.37 + devicePlugin: + devicePluginImage: test/k8s-device-plugin:latest + devicePluginImagePullPolicy: Always + remediationWorkflow: + enable: true + conditionalWorkflows: + name: "conditional-workflows-configmap" + ttlForFailedWorkflows: 36 + testerImage: "test.io/test/remediation-workflow-tester:v1.3.0" +`, + extraArgs: []string{"-f", tmpValuesYamlPath, "--set", "crds.defaultCR.upgrade=true"}, + helmFunc: s.upgradeHelmChart, + expectHelmCommandErr: false, + expectDefaultCR: true, + expectSpec: &v1alpha1.DeviceConfigSpec{ + RemediationWorkflow: v1alpha1.RemediationWorkflowSpec{ + Enable: &boolTrue, + ConditionalWorkflows: &corev1.LocalObjectReference{ + Name: "conditional-workflows-configmap", + }, + TtlForFailedWorkflows: 36, + TesterImage: "test.io/test/remediation-workflow-tester:v1.3.0", + }, + }, + verifyFunc: s.verifyRemediationWorkflow, + }, } for _, tc := range testCases {