Open
Conversation
- Replace nvidia-device-plugin with nvidia-gpu-resource-driver - Add ResourceClaimTemplate configs for GPU resource management - Update Ollama, Tdarr worker, and DCGM exporter for DRA compatibility
Contributor
--- kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin HelmRelease: kube-system/nvidia-device-plugin
+++ kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin HelmRelease: kube-system/nvidia-device-plugin
@@ -1,49 +0,0 @@
----
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
- labels:
- kustomize.toolkit.fluxcd.io/name: nvidia-device-plugin
- kustomize.toolkit.fluxcd.io/namespace: kube-system
- name: nvidia-device-plugin
- namespace: kube-system
-spec:
- chartRef:
- kind: OCIRepository
- name: nvidia-device-plugin
- install:
- crds: CreateReplace
- strategy:
- name: RetryOnFailure
- interval: 1h
- rollback:
- cleanupOnFail: true
- recreate: true
- upgrade:
- cleanupOnFail: true
- crds: CreateReplace
- remediation:
- remediateLastFailure: true
- retries: 2
- strategy:
- name: RemediateOnFailure
- values:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: nvidia.com/gpu.present
- operator: Exists
- config:
- map:
- default: |-
- version: v1
- sharing:
- timeSlicing:
- renameByDefault: false
- resources:
- - name: nvidia.com/gpu
- replicas: 6
- runtimeClassName: nvidia
-
--- kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin OCIRepository: kube-system/nvidia-device-plugin
+++ kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin OCIRepository: kube-system/nvidia-device-plugin
@@ -1,18 +0,0 @@
----
-apiVersion: source.toolkit.fluxcd.io/v1
-kind: OCIRepository
-metadata:
- labels:
- kustomize.toolkit.fluxcd.io/name: nvidia-device-plugin
- kustomize.toolkit.fluxcd.io/namespace: kube-system
- name: nvidia-device-plugin
- namespace: kube-system
-spec:
- interval: 15m
- layerSelector:
- mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
- operation: copy
- ref:
- tag: 0.17.4
- url: oci://ghcr.io/home-operations/charts-mirror/nvidia-device-plugin
-
--- kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin RuntimeClass: kube-system/nvidia
+++ kubernetes/apps/kube-system/nvidia-device-plugin/app Kustomization: kube-system/nvidia-device-plugin RuntimeClass: kube-system/nvidia
@@ -1,10 +0,0 @@
----
-apiVersion: node.k8s.io/v1
-handler: nvidia
-kind: RuntimeClass
-metadata:
- labels:
- kustomize.toolkit.fluxcd.io/name: nvidia-device-plugin
- kustomize.toolkit.fluxcd.io/namespace: kube-system
- name: nvidia
-
--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: ai/ollama
+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: ai/ollama
@@ -9,13 +9,13 @@
namespace: ai
spec:
decryption:
provider: sops
deletionPolicy: WaitForTermination
dependsOn:
- - name: nvidia-device-plugin
+ - name: nvidia-gpu-resource-driver
namespace: kube-system
- name: rook-ceph-cluster
namespace: rook-ceph
interval: 1h
patches:
- patch: |-
--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: kube-system/nvidia-device-plugin
+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: kube-system/nvidia-device-plugin
@@ -1,62 +0,0 @@
----
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
- labels:
- kustomize.toolkit.fluxcd.io/name: cluster-apps
- kustomize.toolkit.fluxcd.io/namespace: flux-system
- name: nvidia-device-plugin
- namespace: kube-system
-spec:
- decryption:
- provider: sops
- deletionPolicy: WaitForTermination
- healthCheckExprs:
- - apiVersion: apps/v1
- current: status.desiredNumberScheduled == status.numberReady
- failed: status.desiredNumberScheduled != status.numberReady
- kind: DaemonSet
- healthChecks:
- - apiVersion: helm.toolkit.fluxcd.io/v2
- kind: HelmRelease
- name: nvidia-device-plugin
- namespace: kube-system
- - apiVersion: apps/v1
- kind: DaemonSet
- name: nvidia-device-plugin
- namespace: kube-system
- interval: 1h
- patches:
- - patch: |-
- apiVersion: helm.toolkit.fluxcd.io/v2
- kind: HelmRelease
- metadata:
- name: _
- spec:
- install:
- crds: CreateReplace
- strategy:
- name: RetryOnFailure
- rollback:
- cleanupOnFail: true
- recreate: true
- upgrade:
- cleanupOnFail: true
- crds: CreateReplace
- strategy:
- name: RemediateOnFailure
- remediation:
- remediateLastFailure: true
- retries: 2
- target:
- group: helm.toolkit.fluxcd.io
- kind: HelmRelease
- path: ./kubernetes/apps/kube-system/nvidia-device-plugin/app
- prune: true
- sourceRef:
- kind: GitRepository
- name: flux-system
- namespace: flux-system
- targetNamespace: kube-system
- wait: false
-
--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: media/tdarr-worker
+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: media/tdarr-worker
@@ -11,13 +11,13 @@
components:
- ../../../../components/nfs-scaler
decryption:
provider: sops
deletionPolicy: WaitForTermination
dependsOn:
- - name: nvidia-device-plugin
+ - name: nvidia-gpu-resource-driver
namespace: kube-system
interval: 1h
patches:
- patch: |-
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: observability/dcgm-exporter
+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: observability/dcgm-exporter
@@ -9,13 +9,13 @@
namespace: observability
spec:
decryption:
provider: sops
deletionPolicy: WaitForTermination
dependsOn:
- - name: nvidia-device-plugin
+ - name: nvidia-gpu-resource-driver
namespace: kube-system
interval: 1h
patches:
- patch: |-
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
--- kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: kube-system/nvidia-gpu-resource-driver
+++ kubernetes/apps Kustomization: flux-system/cluster-apps Kustomization: kube-system/nvidia-gpu-resource-driver
@@ -0,0 +1,53 @@
+---
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: cluster-apps
+ kustomize.toolkit.fluxcd.io/namespace: flux-system
+ name: nvidia-gpu-resource-driver
+ namespace: kube-system
+spec:
+ decryption:
+ provider: sops
+ deletionPolicy: WaitForTermination
+ healthChecks:
+ - apiVersion: helm.toolkit.fluxcd.io/v2
+ kind: HelmRelease
+ name: nvidia-gpu-resource-driver
+ namespace: kube-system
+ interval: 1h
+ patches:
+ - patch: |-
+ apiVersion: helm.toolkit.fluxcd.io/v2
+ kind: HelmRelease
+ metadata:
+ name: _
+ spec:
+ install:
+ crds: CreateReplace
+ strategy:
+ name: RetryOnFailure
+ rollback:
+ cleanupOnFail: true
+ recreate: true
+ upgrade:
+ cleanupOnFail: true
+ crds: CreateReplace
+ strategy:
+ name: RemediateOnFailure
+ remediation:
+ remediateLastFailure: true
+ retries: 2
+ target:
+ group: helm.toolkit.fluxcd.io
+ kind: HelmRelease
+ path: ./kubernetes/apps/kube-system/nvidia-gpu-resource-driver/app
+ prune: true
+ sourceRef:
+ kind: GitRepository
+ name: flux-system
+ namespace: flux-system
+ targetNamespace: kube-system
+ wait: false
+
--- kubernetes/apps/media/tdarr/worker Kustomization: media/tdarr-worker HelmRelease: media/tdarr-worker
+++ kubernetes/apps/media/tdarr/worker Kustomization: media/tdarr-worker HelmRelease: media/tdarr-worker
@@ -51,26 +51,29 @@
transcodecpuWorkers: 0
transcodegpuWorkers: 2
image:
repository: ghcr.io/haveagitgat/tdarr_node
tag: 2.49.01@sha256:34f9cca6dcc0eb0dc10aff63faa587947ff0e0b9e7a0a8c0db02891fcd394fca
resources:
+ claims:
+ - name: gpu
limits:
memory: 8Gi
- nvidia.com/gpu: 1
requests:
cpu: 10m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
defaultPodOptions:
- runtimeClassName: nvidia
+ resourceClaims:
+ - name: gpu
+ resourceClaimTemplateName: tdarr-worker
securityContext:
runAsGroup: 1000
runAsNonRoot: false
runAsUser: 1000
persistence:
media:
--- kubernetes/apps/media/tdarr/worker Kustomization: media/tdarr-worker ResourceClaimTemplate: media/tdarr-worker
+++ kubernetes/apps/media/tdarr/worker Kustomization: media/tdarr-worker ResourceClaimTemplate: media/tdarr-worker
@@ -0,0 +1,17 @@
+---
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: tdarr-worker
+ kustomize.toolkit.fluxcd.io/namespace: media
+ name: tdarr-worker
+ namespace: media
+spec:
+ spec:
+ devices:
+ requests:
+ - exactly:
+ deviceClassName: gpu.nvidia.com
+ name: gpu
+
--- kubernetes/apps/observability/dcgm-exporter/app Kustomization: observability/dcgm-exporter HelmRelease: observability/dcgm-exporter
+++ kubernetes/apps/observability/dcgm-exporter/app Kustomization: observability/dcgm-exporter HelmRelease: observability/dcgm-exporter
@@ -42,12 +42,12 @@
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: Exists
+ kubernetesDRA:
+ enabled: true
resources:
limits:
memory: 1Gi
- nvidia.com/gpu: 1
- runtimeClassName: nvidia
--- kubernetes/apps/observability/dcgm-exporter/app Kustomization: observability/dcgm-exporter ResourceClaimTemplate: observability/dcgm-exporter
+++ kubernetes/apps/observability/dcgm-exporter/app Kustomization: observability/dcgm-exporter ResourceClaimTemplate: observability/dcgm-exporter
@@ -0,0 +1,17 @@
+---
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: dcgm-exporter
+ kustomize.toolkit.fluxcd.io/namespace: observability
+ name: dcgm-exporter
+ namespace: observability
+spec:
+ spec:
+ devices:
+ requests:
+ - exactly:
+ deviceClassName: gpu.nvidia.com
+ name: gpu
+
--- kubernetes/apps/ai/ollama/app Kustomization: ai/ollama HelmRelease: ai/ollama
+++ kubernetes/apps/ai/ollama/app Kustomization: ai/ollama HelmRelease: ai/ollama
@@ -61,26 +61,29 @@
path: /
port: 80
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 1
resources:
+ claims:
+ - name: gpu
limits:
memory: 2Gi
- nvidia.com/gpu: 1
requests:
cpu: 10m
memory: 32Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
defaultPodOptions:
- runtimeClassName: nvidia
+ resourceClaims:
+ - name: gpu
+ resourceClaimTemplateName: ollama
securityContext:
runAsGroup: 1000
runAsNonRoot: true
runAsUser: 1000
persistence:
config:
--- kubernetes/apps/ai/ollama/app Kustomization: ai/ollama ResourceClaimTemplate: ai/ollama
+++ kubernetes/apps/ai/ollama/app Kustomization: ai/ollama ResourceClaimTemplate: ai/ollama
@@ -0,0 +1,17 @@
+---
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: ollama
+ kustomize.toolkit.fluxcd.io/namespace: ai
+ name: ollama
+ namespace: ai
+spec:
+ spec:
+ devices:
+ requests:
+ - exactly:
+ deviceClassName: gpu.nvidia.com
+ name: gpu
+
--- kubernetes/apps/kube-system/nvidia-gpu-resource-driver/app Kustomization: kube-system/nvidia-gpu-resource-driver HelmRelease: kube-system/nvidia-gpu-resource-driver
+++ kubernetes/apps/kube-system/nvidia-gpu-resource-driver/app Kustomization: kube-system/nvidia-gpu-resource-driver HelmRelease: kube-system/nvidia-gpu-resource-driver
@@ -0,0 +1,64 @@
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: nvidia-gpu-resource-driver
+ kustomize.toolkit.fluxcd.io/namespace: kube-system
+ name: nvidia-gpu-resource-driver
+ namespace: kube-system
+spec:
+ chartRef:
+ kind: OCIRepository
+ name: nvidia-gpu-resource-driver
+ install:
+ crds: CreateReplace
+ strategy:
+ name: RetryOnFailure
+ interval: 1h
+ postRenderers:
+ - kustomize:
+ patches:
+ - patch: |
+ - op: replace
+ path: /spec/template/spec/containers/0/volumeMounts/2/mountPath
+ value: "/var/cdi/static"
+ target:
+ kind: DaemonSet
+ name: nvidia-dra-driver-gpu-kubelet-plugin
+ - patch: |
+ - op: replace
+ path: /spec/template/spec/volumes/2/hostPath/path
+ value: "/var/cdi/static"
+ target:
+ kind: DaemonSet
+ name: nvidia-dra-driver-gpu-kubelet-plugin
+ - patch: |
+ - op: replace
+ path: /spec/template/spec/containers/0/env/3/value
+ value: "/var/cdi/static"
+ target:
+ kind: DaemonSet
+ name: nvidia-dra-driver-gpu-kubelet-plugin
+ rollback:
+ cleanupOnFail: true
+ recreate: true
+ upgrade:
+ cleanupOnFail: true
+ crds: CreateReplace
+ remediation:
+ remediateLastFailure: true
+ retries: 2
+ strategy:
+ name: RemediateOnFailure
+ values:
+ gpuResourcesEnabledOverride: true
+ image:
+ repository: ghcr.io/hydazz/k8s-dra-driver-gpu
+ tag: v25.8.0-dev@sha256:e49ee7160a2a99d2e7278c879595329fe96eac15afc94ee7e36a6b5fcdbb3ada
+ resources:
+ computeDomains:
+ enabled: false
+ gpus:
+ enabled: true
+
--- kubernetes/apps/kube-system/nvidia-gpu-resource-driver/app Kustomization: kube-system/nvidia-gpu-resource-driver OCIRepository: kube-system/nvidia-gpu-resource-driver
+++ kubernetes/apps/kube-system/nvidia-gpu-resource-driver/app Kustomization: kube-system/nvidia-gpu-resource-driver OCIRepository: kube-system/nvidia-gpu-resource-driver
@@ -0,0 +1,18 @@
+---
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: OCIRepository
+metadata:
+ labels:
+ kustomize.toolkit.fluxcd.io/name: nvidia-gpu-resource-driver
+ kustomize.toolkit.fluxcd.io/namespace: kube-system
+ name: nvidia-gpu-resource-driver
+ namespace: kube-system
+spec:
+ interval: 15m
+ layerSelector:
+ mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
+ operation: copy
+ ref:
+ tag: 25.3.2
+ url: oci://ghcr.io/hydazz/charts-mirror/nvidia-dra-driver-gpu
+ |
Contributor
--- HelmRelease: observability/dcgm-exporter DaemonSet: observability/dcgm-exporter
+++ HelmRelease: observability/dcgm-exporter DaemonSet: observability/dcgm-exporter
@@ -24,13 +24,12 @@
metadata:
labels:
app.kubernetes.io/name: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
app.kubernetes.io/component: dcgm-exporter
spec:
- runtimeClassName: nvidia
priorityClassName: system-node-critical
serviceAccountName: dcgm-exporter
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
@@ -66,12 +65,14 @@
- name: DCGM_EXPORTER_LISTEN
value: :9400
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
+ - name: KUBERNETES_ENABLE_DRA
+ value: 'true'
ports:
- name: metrics
containerPort: 9400
volumeMounts:
- name: pod-gpu-resources
readOnly: true
@@ -92,8 +93,7 @@
port: 9400
scheme: HTTP
initialDelaySeconds: 45
resources:
limits:
memory: 1Gi
- nvidia.com/gpu: 1
--- HelmRelease: observability/dcgm-exporter ClusterRole: observability/dcgm-exporter-read-pods
+++ HelmRelease: observability/dcgm-exporter ClusterRole: observability/dcgm-exporter-read-pods
@@ -0,0 +1,22 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: dcgm-exporter-read-pods
+ labels:
+ app.kubernetes.io/name: dcgm-exporter
+ app.kubernetes.io/instance: dcgm-exporter
+ app.kubernetes.io/component: dcgm-exporter
+ app.kubernetes.io/managed-by: Helm
+rules:
+- apiGroups:
+ - ''
+ - resource.k8s.io
+ resources:
+ - pods
+ - resourceslices
+ verbs:
+ - get
+ - list
+ - watch
+
--- HelmRelease: observability/dcgm-exporter ClusterRoleBinding: observability/dcgm-exporter-read-pods
+++ HelmRelease: observability/dcgm-exporter ClusterRoleBinding: observability/dcgm-exporter-read-pods
@@ -0,0 +1,19 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: dcgm-exporter-read-pods
+ labels:
+ app.kubernetes.io/name: dcgm-exporter
+ app.kubernetes.io/instance: dcgm-exporter
+ app.kubernetes.io/component: dcgm-exporter
+ app.kubernetes.io/managed-by: Helm
+subjects:
+- kind: ServiceAccount
+ name: dcgm-exporter
+ namespace: observability
+roleRef:
+ kind: ClusterRole
+ name: dcgm-exporter-read-pods
+ apiGroup: rbac.authorization.k8s.io
+
--- HelmRelease: kube-system/nvidia-device-plugin ServiceAccount: kube-system/nvidia-device-plugin-service-account
+++ HelmRelease: kube-system/nvidia-device-plugin ServiceAccount: kube-system/nvidia-device-plugin-service-account
@@ -1,11 +0,0 @@
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
- name: nvidia-device-plugin-service-account
- namespace: kube-system
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-
--- HelmRelease: kube-system/nvidia-device-plugin ConfigMap: kube-system/nvidia-device-plugin-configs
+++ HelmRelease: kube-system/nvidia-device-plugin ConfigMap: kube-system/nvidia-device-plugin-configs
@@ -1,20 +0,0 @@
----
-apiVersion: v1
-kind: ConfigMap
-metadata:
- name: nvidia-device-plugin-configs
- namespace: kube-system
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-data:
- default: |-
- version: v1
- sharing:
- timeSlicing:
- renameByDefault: false
- resources:
- - name: nvidia.com/gpu
- replicas: 6
-
--- HelmRelease: kube-system/nvidia-device-plugin ClusterRole: kube-system/nvidia-device-plugin-role
+++ HelmRelease: kube-system/nvidia-device-plugin ClusterRole: kube-system/nvidia-device-plugin-role
@@ -1,19 +0,0 @@
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
- name: nvidia-device-plugin-role
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-rules:
-- apiGroups:
- - ''
- resources:
- - nodes
- verbs:
- - get
- - list
- - watch
-
--- HelmRelease: kube-system/nvidia-device-plugin ClusterRoleBinding: kube-system/nvidia-device-plugin-role-binding
+++ HelmRelease: kube-system/nvidia-device-plugin ClusterRoleBinding: kube-system/nvidia-device-plugin-role-binding
@@ -1,18 +0,0 @@
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
- name: nvidia-device-plugin-role-binding
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-subjects:
-- kind: ServiceAccount
- name: nvidia-device-plugin-service-account
- namespace: kube-system
-roleRef:
- kind: ClusterRole
- name: nvidia-device-plugin-role
- apiGroup: rbac.authorization.k8s.io
-
--- HelmRelease: kube-system/nvidia-device-plugin DaemonSet: kube-system/nvidia-device-plugin
+++ HelmRelease: kube-system/nvidia-device-plugin DaemonSet: kube-system/nvidia-device-plugin
@@ -1,170 +0,0 @@
----
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
- name: nvidia-device-plugin
- namespace: kube-system
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-spec:
- selector:
- matchLabels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- updateStrategy:
- type: RollingUpdate
- template:
- metadata:
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- spec:
- priorityClassName: system-node-critical
- runtimeClassName: nvidia
- securityContext: {}
- serviceAccountName: nvidia-device-plugin-service-account
- shareProcessNamespace: true
- initContainers:
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- name: nvidia-device-plugin-init
- command:
- - config-manager
- env:
- - name: ONESHOT
- value: 'true'
- - name: KUBECONFIG
- value: ''
- - name: NODE_NAME
- valueFrom:
- fieldRef:
- fieldPath: spec.nodeName
- - name: NODE_LABEL
- value: nvidia.com/device-plugin.config
- - name: CONFIG_FILE_SRCDIR
- value: /available-configs
- - name: CONFIG_FILE_DST
- value: /config/config.yaml
- - name: DEFAULT_CONFIG
- value: null
- - name: FALLBACK_STRATEGIES
- value: named,single
- - name: SEND_SIGNAL
- value: 'false'
- - name: SIGNAL
- value: ''
- - name: PROCESS_TO_SIGNAL
- value: ''
- volumeMounts:
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- containers:
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- name: nvidia-device-plugin-sidecar
- command:
- - config-manager
- env:
- - name: ONESHOT
- value: 'false'
- - name: KUBECONFIG
- value: ''
- - name: NODE_NAME
- valueFrom:
- fieldRef:
- fieldPath: spec.nodeName
- - name: NODE_LABEL
- value: nvidia.com/device-plugin.config
- - name: CONFIG_FILE_SRCDIR
- value: /available-configs
- - name: CONFIG_FILE_DST
- value: /config/config.yaml
- - name: DEFAULT_CONFIG
- value: null
- - name: FALLBACK_STRATEGIES
- value: named,single
- - name: SEND_SIGNAL
- value: 'true'
- - name: SIGNAL
- value: '1'
- - name: PROCESS_TO_SIGNAL
- value: nvidia-device-plugin
- volumeMounts:
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- securityContext:
- capabilities:
- add:
- - SYS_ADMIN
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- imagePullPolicy: IfNotPresent
- name: nvidia-device-plugin-ctr
- command:
- - nvidia-device-plugin
- env:
- - name: MPS_ROOT
- value: /run/nvidia/mps
- - name: CONFIG_FILE
- value: /config/config.yaml
- - name: NVIDIA_MIG_MONITOR_DEVICES
- value: all
- - name: NVIDIA_VISIBLE_DEVICES
- value: all
- - name: NVIDIA_DRIVER_CAPABILITIES
- value: compute,utility
- securityContext:
- capabilities:
- add:
- - SYS_ADMIN
- volumeMounts:
- - name: kubelet-device-plugins-dir
- mountPath: /var/lib/kubelet/device-plugins
- - name: mps-shm
- mountPath: /dev/shm
- - name: mps-root
- mountPath: /mps
- - name: cdi-root
- mountPath: /var/run/cdi
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- volumes:
- - name: kubelet-device-plugins-dir
- hostPath:
- path: /var/lib/kubelet/device-plugins
- type: Directory
- - name: mps-root
- hostPath:
- path: /run/nvidia/mps
- type: DirectoryOrCreate
- - name: mps-shm
- hostPath:
- path: /run/nvidia/mps/shm
- - name: cdi-root
- hostPath:
- path: /var/run/cdi
- type: DirectoryOrCreate
- - name: available-configs
- configMap:
- name: nvidia-device-plugin-configs
- - name: config
- emptyDir: {}
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: nvidia.com/gpu.present
- operator: Exists
- tolerations:
- - key: CriticalAddonsOnly
- operator: Exists
- - effect: NoSchedule
- key: nvidia.com/gpu
- operator: Exists
-
--- HelmRelease: kube-system/nvidia-device-plugin DaemonSet: kube-system/nvidia-device-plugin-mps-control-daemon
+++ HelmRelease: kube-system/nvidia-device-plugin DaemonSet: kube-system/nvidia-device-plugin-mps-control-daemon
@@ -1,168 +0,0 @@
----
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
- name: nvidia-device-plugin-mps-control-daemon
- namespace: kube-system
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- app.kubernetes.io/managed-by: Helm
-spec:
- selector:
- matchLabels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- updateStrategy:
- type: RollingUpdate
- template:
- metadata:
- labels:
- app.kubernetes.io/name: nvidia-device-plugin
- app.kubernetes.io/instance: nvidia-device-plugin
- spec:
- priorityClassName: system-node-critical
- runtimeClassName: nvidia
- securityContext: {}
- serviceAccountName: nvidia-device-plugin-service-account
- shareProcessNamespace: true
- initContainers:
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- name: mps-control-daemon-mounts
- command:
- - mps-control-daemon
- - mount-shm
- securityContext:
- privileged: true
- volumeMounts:
- - name: mps-root
- mountPath: /mps
- mountPropagation: Bidirectional
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- name: mps-control-daemon-init
- command:
- - config-manager
- env:
- - name: ONESHOT
- value: 'true'
- - name: KUBECONFIG
- value: ''
- - name: NODE_NAME
- valueFrom:
- fieldRef:
- fieldPath: spec.nodeName
- - name: NODE_LABEL
- value: nvidia.com/device-plugin.config
- - name: CONFIG_FILE_SRCDIR
- value: /available-configs
- - name: CONFIG_FILE_DST
- value: /config/config.yaml
- - name: DEFAULT_CONFIG
- value: null
- - name: FALLBACK_STRATEGIES
- value: named,single
- - name: SEND_SIGNAL
- value: 'false'
- - name: SIGNAL
- value: ''
- - name: PROCESS_TO_SIGNAL
- value: ''
- volumeMounts:
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- containers:
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- name: mps-control-daemon-sidecar
- command:
- - config-manager
- env:
- - name: ONESHOT
- value: 'false'
- - name: KUBECONFIG
- value: ''
- - name: NODE_NAME
- valueFrom:
- fieldRef:
- fieldPath: spec.nodeName
- - name: NODE_LABEL
- value: nvidia.com/device-plugin.config
- - name: CONFIG_FILE_SRCDIR
- value: /available-configs
- - name: CONFIG_FILE_DST
- value: /config/config.yaml
- - name: DEFAULT_CONFIG
- value: null
- - name: FALLBACK_STRATEGIES
- value: named,single
- - name: SEND_SIGNAL
- value: 'true'
- - name: SIGNAL
- value: '1'
- - name: PROCESS_TO_SIGNAL
- value: /usr/bin/mps-control-daemon
- volumeMounts:
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.4
- imagePullPolicy: IfNotPresent
- name: mps-control-daemon-ctr
- command:
- - mps-control-daemon
- env:
- - name: NODE_NAME
- valueFrom:
- fieldRef:
- apiVersion: v1
- fieldPath: spec.nodeName
- - name: CONFIG_FILE
- value: /config/config.yaml
- - name: NVIDIA_MIG_MONITOR_DEVICES
- value: all
- - name: NVIDIA_VISIBLE_DEVICES
- value: all
- - name: NVIDIA_DRIVER_CAPABILITIES
- value: compute,utility
- securityContext:
- privileged: true
- volumeMounts:
- - name: mps-shm
- mountPath: /dev/shm
- - name: mps-root
- mountPath: /mps
- - name: available-configs
- mountPath: /available-configs
- - name: config
- mountPath: /config
- volumes:
- - name: mps-root
- hostPath:
- path: /run/nvidia/mps
- type: DirectoryOrCreate
- - name: mps-shm
- hostPath:
- path: /run/nvidia/mps/shm
- - name: available-configs
- configMap:
- name: nvidia-device-plugin-configs
- - name: config
- emptyDir: {}
- nodeSelector:
- nvidia.com/mps.capable: 'true'
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: nvidia.com/gpu.present
- operator: Exists
- tolerations:
- - key: CriticalAddonsOnly
- operator: Exists
- - effect: NoSchedule
- key: nvidia.com/gpu
- operator: Exists
-
--- HelmRelease: ai/ollama Deployment: ai/ollama
+++ HelmRelease: ai/ollama Deployment: ai/ollama
@@ -26,21 +26,23 @@
app.kubernetes.io/instance: ollama
app.kubernetes.io/name: ollama
spec:
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
- runtimeClassName: nvidia
securityContext:
runAsGroup: 1000
runAsNonRoot: true
runAsUser: 1000
hostIPC: false
hostNetwork: false
hostPID: false
dnsPolicy: ClusterFirst
+ resourceClaims:
+ - name: gpu
+ resourceClaimTemplateName: ollama
containers:
- env:
- name: HOME
value: /config
- name: OLLAMA_HOST
value: 0.0.0.0:80
@@ -64,15 +66,16 @@
path: /
port: 80
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 1
resources:
+ claims:
+ - name: gpu
limits:
memory: 2Gi
- nvidia.com/gpu: 1
requests:
cpu: 10m
memory: 32Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
--- HelmRelease: media/tdarr-worker Deployment: media/tdarr-worker
+++ HelmRelease: media/tdarr-worker Deployment: media/tdarr-worker
@@ -26,21 +26,23 @@
app.kubernetes.io/instance: tdarr-worker
app.kubernetes.io/name: tdarr-worker
spec:
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
- runtimeClassName: nvidia
securityContext:
runAsGroup: 1000
runAsNonRoot: false
runAsUser: 1000
hostIPC: false
hostNetwork: false
hostPID: false
dnsPolicy: ClusterFirst
+ resourceClaims:
+ - name: gpu
+ resourceClaimTemplateName: tdarr-worker
containers:
- args:
- --max-old-space-size=16384
- /app/Tdarr_Node/srcug/main.js
command:
- /usr/bin/node
@@ -65,15 +67,16 @@
value: '0'
- name: transcodegpuWorkers
value: '2'
image: ghcr.io/haveagitgat/tdarr_node:2.49.01@sha256:34f9cca6dcc0eb0dc10aff63faa587947ff0e0b9e7a0a8c0db02891fcd394fca
name: app
resources:
+ claims:
+ - name: gpu
limits:
memory: 8Gi
- nvidia.com/gpu: 1
requests:
cpu: 10m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ServiceAccount: kube-system/compute-domain-daemon-service-account
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ServiceAccount: kube-system/compute-domain-daemon-service-account
@@ -0,0 +1,7 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: compute-domain-daemon-service-account
+ namespace: kube-system
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ServiceAccount: kube-system/nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ServiceAccount: kube-system/nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account
@@ -0,0 +1,11 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account
+ namespace: kube-system
+ labels:
+ app.kubernetes.io/managed-by: Helm
+ app.kubernetes.io/name: nvidia-dra-driver-gpu
+ app.kubernetes.io/instance: nvidia-gpu-resource-driver
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRole: kube-system/nvidia-dra-driver-gpu-role
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRole: kube-system/nvidia-dra-driver-gpu-role
@@ -0,0 +1,130 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: nvidia-dra-driver-gpu-role
+ namespace: kube-system
+rules:
+- apiGroups:
+ - resource.nvidia.com
+ resources:
+ - computedomains
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.nvidia.com
+ resources:
+ - computedomains/status
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.k8s.io
+ resources:
+ - resourceclaims
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.k8s.io
+ resources:
+ - resourceclaimtemplates
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.k8s.io
+ resources:
+ - deviceclasses
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.k8s.io
+ resources:
+ - resourceslices
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - resource.k8s.io
+ resources:
+ - resourceclaims/status
+ verbs:
+ - update
+- apiGroups:
+ - apps
+ resources:
+ - daemonsets
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - apps
+ resources:
+ - deployments
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - ''
+ resources:
+ - nodes
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+- apiGroups:
+ - ''
+ resources:
+ - pods
+ verbs:
+ - get
+ - list
+ - watch
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRole: kube-system/compute-domain-daemon-role
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRole: kube-system/compute-domain-daemon-role
@@ -0,0 +1,19 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: compute-domain-daemon-role
+ namespace: kube-system
+rules:
+- apiGroups:
+ - resource.nvidia.com
+ resources:
+ - computedomains
+ - computedomains/status
+ verbs:
+ - get
+ - list
+ - watch
+ - update
+ - patch
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRoleBinding: kube-system/nvidia-dra-driver-gpu-role-binding
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRoleBinding: kube-system/nvidia-dra-driver-gpu-role-binding
@@ -0,0 +1,15 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: nvidia-dra-driver-gpu-role-binding
+ namespace: kube-system
+subjects:
+- kind: ServiceAccount
+ name: nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account
+ namespace: kube-system
+roleRef:
+ kind: ClusterRole
+ name: nvidia-dra-driver-gpu-role
+ apiGroup: rbac.authorization.k8s.io
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRoleBinding: kube-system/compute-domain-daemon-role-binding
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ClusterRoleBinding: kube-system/compute-domain-daemon-role-binding
@@ -0,0 +1,15 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: compute-domain-daemon-role-binding
+ namespace: kube-system
+subjects:
+- kind: ServiceAccount
+ name: compute-domain-daemon-service-account
+ namespace: kube-system
+roleRef:
+ kind: ClusterRole
+ name: compute-domain-daemon-role
+ apiGroup: rbac.authorization.k8s.io
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver DaemonSet: kube-system/nvidia-dra-driver-gpu-kubelet-plugin
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver DaemonSet: kube-system/nvidia-dra-driver-gpu-kubelet-plugin
@@ -0,0 +1,139 @@
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: nvidia-dra-driver-gpu-kubelet-plugin
+ namespace: kube-system
+ labels:
+ app.kubernetes.io/managed-by: Helm
+ app.kubernetes.io/name: nvidia-dra-driver-gpu
+ app.kubernetes.io/instance: nvidia-gpu-resource-driver
+spec:
+ selector:
+ matchLabels:
+ nvidia-dra-driver-gpu-component: kubelet-plugin
+ updateStrategy:
+ type: RollingUpdate
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: nvidia-dra-driver-gpu
+ app.kubernetes.io/instance: nvidia-gpu-resource-driver
+ nvidia-dra-driver-gpu-component: kubelet-plugin
+ spec:
+ priorityClassName: system-node-critical
+ serviceAccountName: nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account
+ securityContext: {}
+ initContainers:
+ - name: init-container
+ image: ghcr.io/hydazz/k8s-dra-driver-gpu:v25.8.0-dev@sha256:e49ee7160a2a99d2e7278c879595329fe96eac15afc94ee7e36a6b5fcdbb3ada
+ securityContext:
+ privileged: true
+ command:
+ - bash
+ - /usr/bin/kubelet-plugin-prestart.sh
+ env:
+ - name: NVIDIA_DRIVER_ROOT
+ value: /
+ - name: NVIDIA_VISIBLE_DEVICES
+ value: void
+ volumeMounts:
+ - name: driver-root-parent
+ mountPath: /driver-root-parent
+ readOnly: true
+ containers:
+ - name: gpus
+ securityContext:
+ privileged: true
+ image: ghcr.io/hydazz/k8s-dra-driver-gpu:v25.8.0-dev@sha256:e49ee7160a2a99d2e7278c879595329fe96eac15afc94ee7e36a6b5fcdbb3ada
+ imagePullPolicy: IfNotPresent
+ command:
+ - bash
+ - -c
+ args:
+ - |-
+ # Conditionally mask the params file to prevent this container from
+ # recreating any missing GPU device nodes. This is necessary, for
+ # example, when running under nvkind to limit the set GPUs governed
+ # by the plugin even though it has cgroup access to all of them.
+ if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
+ cp /proc/driver/nvidia/params root/gpu-params
+ sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
+ mount --bind root/gpu-params /proc/driver/nvidia/params
+ fi
+ gpu-kubelet-plugin -v 6
+ resources: {}
+ env:
+ - name: MASK_NVIDIA_DRIVER_PARAMS
+ value: ''
+ - name: NVIDIA_DRIVER_ROOT
+ value: /
+ - name: NVIDIA_VISIBLE_DEVICES
+ value: void
+ - name: CDI_ROOT
+ value: /var/run/cdi
+ - name: NVIDIA_MIG_CONFIG_DEVICES
+ value: all
+ - name: NODE_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: IMAGE_NAME
+ value: ghcr.io/hydazz/k8s-dra-driver-gpu:v25.8.0-dev@sha256:e49ee7160a2a99d2e7278c879595329fe96eac15afc94ee7e36a6b5fcdbb3ada
+ volumeMounts:
+ - name: plugins-registry
+ mountPath: /var/lib/kubelet/plugins_registry
+ - name: plugins
+ mountPath: /var/lib/kubelet/plugins
+ mountPropagation: Bidirectional
+ - name: cdi
+ mountPath: /var/run/cdi
+ - name: driver-root
+ mountPath: /driver-root
+ readOnly: true
+ mountPropagation: HostToContainer
+ volumes:
+ - name: plugins-registry
+ hostPath:
+ path: /var/lib/kubelet/plugins_registry
+ - name: plugins
+ hostPath:
+ path: /var/lib/kubelet/plugins
+ - name: cdi
+ hostPath:
+ path: /var/run/cdi
+ - name: driver-root-parent
+ hostPath:
+ path: /
+ type: DirectoryOrCreate
+ - name: driver-root
+ hostPath:
+ path: /
+ type: DirectoryOrCreate
+ - name: host-dev
+ hostPath:
+ path: /dev
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: feature.node.kubernetes.io/pci-10de.present
+ operator: In
+ values:
+ - 'true'
+ - matchExpressions:
+ - key: feature.node.kubernetes.io/cpu-model.vendor_id
+ operator: In
+ values:
+ - NVIDIA
+ - matchExpressions:
+ - key: nvidia.com/gpu.present
+ operator: In
+ values:
+ - 'true'
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver DeviceClass: kube-system/gpu.nvidia.com
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver DeviceClass: kube-system/gpu.nvidia.com
@@ -0,0 +1,11 @@
+---
+apiVersion: resource.k8s.io/v1
+kind: DeviceClass
+metadata:
+ name: gpu.nvidia.com
+spec:
+ selectors:
+ - cel:
+ expression: device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type
+ == 'gpu'
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver DeviceClass: kube-system/mig.nvidia.com
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver DeviceClass: kube-system/mig.nvidia.com
@@ -0,0 +1,11 @@
+---
+apiVersion: resource.k8s.io/v1
+kind: DeviceClass
+metadata:
+ name: mig.nvidia.com
+spec:
+ selectors:
+ - cel:
+ expression: device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type
+ == 'mig'
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ValidatingAdmissionPolicy: kube-system/resourceslices-policy-nvidia-dra-driver-gpu
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ValidatingAdmissionPolicy: kube-system/resourceslices-policy-nvidia-dra-driver-gpu
@@ -0,0 +1,42 @@
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicy
+metadata:
+ name: resourceslices-policy-nvidia-dra-driver-gpu
+spec:
+ failurePolicy: Fail
+ matchConstraints:
+ resourceRules:
+ - apiGroups:
+ - resource.k8s.io
+ apiVersions:
+ - v1
+ - v1beta1
+ - v1beta2
+ operations:
+ - CREATE
+ - UPDATE
+ - DELETE
+ resources:
+ - resourceslices
+ matchConditions:
+ - name: isRestrictedUser
+ expression: request.userInfo.username == "system:serviceaccount:kube-system:nvidia-gpu-resource-driver-nvidia-dra-driver-gpu-service-account"
+ variables:
+ - name: userNodeName
+ expression: request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
+ - name: objectNodeName
+ expression: '(request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")'
+ - name: nodeSelectorValue
+ expression: '(request.operation == "DELETE" ? oldObject : object).spec.?nodeSelector.orValue(null)'
+ - name: allNodesValue
+ expression: '(request.operation == "DELETE" ? oldObject : object).spec.?allNodes.orValue(false)'
+ validations:
+ - expression: variables.userNodeName != ""
+ message: no node association found for user, this user must run in a pod on a
+ node and ServiceAccountTokenPodNodeInfo must be enabled
+ - expression: variables.userNodeName == variables.objectNodeName || variables.allNodesValue
+ == true || variables.nodeSelectorValue != null
+ messageExpression: '"this user running on node ''"+variables.userNodeName+"''
+ may not modify cluster or node resourceslices"'
+
--- HelmRelease: kube-system/nvidia-gpu-resource-driver ValidatingAdmissionPolicyBinding: kube-system/resourceslices-policy-nvidia-dra-driver-gpu
+++ HelmRelease: kube-system/nvidia-gpu-resource-driver ValidatingAdmissionPolicyBinding: kube-system/resourceslices-policy-nvidia-dra-driver-gpu
@@ -0,0 +1,10 @@
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicyBinding
+metadata:
+ name: resourceslices-policy-nvidia-dra-driver-gpu
+spec:
+ policyName: resourceslices-policy-nvidia-dra-driver-gpu
+ validationActions:
+ - Deny
+ |
8f6defe to
cd67fe6
Compare
4ec0764 to
b167a78
Compare
6dbd57d to
57cd8c8
Compare
8f99de2 to
aa330fa
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Once the nvidia DRA actually works out of the box.
DRA works, but CDI is broken, as nvidia dra has hardcoded CDI paths
https://github.com/NVIDIA/k8s-dra-driver-gpu/blob/cfe35ffd3d2c8872a83c029e26376ca168e2c409/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml#L131-L132
NVIDIA/k8s-dra-driver-gpu#605 (comment)
DCGM exporter probably needs tweaking,
kubernetesDRAappears to be broken too