From 97bd8646bee884f10900d9b7b7906e0c9743995c Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Fri, 17 Oct 2025 14:07:15 -0700 Subject: [PATCH] [DRAFT]node-feature-rules: Add 0x2321 as CC-capable device --- assets/state-cc-manager/0500_daemonset.yaml | 4 +++- .../gpu-operator/templates/nodefeaturerules.yaml | 10 +++++++++- deployments/gpu-operator/values.yaml | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/assets/state-cc-manager/0500_daemonset.yaml b/assets/state-cc-manager/0500_daemonset.yaml index ec851dd5c..aa2c9d35d 100644 --- a/assets/state-cc-manager/0500_daemonset.yaml +++ b/assets/state-cc-manager/0500_daemonset.yaml @@ -34,7 +34,9 @@ spec: fieldRef: fieldPath: spec.nodeName - name: CC_CAPABLE_DEVICE_IDS - value: "0x2322,0x2331" + # TODO - revisit: This list was reduced in 03688e3f61433cbf3bb8e2fad241d12672b04836 + # We should align with deployments\gpu-operator\templates\nodefeaturerules.yaml + value: "0x2322,0x2321,0x2331" # always use runc for driver containers - name: NVIDIA_VISIBLE_DEVICES value: void diff --git a/deployments/gpu-operator/templates/nodefeaturerules.yaml b/deployments/gpu-operator/templates/nodefeaturerules.yaml index 6076b3d31..2e4096aa3 100644 --- a/deployments/gpu-operator/templates/nodefeaturerules.yaml +++ b/deployments/gpu-operator/templates/nodefeaturerules.yaml @@ -89,6 +89,15 @@ spec: matchExpressions: vendor: {op: In, value: ["10de"]} device: {op: In, value: ["2322"]} + - name: "NVIDIA H100L 94GB" + labels: + "nvidia.com/gpu.H100L": "true" + "nvidia.com/gpu.family": "hopper" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["10de"]} + device: {op: In, value: ["2321"]} - name: "NVIDIA CC Enabled" labels: "nvidia.com/cc.capable": "true" @@ -104,4 +113,3 @@ spec: nvidia.com/gpu.family: {op: In, value: ["hopper"]} tdx.enabled: {op: IsTrue} {{- end }} - diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 33154a2d4..45d44e57c 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -511,7 +511,9 @@ ccManager: imagePullSecrets: [] env: - name: CC_CAPABLE_DEVICE_IDS - value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x233d" + # TODO: 0x233d does not seem to be listed in deployments\gpu-operator\templates\nodefeaturerules.yaml, or 0500_daemonset.yaml + # The value was at least added to assets\state-cc-manager\0500_daemonset.yaml in 094e28f5056cf5ebfcf5c0a6277672cdda2c9e08 (but later on removed) + value: "0x2339,0x2331,0x2330,0x2324,0x2322,0x2321,0x233d" resources: {} node-feature-discovery: