diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index ea4b21d86..4a1fb7290 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -94,6 +94,8 @@ type ClusterPolicySpec struct { CCManager CCManagerSpec `json:"ccManager,omitempty"` // HostPaths defines various paths on the host needed by GPU Operator components HostPaths HostPathsSpec `json:"hostPaths,omitempty"` + // FabricManager component spec + FabricManager FabricManagerSpec `json:"fabricManager,omitempty"` } // Runtime defines container runtime type @@ -1724,6 +1726,38 @@ type CDIConfigSpec struct { Default *bool `json:"default,omitempty"` } +// FabricMode defines the Fabric Manager mode +type FabricMode string + +const ( + // FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0) + FabricModeFullPassthrough FabricMode = "full-passthrough" + // FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1) + FabricModeSharedNVSwitch FabricMode = "shared-nvswitch" +) + +func (f FabricMode) String() string { + switch f { + case FabricModeFullPassthrough: + return "full-passthrough" + case FabricModeSharedNVSwitch: + return "shared-nvswitch" + default: + return "" + } +} + +// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration +type FabricManagerSpec struct { + // Mode indicates the Fabric Manager mode + // +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch + // +kubebuilder:default=full-passthrough + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch" + Mode FabricMode `json:"mode,omitempty"` +} + // MIGStrategy indicates MIG mode type MIGStrategy string @@ -2218,3 +2252,18 @@ func (c *MIGPartedConfigSpec) GetName() string { func (c *VGPUDevicesConfigSpec) GetName() string { return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name } + +// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode +func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool { + return f.Mode == FabricModeSharedNVSwitch +} + +// ValidateFabricManagerConfig validates the Fabric Manager configuration +func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error { + if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" && + c.FabricManager.IsSharedNVSwitchMode() && + !c.Driver.IsEnabled() { + return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode") + } + return nil +} diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 9e68fdb37..5b86cd8e2 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -209,6 +209,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) { in.KataManager.DeepCopyInto(&out.KataManager) in.CCManager.DeepCopyInto(&out.CCManager) out.HostPaths = in.HostPaths + out.FabricManager = in.FabricManager } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterPolicySpec. @@ -788,6 +789,21 @@ func (in *EnvVar) DeepCopy() *EnvVar { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FabricManagerSpec) DeepCopyInto(out *FabricManagerSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FabricManagerSpec. +func (in *FabricManagerSpec) DeepCopy() *FabricManagerSpec { + if in == nil { + return nil + } + out := new(FabricManagerSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GDRCopySpec) DeepCopyInto(out *GDRCopySpec) { *out = *in diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml index 67aa1e2ca..3ab40437c 100644 --- a/assets/state-driver/0400_configmap.yaml +++ b/assets/state-driver/0400_configmap.yaml @@ -22,8 +22,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml index 853cf6fc9..c6023e9b3 100644 --- a/assets/state-driver/0500_daemonset.yaml +++ b/assets/state-driver/0500_daemonset.yaml @@ -50,29 +50,29 @@ spec: command: ["driver-manager"] args: ["uninstall_driver"] env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - # always use runc for driver containers - - name: NVIDIA_VISIBLE_DEVICES - value: void - - name: ENABLE_GPU_POD_EVICTION - value: "true" - - name: ENABLE_AUTO_DRAIN - value: "false" - - name: DRAIN_USE_FORCE - value: "false" - - name: DRAIN_POD_SELECTOR_LABEL - value: "" - - name: DRAIN_TIMEOUT_SECONDS - value: "0s" - - name: DRAIN_DELETE_EMPTYDIR_DATA - value: "false" - - name: OPERATOR_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace securityContext: privileged: true volumeMounts: @@ -89,193 +89,203 @@ spec: mountPath: /run/mellanox/drivers mountPropagation: HostToContainer containers: - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-driver-ctr - command: ["nvidia-driver"] - args: ["init"] - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: Bidirectional - - name: run-nvidia-fabricmanager - mountPath: /run/nvidia-fabricmanager - - name: run-nvidia-topologyd - mountPath: /run/nvidia-topologyd - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - - name: host-os-release - mountPath: "/host-etc/os-release" - readOnly: true - - name: mlnx-ofed-usr-src - mountPath: /run/mellanox/drivers/usr/src - mountPropagation: HostToContainer - - name: run-mellanox-drivers - mountPath: /run/mellanox/drivers - mountPropagation: HostToContainer - - name: sysfs-memory-online - mountPath: /sys/devices/system/memory/auto_online_blocks - - name: firmware-search-path - mountPath: /sys/module/firmware_class/parameters/path - - name: nv-firmware - mountPath: /lib/firmware - - name: driver-startup-probe-script - mountPath: /usr/local/bin/startup-probe.sh - subPath: startup-probe.sh - startupProbe: - exec: - command: - - sh - - /usr/local/bin/startup-probe.sh - initialDelaySeconds: 60 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 60 - lifecycle: - preStop: + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-driver-ctr + command: ["nvidia-driver"] + args: ["init"] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: Bidirectional + - name: run-nvidia-fabricmanager + mountPath: /run/nvidia-fabricmanager + - name: run-nvidia-topologyd + mountPath: /run/nvidia-topologyd + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + - name: mlnx-ofed-usr-src + mountPath: /run/mellanox/drivers/usr/src + mountPropagation: HostToContainer + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer + - name: sysfs-memory-online + mountPath: /sys/devices/system/memory/auto_online_blocks + - name: firmware-search-path + mountPath: /sys/module/firmware_class/parameters/path + - name: nv-firmware + mountPath: /lib/firmware + - name: driver-startup-probe-script + mountPath: /usr/local/bin/startup-probe.sh + subPath: startup-probe.sh + startupProbe: + exec: + command: + - sh + - /usr/local/bin/startup-probe.sh + initialDelaySeconds: 60 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 60 + lifecycle: + preStop: + exec: + command: + [ + "/bin/sh", + "-c", + "rm -f /run/nvidia/validations/.driver-ctr-ready", + ] + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-peermem-ctr + command: ["nvidia-driver"] + # takes care of loading nvidia_peermem whenever it gets dynamically unloaded during MOFED driver re-install/update + args: ["reload_nvidia_peermem"] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: Bidirectional + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer + startupProbe: + exec: + command: [sh, -c, "nvidia-driver probe_nvidia_peermem"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + livenessProbe: exec: - command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready"] - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-peermem-ctr - command: ["nvidia-driver"] - # takes care of loading nvidia_peermem whenever it gets dynamically unloaded during MOFED driver re-install/update - args: ["reload_nvidia_peermem"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: Bidirectional - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - - name: run-mellanox-drivers - mountPath: /run/mellanox/drivers - mountPropagation: HostToContainer - startupProbe: - exec: - command: - [sh, -c, 'nvidia-driver probe_nvidia_peermem'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - livenessProbe: - exec: - command: - [sh, -c, 'nvidia-driver probe_nvidia_peermem'] - periodSeconds: 30 - initialDelaySeconds: 30 - failureThreshold: 1 - successThreshold: 1 - timeoutSeconds: 10 - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-fs-ctr - command: [bash, -xc] - args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gds-driver install"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: HostToContainer - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - startupProbe: - exec: - command: - [sh, -c, 'lsmod | grep nvidia_fs'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-gdrcopy-ctr - command: [bash, -xc] - args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: HostToContainer - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - startupProbe: - exec: - command: - [sh, -c, 'lsmod | grep gdrdrv'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - # Only kept when OpenShift DriverToolkit side-car is enabled. - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: openshift-driver-toolkit-ctr - command: [bash, -xc] - args: ["until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver"] - securityContext: - # currently mandatory as 'nvidia-installer' loads (and - # unloads) the kernel module as part of the build process - privileged: true - seLinuxOptions: - level: "s0" - env: - - name: RHCOS_VERSION - value: "FILLED BY THE OPERATOR" - # always use runc for driver containers - - name: NVIDIA_VISIBLE_DEVICES - value: void - volumeMounts: - # corresponding volumes are dynamically injected by the - # operator when the OCP DriverToolkit side-car is enabled - - name: shared-nvidia-driver-toolkit - mountPath: /mnt/shared-nvidia-driver-toolkit - - name: var-log - mountPath: /var/log - - name: mlnx-ofed-usr-src - mountPath: /run/mellanox/drivers/usr/src - mountPropagation: HostToContainer - - name: host-os-release - mountPath: /host-etc/os-release - readOnly: true + command: [sh, -c, "nvidia-driver probe_nvidia_peermem"] + periodSeconds: 30 + initialDelaySeconds: 30 + failureThreshold: 1 + successThreshold: 1 + timeoutSeconds: 10 + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-fs-ctr + command: [bash, -xc] + args: + [ + "until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gds-driver install", + ] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: HostToContainer + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + startupProbe: + exec: + command: [sh, -c, "lsmod | grep nvidia_fs"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-gdrcopy-ctr + command: [bash, -xc] + args: + [ + "until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install", + ] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: HostToContainer + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + startupProbe: + exec: + command: [sh, -c, "lsmod | grep gdrdrv"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + # Only kept when OpenShift DriverToolkit side-car is enabled. + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: openshift-driver-toolkit-ctr + command: [bash, -xc] + args: + [ + "until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver", + ] + securityContext: + # currently mandatory as 'nvidia-installer' loads (and + # unloads) the kernel module as part of the build process + privileged: true + seLinuxOptions: + level: "s0" + env: + - name: RHCOS_VERSION + value: "FILLED BY THE OPERATOR" + # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + volumeMounts: + # corresponding volumes are dynamically injected by the + # operator when the OCP DriverToolkit side-car is enabled + - name: shared-nvidia-driver-toolkit + mountPath: /mnt/shared-nvidia-driver-toolkit + - name: var-log + mountPath: /var/log + - name: mlnx-ofed-usr-src + mountPath: /run/mellanox/drivers/usr/src + mountPropagation: HostToContainer + - name: host-os-release + mountPath: /host-etc/os-release + readOnly: true volumes: - name: run-nvidia hostPath: diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 4ea634ebe..f2f5ece52 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -42,6 +42,7 @@ const ( commonGPULabelValue = "true" commonOperandsLabelKey = "nvidia.com/gpu.deploy.operands" commonOperandsLabelValue = "true" + driverLabelKey = "nvidia.com/gpu.deploy.driver" migManagerLabelKey = "nvidia.com/gpu.deploy.mig-manager" migManagerLabelValue = "true" migCapableLabelKey = "nvidia.com/mig.capable" @@ -116,9 +117,10 @@ var gpuNodeLabels = map[string]string{ } type gpuWorkloadConfiguration struct { - config string - node string - log logr.Logger + config string + node string + log logr.Logger + clusterPolicy *gpuv1.ClusterPolicy } // OpenShiftDriverToolkit contains the values required to deploy @@ -322,6 +324,18 @@ func isValidWorkloadConfig(workloadConfig string) bool { return ok } +// shouldDeployDriverForVMPassthrough returns true if driver should be deployed for vm-passthrough workload +// based on Fabric Manager configuration +func (w *gpuWorkloadConfiguration) shouldDeployDriverForVMPassthrough() bool { + if w.config != gpuWorkloadConfigVMPassthrough { + return false + } + if w.clusterPolicy == nil { + return false + } + return w.clusterPolicy.Spec.FabricManager.IsSharedNVSwitchMode() +} + // getWorkloadConfig returns the GPU workload configured for the node. // If an error occurs when searching for the workload config, // return defaultGPUWorkloadConfig. @@ -382,6 +396,16 @@ func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) b modified = true } } + + // Add conditional driver deployment for vm-passthrough workload + if w.shouldDeployDriverForVMPassthrough() { + if _, ok := labels[driverLabelKey]; !ok { + w.log.Info("Setting node label for driver deployment in vm-passthrough with Fabric Manager shared-nvswitch mode", "NodeName", w.node, "Label", driverLabelKey, "Value", "true") + labels[driverLabelKey] = "true" + modified = true + } + } + if w.config == gpuWorkloadConfigContainer && hasMIGCapableGPU(labels) && !hasMIGManagerLabel(labels) { w.log.Info("Setting node label", "NodeName", w.node, "Label", migManagerLabelKey, "Value", migManagerLabelValue) labels[migManagerLabelKey] = migManagerLabelValue @@ -506,7 +530,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { "Error", err, "defaultGPUWorkloadConfig", defaultGPUWorkloadConfig) } n.logger.Info("GPU workload configuration", "NodeName", node.Name, "GpuWorkloadConfig", config) - gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.Name, n.logger} + gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.Name, n.logger, n.singleton} if !hasCommonGPULabel(labels) && hasGPULabels(labels) { n.logger.Info("Node has GPU(s)", "NodeName", node.Name) // label the node with common Nvidia GPU label diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index bd1641e94..894ae35fd 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -19,6 +19,8 @@ package controllers import ( "testing" + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" @@ -186,3 +188,224 @@ func TestHasMIGCapableGPU(t *testing.T) { } } } + +func TestGpuWorkloadConfiguration_ShouldDeployDriverForVMPassthrough(t *testing.T) { + tests := []struct { + name string + config string + clusterPolicy *gpuv1.ClusterPolicy + expected bool + }{ + { + name: "non-vm-passthrough workload", + config: gpuWorkloadConfigContainer, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + expected: false, + }, + { + name: "vm-passthrough with nil cluster policy", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: nil, + expected: false, + }, + { + name: "vm-passthrough with shared-nvswitch mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + expected: true, + }, + { + name: "vm-passthrough with full-passthrough mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + }, + expected: false, + }, + { + name: "vm-passthrough with default (empty) fabric manager mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: "", // empty defaults to full-passthrough + }, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workloadConfig := &gpuWorkloadConfiguration{ + config: tt.config, + node: "test-node", + log: logr.Discard(), + clusterPolicy: tt.clusterPolicy, + } + + result := workloadConfig.shouldDeployDriverForVMPassthrough() + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestGpuWorkloadConfiguration_AddGPUStateLabels(t *testing.T) { + tests := []struct { + name string + config string + clusterPolicy *gpuv1.ClusterPolicy + inputLabels map[string]string + expectedLabels map[string]string + expectModified bool + }{ + { + name: "vm-passthrough with shared-nvswitch adds driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectModified: true, + }, + { + name: "vm-passthrough with full-passthrough does not add driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + }, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + }, + expectModified: true, + }, + { + name: "container workload is not affected", + config: gpuWorkloadConfigContainer, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{ + "existing-label": "value", + }, + expectedLabels: map[string]string{ + "existing-label": "value", + "nvidia.com/gpu.deploy.driver": "true", + "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.dcgm": "true", + "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "nvidia.com/gpu.deploy.node-status-exporter": "true", + "nvidia.com/gpu.deploy.operator-validator": "true", + }, + expectModified: true, + }, + { + name: "vm-passthrough with nil cluster policy does not add driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: nil, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + }, + expectModified: true, + }, + { + name: "driver label already exists - no modification", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectModified: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workloadConfig := &gpuWorkloadConfiguration{ + config: tt.config, + node: "test-node", + log: logr.Discard(), + clusterPolicy: tt.clusterPolicy, + } + + // Make a copy of input labels to avoid modifying the test data + labels := make(map[string]string) + for k, v := range tt.inputLabels { + labels[k] = v + } + + modified := workloadConfig.addGPUStateLabels(labels) + + assert.Equal(t, tt.expectModified, modified) + assert.Equal(t, tt.expectedLabels, labels) + }) + } +} diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/internal/state/driver.go b/internal/state/driver.go index b0e6484de..068406597 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -269,7 +269,7 @@ func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1 for _, nodePool := range nodePools { // Construct a unique driver spec per node pool. Each node pool // should have a unique nodeSelector and name. - driverSpec, err := getDriverSpec(cr, nodePool) + driverSpec, err := getDriverSpec(cr, nodePool, &clusterPolicy) if err != nil { return nil, fmt.Errorf("failed to construct driver spec: %w", err) } @@ -542,7 +542,18 @@ func sanitizeDriverLabels(labels map[string]string) map[string]string { return sanitizedLabels } -func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverSpec, error) { +func getFabricModeValue(mode gpuv1.FabricMode) string { + switch mode { + case gpuv1.FabricModeSharedNVSwitch: + return "1" // Shared NVSwitch Virtualization mode + case gpuv1.FabricModeFullPassthrough: + return "0" // Full-passthrough mode + default: + return "0" // Default to full-passthrough + } +} + +func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool, clusterPolicy *gpuv1.ClusterPolicy) (*driverSpec, error) { if cr == nil { return nil, fmt.Errorf("no NVIDIADriver CR provided") } @@ -569,6 +580,16 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS spec.Labels = sanitizeDriverLabels(spec.Labels) + // Add Fabric Manager environment variable if fabric manager is configured + if clusterPolicy != nil { + fabricModeValue := getFabricModeValue(clusterPolicy.Spec.FabricManager.Mode) + fabricManagerEnv := nvidiav1alpha1.EnvVar{ + Name: "FABRIC_MANAGER_FABRIC_MODE", + Value: fabricModeValue, + } + spec.Env = append(spec.Env, fabricManagerEnv) + } + return &driverSpec{ Spec: spec, AppName: nvidiaDriverAppName, diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 75d8b04f9..98ba1bac2 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -35,6 +35,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" "k8s.io/utils/ptr" + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/internal/render" ) @@ -950,9 +951,9 @@ func TestGetDriverSpecMultipleNodePools(t *testing.T) { }, } - spec1, err := getDriverSpec(cr, pool1) + spec1, err := getDriverSpec(cr, pool1, nil) require.NoError(t, err) - spec2, err := getDriverSpec(cr, pool2) + spec2, err := getDriverSpec(cr, pool2, nil) require.NoError(t, err) // Verify each spec has correct values @@ -973,3 +974,147 @@ func TestGetDriverSpecMultipleNodePools(t *testing.T) { _, exists := spec2.Spec.NodeSelector["test-key"] assert.False(t, exists) } + +func TestGetDriverSpecFabricManagerEnvVar(t *testing.T) { + cr := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{ + UID: apitypes.UID("test-uid-fabric"), + }, + Spec: nvidiav1alpha1.NVIDIADriverSpec{ + DriverType: nvidiav1alpha1.GPU, + UsePrecompiled: ptr.To(true), + Repository: "nvcr.io/nvidia", + Image: "driver", + Version: "535.104.05", + Manager: nvidiav1alpha1.DriverManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "k8s-driver-manager", + Version: "v0.6.8", + }, + }, + } + + nodePool := nodePool{ + osRelease: "ubuntu", + osVersion: "22.04", + kernel: "5.15.0-generic", + nodeSelector: map[string]string{ + "feature.node.kubernetes.io/kernel-version.full": "5.15.0-generic", + "feature.node.kubernetes.io/system-os_release.VERSION_ID": "22.04", + }, + } + + tests := []struct { + name string + clusterPolicy *gpuv1.ClusterPolicy + expectedEnvVar string + expectedEnvValue string + expectEnvVar bool + }{ + { + name: "no cluster policy", + clusterPolicy: nil, + expectEnvVar: false, + }, + { + name: "fabric manager shared-nvswitch mode", + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + expectedEnvVar: "FABRIC_MANAGER_FABRIC_MODE", + expectedEnvValue: "1", + expectEnvVar: true, + }, + { + name: "fabric manager full-passthrough mode", + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + }, + expectedEnvVar: "FABRIC_MANAGER_FABRIC_MODE", + expectedEnvValue: "0", + expectEnvVar: true, + }, + { + name: "fabric manager default mode (empty)", + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: "", + }, + }, + }, + expectedEnvVar: "FABRIC_MANAGER_FABRIC_MODE", + expectedEnvValue: "0", + expectEnvVar: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + spec, err := getDriverSpec(cr, nodePool, tt.clusterPolicy) + require.NoError(t, err) + + if tt.expectEnvVar { + // Find the Fabric Manager environment variable + found := false + for _, envVar := range spec.Spec.Env { + if envVar.Name == tt.expectedEnvVar { + assert.Equal(t, tt.expectedEnvValue, envVar.Value) + found = true + break + } + } + assert.True(t, found, "Expected environment variable %s not found", tt.expectedEnvVar) + } else { + // Verify no Fabric Manager env var is set + for _, envVar := range spec.Spec.Env { + assert.NotEqual(t, "FABRIC_MANAGER_FABRIC_MODE", envVar.Name, "Unexpected Fabric Manager env var found") + } + } + }) + } +} + +func TestGetFabricModeValue(t *testing.T) { + tests := []struct { + name string + mode gpuv1.FabricMode + expected string + }{ + { + name: "shared-nvswitch mode", + mode: gpuv1.FabricModeSharedNVSwitch, + expected: "1", + }, + { + name: "full-passthrough mode", + mode: gpuv1.FabricModeFullPassthrough, + expected: "0", + }, + { + name: "empty mode defaults to full-passthrough", + mode: "", + expected: "0", + }, + { + name: "unknown mode defaults to full-passthrough", + mode: "unknown-mode", + expected: "0", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := getFabricModeValue(tt.mode) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/manifests/state-driver/0400_configmap.yaml b/manifests/state-driver/0400_configmap.yaml index 55ba3df55..34802a6d5 100644 --- a/manifests/state-driver/0400_configmap.yaml +++ b/manifests/state-driver/0400_configmap.yaml @@ -26,8 +26,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"