Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ type ClusterPolicySpec struct {
CCManager CCManagerSpec `json:"ccManager,omitempty"`
// HostPaths defines various paths on the host needed by GPU Operator components
HostPaths HostPathsSpec `json:"hostPaths,omitempty"`
// FabricManager component spec
FabricManager FabricManagerSpec `json:"fabricManager,omitempty"`
}

// Runtime defines container runtime type
Expand Down Expand Up @@ -1724,6 +1726,38 @@ type CDIConfigSpec struct {
Default *bool `json:"default,omitempty"`
}

// FabricMode defines the Fabric Manager mode
type FabricMode string

const (
// FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0)
FabricModeFullPassthrough FabricMode = "full-passthrough"
// FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1)
FabricModeSharedNVSwitch FabricMode = "shared-nvswitch"
)

func (f FabricMode) String() string {
switch f {
case FabricModeFullPassthrough:
return "full-passthrough"
case FabricModeSharedNVSwitch:
return "shared-nvswitch"
default:
return ""
}
}

// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration
type FabricManagerSpec struct {
// Mode indicates the Fabric Manager mode
// +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch
// +kubebuilder:default=full-passthrough
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch"
Mode FabricMode `json:"mode,omitempty"`
}

// MIGStrategy indicates MIG mode
type MIGStrategy string

Expand Down Expand Up @@ -2218,3 +2252,18 @@ func (c *MIGPartedConfigSpec) GetName() string {
func (c *VGPUDevicesConfigSpec) GetName() string {
return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name
}

// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode
func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool {
return f.Mode == FabricModeSharedNVSwitch
}

// ValidateFabricManagerConfig validates the Fabric Manager configuration
func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error {
if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" &&
c.FabricManager.IsSharedNVSwitchMode() &&
!c.Driver.IsEnabled() {
return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode")
}
return nil
}
16 changes: 16 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions assets/state-driver/0400_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ data:
fi

if ! nvidia-smi; then
echo "nvidia-smi failed"
exit 1
# For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
# Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
else
echo "nvidia-smi failed"
exit 1
fi
fi

GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
Expand Down
Loading