diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index ea4b21d86..a9e863c7a 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1429,9 +1429,8 @@ type GDRCopySpec struct { // MIGPartedConfigSpec defines custom mig-parted config for NVIDIA MIG Manager container type MIGPartedConfigSpec struct { - // ConfigMap name + // ConfigMap name. If not specified, MIG configuration will be dynamically generated from hardware. // +kubebuilder:validation:Optional - // +kubebuilder:default=default-mig-parted-config // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ConfigMap Name" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" diff --git a/assets/state-mig-manager/0200_role.yaml b/assets/state-mig-manager/0200_role.yaml index 5396cbeaa..c9fd933ad 100644 --- a/assets/state-mig-manager/0200_role.yaml +++ b/assets/state-mig-manager/0200_role.yaml @@ -21,3 +21,13 @@ rules: - list - watch - delete +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - get + - list + - update + - patch diff --git a/assets/state-mig-manager/0400_configmap.yaml b/assets/state-mig-manager/0400_configmap.yaml deleted file mode 100644 index 90cdde095..000000000 --- a/assets/state-mig-manager/0400_configmap.yaml +++ /dev/null @@ -1,625 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: default-mig-parted-config - namespace: "FILLED BY THE OPERATOR" -data: - config.yaml: | - version: v1 - mig-configs: - all-disabled: - - devices: all - mig-enabled: false - - all-enabled: - - devices: all - mig-enabled: true - mig-devices: {} - - # A100-40GB, A800-40GB - all-1g.5gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.5gb": 7 - - all-1g.5gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.5gb+me": 1 - - all-2g.10gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.10gb": 3 - - all-3g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.20gb": 2 - - all-4g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.20gb": 1 - - all-7g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.40gb": 1 - - # RTX-PRO-6000-96GB - all-1g.24gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+gfx": 4 - - all-1g.24gb.me.all: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+me.all": 1 - - all-1g.24gb-me: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb-me": 4 - - all-2g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb": 2 - - all-2g.48gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb+gfx": 2 - - all-2g.48gb.me.all: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb+me.all": 1 - - all-2g.48gb-me: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb-me": 2 - - all-4g.96gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.96gb": 1 - - all-4g.96gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "4g.96gb+gfx": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB - all-1g.10gb: - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 7 - - # A100-40GB, A800-40GB - - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 4 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - all-1g.10gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.10gb+me": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - all-1g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.20gb": 4 - - # GB200, B200 - all-1g.23gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 7 - - # GB200, B200 - all-1g.23gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.23gb+me": 1 - - all-1g.24gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+me": 1 - - all-2g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.20gb": 3 - - all-3g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.40gb": 2 - - all-4g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.40gb": 1 - - all-7g.80gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.80gb": 1 - - # A30-24GB - all-1g.6gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.6gb": 4 - - all-1g.6gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.6gb+me": 1 - - all-2g.12gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.12gb": 2 - - all-2g.12gb.me: - - devices: all - mig-enabled: true - mig-devices: - "2g.12gb+me": 1 - - all-4g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.24gb": 1 - - # H100 NVL, H800 NVL, GH200 - all-1g.12gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 7 - - all-1g.12gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.12gb+me": 1 - - all-1g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb": 4 - - all-1g.45gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.45gb": 4 - - all-1g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.47gb": 4 - - all-2g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.24gb": 3 - - all-2g.45gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.45gb": 3 - - all-2g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.47gb": 3 - - # H100 NVL, H800 NVL - all-3g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.47gb": 2 - - all-4g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.47gb": 1 - - all-7g.94gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.94gb": 1 - - # H100-96GB, PG506-96GB, GH200 - all-3g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.48gb": 2 - - all-3g.90gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.90gb": 2 - - all-3g.93gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.93gb": 2 - - all-3g.95gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.95gb": 2 - - all-4g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.48gb": 1 - - all-4g.90gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.90gb": 1 - - all-4g.93gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.93gb": 1 - - all-4g.95gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.95gb": 1 - - all-7g.96gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.96gb": 1 - - all-7g.180gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.180gb": 1 - - all-7g.186gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.186gb": 1 - - all-7g.189gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.189gb": 1 - - # GB200 HGX, B200, GH200 144G HBM3e, H200-141GB, H200 NVL, H100-96GB, GH200, H100 NVL, H800 NVL, H100-80GB, H800-80GB, A800-40GB, A800-80GB, A100-40GB, A100-80GB, A30-24GB, PG506-96GB - all-balanced: - # GB200 HGX - - device-filter: ["0x294110DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 2 - "2g.47gb": 1 - "3g.93gb": 1 - - # RTX-PRO-6000-96GB - - device-filter: ["0x2BB510DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.24gb": 2 - "2g.48gb": 1 - - # B200 - - device-filter: ["0x290110DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 2 - "2g.45gb": 1 - "3g.90gb": 1 - - # GH200 144G HBM3e - - device-filter: ["0x234810DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 2 - "2g.36gb": 1 - "3g.72gb": 1 - - # H200 141GB, H200 NVL - - device-filter: ["0x233510DE", "0x233B10DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 2 - "2g.35gb": 1 - "3g.71gb": 1 - - # H100 NVL, H800 NVL - - device-filter: ["0x232110DE", "0x233A10DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 2 - "2g.24gb": 1 - "3g.47gb": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 2 - "2g.20gb": 1 - "3g.40gb": 1 - - # A100-40GB, A800-40GB - - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.5gb": 2 - "2g.10gb": 1 - "3g.20gb": 1 - - # A30-24GB - - device-filter: "0x20B710DE" - devices: all - mig-enabled: true - mig-devices: - "1g.6gb": 2 - "2g.12gb": 1 - - # H100-96GB, PG506-96GB, GH200, H20 - - device-filter: ["0x234210DE", "0x233D10DE", "0x20B610DE", "0x232910DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 2 - "2g.24gb": 1 - "3g.48gb": 1 - - # B300 - - device-filter: ["0x318210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.34gb": 2 - "2g.67gb": 1 - "3g.135gb": 1 - - # GB300 - - device-filter: ["0x31C210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 2 - "2g.70gb": 1 - "3g.139gb": 1 - - # H200-141GB, GH200 144G HBM3e - all-1g.18gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 7 - - all-1g.18gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.18gb+me": 1 - - all-1g.35gb: - # H200-141GB - - device-filter: ["0x233510DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 4 - # GB300 - - device-filter: ["0x31C210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 7 - - all-2g.35gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.35gb": 3 - - all-3g.71gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.71gb": 2 - - all-4g.71gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.71gb": 1 - - all-7g.141gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.141gb": 1 - - # GH200 144G HBM3e - all-1g.36gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.36gb": 4 - - all-2g.36gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.36gb": 3 - - all-3g.72gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.72gb": 2 - - all-4g.72gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.72gb": 1 - - all-7g.144gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.144gb": 1 - - # B300 - all-1g.34gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.34gb": 7 - - all-1g.34gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.34gb+me": 1 - - all-1g.67gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.67gb": 4 - - all-2g.67gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.67gb": 3 - - all-3g.135gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.135gb": 2 - - all-4g.135gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.135gb": 1 - - all-7g.269gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.269gb": 1 - - # GB300 - all-1g.35gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.35gb+me": 1 - - all-1g.70gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.70gb": 4 - - all-2g.70gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.70gb": 3 - - all-3g.139gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.139gb": 2 - - all-4g.139gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.139gb": 1 - - all-7g.278gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.278gb": 1 diff --git a/assets/state-mig-manager/0600_daemonset.yaml b/assets/state-mig-manager/0600_daemonset.yaml index 1a9076169..c5ec8e283 100644 --- a/assets/state-mig-manager/0600_daemonset.yaml +++ b/assets/state-mig-manager/0600_daemonset.yaml @@ -47,8 +47,14 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - - name: CONFIG_FILE - value: "/mig-parted-config/config.yaml" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace - name: GPU_CLIENTS_FILE value: "/gpu-clients/clients.yaml" - name: DEFAULT_GPU_CLIENTS_NAMESPACE @@ -66,8 +72,6 @@ spec: mountPath: /run/nvidia/validations - mountPath: /sys name: host-sys - - mountPath: /mig-parted-config - name: mig-parted-config - mountPath: /host name: host-root mountPropagation: HostToContainer @@ -87,9 +91,6 @@ spec: hostPath: path: /sys type: Directory - - name: mig-parted-config - configMap: - name: "FILLED_BY_OPERATOR" - name: run-nvidia-validations hostPath: path: "/run/nvidia/validations" diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 379e98d87..8d47b8ef5 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -1403,8 +1403,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 379e98d87..8d47b8ef5 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -1403,8 +1403,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index f811e8568..0e784fb26 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -23,13 +23,12 @@ import ( "fmt" "os" "path" + "path/filepath" "regexp" "sort" "strconv" "strings" - "path/filepath" - apiconfigv1 "github.com/openshift/api/config/v1" apiimagev1 "github.com/openshift/api/image/v1" secv1 "github.com/openshift/api/security/v1" @@ -98,8 +97,6 @@ const ( ValidatorRuntimeClassEnvName = "VALIDATOR_RUNTIME_CLASS" // MigStrategyEnvName indicates env name for passing MIG strategy MigStrategyEnvName = "MIG_STRATEGY" - // MigPartedDefaultConfigMapName indicates name of ConfigMap containing default mig-parted config - MigPartedDefaultConfigMapName = "default-mig-parted-config" // MigDefaultGPUClientsConfigMapName indicates name of ConfigMap containing default gpu-clients MigDefaultGPUClientsConfigMapName = "default-gpu-clients" // DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port @@ -528,14 +525,6 @@ func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, return gpuv1.Disabled, nil } - // avoid creating default 'mig-parted-config' ConfigMap if custom one is provided - if obj.Name == MigPartedDefaultConfigMapName { - if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName); isCustom { - logger.Info("Not creating resource, custom ConfigMap provided", "Name", name) - return gpuv1.Ready, nil - } - } - // avoid creating default 'gpu-clients' ConfigMap if custom one is provided if obj.Name == MigDefaultGPUClientsConfigMapName { if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.GPUClientsConfig, MigDefaultGPUClientsConfigMapName); isCustom { @@ -1898,15 +1887,32 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) - // set ConfigMap name for "mig-parted-config" Volume - for i, vol := range obj.Spec.Template.Spec.Volumes { - if !strings.Contains(vol.Name, "mig-parted-config") { - continue + // Only mount config volume and set CONFIG_FILE when custom config is provided + hasCustomConfig := config.MIGManager.Config != nil && config.MIGManager.Config.Name != "" + + if hasCustomConfig { + // CUSTOM CONFIG: Add ConfigMap volume/mount and set CONFIG_FILE + configMapName, _ := gpuv1.GetConfigMapName(config.MIGManager.Config, "") + + // Add mig-parted-config volume + migConfigVolume := createConfigMapVolume(configMapName, nil) + migConfigVolume.Name = "mig-parted-config" // Use standard name for consistency + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, migConfigVolume) + + // Add mig-parted-config volumeMount to container + migConfigMount := corev1.VolumeMount{ + Name: "mig-parted-config", + MountPath: "/mig-parted-config", + ReadOnly: true, } + obj.Spec.Template.Spec.Containers[0].VolumeMounts = append( + obj.Spec.Template.Spec.Containers[0].VolumeMounts, + migConfigMount, + ) - name, _ := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName) - obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name - break + // Add CONFIG_FILE env var pointing to mounted ConfigMap + // NOTE: Assumes custom ConfigMap has key "config.yaml" + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "CONFIG_FILE", "/mig-parted-config/config.yaml") } // set ConfigMap name for "gpu-clients" Volume diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 379e98d87..8d47b8ef5 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1403,8 +1403,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 14129a68d..7ae138dc8 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -343,9 +343,18 @@ migManager: env: [] resources: {} # MIG configuration - # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). - # Use "data" to build an integrated ConfigMap from a set of configurations as - # part of this helm chart. An example of setting "data" might be: + # NOTE: MIG manager automatically generates configuration from hardware on each node. + # Only provide a custom config if you need settings that differ from hardware discovery. + # + # To use an existing ConfigMap: + # - Set name="your-configmap-name" with create=false + # - ConfigMap MUST have a key named "config.yaml" + # + # To create a new ConfigMap via Helm: + # - Set create=true, name="your-configmap-name", and provide data below + # - If create=true but data is empty, ConfigMap creation is skipped + # + # Example of creating a custom ConfigMap: # config: # name: custom-mig-parted-configs # create: true @@ -377,9 +386,11 @@ migManager: default: "all-disabled" # Create a ConfigMap (default: false) create: false - # ConfigMap name (either existing or to create a new one with create=true above) + # ConfigMap name (either existing or to create with create=true) + # If name is provided, mig-manager will use this config instead of auto-generated one. + # REQUIREMENT: Custom ConfigMaps must contain a key named "config.yaml" name: "" - # Data section for the ConfigMap to create (i.e only applies when create=true) + # Data section for the ConfigMap (required only if create=true) data: {} gpuClientsConfig: name: ""