diff --git a/deployments/helm/nvidia-device-plugin/Chart.yaml b/deployments/helm/nvidia-device-plugin/Chart.yaml index be2d66b8a..2dc1f3021 100644 --- a/deployments/helm/nvidia-device-plugin/Chart.yaml +++ b/deployments/helm/nvidia-device-plugin/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: nvidia-device-plugin type: application description: A Helm chart for the nvidia-device-plugin on Kubernetes -version: "0.18.0" +version: "0.19.0" appVersion: "0.18.0" kubeVersion: ">= 1.10.0-0" home: https://github.com/NVIDIA/k8s-device-plugin diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-device-plugin.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-device-plugin.yml index 6cfa5042b..71e8c5094 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-device-plugin.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-device-plugin.yml @@ -63,7 +63,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "true" + value: {{ .Values.configManager.init.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -71,21 +71,21 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "false" + value: {{ .Values.configManager.init.sendSignal | quote }} - name: SIGNAL - value: "" + value: {{ .Values.configManager.init.signal | quote }} - name: PROCESS_TO_SIGNAL - value: "" + value: {{ .Values.configManager.init.processToSignal | quote }} volumeMounts: - name: available-configs mountPath: /available-configs @@ -99,7 +99,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "false" + value: {{ .Values.configManager.sidecar.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -107,19 +107,19 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "true" + value: {{ .Values.configManager.sidecar.sendSignal | quote }} - name: SIGNAL - value: "1" # SIGHUP + value: {{ .Values.configManager.sidecar.signal | quote }} - name: PROCESS_TO_SIGNAL value: "nvidia-device-plugin" volumeMounts: @@ -187,20 +187,20 @@ spec: {{- end }} {{- if $options.hasConfigMap }} - name: CONFIG_FILE - value: /config/config.yaml + value: {{ .Values.env.configFile | quote }} {{- end }} {{- if $options.addMigMonitorDevices }} - name: NVIDIA_MIG_MONITOR_DEVICES - value: all + value: {{ .Values.env.nvidia.migMonitorDevices | quote }} {{- end }} {{- if typeIs "string" .Values.deviceDiscoveryStrategy }} - name: DEVICE_DISCOVERY_STRATEGY value: {{ .Values.deviceDiscoveryStrategy }} {{- end }} - name: NVIDIA_VISIBLE_DEVICES - value: all + value: {{ .Values.env.nvidia.visibleDevices | quote }} - name: NVIDIA_DRIVER_CAPABILITIES - value: compute,utility + value: {{ .Values.env.nvidia.driverCapabilities | quote }} securityContext: {{- include "nvidia-device-plugin.securityContext" . | nindent 10 }} volumeMounts: diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml index 9a7a5e516..69d25e263 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml @@ -65,7 +65,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "true" + value: {{ .Values.configManager.init.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -73,21 +73,21 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "false" + value: {{ .Values.configManager.init.sendSignal | quote }} - name: SIGNAL - value: "" + value: {{ .Values.configManager.init.signal | quote }} - name: PROCESS_TO_SIGNAL - value: "" + value: {{ .Values.configManager.init.processToSignal | quote }} volumeMounts: - name: available-configs mountPath: /available-configs @@ -101,7 +101,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "false" + value: {{ .Values.configManager.sidecar.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -109,19 +109,19 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "true" + value: {{ .Values.configManager.sidecar.sendSignal | quote }} - name: SIGNAL - value: "1" # SIGHUP + value: {{ .Values.configManager.sidecar.signal | quote }} - name: PROCESS_TO_SIGNAL value: "gpu-feature-discovery" volumeMounts: @@ -167,11 +167,11 @@ spec: {{- end }} {{- if $options.hasConfigMap }} - name: CONFIG_FILE - value: /config/config.yaml + value: {{ .Values.env.configFile | quote }} {{- end }} {{- if $options.addMigMonitorDevices }} - name: NVIDIA_MIG_MONITOR_DEVICES - value: all + value: {{ .Values.env.nvidia.migMonitorDevices | quote }} {{- end }} {{- if typeIs "string" .Values.deviceDiscoveryStrategy }} - name: DEVICE_DISCOVERY_STRATEGY diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-mps-control-daemon.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-mps-control-daemon.yml index da37aba6d..00972f641 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-mps-control-daemon.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-mps-control-daemon.yml @@ -79,7 +79,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "true" + value: {{ .Values.configManager.init.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -87,21 +87,21 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "false" + value: {{ .Values.configManager.init.sendSignal | quote }} - name: SIGNAL - value: "" + value: {{ .Values.configManager.init.signal | quote }} - name: PROCESS_TO_SIGNAL - value: "" + value: {{ .Values.configManager.init.processToSignal | quote }} volumeMounts: - name: available-configs mountPath: /available-configs @@ -116,7 +116,7 @@ spec: command: ["config-manager"] env: - name: ONESHOT - value: "false" + value: {{ .Values.configManager.sidecar.oneshot | quote }} - name: KUBECONFIG value: "" - name: NODE_NAME @@ -124,19 +124,19 @@ spec: fieldRef: fieldPath: "spec.nodeName" - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" + value: {{ .Values.configManager.init.nodeLabel | quote }} - name: CONFIG_FILE_SRCDIR - value: "/available-configs" + value: {{ .Values.configManager.init.configFileSrcDir | quote }} - name: CONFIG_FILE_DST - value: "/config/config.yaml" + value: {{ .Values.configManager.init.configFileDst | quote }} - name: DEFAULT_CONFIG value: {{ .Values.config.default }} - name: FALLBACK_STRATEGIES value: {{ join "," .Values.config.fallbackStrategies }} - name: SEND_SIGNAL - value: "true" + value: {{ .Values.configManager.sidecar.sendSignal | quote }} - name: SIGNAL - value: "1" + value: {{ .Values.configManager.sidecar.signal | quote }} - name: PROCESS_TO_SIGNAL value: "/usr/bin/mps-control-daemon" volumeMounts: @@ -161,16 +161,16 @@ spec: {{- end }} {{- if $options.hasConfigMap }} - name: CONFIG_FILE - value: /config/config.yaml + value: {{ .Values.env.configFile | quote }} {{- end }} {{- if $options.addMigMonitorDevices }} - name: NVIDIA_MIG_MONITOR_DEVICES - value: all + value: {{ .Values.env.nvidia.migMonitorDevices | quote }} {{- end }} - name: NVIDIA_VISIBLE_DEVICES - value: all + value: {{ .Values.env.nvidia.visibleDevices | quote }} - name: NVIDIA_DRIVER_CAPABILITIES - value: compute,utility + value: {{ .Values.env.nvidia.driverCapabilities | quote }} securityContext: privileged: true volumeMounts: diff --git a/deployments/helm/nvidia-device-plugin/values.yaml b/deployments/helm/nvidia-device-plugin/values.yaml index a02688cfa..6a95ac185 100644 --- a/deployments/helm/nvidia-device-plugin/values.yaml +++ b/deployments/helm/nvidia-device-plugin/values.yaml @@ -27,6 +27,55 @@ config: # List of fallback strategies to attempt if no config is selected and no default is provided fallbackStrategies: ["named" , "single"] +# Configuration Manager Environment Variables +# Controls config-manager init and sidecar containers that handle dynamic config updates +configManager: + # Init container configuration (runs once on pod startup) + init: + # Kubernetes node label to watch for device plugin configuration + nodeLabel: "nvidia.com/device-plugin.config" + # Directory containing available configuration files + configFileSrcDir: "/available-configs" + # Destination path for the active configuration file + configFileDst: "/config/config.yaml" + # Whether to send signal after config update (init container: false) + sendSignal: "false" + # Signal number to send (empty for init container) + signal: "" + # Process name to signal (empty for init container) + processToSignal: "" + # Run once and exit (init container: true) + oneshot: "true" + + # Sidecar container configuration (runs continuously for hot-reload) + sidecar: + # Run continuously to watch for config changes + oneshot: "false" + # Send signal to main container after config update + sendSignal: "true" + # SIGHUP (1) to trigger config reload + signal: "1" + # Process name varies by DaemonSet and is set automatically in templates + +# Main Container Environment Variables +# Apply to device-plugin, gpu-feature-discovery, and mps-control-daemon containers +env: + # Path to the active configuration file + configFile: "/config/config.yaml" + + # NVIDIA runtime environment variables + nvidia: + # MIG (Multi-Instance GPU) devices to monitor + # Options: "all" or comma-separated GPU indices + migMonitorDevices: "all" + # GPU devices visible to containers + # Options: "all", "none", or comma-separated GPU indices/UUIDs + visibleDevices: "all" + # NVIDIA driver capabilities to enable + # Common values: "compute,utility" for standard workloads + # Full list: compute, compat32, graphics, utility, video, display, ngx + driverCapabilities: "compute,utility" + compatWithCPUManager: null migStrategy: null failOnInitError: null