From b1679ad477dbb047290a17bc2340356cb9500103 Mon Sep 17 00:00:00 2001 From: sulixu Date: Wed, 25 Feb 2026 11:37:23 -0800 Subject: [PATCH 1/3] enable nvidia-cdi-refresh.path and nvidia-cdi-refresh.service --- install.sh | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/install.sh b/install.sh index 26aede0..d35b1b2 100644 --- a/install.sh +++ b/install.sh @@ -21,6 +21,68 @@ echo "Open gridd: $open_gridd" set -euo pipefail # install cached nvidia debian packages for container runtime compatibility +ensure_cdi_refresh_units() { + local missing_units=() + local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") + + for unit in "${units[@]}"; do + if systemctl cat "${unit}" >/dev/null 2>&1; then + systemctl enable --now "${unit}" + else + missing_units+=("${unit}") + fi + done + + if [[ ${#missing_units[@]} -gt 0 ]]; then + echo "Missing expected systemd units: ${missing_units[*]}." + echo "Proceeding without automatic CDI refresh; containers may fail without manual nvidia-ctk cdi generate." + return 1 + fi + + return 0 +} + +ensure_runtime_cdi_spec() { + local tmpfile + tmpfile=$(mktemp) + + if ! nvidia-ctk cdi list >"${tmpfile}" 2>&1; then + echo "nvidia-ctk cdi list failed; attempting regeneration." + mkdir -p /var/run/cdi + if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then + echo "Unable to generate CDI specification; see above logs." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + if ! nvidia-ctk cdi list >"${tmpfile}"; then + echo "nvidia-ctk cdi list still failing after regeneration." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + fi + + if ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then + echo "runtime.nvidia.com/gpu devices not found; forcing CDI spec regeneration." + mkdir -p /var/run/cdi + if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then + echo "Unable to generate CDI specification containing runtime.nvidia.com aliases." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + if ! nvidia-ctk cdi list >"${tmpfile}" || ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then + echo "CDI specification still missing runtime.nvidia.com devices after regeneration." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + fi + + rm -f "${tmpfile}" +} + install_cached_nvidia_packages() { for apt_package in $NVIDIA_PACKAGES; do dpkg -i --force-overwrite /opt/gpu/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* @@ -28,6 +90,8 @@ done } use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3 +ensure_cdi_refresh_units +ensure_runtime_cdi_spec # blacklist nouveau driver, nvidia driver dependency cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf From dc0478f336660298882c004f47a43bf7f7d0b5cc Mon Sep 17 00:00:00 2001 From: sulixu Date: Wed, 25 Feb 2026 16:15:46 -0800 Subject: [PATCH 2/3] update --- install.sh | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index d35b1b2..7b0b4eb 100644 --- a/install.sh +++ b/install.sh @@ -27,7 +27,7 @@ ensure_cdi_refresh_units() { for unit in "${units[@]}"; do if systemctl cat "${unit}" >/dev/null 2>&1; then - systemctl enable --now "${unit}" + systemctl enable "${unit}" else missing_units+=("${unit}") fi @@ -83,6 +83,27 @@ ensure_runtime_cdi_spec() { rm -f "${tmpfile}" } +start_cdi_refresh_units() { + local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") + local started=false + + for unit in "${units[@]}"; do + if systemctl cat "${unit}" >/dev/null 2>&1; then + if systemctl start "${unit}"; then + started=true + else + echo "Warning: failed to start ${unit}; will fall back to nvidia-ctk cdi generate." + fi + fi + done + + if ! $started; then + echo "Warning: unable to start any nvidia-cdi-refresh units; falling back to manual CDI generation." + fi + + ensure_runtime_cdi_spec +} + install_cached_nvidia_packages() { for apt_package in $NVIDIA_PACKAGES; do dpkg -i --force-overwrite /opt/gpu/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* @@ -91,7 +112,6 @@ done use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3 ensure_cdi_refresh_units -ensure_runtime_cdi_spec # blacklist nouveau driver, nvidia driver dependency cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf @@ -167,6 +187,8 @@ if [[ "${DRIVER_KIND}" == "cuda" ]]; then bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh fi +start_cdi_refresh_units + mkdir -p /etc/containerd/config.d cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml From a910c631cf6341b8c14bf1787e7143fda48af7e1 Mon Sep 17 00:00:00 2001 From: sulixu Date: Wed, 25 Feb 2026 18:13:47 -0800 Subject: [PATCH 3/3] add log --- install.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/install.sh b/install.sh index 7b0b4eb..56ef127 100644 --- a/install.sh +++ b/install.sh @@ -24,10 +24,12 @@ set -euo pipefail ensure_cdi_refresh_units() { local missing_units=() local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") + local enabled=() for unit in "${units[@]}"; do if systemctl cat "${unit}" >/dev/null 2>&1; then systemctl enable "${unit}" + enabled+=("${unit}") else missing_units+=("${unit}") fi @@ -39,6 +41,10 @@ ensure_cdi_refresh_units() { return 1 fi + if [[ ${#enabled[@]} -gt 0 ]]; then + echo "Enabled CDI refresh units: ${enabled[*]}" + fi + return 0 } @@ -83,14 +89,40 @@ ensure_runtime_cdi_spec() { rm -f "${tmpfile}" } +log_cdi_refresh_status() { + local service_status path_status + + if systemctl is-enabled nvidia-cdi-refresh.service &>/dev/null; then + service_status="enabled" + else + service_status="disabled" + fi + + if systemctl is-enabled nvidia-cdi-refresh.path &>/dev/null; then + path_status="enabled" + else + path_status="disabled" + fi + + echo "CDI refresh units status: service=${service_status}, path=${path_status}" + + if systemctl is-active nvidia-cdi-refresh.service &>/dev/null; then + echo "nvidia-cdi-refresh.service is active" + else + echo "Warning: nvidia-cdi-refresh.service is not active" + fi +} + start_cdi_refresh_units() { local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") local started=false + local started_units=() for unit in "${units[@]}"; do if systemctl cat "${unit}" >/dev/null 2>&1; then if systemctl start "${unit}"; then started=true + started_units+=("${unit}") else echo "Warning: failed to start ${unit}; will fall back to nvidia-ctk cdi generate." fi @@ -99,9 +131,12 @@ start_cdi_refresh_units() { if ! $started; then echo "Warning: unable to start any nvidia-cdi-refresh units; falling back to manual CDI generation." + else + echo "Started CDI refresh units: ${started_units[*]}" fi ensure_runtime_cdi_spec + log_cdi_refresh_status } install_cached_nvidia_packages() {