diff --git a/install.sh b/install.sh index 26aede0..56ef127 100644 --- a/install.sh +++ b/install.sh @@ -21,6 +21,124 @@ echo "Open gridd: $open_gridd" set -euo pipefail # install cached nvidia debian packages for container runtime compatibility +ensure_cdi_refresh_units() { + local missing_units=() + local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") + local enabled=() + + for unit in "${units[@]}"; do + if systemctl cat "${unit}" >/dev/null 2>&1; then + systemctl enable "${unit}" + enabled+=("${unit}") + else + missing_units+=("${unit}") + fi + done + + if [[ ${#missing_units[@]} -gt 0 ]]; then + echo "Missing expected systemd units: ${missing_units[*]}." + echo "Proceeding without automatic CDI refresh; containers may fail without manual nvidia-ctk cdi generate." + return 1 + fi + + if [[ ${#enabled[@]} -gt 0 ]]; then + echo "Enabled CDI refresh units: ${enabled[*]}" + fi + + return 0 +} + +ensure_runtime_cdi_spec() { + local tmpfile + tmpfile=$(mktemp) + + if ! nvidia-ctk cdi list >"${tmpfile}" 2>&1; then + echo "nvidia-ctk cdi list failed; attempting regeneration." + mkdir -p /var/run/cdi + if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then + echo "Unable to generate CDI specification; see above logs." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + if ! nvidia-ctk cdi list >"${tmpfile}"; then + echo "nvidia-ctk cdi list still failing after regeneration." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + fi + + if ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then + echo "runtime.nvidia.com/gpu devices not found; forcing CDI spec regeneration." + mkdir -p /var/run/cdi + if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then + echo "Unable to generate CDI specification containing runtime.nvidia.com aliases." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + if ! nvidia-ctk cdi list >"${tmpfile}" || ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then + echo "CDI specification still missing runtime.nvidia.com devices after regeneration." + cat "${tmpfile}" + rm -f "${tmpfile}" + return 1 + fi + fi + + rm -f "${tmpfile}" +} + +log_cdi_refresh_status() { + local service_status path_status + + if systemctl is-enabled nvidia-cdi-refresh.service &>/dev/null; then + service_status="enabled" + else + service_status="disabled" + fi + + if systemctl is-enabled nvidia-cdi-refresh.path &>/dev/null; then + path_status="enabled" + else + path_status="disabled" + fi + + echo "CDI refresh units status: service=${service_status}, path=${path_status}" + + if systemctl is-active nvidia-cdi-refresh.service &>/dev/null; then + echo "nvidia-cdi-refresh.service is active" + else + echo "Warning: nvidia-cdi-refresh.service is not active" + fi +} + +start_cdi_refresh_units() { + local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service") + local started=false + local started_units=() + + for unit in "${units[@]}"; do + if systemctl cat "${unit}" >/dev/null 2>&1; then + if systemctl start "${unit}"; then + started=true + started_units+=("${unit}") + else + echo "Warning: failed to start ${unit}; will fall back to nvidia-ctk cdi generate." + fi + fi + done + + if ! $started; then + echo "Warning: unable to start any nvidia-cdi-refresh units; falling back to manual CDI generation." + else + echo "Started CDI refresh units: ${started_units[*]}" + fi + + ensure_runtime_cdi_spec + log_cdi_refresh_status +} + install_cached_nvidia_packages() { for apt_package in $NVIDIA_PACKAGES; do dpkg -i --force-overwrite /opt/gpu/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* @@ -28,6 +146,7 @@ done } use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3 +ensure_cdi_refresh_units # blacklist nouveau driver, nvidia driver dependency cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf @@ -103,6 +222,8 @@ if [[ "${DRIVER_KIND}" == "cuda" ]]; then bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh fi +start_cdi_refresh_units + mkdir -p /etc/containerd/config.d cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml