Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,132 @@ echo "Open gridd: $open_gridd"
set -euo pipefail

# install cached nvidia debian packages for container runtime compatibility
ensure_cdi_refresh_units() {
local missing_units=()
local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service")
local enabled=()

for unit in "${units[@]}"; do
if systemctl cat "${unit}" >/dev/null 2>&1; then
systemctl enable "${unit}"
enabled+=("${unit}")
else
missing_units+=("${unit}")
fi
done

if [[ ${#missing_units[@]} -gt 0 ]]; then
echo "Missing expected systemd units: ${missing_units[*]}."
echo "Proceeding without automatic CDI refresh; containers may fail without manual nvidia-ctk cdi generate."
return 1
fi

if [[ ${#enabled[@]} -gt 0 ]]; then
echo "Enabled CDI refresh units: ${enabled[*]}"
fi

return 0
}

ensure_runtime_cdi_spec() {
local tmpfile
tmpfile=$(mktemp)

if ! nvidia-ctk cdi list >"${tmpfile}" 2>&1; then
echo "nvidia-ctk cdi list failed; attempting regeneration."
mkdir -p /var/run/cdi
if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then
echo "Unable to generate CDI specification; see above logs."
cat "${tmpfile}"
rm -f "${tmpfile}"
return 1
fi
if ! nvidia-ctk cdi list >"${tmpfile}"; then
echo "nvidia-ctk cdi list still failing after regeneration."
cat "${tmpfile}"
rm -f "${tmpfile}"
return 1
fi
fi

if ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then
echo "runtime.nvidia.com/gpu devices not found; forcing CDI spec regeneration."
mkdir -p /var/run/cdi
if ! nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml; then
echo "Unable to generate CDI specification containing runtime.nvidia.com aliases."
cat "${tmpfile}"
rm -f "${tmpfile}"
return 1
fi
if ! nvidia-ctk cdi list >"${tmpfile}" || ! grep -q "runtime.nvidia.com/gpu" "${tmpfile}"; then
echo "CDI specification still missing runtime.nvidia.com devices after regeneration."
cat "${tmpfile}"
rm -f "${tmpfile}"
return 1
fi
fi

rm -f "${tmpfile}"
}

log_cdi_refresh_status() {
local service_status path_status

if systemctl is-enabled nvidia-cdi-refresh.service &>/dev/null; then
service_status="enabled"
else
service_status="disabled"
fi

if systemctl is-enabled nvidia-cdi-refresh.path &>/dev/null; then
path_status="enabled"
else
path_status="disabled"
fi

echo "CDI refresh units status: service=${service_status}, path=${path_status}"

if systemctl is-active nvidia-cdi-refresh.service &>/dev/null; then
echo "nvidia-cdi-refresh.service is active"
else
echo "Warning: nvidia-cdi-refresh.service is not active"
fi
}

start_cdi_refresh_units() {
local units=("nvidia-cdi-refresh.path" "nvidia-cdi-refresh.service")
local started=false
local started_units=()

for unit in "${units[@]}"; do
if systemctl cat "${unit}" >/dev/null 2>&1; then
if systemctl start "${unit}"; then
started=true
started_units+=("${unit}")
else
echo "Warning: failed to start ${unit}; will fall back to nvidia-ctk cdi generate."
fi
fi
done

if ! $started; then
echo "Warning: unable to start any nvidia-cdi-refresh units; falling back to manual CDI generation."
else
echo "Started CDI refresh units: ${started_units[*]}"
fi

ensure_runtime_cdi_spec
log_cdi_refresh_status
}

install_cached_nvidia_packages() {
for apt_package in $NVIDIA_PACKAGES; do
dpkg -i --force-overwrite /opt/gpu/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}*
done
}

use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
ensure_cdi_refresh_units

# blacklist nouveau driver, nvidia driver dependency
cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
Expand Down Expand Up @@ -103,6 +222,8 @@ if [[ "${DRIVER_KIND}" == "cuda" ]]; then
bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
fi

start_cdi_refresh_units

mkdir -p /etc/containerd/config.d
cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml

Expand Down
Loading