Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 23 additions & 173 deletions parts/linux/cloud-init/artifacts/aks-log-collector.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ COLLECT_NETNS=$(<<<"$CONFIG" jq -esRr 'try fromjson catch null | .netns? // fals
ZIP="aks_logs.zip"

# Log bundle upload max size is limited to 100MB
ENFORCE_MAX_ZIP_SIZE="${ENFORCE_MAX_ZIP_SIZE:-true}"
MAX_SIZE=104857600

# File globs to include
Expand All @@ -52,85 +53,13 @@ for FILE in $(<<<"$CONFIG" jq -esRr 'try fromjson catch ("" | halt_error) | .fil
GLOBS+=("${FILE}")
done

# Add extra_namespaces to the top to make sure those pod logs are included
for NAMESPACE in $(<<<"$CONFIG" jq -esRr 'try fromjson catch ("" | halt_error) | .pod_log_namespaces[]'); do
GLOBS+=("/var/log/pods/${NAMESPACE}_*/**/*")
done

# AKS specific entries
GLOBS+=(/etc/cni/net.d/*)
GLOBS+=(/etc/containerd/*)
GLOBS+=(/etc/default/kubelet)
GLOBS+=(/etc/kubernetes/manifests/*)
GLOBS+=(/var/log/azure-cni*)
GLOBS+=(/var/log/azure-cns*)
GLOBS+=(/var/log/azure-ipam*)
GLOBS+=(/var/log/azure-vnet*)
GLOBS+=(/var/log/cilium-cni*)
GLOBS+=(/var/run/azure-vnet*)
GLOBS+=(/var/run/azure-cns*)

# GPU specific entries
GLOBS+=(/var/log/nvidia*.log)
GLOBS+=(/var/log/azure/nvidia*.log)
GLOBS+=(/var/log/fabricmanager*.log)

# based on MANIFEST_FULL from Azure Linux Agent's log collector
# https://github.com/Azure/WALinuxAgent/blob/master/azurelinuxagent/common/logcollector_manifests.py
GLOBS+=(/var/lib/waagent/provisioned)
GLOBS+=(/etc/fstab)
GLOBS+=(/etc/ssh/sshd_config)
GLOBS+=(/boot/grub*/grub.c*)
GLOBS+=(/boot/grub*/menu.lst)
GLOBS+=(/etc/*-release)
GLOBS+=(/etc/HOSTNAME)
GLOBS+=(/etc/hostname)
GLOBS+=(/etc/apt/sources.list)
GLOBS+=(/etc/apt/sources.list.d/*)
GLOBS+=(/etc/network/interfaces)
GLOBS+=(/etc/network/interfaces.d/*.cfg)
GLOBS+=(/etc/netplan/*.yaml)
GLOBS+=(/etc/nsswitch.conf)
GLOBS+=(/etc/resolv.conf)
GLOBS+=(/run/systemd/resolve/stub-resolv.conf)
GLOBS+=(/run/resolvconf/resolv.conf)
GLOBS+=(/etc/sysconfig/iptables)
GLOBS+=(/etc/sysconfig/network)
GLOBS+=(/etc/sysconfig/network/ifcfg-eth*)
GLOBS+=(/etc/sysconfig/network/routes)
GLOBS+=(/etc/sysconfig/network-scripts/ifcfg-eth*)
GLOBS+=(/etc/sysconfig/network-scripts/route-eth*)
GLOBS+=(/etc/ufw/ufw.conf)
GLOBS+=(/etc/waagent.conf)
GLOBS+=(/var/lib/hyperv/.kvp_pool_*)
GLOBS+=(/var/lib/dhcp/dhclient.eth0.leases)
GLOBS+=(/var/lib/dhclient/dhclient-eth0.leases)
GLOBS+=(/var/lib/wicked/lease-eth0-dhcp-ipv4.xml)
GLOBS+=(/var/log/azure/custom-script/handler.log)
GLOBS+=(/var/log/azure/run-command/handler.log)
GLOBS+=(/var/lib/waagent/ovf-env.xml)
GLOBS+=(/var/lib/waagent/*/status/*.status)
GLOBS+=(/var/lib/waagent/*/config/*.settings)
GLOBS+=(/var/lib/waagent/*/config/HandlerState)
GLOBS+=(/var/lib/waagent/*/config/HandlerStatus)
GLOBS+=(/var/lib/waagent/SharedConfig.xml)
GLOBS+=(/var/lib/waagent/ManagedIdentity-*.json)
GLOBS+=(/var/lib/waagent/waagent_status.json)
GLOBS+=(/var/lib/waagent/*/error.json)
GLOBS+=(/var/log/cloud-init*)
GLOBS+=(/var/log/azure/aks/aks-node.pcap)
GLOBS+=(/var/log/azure/*/*)
GLOBS+=(/var/log/azure/*/*/*)
GLOBS+=(/var/log/syslog*)
GLOBS+=(/var/log/rsyslog*)
GLOBS+=(/var/log/messages*)
GLOBS+=(/var/log/kern*)
GLOBS+=(/var/log/dmesg*)
GLOBS+=(/var/log/dpkg*)
GLOBS+=(/var/log/yum*)
GLOBS+=(/var/log/boot*)
GLOBS+=(/var/log/auth*)
GLOBS+=(/var/log/secure*)
GLOBS+=(/var/log/journal*)
GLOBS+=(/etc/default/kubelet)

Comment on lines 56 to 63
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change removes most of the default AKS log bundle content (CNI/containerd/k8s manifests, waagent, network state, etc.) and leaves only the pcap + a small set of logs. That’s a major behavior regression for supportability. If the goal is to add the pcap for secure TLS bootstrap debugging, keep the existing globs/collectors and append the pcap, or gate the reduced collection behind a debug tag/flag.

Copilot uses AI. Check for mistakes.
### END CONFIGURATION

Expand Down Expand Up @@ -189,114 +118,35 @@ mkdir collect
# Collect general information and create the ZIP in the first place
zip -DZ deflate "${ZIP}" /proc/@(cmdline|cpuinfo|filesystems|interrupts|loadavg|meminfo|modules|mounts|slabinfo|stat|uptime|version*|vmstat) /proc/net/*

# Include some disk listings
collectToZip collect/file_listings.txt find /dev /etc /var/lib/waagent /var/log -ls

# Collect system information
collectToZip collect/blkid.txt blkid $(find /dev -type b ! -name 'sr*')
collectToZip collect/du_bytes.txt df -al
collectToZip collect/du_inodes.txt df -ail
collectToZip collect/diskinfo.txt lsblk
collectToZip collect/lscpu.txt lscpu
collectToZip collect/lscpu.json lscpu -J
collectToZip collect/lshw.txt lshw
collectToZip collect/lshw.json lshw -json
collectToZip collect/lsipc.txt lsipc
collectToZip collect/lsns.json lsns -J --output-all
collectToZip collect/lspci.txt lspci -vkPP
collectToZip collect/lsscsi.txt lsscsi -vv
collectToZip collect/lsvmbus.txt lsvmbus -vv
collectToZip collect/sysctl.txt sysctl -a
collectToZip collect/systemctl-status.txt systemctl status --all -fr

# Collect logs of the Nvidia services if present
collectToZip collect/journalctl_nvidia-dcgm.txt journalctl -u nvidia-dcgm --no-pager
collectToZip collect/journalctl_nvidia-dcgm-exporter.txt journalctl -u nvidia-dcgm-exporter --no-pager
collectToZip collect/journalctl_nvidia-device-plugin.txt journalctl -u nvidia-device-plugin --no-pager

# Collect container runtime information
collectToZip collect/crictl_version.txt crictl version
collectToZip collect/crictl_info.json crictl info -o json
collectToZip collect/crictl_images.json crictl images -o json
collectToZip collect/crictl_imagefsinfo.json crictl imagefsinfo -o json
collectToZip collect/crictl_pods.json crictl pods -o json
collectToZip collect/crictl_ps.json crictl ps -o json
collectToZip collect/crictl_stats.json crictl stats -o json
collectToZip collect/crictl_statsp.json crictl statsp -o json

# Collect network information
collectToZip collect/conntrack.txt conntrack -L
collectToZip collect/conntrack_stats.txt conntrack -S
collectToZip collect/ip_4_addr.json ip -4 -d -j addr show
collectToZip collect/ip_4_neighbor.json ip -4 -d -j neighbor show
collectToZip collect/ip_4_route.json ip -4 -d -j route show
collectToZip collect/ip_4_tcpmetrics.json ip -4 -d -j tcpmetrics show
collectToZip collect/ip_6_addr.json ip -6 -d -j addr show table all
collectToZip collect/ip_6_neighbor.json ip -6 -d -j neighbor show
collectToZip collect/ip_6_route.json ip -6 -d -j route show table all
collectToZip collect/ip_6_tcpmetrics.json ip -6 -d -j tcpmetrics show
collectToZip collect/ip_link.json ip -d -j link show
collectToZip collect/ip_netconf.json ip -d -j netconf show
collectToZip collect/ip_netns.json ip -d -j netns show
collectToZip collect/ip_rule.json ip -d -j rule show

if [ "${COLLECT_IPTABLES}" = "true" ]; then
collectToZip collect/iptables.txt iptables -L -vn --line-numbers
collectToZip collect/ip6tables.txt ip6tables -L -vn --line-numbers
fi

if [ "${COLLECT_NFTABLES}" = "true" ]; then
collectToZip collect/nftables.txt nft -n list ruleset 2>&1
fi

collectToZip collect/ss.txt ss -anoempiO --cgroup
collectToZip collect/ss_stats.txt ss -s

# Collect network information from network namespaces
if [ "${COLLECT_NETNS}" = "true" ]; then
for NETNS in $(ip -j netns list | jq -r '.[].name'); do
mkdir -p "collect/ip_netns_${NETNS}/"
collectToZip collect/ip_netns_${NETNS}/conntrack.txt ip netns exec "${NETNS}" conntrack -L
collectToZip collect/ip_netns_${NETNS}/conntrack_stats.txt ip netns exec "${NETNS}" conntrack -S
collectToZip collect/ip_netns_${NETNS}/ip_4_addr.json ip -n "${NETNS}" -4 -d -j addr show
collectToZip collect/ip_netns_${NETNS}/ip_4_neighbor.json ip -n "${NETNS}" -4 -d -j neighbor show
collectToZip collect/ip_netns_${NETNS}/ip_4_route.json ip -n "${NETNS}" -4 -d -j route show
collectToZip collect/ip_netns_${NETNS}/ip_4_tcpmetrics.json ip -n "${NETNS}" -4 -d -j tcpmetrics show
collectToZip collect/ip_netns_${NETNS}/ip_6_addr.json ip -n "${NETNS}" -6 -d -j addr show table all
collectToZip collect/ip_netns_${NETNS}/ip_6_neighbor.json ip -n "${NETNS}" -6 -d -j neighbor show
collectToZip collect/ip_netns_${NETNS}/ip_6_route.json ip -n "${NETNS}" -6 -d -j route show table all
collectToZip collect/ip_netns_${NETNS}/ip_6_tcpmetrics.json ip -n "${NETNS}" -6 -d -j tcpmetrics show
collectToZip collect/ip_netns_${NETNS}/ip_link.json ip -n "${NETNS}" -d -j link show
collectToZip collect/ip_netns_${NETNS}/ip_netconf.json ip -n "${NETNS}" -d -j netconf show
collectToZip collect/ip_netns_${NETNS}/ip_netns.json ip -n "${NETNS}" -d -j netns show
collectToZip collect/ip_netns_${NETNS}/ip_rule.json ip -n "${NETNS}" -d -j rule show
if [ "${COLLECT_IPTABLES}" = "true" ]; then
collectToZip collect/ip_netns_${NETNS}/iptables.txt ip netns exec "${NETNS}" iptables -L -vn --line-numbers
collectToZip collect/ip_netns_${NETNS}/ip6tables.txt ip netns exec "${NETNS}" ip6tables -L -vn --line-numbers
fi
if [ "${COLLECT_NFTABLES}" = "true" ]; then
collectToZip collect/ip_netns_${NETNS}/nftables.txt nft -n list ruleset
fi
collectToZip collect/ip_netns_${NETNS}/ss.txt ip netns exec "${NETNS}" ss -anoempiO --cgroup
collectToZip collect/ip_netns_${NETNS}/ss_stats.txt ip netns exec "${NETNS}" ss -s
done
fi

# Add each file sequentially to the zip archive. This is slightly less efficient then adding them
# all at once, but allows us to easily check when we've exceeded the maximum file size and stop
# adding things to the archive.
echo "Adding log files to zip archive..."
for file in ${GLOBS[*]}; do
if test -e $file; then
zip -g -DZ deflate -u "${ZIP}" $file -x '*.sock'
if test -e "$file"; then
if [ "${ENFORCE_MAX_ZIP_SIZE}" = "true" ]; then
# If the archive is already at or over the max size, stop adding files.
FILE_SIZE=$(stat --printf "%s" "${ZIP}" 2>/dev/null || echo 0)
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE; not adding more files."
break
fi
fi

# The API for the log bundle has a max file size (defined above, usually 100MB), so if
# adding this last file made the zip go over that size, remove that file and stop processing.
FILE_SIZE=$(stat --printf "%s" ${ZIP})
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE; removing last log file and terminating adding more files."
zip -d "${ZIP}" $file
break
zip -g -DZ deflate -u "${ZIP}" "$file" -x '*.sock'

if [ "${ENFORCE_MAX_ZIP_SIZE}" = "true" ]; then
# The API for the log bundle has a max file size (defined above, usually 100MB), so if
# adding this last file made the zip go over that size, remove that file and try the next one.
# Using continue instead of break ensures smaller subsequent files can still be included.
FILE_SIZE=$(stat --printf "%s" "${ZIP}")
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE after adding $file; removing it and trying next file."
zip -d "${ZIP}" "$file"
Comment on lines +143 to +148
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ENFORCE_MAX_ZIP_SIZE logic change introduces a subtle behavioral difference that could cause issues:

Old behavior (lines removed): When zip exceeded MAX_SIZE, the script would:

  1. Remove the last file that pushed it over
  2. STOP processing (break)

New behavior (lines 150-169): When zip exceeds MAX_SIZE:

  1. Skip adding more files until a file is added
  2. If that file pushes over limit, remove it and CONTINUE trying next files
  3. Only stops when hitting the pre-check at line 153

Problem: The continue logic (line 165-169) is inside the "if test -e" block, so smaller files after a large one can still be added. However, the pre-check (line 150-157) will prevent ANY files from being added once the limit is hit, creating inconsistent behavior depending on file ordering.

Recommendation: Simplify to always break when limit is exceeded (restore original behavior), OR move the pre-check outside the file loop for consistent "stop when full" behavior.

Suggested change
# adding this last file made the zip go over that size, remove that file and try the next one.
# Using continue instead of break ensures smaller subsequent files can still be included.
FILE_SIZE=$(stat --printf "%s" "${ZIP}")
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE after adding $file; removing it and trying next file."
zip -d "${ZIP}" "$file"
# adding this last file made the zip go over that size, remove that file and stop adding more.
FILE_SIZE=$(stat --printf "%s" "${ZIP}")
if [ "$FILE_SIZE" -ge "$MAX_SIZE" ]; then
echo "WARNING: ZIP file size $FILE_SIZE >= $MAX_SIZE after adding $file; removing it and stopping further additions."
zip -d "${ZIP}" "$file"
break

Copilot uses AI. Check for mistakes.
fi
fi
fi
done
Expand Down
13 changes: 13 additions & 0 deletions parts/linux/cloud-init/artifacts/aks-pcap.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Unit]
Description=AKS PCAP
Wants=network-online.target
After=network-online.target

[Service]
Type=oneshot
RemainAfterExit=yes
SuccessExitStatus=124
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SuccessExitStatus=124 suggests a normal timeout exit should be treated as success, but the current aks-pcap.sh uses || which prevents the unit from ever exiting with 124 (it will run the collector and return that exit code instead). Align the unit semantics with the script (either remove SuccessExitStatus=124 or change the script to preserve the 124 exit code on timeout).

Suggested change
SuccessExitStatus=124

Copilot uses AI. Check for mistakes.
ExecStart=/bin/bash /opt/azure/containers/aks-pcap.sh

[Install]
WantedBy=multi-user.target
6 changes: 6 additions & 0 deletions parts/linux/cloud-init/artifacts/aks-pcap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -uxo pipefail

mkdir -p /var/log/azure/aks

timeout 300 tcpdump -i eth0 -s 0 -w /var/log/azure/aks/aks-node.pcap 'port 443' || /opt/azure/containers/aks-log-collector.sh >/var/log/azure/aks/cse-aks-log-collector.log 2>&1
4 changes: 4 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,10 @@ Environment="BOOTSTRAP_FLAGS=${BOOTSTRAP_CLIENT_FLAGS}"
# once bootstrap tokens are no longer a fallback, kubelet.service needs to be a RequiredBy=
WantedBy=kubelet.service
EOF
# start the PCAP service (not supported on Flatcar or AzureLinux OS Guard)
if ! isFlatcar "$OS" && ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
systemctlEnableAndStartNoBlock aks-pcap 30 || exit $ERR_AKS_PCAP_START_FAILURE
fi

# explicitly start secure TLS bootstrapping ahead of kubelet
systemctlEnableAndStartNoBlock secure-tls-bootstrap 30 || exit $ERR_SECURE_TLS_BOOTSTRAP_START_FAILURE
Expand Down
2 changes: 2 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ ERR_LOCALDNS_BINARY_ERR=219 # Localdns binary not found or not executable.

ERR_SECURE_TLS_BOOTSTRAP_START_FAILURE=220 # Error starting the secure TLS bootstrap systemd service

ERR_AKS_PCAP_START_FAILURE=221 # Error starting aks-pcap service

ERR_CLOUD_INIT_FAILED=223 # Error indicating that cloud-init returned exit code 1 in cse_cmd.sh
ERR_NVIDIA_DRIVER_INSTALL=224 # Error determining if nvidia driver install should be skipped
ERR_NVIDIA_GPG_KEY_DOWNLOAD_TIMEOUT=225 # Timeout waiting for NVIDIA GPG key download
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ installDeps() {

dnf_makecache || exit $ERR_APT_UPDATE_TIMEOUT
dnf_update || exit $ERR_APT_DIST_UPGRADE_TIMEOUT
for dnf_package in ca-certificates check-restart cifs-utils cloud-init-azure-kvp conntrack-tools cracklib dnf-automatic ebtables ethtool fuse inotify-tools iotop iproute ipset iptables jq logrotate lsof nmap-ncat nfs-utils pam pigz psmisc rsyslog socat sysstat traceroute util-linux xz zip blobfuse2 nftables iscsi-initiator-utils device-mapper-multipath; do
for dnf_package in tcpdump ca-certificates check-restart cifs-utils cloud-init-azure-kvp conntrack-tools cracklib dnf-automatic ebtables ethtool fuse inotify-tools iotop iproute ipset iptables jq logrotate lsof nmap-ncat nfs-utils pam pigz psmisc rsyslog socat sysstat traceroute util-linux xz zip blobfuse2 nftables iscsi-initiator-utils device-mapper-multipath; do
if ! dnf_install 30 1 600 $dnf_package; then
exit $ERR_APT_INSTALL_TIMEOUT
fi
Expand Down
Loading