From c8fc59de7cff7f5d868de756857ba489960d70e1 Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 15:59:59 -0800
Subject: [PATCH 1/8] feat: add qemu vm helper

---
 .gitignore               |   5 +-
 hack/qemu/user-data.yaml |  40 +++
 hack/qemu/vm.sh          | 521 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 565 insertions(+), 1 deletion(-)
 create mode 100644 hack/qemu/user-data.yaml
 create mode 100755 hack/qemu/vm.sh
diff --git a/.gitignore b/.gitignore
index b79aa92..c1fccb8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,4 +50,7 @@ config.json
 
 # Build artifacts
 /build/
-/dist/
\ No newline at end of file
+/dist/
+
+# qemu vm state
+.vm
diff --git a/hack/qemu/user-data.yaml b/hack/qemu/user-data.yaml
new file mode 100644
index 0000000..a8a856d
--- /dev/null
+++ b/hack/qemu/user-data.yaml
@@ -0,0 +1,40 @@
+#cloud-config
+
+# User configuration
+users:
+  - name: ubuntu
+    sudo: ALL=(ALL) NOPASSWD:ALL
+    shell: /bin/bash
+    groups: [sudo]
+    lock_passwd: false
+    ssh_authorized_keys:
+      - __SSH_PUBLIC_KEY__
+
+# Package management
+package_update: true
+package_upgrade: false
+packages:
+  - curl
+  - jq
+  - apt-transport-https
+  - ca-certificates
+  - net-tools
+  - vim
+
+# Mount host repo into guest via virtio-9p
+mounts:
+  - ["flexnode", "/flex-node", "9p", "trans=virtio,version=9p2000.L,nofail", "0", "0"]
+
+# Run commands on first boot
+runcmd:
+  - mkdir -p /flex-node
+  - mount -a
+  - echo "hello, world"
+
+# Write files
+write_files:
+  - path: /etc/flexnode/provisioned
+    content: |
+      provisioned=true
+      timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+    permissions: "0644"
diff --git a/hack/qemu/vm.sh b/hack/qemu/vm.sh
new file mode 100755
index 0000000..18e9462
--- /dev/null
+++ b/hack/qemu/vm.sh
@@ -0,0 +1,521 @@
+#!/usr/bin/env bash
+#
+# vm.sh - Manage QEMU-based Ubuntu VMs with cloud-init support
+#
+# Usage:
+#   ./hack/qemu/vm.sh <command> [options]
+#
+# Commands:
+#   start          Create and start a VM
+#   stop           Stop a running VM
+#   logs           Show VM serial console logs
+#
+# Start options:
+#   -n, --name        VM name (default: flexnode-vm)
+#   -m, --memory      Memory in MB (default: 2048)
+#   -c, --cpus        Number of CPUs (default: 2)
+#   -d, --disk-size   Disk size (default: 20G)
+#   -p, --ssh-port    Host port forwarded to guest SSH (default: 2222)
+#   -i, --image       Path to Ubuntu cloud image (downloaded if not present)
+#   -u, --user-data   Path to cloud-init user-data file (default: hack/qemu/user-data.yaml)
+#       --no-snapshot  Use the base image directly instead of creating a snapshot
+#
+# Stop options:
+#   -n, --name   VM name (default: flexnode-vm)
+#   -f, --force  Force kill (SIGKILL) instead of graceful shutdown (SIGTERM)
+#       --clean  Also remove disk, seed ISO, and log files
+#
+# Logs options:
+#   -n, --name   VM name (default: flexnode-vm)
+#   -f, --follow  Follow log output (like tail -f)
+#
+# Examples:
+#   ./hack/qemu/vm.sh start
+#   ./hack/qemu/vm.sh start -n my-vm --memory 4096 --cpus 4
+#   ./hack/qemu/vm.sh stop
+#   ./hack/qemu/vm.sh stop --force --clean
+#   ./hack/qemu/vm.sh logs
+#   ./hack/qemu/vm.sh logs --follow
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# -------------------------------------------------------------------
+# Detect host architecture
+# -------------------------------------------------------------------
+HOST_ARCH="$(uname -m)"
+case "${HOST_ARCH}" in
+    x86_64)        GUEST_ARCH="amd64" ;;
+    aarch64|arm64) GUEST_ARCH="arm64" ;;
+    *)             echo "[ERROR] Unsupported host architecture: ${HOST_ARCH}" >&2; exit 1 ;;
+esac
+
+# -------------------------------------------------------------------
+# Defaults
+# -------------------------------------------------------------------
+VM_NAME="flexnode-vm"
+MEMORY="2048"
+CPUS="2"
+DISK_SIZE="20G"
+SSH_PORT="2222"
+USE_SNAPSHOT=true
+FORCE=false
+CLEAN=false
+
+VM_DIR="${REPO_ROOT}/.vm"
+IMAGE_BASE_URL="https://cloud-images.ubuntu.com/minimal/releases/noble/release"
+IMAGE_URL="${IMAGE_BASE_URL}/ubuntu-24.04-minimal-cloudimg-${GUEST_ARCH}.img"
+IMAGE_FILE=""
+USER_DATA="${SCRIPT_DIR}/user-data.yaml"
+
+# -------------------------------------------------------------------
+# Helpers
+# -------------------------------------------------------------------
+info()  { echo "[INFO]  $*"; }
+warn()  { echo "[WARN]  $*" >&2; }
+error() { echo "[ERROR] $*" >&2; exit 1; }
+
+usage() {
+    cat <<'EOF'
+Usage:
+  ./hack/qemu/vm.sh <command> [options]
+
+Commands:
+  start          Create and start a VM
+  stop           Stop a running VM
+  logs           Show VM serial console logs
+
+Start options:
+  -n, --name        VM name (default: flexnode-vm)
+  -m, --memory      Memory in MB (default: 2048)
+  -c, --cpus        Number of CPUs (default: 2)
+  -d, --disk-size   Disk size (default: 20G)
+  -p, --ssh-port    Host port forwarded to guest SSH (default: 2222)
+  -i, --image       Path to Ubuntu cloud image (downloaded if not present)
+  -u, --user-data   Path to cloud-init user-data file (default: hack/qemu/user-data.yaml)
+      --no-snapshot  Use the base image directly instead of creating a snapshot
+
+Stop options:
+  -n, --name   VM name (default: flexnode-vm)
+  -f, --force  Force kill (SIGKILL) instead of graceful shutdown (SIGTERM)
+      --clean  Also remove disk, seed ISO, and log files
+
+Logs options:
+  -n, --name    VM name (default: flexnode-vm)
+  -f, --follow  Follow log output (like tail -f)
+
+Examples:
+  ./hack/qemu/vm.sh start
+  ./hack/qemu/vm.sh start -n my-vm --memory 4096 --cpus 4
+  ./hack/qemu/vm.sh stop
+  ./hack/qemu/vm.sh stop --force --clean
+  ./hack/qemu/vm.sh logs
+  ./hack/qemu/vm.sh logs --follow
+EOF
+    exit 0
+}
+
+check_deps() {
+    local qemu_bin
+    if [[ "${GUEST_ARCH}" == "arm64" ]]; then
+        qemu_bin="qemu-system-aarch64"
+    else
+        qemu_bin="qemu-system-x86_64"
+    fi
+
+    local missing=()
+    for cmd in "${qemu_bin}" qemu-img; do
+        if ! command -v "$cmd" &>/dev/null; then
+            missing+=("$cmd")
+        fi
+    done
+
+    # We need at least one ISO generation tool
+    if ! command -v mkisofs &>/dev/null && ! command -v genisoimage &>/dev/null && ! command -v hdiutil &>/dev/null; then
+        missing+=("mkisofs (or genisoimage or hdiutil)")
+    fi
+
+    if [[ ${#missing[@]} -gt 0 ]]; then
+        echo ""
+        echo "Missing required dependencies: ${missing[*]}"
+        echo ""
+        echo "Install on macOS:"
+        echo "  brew install qemu cdrtools"
+        echo ""
+        echo "Install on Ubuntu/Debian:"
+        echo "  sudo apt-get install qemu-system-x86 qemu-utils genisoimage"
+        echo ""
+        exit 1
+    fi
+}
+
+# Build a cloud-init NoCloud seed ISO without requiring cloud-localds.
+# Uses mkisofs, genisoimage, or hdiutil (macOS) — whichever is available.
+create_seed_iso() {
+    local iso_path="$1"
+    local user_data="$2"
+    local meta_data="$3"
+
+    local staging
+    staging="$(mktemp -d)"
+    cp "${user_data}" "${staging}/user-data"
+    cp "${meta_data}" "${staging}/meta-data"
+
+    if command -v mkisofs &>/dev/null; then
+        mkisofs -output "${iso_path}" -volid cidata -joliet -rock \
+            "${staging}/user-data" "${staging}/meta-data"
+    elif command -v genisoimage &>/dev/null; then
+        genisoimage -output "${iso_path}" -volid cidata -joliet -rock \
+            "${staging}/user-data" "${staging}/meta-data"
+    elif command -v hdiutil &>/dev/null; then
+        hdiutil makehybrid -o "${iso_path}" -joliet -iso \
+            -default-volume-name cidata "${staging}"
+    else
+        rm -rf "${staging}"
+        error "No ISO generation tool found"
+    fi
+
+    rm -rf "${staging}"
+}
+
+# ===================================================================
+# Command: start
+# ===================================================================
+cmd_start() {
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            -n|--name)       VM_NAME="$2"; shift 2 ;;
+            -m|--memory)     MEMORY="$2"; shift 2 ;;
+            -c|--cpus)       CPUS="$2"; shift 2 ;;
+            -d|--disk-size)  DISK_SIZE="$2"; shift 2 ;;
+            -p|--ssh-port)   SSH_PORT="$2"; shift 2 ;;
+            -i|--image)      IMAGE_FILE="$2"; shift 2 ;;
+            -u|--user-data)  USER_DATA="$2"; shift 2 ;;
+            --no-snapshot)   USE_SNAPSHOT=false; shift ;;
+            -h|--help)       usage ;;
+            *)               error "Unknown option: $1" ;;
+        esac
+    done
+
+    check_deps
+    mkdir -p "${VM_DIR}"
+
+    # ---------------------------------------------------------------
+    # Download Ubuntu cloud image if needed
+    # ---------------------------------------------------------------
+    if [[ -z "${IMAGE_FILE}" ]]; then
+        IMAGE_FILE="${VM_DIR}/ubuntu-cloud.img"
+    fi
+
+    if [[ ! -f "${IMAGE_FILE}" ]]; then
+        info "Downloading Ubuntu cloud image..."
+        info "URL: ${IMAGE_URL}"
+        curl -L -o "${IMAGE_FILE}" "${IMAGE_URL}"
+        info "Download complete: ${IMAGE_FILE}"
+    else
+        info "Using existing image: ${IMAGE_FILE}"
+    fi
+
+    # ---------------------------------------------------------------
+    # Create VM disk (snapshot backed by the cloud image)
+    # ---------------------------------------------------------------
+    VM_DISK="${VM_DIR}/${VM_NAME}.qcow2"
+
+    if [[ "${USE_SNAPSHOT}" == true ]]; then
+        info "Creating snapshot disk: ${VM_DISK} (backed by base image)"
+        qemu-img create -f qcow2 -b "${IMAGE_FILE}" -F qcow2 "${VM_DISK}" "${DISK_SIZE}"
+    else
+        info "Copying base image to: ${VM_DISK}"
+        cp "${IMAGE_FILE}" "${VM_DISK}"
+        qemu-img resize "${VM_DISK}" "${DISK_SIZE}"
+    fi
+
+    # ---------------------------------------------------------------
+    # Resolve local SSH public key
+    # ---------------------------------------------------------------
+    SSH_PUB_KEY=""
+    for key_file in "${HOME}/.ssh/id_ed25519.pub" "${HOME}/.ssh/id_rsa.pub" "${HOME}/.ssh/id_ecdsa.pub"; do
+        if [[ -f "${key_file}" ]]; then
+            SSH_PUB_KEY="$(cat "${key_file}")"
+            info "Using SSH public key: ${key_file}"
+            break
+        fi
+    done
+
+    if [[ -z "${SSH_PUB_KEY}" ]]; then
+        warn "No SSH public key found in ~/.ssh/. The VM will not have key-based SSH access."
+    fi
+
+    # ---------------------------------------------------------------
+    # Render user-data with SSH key into .vm/
+    # ---------------------------------------------------------------
+    RENDERED_USER_DATA="${VM_DIR}/user-data.yaml"
+
+    if [[ ! -f "${USER_DATA}" ]]; then
+        error "User-data template not found: ${USER_DATA}"
+    fi
+
+    if [[ -n "${SSH_PUB_KEY}" ]]; then
+        sed "s|__SSH_PUBLIC_KEY__|${SSH_PUB_KEY}|g" "${USER_DATA}" > "${RENDERED_USER_DATA}"
+    else
+        # Remove the placeholder line entirely if no key is available
+        sed '/__SSH_PUBLIC_KEY__/d' "${USER_DATA}" > "${RENDERED_USER_DATA}"
+    fi
+    info "Rendered user-data: ${RENDERED_USER_DATA}"
+
+    # ---------------------------------------------------------------
+    # Build cloud-init seed ISO
+    # ---------------------------------------------------------------
+    SEED_ISO="${VM_DIR}/${VM_NAME}-seed.iso"
+    META_DATA="${VM_DIR}/meta-data"
+
+    # Create minimal meta-data
+    cat > "${META_DATA}" <<EOF
+instance-id: ${VM_NAME}
+local-hostname: ${VM_NAME}
+EOF
+
+    info "Creating cloud-init seed ISO: ${SEED_ISO}"
+    create_seed_iso "${SEED_ISO}" "${RENDERED_USER_DATA}" "${META_DATA}"
+
+    # ---------------------------------------------------------------
+    # Determine QEMU binary, accelerator, and machine type
+    # ---------------------------------------------------------------
+    ACCEL=""
+    MACHINE_ARGS=""
+    QEMU_BIN=""
+
+    if [[ "${GUEST_ARCH}" == "arm64" ]]; then
+        QEMU_BIN="qemu-system-aarch64"
+        MACHINE_ARGS="-machine virt -cpu host"
+
+        # Locate UEFI firmware for aarch64
+        UEFI_FW=""
+        for fw_path in \
+            /opt/homebrew/share/qemu/edk2-aarch64-code.fd \
+            /usr/local/share/qemu/edk2-aarch64-code.fd \
+            /usr/share/qemu-efi-aarch64/QEMU_EFI.fd \
+            /usr/share/AAVMF/AAVMF_CODE.fd; do
+            if [[ -f "${fw_path}" ]]; then
+                UEFI_FW="${fw_path}"
+                break
+            fi
+        done
+        if [[ -z "${UEFI_FW}" ]]; then
+            error "Could not find UEFI firmware for aarch64. Install qemu (brew install qemu) or edk2."
+        fi
+        MACHINE_ARGS="${MACHINE_ARGS} -bios ${UEFI_FW}"
+    else
+        QEMU_BIN="qemu-system-x86_64"
+        MACHINE_ARGS="-cpu host"
+    fi
+
+    case "$(uname -s)" in
+        Darwin)
+            if sysctl -n kern.hv_support 2>/dev/null | grep -q 1; then
+                ACCEL="-accel hvf"
+            fi
+            ;;
+        Linux)
+            if [[ -r /dev/kvm ]]; then
+                ACCEL="-accel kvm"
+            fi
+            ;;
+    esac
+
+    # ---------------------------------------------------------------
+    # Launch VM in background
+    # ---------------------------------------------------------------
+    QEMU_PID_FILE="${VM_DIR}/${VM_NAME}.pid"
+    QEMU_LOG="${VM_DIR}/${VM_NAME}.log"
+
+    info "============================================"
+    info "  Launching VM: ${VM_NAME}"
+    info "  Arch:         ${GUEST_ARCH} (${HOST_ARCH})"
+    info "  Memory:       ${MEMORY} MB"
+    info "  CPUs:         ${CPUS}"
+    info "  Disk:         ${VM_DISK}"
+    info "  SSH port:     ${SSH_PORT} -> 22"
+    info "  Mount:        ${REPO_ROOT} -> /flex-node"
+    info "  Log:          ${QEMU_LOG}"
+    info "  PID file:     ${QEMU_PID_FILE}"
+    info "============================================"
+
+    # shellcheck disable=SC2086
+    "${QEMU_BIN}" \
+        ${MACHINE_ARGS} \
+        ${ACCEL} \
+        -m "${MEMORY}" \
+        -smp "${CPUS}" \
+        -drive file="${VM_DISK}",format=qcow2,if=virtio \
+        -drive file="${SEED_ISO}",format=raw,if=virtio \
+        -netdev user,id=net0,hostfwd=tcp::"${SSH_PORT}"-:22 \
+        -device virtio-net-pci,netdev=net0 \
+        -virtfs local,path="${REPO_ROOT}",mount_tag=flexnode,security_model=mapped-xattr,id=flexnode0 \
+        -daemonize \
+        -pidfile "${QEMU_PID_FILE}" \
+        -serial file:"${QEMU_LOG}" \
+        -display none
+
+    QEMU_PID="$(cat "${QEMU_PID_FILE}")"
+    info "VM started in background (PID: ${QEMU_PID})"
+
+    # ---------------------------------------------------------------
+    # Wait for SSH to become available
+    # ---------------------------------------------------------------
+    info "Waiting for SSH to become available on localhost:${SSH_PORT}..."
+
+    MAX_ATTEMPTS=60
+    ATTEMPT=0
+    while [[ ${ATTEMPT} -lt ${MAX_ATTEMPTS} ]]; do
+        ATTEMPT=$((ATTEMPT + 1))
+
+        # Check that the QEMU process is still alive
+        if ! kill -0 "${QEMU_PID}" 2>/dev/null; then
+            echo ""
+            error "QEMU process exited unexpectedly. Check log: ${QEMU_LOG}"
+        fi
+
+        if ssh -o BatchMode=yes -o ConnectTimeout=2 -o StrictHostKeyChecking=no \
+               -o UserKnownHostsFile=/dev/null -p "${SSH_PORT}" ubuntu@localhost \
+               "true" 2>/dev/null; then
+            break
+        fi
+
+        printf "."
+        sleep 3
+    done
+    echo ""
+
+    if [[ ${ATTEMPT} -ge ${MAX_ATTEMPTS} ]]; then
+        warn "SSH did not become available after ${MAX_ATTEMPTS} attempts."
+        warn "The VM may still be booting. Check log: ${QEMU_LOG}"
+        echo ""
+        echo "You can try connecting manually:"
+        echo ""
+        echo "  ssh -o StrictHostKeyChecking=no -p ${SSH_PORT} ubuntu@localhost"
+        echo ""
+        echo "To stop the VM:"
+        echo "  ./hack/qemu/vm.sh stop -n ${VM_NAME}"
+        exit 1
+    fi
+
+    info "VM is ready!"
+    echo ""
+    echo "  ssh -o StrictHostKeyChecking=no -p ${SSH_PORT} ubuntu@localhost"
+    echo ""
+    echo "To stop the VM:"
+    echo "  ./hack/qemu/vm.sh stop -n ${VM_NAME}"
+    echo ""
+}
+
+# ===================================================================
+# Command: stop
+# ===================================================================
+cmd_stop() {
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            -n|--name)   VM_NAME="$2"; shift 2 ;;
+            -f|--force)  FORCE=true; shift ;;
+            --clean)     CLEAN=true; shift ;;
+            -h|--help)   usage ;;
+            *)           error "Unknown option: $1" ;;
+        esac
+    done
+
+    local pid_file="${VM_DIR}/${VM_NAME}.pid"
+
+    if [[ ! -f "${pid_file}" ]]; then
+        error "PID file not found: ${pid_file}. Is the VM running?"
+    fi
+
+    local pid
+    pid="$(cat "${pid_file}")"
+
+    if ! kill -0 "${pid}" 2>/dev/null; then
+        warn "Process ${pid} is not running. Cleaning up stale PID file."
+        rm -f "${pid_file}"
+    else
+        if [[ "${FORCE}" == true ]]; then
+            info "Force killing VM '${VM_NAME}' (PID: ${pid})..."
+            kill -9 "${pid}"
+        else
+            info "Stopping VM '${VM_NAME}' (PID: ${pid})..."
+            kill "${pid}"
+
+            # Wait for process to exit
+            local timeout=15
+            while kill -0 "${pid}" 2>/dev/null && [[ ${timeout} -gt 0 ]]; do
+                sleep 1
+                timeout=$((timeout - 1))
+            done
+
+            if kill -0 "${pid}" 2>/dev/null; then
+                warn "VM did not stop gracefully, sending SIGKILL..."
+                kill -9 "${pid}" 2>/dev/null || true
+            fi
+        fi
+
+        rm -f "${pid_file}"
+        info "VM '${VM_NAME}' stopped."
+    fi
+
+    if [[ "${CLEAN}" == true ]]; then
+        info "Cleaning up VM artifacts..."
+        rm -f "${VM_DIR}/${VM_NAME}.qcow2"
+        rm -f "${VM_DIR}/${VM_NAME}-seed.iso"
+        rm -f "${VM_DIR}/${VM_NAME}.log"
+        rm -f "${VM_DIR}/user-data.yaml"
+        rm -f "${VM_DIR}/meta-data"
+        info "Cleanup complete."
+    fi
+}
+
+# ===================================================================
+# Command: logs
+# ===================================================================
+cmd_logs() {
+    local follow=false
+
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            -n|--name)   VM_NAME="$2"; shift 2 ;;
+            -f|--follow) follow=true; shift ;;
+            -h|--help)   usage ;;
+            *)           error "Unknown option: $1" ;;
+        esac
+    done
+
+    local log_file="${VM_DIR}/${VM_NAME}.log"
+
+    if [[ ! -f "${log_file}" ]]; then
+        error "Log file not found: ${log_file}. Has the VM been started?"
+    fi
+
+    if [[ "${follow}" == true ]]; then
+        info "Following logs for '${VM_NAME}' (Ctrl-C to stop)..."
+        tail -f "${log_file}"
+    else
+        cat "${log_file}"
+    fi
+}
+
+# ===================================================================
+# Main: dispatch subcommand
+# ===================================================================
+if [[ $# -lt 1 ]]; then
+    usage
+fi
+
+COMMAND="$1"
+shift
+
+case "${COMMAND}" in
+    start) cmd_start "$@" ;;
+    stop)  cmd_stop "$@" ;;
+    logs)  cmd_logs "$@" ;;
+    -h|--help) usage ;;
+    *)     error "Unknown command: ${COMMAND}. Use 'start', 'stop', or 'logs'." ;;
+esac

From b9860b2bca4d22b9e353eb44892bb9658ad2ff65 Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 16:01:38 -0800
Subject: [PATCH 2/8] chore: simlify signal handling

---
 main.go | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/main.go b/main.go
index 7edddd7..5eb1a8c 100644
--- a/main.go
+++ b/main.go
@@ -34,19 +34,9 @@ func main() {
 	rootCmd.AddCommand(NewVersionCommand())
 
 	// Set up context with signal handling
-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer cancel()
 
-	// Handle shutdown signals
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
-	go func() {
-		<-sigCh
-		// Use a basic logger for shutdown signal since context may not be available
-		fmt.Println("Received shutdown signal, cancelling operations...")
-		cancel()
-	}()
-
 	// Set up persistent pre-run to initialize config and logger
 	rootCmd.PersistentPreRunE = func(cmd *cobra.Command, args []string) error {
 		// Skip config loading for version command

From 232a1070ff2ed735a93fc0dd107a31ca0ed7ea93 Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 16:49:09 -0800
Subject: [PATCH 3/8] fix: RuntimeConfig -> RuncConfig

---
 pkg/config/structs.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/config/structs.go b/pkg/config/structs.go
index 3c1cc4a..a17e1f8 100644
--- a/pkg/config/structs.go
+++ b/pkg/config/structs.go
@@ -10,7 +10,7 @@ type Config struct {
 	Containerd ContainerdConfig `json:"containerd"`
 	Kubernetes KubernetesConfig `json:"kubernetes"`
 	CNI        CNIConfig        `json:"cni"`
-	Runc       RuntimeConfig    `json:"runc"`
+	Runc       RuncConfig       `json:"runc"`
 	Node       NodeConfig       `json:"node"`
 	Paths      PathsConfig      `json:"paths"`
 	Npd        NPDConfig        `json:"npd"`
@@ -84,8 +84,8 @@ type KubernetesConfig struct {
 	URLTemplate string `json:"urlTemplate"`
 }
 
-// RuntimeConfig holds configuration settings for the container runtime (runc).
-type RuntimeConfig struct {
+// RuncConfig holds configuration settings for the container runtime (runc).
+type RuncConfig struct {
 	Version string `json:"version"`
 	URL     string `json:"url"`
 }

From 4c1b2f6d29df9d95e219a3b29fea7ef14e6b5061 Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 19:48:18 -0800
Subject: [PATCH 4/8] fix: add minimal bootstrap process

---
 commands.go                                   | 52 +++++++++++++++++++
 main.go                                       |  8 +--
 pkg/bootstrapper/minimal.go                   | 44 ++++++++++++++++
 .../system_configuration_installer.go         |  7 +--
 pkg/config/config.go                          |  3 ++
 5 files changed, 108 insertions(+), 6 deletions(-)
 create mode 100644 pkg/bootstrapper/minimal.go

diff --git a/commands.go b/commands.go
index bf8d798..75bfb1a 100644
--- a/commands.go
+++ b/commands.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"
 
 	"github.com/sirupsen/logrus"
@@ -25,6 +26,57 @@ var (
 	BuildTime = "unknown"
 )
 
+func NewApplyCommand() *cobra.Command {
+	var flagMode string
+
+	cmd := &cobra.Command{
+		Use:   "apply",
+		Short: "Apply configuration to the node",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			ctx := cmd.Context()
+
+			logger := logger.GetLoggerFromContext(ctx)
+
+			cfg, err := config.LoadConfig(configPath)
+			if err != nil {
+				return fmt.Errorf("failed to load config from %s: %w", configPath, err)
+			}
+
+			var b interface {
+				Bootstrap(context.Context) (*bootstrapper.ExecutionResult, error)
+			}
+
+			if strings.EqualFold(flagMode, "minimal") {
+				logger.Info("Using minimal bootstrapper mode")
+				b = bootstrapper.NewMinimal(cfg, logger)
+			} else {
+				logger.Info("Using full bootstrapper mode")
+				b = bootstrapper.New(cfg, logger)
+			}
+
+			result, err := b.Bootstrap(ctx)
+			if err != nil {
+				return err
+			}
+
+			fmt.Printf(
+				"Bootstrap completed with success: %t, duration: %v, steps: %d\n",
+				result.Success, result.Duration, result.StepCount,
+			)
+			if !result.Success {
+				fmt.Printf("Bootstrap failed with error: %s\n", result.Error)
+				return fmt.Errorf("bootstrap failed: %s", result.Error)
+			}
+
+			return nil
+		},
+	}
+
+	cmd.Flags().StringVar(&flagMode, "mode", "minimal", "minimal or full")
+
+	return cmd
+}
+
 // NewAgentCommand creates a new agent command
 func NewAgentCommand() *cobra.Command {
 	cmd := &cobra.Command{
diff --git a/main.go b/main.go
index 5eb1a8c..d54579a 100644
--- a/main.go
+++ b/main.go
@@ -19,9 +19,10 @@ var (
 
 func main() {
 	rootCmd := &cobra.Command{
-		Use:   "aks-flex-node",
-		Short: "AKS Flex Node Agent",
-		Long:  "Azure Kubernetes Service Flex Node Agent for edge computing scenarios",
+		Use:          "aks-flex-node",
+		Short:        "AKS Flex Node Agent",
+		Long:         "Azure Kubernetes Service Flex Node Agent for edge computing scenarios",
+		SilenceUsage: true,
 	}
 
 	// Add global flags for configuration
@@ -29,6 +30,7 @@ func main() {
 	// Don't mark as required globally - we'll check in PersistentPreRunE for commands that need it
 
 	// Add commands
+	rootCmd.AddCommand(NewApplyCommand())
 	rootCmd.AddCommand(NewAgentCommand())
 	rootCmd.AddCommand(NewUnbootstrapCommand())
 	rootCmd.AddCommand(NewVersionCommand())
diff --git a/pkg/bootstrapper/minimal.go b/pkg/bootstrapper/minimal.go
new file mode 100644
index 0000000..c1ae49e
--- /dev/null
+++ b/pkg/bootstrapper/minimal.go
@@ -0,0 +1,44 @@
+package bootstrapper
+
+import (
+	"context"
+
+	"github.com/sirupsen/logrus"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/containerd"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/kube_binaries"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/kubelet"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/npd"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/runc"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/services"
+	"go.goms.io/aks/AKSFlexNode/pkg/components/system_configuration"
+	"go.goms.io/aks/AKSFlexNode/pkg/config"
+)
+
+type MinimalBootstrapper struct {
+	*BaseExecutor
+}
+
+func NewMinimal(cfg *config.Config, logger *logrus.Logger) *MinimalBootstrapper {
+	return &MinimalBootstrapper{
+		BaseExecutor: NewBaseExecutor(cfg, logger),
+	}
+}
+
+func (b *MinimalBootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) {
+	// Define the bootstrap steps in order - using modules directly
+	steps := []Executor{
+		system_configuration.NewInstaller(b.logger),
+		runc.NewInstaller(b.logger),
+		containerd.NewInstaller(b.logger),
+		kube_binaries.NewInstaller(b.logger),
+		kubelet.NewInstaller(b.logger),
+		npd.NewInstaller(b.logger),
+		services.NewInstaller(b.logger),
+	}
+
+	return b.ExecuteSteps(ctx, steps, "bootstrap")
+}
+
+func (b *MinimalBootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error) {
+	return nil, nil
+}
diff --git a/pkg/components/system_configuration/system_configuration_installer.go b/pkg/components/system_configuration/system_configuration_installer.go
index 1eb669b..bdd3d87 100644
--- a/pkg/components/system_configuration/system_configuration_installer.go
+++ b/pkg/components/system_configuration/system_configuration_installer.go
@@ -33,9 +33,10 @@ func (i *Installer) Execute(ctx context.Context) error {
 	}
 
 	// Configure resolv.conf
-	if err := i.configureResolvConf(); err != nil {
-		return fmt.Errorf("failed to configure resolv.conf: %w", err)
-	}
+	// FIXME: this doesn't make sense to me, so disable for now
+	// if err := i.configureResolvConf(); err != nil {
+	// 	return fmt.Errorf("failed to configure resolv.conf: %w", err)
+	// }
 
 	i.logger.Info("System configuration completed successfully")
 	return nil
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 498d91e..34e8685 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -259,6 +259,9 @@ func (c *Config) Validate() error {
 	if c.Azure.TenantID == "" {
 		return fmt.Errorf("azure.tenantId is required")
 	}
+	if c.Azure.TargetCluster == nil {
+		return fmt.Errorf("azure.targetCluster configuration is required")
+	}
 	if c.Azure.TargetCluster.Location == "" {
 		return fmt.Errorf("azure.targetCluster.location is required")
 	}

From 530e677c72de6ac86c5d4818f4bd212112566aea Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 23:16:12 -0800
Subject: [PATCH 5/8] doc: describe agent node host environment baseline

---
 docs/node-env.md | 288 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100644 docs/node-env.md

diff --git a/docs/node-env.md b/docs/node-env.md
new file mode 100644
index 0000000..8dc3c15
--- /dev/null
+++ b/docs/node-env.md
@@ -0,0 +1,288 @@
+# Agent Node Host Environment
+
+## Overview
+
+Each Kubernetes agent (worker) node must be provisioned with the software and services
+required to join a cluster and run workloads. Beyond this minimal baseline, certain
+scenarios demand additional setup; for example, GPU-capable nodes may need NVIDIA
+drivers and the appropriate device plugin to expose GPU resources to Kubernetes.
+
+At the same time, agent nodes are routinely restarted, patched, or replaced as part of
+ongoing maintenance and upgrade cycles. The mechanisms for performing these lifecycle
+operations vary across cloud and on-prem environments, depending on the available APIs
+and underlying infrastructure.
+
+To ensure AKS flex nodes can function consistently across this broad range of environments,
+this document defines the baseline runtime assumptions and requirements, and describes
+the expected behaviors for key lifecycle operations.
+
+### Non-Goals
+
+- We will limit the support scope to Linux-based nodes and focus on Ubuntu distro for now.
+  This is because Ubuntu is the widely and commonly available Linux distribution
+  across the target environments.
+
+## Baseline Environment Requirements
+
+### CPU Only Nodes
+
+- A Linux-based OS with `systemd` init system;
+- Modern Linux kernel (currently LTS or supported release) enabled with cgroup v2;
+- Swap disabled;
+- Syslog with rotation configured;
+- Time synchronization configured;
+- Proper host level DNS setup;
+- Outbound connectivity to cluster control plane endpoint;
+- Container runtime:
+  * `containerd` w/ 2.0+ version;
+  * `runc`
+- Kubernetes components:
+  * `kubelet` matching with the target worker node version;
+  * Control plane public CA certificate(s);
+  * TLS bootstrap configurations;
+  * Other cloud provider binaries;
+- NFTables / IPtables installed for Kubernetes network policies;
+- Network forward, IP masquerade and bridge settings configured for Kubernetes networking;
+- Support tools / binaries (e.g., `curl`, `ping`, etc) for diagnostics and troubleshooting;
+
+### GPU-Capable Nodes
+
+- All of the above CPU node requirements;
+- GPU drivers and runtime (e.g. NVIDIA drivers and CUDA toolkit for NVIDIA GPUs)
+  compatible with OS kernel;
+- RDMA, SR-IOV and InfiniBand drivers and runtime for GPU direct communication (if applicable);
+- Updated container runtime with GPU support;
+
+### Additional Requirements
+
+- Node identity for identifying and authenticating the node to cluster control plane;
+- CNI plugin binaries and configurations;
+- Node-problem-detector;
+- Node local DNS caching;
+- Background auto-repair agent;
+- Pre-cached container images for critical system components;
+- Support for adding optional feature layers & customizations during node image
+  baking or bootstrapping process;
+- In some environments, pre-built VHD images might not be available. In such
+  cases, the node bootstrapping process should also handle the initial OS image baking and provisioning to ensure a consistent baseline environment.
+- In some environments, the node might have limited outbound connectivity
+  (e.g., no direct access to public internet). In such cases, the node bootstrapping process should also handle pulling necessary components
+  through proxy or fallback endpoints.
+
+## Node Lifecycle Operations
+
+This section describes lifecycle activities across heterogeneous environments.
+Each operation defines:
+
+- **Inputs**: what information/config is required
+- **Actions**: what the platform does
+- **Expected behaviors**: node and cluster-level outcomes
+- **Failure handling**: what happens when things go wrong
+
+### Node VHD Image Baking
+
+**Purpose**: Produce a base node image (VHD or similar) that satisfies baseline
+requirements and can be instantiated consistently across environments.
+
+**Inputs**:
+
+- Base OS distribution and version (e.g., Ubuntu 24.04)
+- System configurations
+- Versions of container runtime, kubelet, and other components
+- Optional feature layers (e.g., GPU drivers)
+
+**Actions**:
+
+- Install and configure system settings & tunings
+- Install container runtime and Kubernetes components
+- **Leave out**: cluster-specific configurations / credentials
+
+**Expected behaviors**:
+
+- Produced image is **immutable** and **reproducible** giving the same inputs.
+- Sources for all installed components **MUST** be pinned with qualified versions
+  and checksums for traceability and security.
+- Every baking step fully completes without partial failures.
+- Image is able to boot successfully and reach a "ready-to-bootstrap" state.
+- GPU image boot with drivers loaded.
+
+**Failure handling**:
+
+- Build pipeline produces actionable error messages
+- Failed builds do not produce or overwrite existing images
+
+### Node Bootstrapping
+
+**Purpose**: Turn a newly created machine instance into a functional Kubernetes
+node that can join the cluster and serve workloads.
+
+**Inputs**:
+
+- Cluster endpoint (API server URL, CA bundle)
+- Kubelet bootstrap credentials
+- Node configuration (e.g., kubelet config, runtime settings, node labels/taints)
+- Environment-specific instance metadata (node name, region/zone). Can be
+  exposed later via cloud provider.
+
+**Actions**:
+
+- Ensure network & container runtime are ready
+- Render kubelet configuration and start kubelet
+- Kubelet performs TLS bootstrapping to obtain node credentials and join the
+  cluster
+- Deploy and enable per node workloads
+
+**Expected behaviors**:
+
+- Node becomes `Ready` within a target SLA
+- Node labels/taints are applied correctly
+- Node reports correct capacity/allocatable resources, including GPU
+  if applicable
+- Bootstrap process is **idempotent** and can be safely re-run on the same node
+  for transient failures
+
+**Failure handling**:
+
+- Ability to detect and report failure details and kind of failure
+  (i.e., transient vs terminal) for better troubleshooting and remediation
+
+### Node Bootstrapping w/ Baking
+
+**Purpose**: In environments without pre-baked images, the bootstrapping process should also handle the initial image baking and provisioning to ensure a consistent baseline environment.
+
+**Inputs**:
+
+- Same as Node VHD Image Baking and Node Bootstrapping
+- (Optional) fallback/alternative endpoints for pulling necessary components in
+  environments with limited outbound connectivity
+
+**Actions**:
+
+- Perform image baking steps as described in Node VHD Image Baking
+- Proceed with bootstrapping steps as described in Node Bootstrapping
+
+**Expected behaviors**:
+
+- All expected behaviors from both Node VHD Image Baking and Node Bootstrapping
+- In addition, the process should be resilient to transient failures both phases
+  and support **idempotent** retries.
+
+**Failure handling**:
+
+- All failure handling mechanisms from both Node VHD Image Baking and Node Bootstrapping
+
+### Node Rebooting & Repairing
+
+_TODO_: This part needs more work and discussions
+
+**Purpose**: Handle planned and unplanned node reboots and repairs while
+maintaining node health and minimizing disruption to workloads.
+
+**Node Rebooting**
+
+- Inputs: N/A
+- Expected behaviors:
+  * Node is cordoned/drained before planned reboot
+  * Node becomes `Ready` within a target SLA after reboot
+
+**Node Repairing**
+
+- Inputs: N/A
+- Expected behaviors:
+  * Monitoring components detect node issues and trigger repair actions
+  * Impacted services are being restarted
+
+**Failure handling**:
+
+- If node fails to recover within a defined SLA, it should be marked as
+  unhealthy and trigger replacement workflow.
+- In case of repair failures, exponential backoff retries should be attempted;
+  errors should be exposed for troubleshooting and alerting.
+
+### Node Components Version Upgrades
+
+_TODO_: This part needs more details and breakdown designs
+
+**Purpose**: Upgrade on node components to newer versions.
+
+**Inputs**:
+
+- Target versions for components
+- Upgrade strategy (e.g., in-place vs replacement)
+
+**Actions**:
+
+- Cordon/drain node to evict workloads
+- In-place upgrade:
+  * Install newer versions of components/configurations
+  * Restart necessary services
+  * Verify node health and functionality
+  * Uncordon node
+- Replacement upgrade:
+  * Deprovision existing node and underlying resources
+  * Provision new node with updated image or configurations
+  * Join new node to cluster and verify health
+
+**Expected behaviors**:
+
+- Node is reporting expected versions for components after upgrade
+- In-place upgrade process is idempotent and can be safely retried in case of
+  transient failures
+
+**Failure handling**:
+
+- Failures should be reported for troubleshooting and alerting
+- In-place upgrade failures should not leave node open to scheduling.
+  Provide rollback if possible or recommend node replacement otherwise.
+
+### Node Re-imaging
+
+**Purpose**: Re-image a node to restore it to a known good state, either for
+recovering from failures or applying updates.
+
+**Inputs**:
+
+- Same as Node Bootstrapping, plus:
+- Target node and target node image
+
+**Actions**:
+
+- Cordon and drain the node to evict workloads
+- Re-image the underlying machine instance with the target image
+- Perform bootstrapping steps to rejoin the cluster
+
+**Expected behaviors**:
+
+- Re-image results in a clean, baseline-compliant host state.
+- Node returns to `Ready` state within a target SLA.
+- Node identity (e.g., name) is preserved after re-imaging.
+- Re-image is idempotent and can be safely retried in case of transient failures.
+
+**Failure handling**:
+
+- Re-image failures should be reported for troubleshooting and alerting
+- Re-image failed node should not be left in schedulable state.
+
+### Node Deletion
+
+**Purpose**: Remove a node from the cluster intentionally, either for
+scaling down, decommissioning, or drifting replacement.
+
+**Inputs**: node name
+
+**Actions**:
+
+- Cordon and drain the node to evict workloads if not forced deletion
+- Delete node object from cluster
+- Deprovision underlying compute / network resources if applicable
+
+**Expected behaviors**:
+
+- Node is gracefully removed from cluster and workloads are rescheduled
+- No orphaned resources are left behind
+
+**Failure handling**:
+
+- PDB violations or other issues preventing eviction should be reported clearly
+- Infrastructure resource clean up failure should be retried and alerted
+  if not successful after SLA
\ No newline at end of file

From 17991a8cfe07d4599fecdb287019d188d669f22f Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 23:27:18 -0800
Subject: [PATCH 6/8] doc: refine wordings

---
 docs/node-env.md | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/docs/node-env.md b/docs/node-env.md
index 8dc3c15..89d3690 100644
--- a/docs/node-env.md
+++ b/docs/node-env.md
@@ -21,15 +21,22 @@ the expected behaviors for key lifecycle operations.
 - We will limit the support scope to Linux-based nodes and focus on Ubuntu distro for now.
   This is because Ubuntu is the widely and commonly available Linux distribution
   across the target environments.
+- Credential management (bootstrap token rotation, CA renewal, etc) is out of scope
+  for this document, but will be handled by the operations described below.
+- Extra security harding and compliance requirements are out of scope for this document,
+  but can be added as optional layers on top of the baseline environment in the future.
+- Detailed GPU device plugin requirements and enablement strategies will be addressed in
+  a separate document.
 
 ## Baseline Environment Requirements
 
 ### CPU Only Nodes
 
 - A Linux-based OS with `systemd` init system;
-- Modern Linux kernel (currently LTS or supported release) enabled with cgroup v2;
+- Modern Linux kernel (currently LTS or supported release, minimum 5.19) enabled
+  with cgroup v2, namespaces, overlayfs, eBPF etc for container support.
 - Swap disabled;
-- Syslog with rotation configured;
+- System level logging enabled with rotation configured;
 - Time synchronization configured;
 - Proper host level DNS setup;
 - Outbound connectivity to cluster control plane endpoint;
@@ -64,10 +71,11 @@ the expected behaviors for key lifecycle operations.
 - Support for adding optional feature layers & customizations during node image
   baking or bootstrapping process;
 - In some environments, pre-built VHD images might not be available. In such
-  cases, the node bootstrapping process should also handle the initial OS image baking and provisioning to ensure a consistent baseline environment.
+  cases, the node bootstrapping process should also handle the initial OS image
+  baking and provisioning to ensure a consistent baseline environment.
 - In some environments, the node might have limited outbound connectivity
-  (e.g., no direct access to public internet). In such cases, the node bootstrapping process should also handle pulling necessary components
-  through proxy or fallback endpoints.
+  (e.g., no direct access to public internet). In such cases, the node bootstrapping
+  process should also handle pulling necessary components through proxy or fallback endpoints.
 
 ## Node Lifecycle Operations
 
@@ -148,7 +156,9 @@ node that can join the cluster and serve workloads.
 
 ### Node Bootstrapping w/ Baking
 
-**Purpose**: In environments without pre-baked images, the bootstrapping process should also handle the initial image baking and provisioning to ensure a consistent baseline environment.
+**Purpose**: In environments without pre-baked images, the bootstrapping process
+should also handle the initial image baking and provisioning to ensure a
+consistent baseline environment.
 
 **Inputs**:
 
@@ -180,14 +190,14 @@ maintaining node health and minimizing disruption to workloads.
 
 **Node Rebooting**
 
-- Inputs: N/A
+- Inputs: node name and reboot type (planned vs unplanned)
 - Expected behaviors:
   * Node is cordoned/drained before planned reboot
   * Node becomes `Ready` within a target SLA after reboot
 
 **Node Repairing**
 
-- Inputs: N/A
+- Inputs: node name and repair category
 - Expected behaviors:
   * Monitoring components detect node issues and trigger repair actions
   * Impacted services are being restarted
@@ -203,7 +213,8 @@ maintaining node health and minimizing disruption to workloads.
 
 _TODO_: This part needs more details and breakdown designs
 
-**Purpose**: Upgrade on node components to newer versions.
+**Purpose**: Upgrade on node components (kubelet, container runtime,
+CNI plugins) to newer versions.
 
 **Inputs**:
 
@@ -266,7 +277,7 @@ recovering from failures or applying updates.
 ### Node Deletion
 
 **Purpose**: Remove a node from the cluster intentionally, either for
-scaling down, decommissioning, or drifting replacement.
+scaling down, decommissioning, or drift replacement.
 
 **Inputs**: node name
 

From 3e0cb10805ea1440a4ff05bc7fb7c52a9677955c Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Wed, 11 Feb 2026 23:45:57 -0800
Subject: [PATCH 7/8] doc: wording fix

---
 docs/node-env.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/node-env.md b/docs/node-env.md
index 89d3690..484763c 100644
--- a/docs/node-env.md
+++ b/docs/node-env.md
@@ -21,8 +21,8 @@ the expected behaviors for key lifecycle operations.
 - We will limit the support scope to Linux-based nodes and focus on Ubuntu distro for now.
   This is because Ubuntu is the widely and commonly available Linux distribution
   across the target environments.
-- Credential management (bootstrap token rotation, CA renewal, etc) is out of scope
-  for this document, but will be handled by the operations described below.
+- Credential management (bootstrap token distribution & rotation, CA renewal, etc)
+  is out of scope for this document, but will be handled by the operations described below.
 - Extra security harding and compliance requirements are out of scope for this document,
   but can be added as optional layers on top of the baseline environment in the future.
 - Detailed GPU device plugin requirements and enablement strategies will be addressed in

From 5fb712b411be6c68bf466178957c22fa9e4d8da9 Mon Sep 17 00:00:00 2001
From: hbc <me@hbc.rocks>
Date: Thu, 12 Feb 2026 10:33:39 -0800
Subject: [PATCH 8/8] doc: feedbacks

---
 docs/node-env.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/docs/node-env.md b/docs/node-env.md
index 484763c..bc1f1f9 100644
--- a/docs/node-env.md
+++ b/docs/node-env.md
@@ -40,17 +40,20 @@ the expected behaviors for key lifecycle operations.
 - Time synchronization configured;
 - Proper host level DNS setup;
 - Outbound connectivity to cluster control plane endpoint;
-- Container runtime:
+- Container runtime components:
   * `containerd` w/ 2.0+ version;
   * `runc`
 - Kubernetes components:
   * `kubelet` matching with the target worker node version;
-  * Control plane public CA certificate(s);
-  * TLS bootstrap configurations;
   * Other cloud provider binaries;
 - NFTables / IPtables installed for Kubernetes network policies;
 - Network forward, IP masquerade and bridge settings configured for Kubernetes networking;
 - Support tools / binaries (e.g., `curl`, `ping`, etc) for diagnostics and troubleshooting;
+- Configurations:
+  * Standard container runtime configurations layout on the host;
+  * Standard Kubernetes node configurations layout on the host;
+  * Control plane public CA certificate(s);
+  * TLS bootstrap configurations;
 
 ### GPU-Capable Nodes
 
@@ -58,7 +61,8 @@ the expected behaviors for key lifecycle operations.
 - GPU drivers and runtime (e.g. NVIDIA drivers and CUDA toolkit for NVIDIA GPUs)
   compatible with OS kernel;
 - RDMA, SR-IOV and InfiniBand drivers and runtime for GPU direct communication (if applicable);
-- Updated container runtime with GPU support;
+- Configurations:
+  * Updated container runtime configurations with support for GPU drivers and runtimes;
 
 ### Additional Requirements
 
@@ -66,6 +70,7 @@ the expected behaviors for key lifecycle operations.
 - CNI plugin binaries and configurations;
 - Node-problem-detector;
 - Node local DNS caching;
+- VPN components for cross region/cloud connectivity;
 - Background auto-repair agent;
 - Pre-cached container images for critical system components;
 - Support for adding optional feature layers & customizations during node image
@@ -107,13 +112,19 @@ requirements and can be instantiated consistently across environments.
 
 **Expected behaviors**:
 
-- Produced image is **immutable** and **reproducible** giving the same inputs.
+- Produced image is **immutable**[^1] and **reproducible**[^2] giving the same inputs.
 - Sources for all installed components **MUST** be pinned with qualified versions
   and checksums for traceability and security.
 - Every baking step fully completes without partial failures.
 - Image is able to boot successfully and reach a "ready-to-bootstrap" state.
 - GPU image boot with drivers loaded.
 
+[^1]: Immutable means once the image is built and published, it should not be modified.
+      Any updates or changes should trigger a new image build with a new version/tag.
+
+[^2]: Reproducible means given the same inputs and build process, the output image
+      should be identical with the installed components/configurations setup.
+
 **Failure handling**:
 
 - Build pipeline produces actionable error messages
@@ -127,10 +138,11 @@ node that can join the cluster and serve workloads.
 **Inputs**:
 
 - Cluster endpoint (API server URL, CA bundle)
-- Kubelet bootstrap credentials
+- Kubelet bootstrap credentials (node identity credentials)
 - Node configuration (e.g., kubelet config, runtime settings, node labels/taints)
 - Environment-specific instance metadata (node name, region/zone). Can be
   exposed later via cloud provider.
+- VPN configurations for cross region/cloud connectivity if applicable
 
 **Actions**: