From 789a6d15a691abaefe2720f2ff6639da887eb746 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Feb 2026 05:53:34 +0000 Subject: [PATCH 1/6] Initial plan From 5dd892a8fcf9707bc2ce3d071cbc85e0ae430df3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Feb 2026 06:03:51 +0000 Subject: [PATCH 2/6] Add E2E test cases for apply -f usage with kubeadm join flow Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- hack/e2e/README.md | 39 ++++-- hack/e2e/infra/main.bicep | 88 +++++++++++++- hack/e2e/lib/cleanup.sh | 24 ++-- hack/e2e/lib/infra.sh | 20 ++- hack/e2e/lib/node-join.sh | 247 ++++++++++++++++++++++++++++++++++++-- hack/e2e/lib/validate.sh | 18 +-- hack/e2e/run.sh | 30 +++-- 7 files changed, 410 insertions(+), 56 deletions(-) diff --git a/hack/e2e/README.md b/hack/e2e/README.md index 2f1e418..7582b10 100644 --- a/hack/e2e/README.md +++ b/hack/e2e/README.md @@ -1,7 +1,8 @@ # AKS Flex Node E2E Tests -End-to-end tests that provision an AKS cluster and two Ubuntu VMs in Azure, join -them as flex nodes (one via MSI, one via bootstrap token), and run smoke tests. +End-to-end tests that provision an AKS cluster and three Ubuntu VMs in Azure, +join them as flex nodes (one via MSI, one via bootstrap token, one via kubeadm +join using `apply -f`), and run smoke tests. ## Prerequisites @@ -27,8 +28,8 @@ export E2E_LOCATION=westus2 make e2e ``` -This will build the agent binary, deploy infrastructure via Bicep, join both -nodes, run validations, collect logs, and tear everything down. +This will build the agent binary, deploy infrastructure via Bicep, join all +three nodes, run validations, collect logs, and tear everything down. ## Commands @@ -38,10 +39,11 @@ omitted it defaults to `all`. | Command | Description | |---------|-------------| | `all` | Full flow: build, infra, join, validate, cleanup (default) | -| `infra` | Deploy AKS cluster + 2 VMs via Bicep | -| `join` | Join both nodes to the cluster | +| `infra` | Deploy AKS cluster + 3 VMs via Bicep | +| `join` | Join all nodes to the cluster | | `join-msi` | Join only the MSI-authenticated node | | `join-token` | Join only the bootstrap-token node | +| `join-kubeadm` | Join only the kubeadm node (`apply -f` with `KubeadmNodeJoin`) | | `validate` | Verify nodes joined and run smoke tests | | `smoke` | Run smoke tests only (nginx pods on flex nodes) | | `logs` | Collect logs from VMs | @@ -72,6 +74,21 @@ Additional environment variables: | `AZURE_SUBSCRIPTION_ID` | (auto-detected) | Azure subscription | | `AZURE_TENANT_ID` | (auto-detected) | Azure tenant | +## Node Join Modes + +The E2E suite tests three node join methods: + +| VM | Auth Mode | Join Method | +|----|-----------|-------------| +| `vm-e2e-msi-*` | Managed Identity (MSI) | `aks-flex-node agent --config config.json` | +| `vm-e2e-token-*` | Bootstrap Token | `aks-flex-node agent --config config.json` | +| `vm-e2e-kubeadm-*` | Bootstrap Token | `aks-flex-node apply -f kubeadm-join.json` | + +The kubeadm VM uses the `apply -f` command with a JSON action file that +contains a sequence of component actions (configure OS, download CRI/kube/CNI +binaries, start containerd, then `KubeadmNodeJoin`) to join the cluster using +the kubeadm join flow. + ## Iterative Development The subcommands make it easy to deploy infrastructure once and iterate on the @@ -84,6 +101,7 @@ join or validation steps without re-provisioning every time. # Iterate on the join logic ./hack/e2e/run.sh join-msi ./hack/e2e/run.sh join-token +./hack/e2e/run.sh join-kubeadm # Run validation ./hack/e2e/run.sh validate @@ -114,11 +132,11 @@ make e2e-cleanup # Tear down resources hack/e2e/ run.sh Main entry point / orchestrator infra/ - main.bicep Bicep template (AKS + VNet + NSG + 2 VMs + role assignments) + main.bicep Bicep template (AKS + VNet + NSG + 3 VMs + role assignments) lib/ common.sh Logging, prereqs, config, state management, SSH helpers infra.sh Bicep deployment, output extraction, kubeconfig fetch - node-join.sh MSI and token node join logic + node-join.sh MSI, token, and kubeadm node join logic validate.sh Node-ready checks and smoke tests (nginx pods) cleanup.sh Log collection and Azure resource teardown ``` @@ -135,7 +153,10 @@ previous one left off. Use `run.sh status` to inspect it. your SSH key is available (defaults to `~/.ssh/id_rsa.pub`). Check the state file for the correct VM public IPs with `run.sh status`. - **Node not joining**: Run `run.sh logs` to pull `journalctl` and agent logs - from both VMs. Logs are saved to `$E2E_WORK_DIR/logs/`. + from all VMs. Logs are saved to `$E2E_WORK_DIR/logs/`. +- **Kubeadm join failures**: Check `kubeadm-agent-journal.log` and + `kubeadm-kubelet.log` in the logs directory. The `apply -f` approach runs + sequentially; each action step must succeed before the next one starts. - **Timeouts**: Adjust `E2E_SSH_WAIT_TIMEOUT`, `E2E_NODE_JOIN_TIMEOUT`, or `E2E_POD_READY_TIMEOUT` environment variables (in seconds). - **Leftover resources**: If a previous run didn't clean up, run diff --git a/hack/e2e/infra/main.bicep b/hack/e2e/infra/main.bicep index 06da42c..0073957 100644 --- a/hack/e2e/infra/main.bicep +++ b/hack/e2e/infra/main.bicep @@ -34,11 +34,12 @@ param tags object = {} // --------------------------------------------------------------------------- // Variables // --------------------------------------------------------------------------- -var clusterName = 'aks-e2e-${nameSuffix}' -var msiVmName = 'vm-e2e-msi-${nameSuffix}' -var tokenVmName = 'vm-e2e-token-${nameSuffix}' -var vnetName = 'vnet-e2e-${nameSuffix}' -var nsgName = 'nsg-e2e-${nameSuffix}' +var clusterName = 'aks-e2e-${nameSuffix}' +var msiVmName = 'vm-e2e-msi-${nameSuffix}' +var tokenVmName = 'vm-e2e-token-${nameSuffix}' +var kubeadmVmName = 'vm-e2e-kubeadm-${nameSuffix}' +var vnetName = 'vnet-e2e-${nameSuffix}' +var nsgName = 'nsg-e2e-${nameSuffix}' var subnetAksName = 'snet-aks' var subnetVmName = 'snet-vm' @@ -158,6 +159,16 @@ resource pipToken 'Microsoft.Network/publicIPAddresses@2023-11-01' = { } } +resource pipKubeadm 'Microsoft.Network/publicIPAddresses@2023-11-01' = { + name: '${kubeadmVmName}-pip' + location: location + tags: tags + sku: { name: 'Standard' } + properties: { + publicIPAllocationMethod: 'Static' + } +} + // --------------------------------------------------------------------------- // NICs // --------------------------------------------------------------------------- @@ -205,6 +216,28 @@ resource nicToken 'Microsoft.Network/networkInterfaces@2023-11-01' = { } } +resource nicKubeadm 'Microsoft.Network/networkInterfaces@2023-11-01' = { + name: '${kubeadmVmName}-nic' + location: location + tags: tags + properties: { + ipConfigurations: [ + { + name: 'ipconfig1' + properties: { + subnet: { + id: vnet.properties.subnets[1].id + } + publicIPAddress: { + id: pipKubeadm.id + } + privateIPAllocationMethod: 'Dynamic' + } + } + ] + } +} + // --------------------------------------------------------------------------- // VM: MSI (system-assigned managed identity) // --------------------------------------------------------------------------- @@ -292,6 +325,48 @@ resource vmToken 'Microsoft.Compute/virtualMachines@2024-03-01' = { } } +// --------------------------------------------------------------------------- +// VM: Kubeadm (no managed identity - uses apply -f with kubeadm join flow) +// --------------------------------------------------------------------------- +resource vmKubeadm 'Microsoft.Compute/virtualMachines@2024-03-01' = { + name: kubeadmVmName + location: location + tags: tags + properties: { + hardwareProfile: { vmSize: vmSize } + osProfile: { + computerName: kubeadmVmName + adminUsername: adminUsername + linuxConfiguration: { + disablePasswordAuthentication: true + ssh: { + publicKeys: [ + { + path: '/home/${adminUsername}/.ssh/authorized_keys' + keyData: sshPublicKey + } + ] + } + } + } + storageProfile: { + imageReference: { + publisher: 'Canonical' + offer: '0001-com-ubuntu-server-jammy' + sku: '22_04-lts-gen2' + version: 'latest' + } + osDisk: { + createOption: 'FromImage' + managedDisk: { storageAccountType: 'StandardSSD_LRS' } + } + } + networkProfile: { + networkInterfaces: [ { id: nicKubeadm.id } ] + } + } +} + // --------------------------------------------------------------------------- // Role assignments: grant MSI VM permissions on the AKS cluster // --------------------------------------------------------------------------- @@ -331,4 +406,7 @@ output msiVmPrincipalId string = vmMsi.identity.principalId output tokenVmName string = vmToken.name output tokenVmIp string = pipToken.properties.ipAddress +output kubeadmVmName string = vmKubeadm.name +output kubeadmVmIp string = pipKubeadm.properties.ipAddress + output adminUsername string = adminUsername diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh index 2ed7676..16ecb3a 100755 --- a/hack/e2e/lib/cleanup.sh +++ b/hack/e2e/lib/cleanup.sh @@ -43,16 +43,17 @@ _collect_vm_logs() { } # --------------------------------------------------------------------------- -# collect_logs - Collect logs from both VMs +# collect_logs - Collect logs from all VMs # --------------------------------------------------------------------------- collect_logs() { log_section "Collecting Logs" mkdir -p "${E2E_LOG_DIR}" - local msi_vm_ip token_vm_ip + local msi_vm_ip token_vm_ip kubeadm_vm_ip msi_vm_ip="$(state_get msi_vm_ip)" token_vm_ip="$(state_get token_vm_ip)" + kubeadm_vm_ip="$(state_get kubeadm_vm_ip)" if [[ -n "${msi_vm_ip}" ]]; then _collect_vm_logs "${msi_vm_ip}" "msi" || true @@ -62,6 +63,10 @@ collect_logs() { _collect_vm_logs "${token_vm_ip}" "token" || true fi + if [[ -n "${kubeadm_vm_ip}" ]]; then + _collect_vm_logs "${kubeadm_vm_ip}" "kubeadm" || true + fi + # Also capture cluster-side info { echo "=== Nodes ===" @@ -91,11 +96,12 @@ cleanup() { return 0 fi - local resource_group cluster_name msi_vm_name token_vm_name + local resource_group cluster_name msi_vm_name token_vm_name kubeadm_vm_name resource_group="$(state_get resource_group)" cluster_name="$(state_get cluster_name)" msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + kubeadm_vm_name="$(state_get kubeadm_vm_name)" local deployment_name deployment_name="$(state_get deployment_name)" @@ -105,16 +111,20 @@ cleanup() { fi # Delete VMs first (faster than waiting for full RG delete) - log_info "[1/4] Deleting MSI VM: ${msi_vm_name}..." + log_info "[1/5] Deleting MSI VM: ${msi_vm_name}..." az vm delete --resource-group "${resource_group}" --name "${msi_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true - log_info "[2/4] Deleting Token VM: ${token_vm_name}..." + log_info "[2/5] Deleting Token VM: ${token_vm_name}..." az vm delete --resource-group "${resource_group}" --name "${token_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true + log_info "[3/5] Deleting Kubeadm VM: ${kubeadm_vm_name}..." + az vm delete --resource-group "${resource_group}" --name "${kubeadm_vm_name}" \ + --force-deletion yes --yes --no-wait 2>/dev/null || true + # Clean up leftover networking resources tied to our deployment - log_info "[3/4] Cleaning up networking resources..." + log_info "[4/5] Cleaning up networking resources..." local run_id="${GITHUB_RUN_ID:-}" if [[ -n "${run_id}" ]]; then for res_type in networkInterfaces publicIPAddresses networkSecurityGroups disks; do @@ -126,7 +136,7 @@ cleanup() { done fi - log_info "[4/4] Deleting AKS cluster: ${cluster_name}..." + log_info "[5/5] Deleting AKS cluster: ${cluster_name}..." az aks delete --resource-group "${resource_group}" --name "${cluster_name}" \ --yes --no-wait 2>/dev/null || true diff --git a/hack/e2e/lib/infra.sh b/hack/e2e/lib/infra.sh index 1ba134d..b30a7d6 100755 --- a/hack/e2e/lib/infra.sh +++ b/hack/e2e/lib/infra.sh @@ -105,7 +105,7 @@ infra_deploy() { -o json) local cluster_name cluster_id msi_vm_name msi_vm_ip msi_vm_principal_id - local token_vm_name token_vm_ip admin_username + local token_vm_name token_vm_ip kubeadm_vm_name kubeadm_vm_ip admin_username cluster_name=$(echo "${outputs}" | jq -r '.clusterName.value') cluster_id=$(echo "${outputs}" | jq -r '.clusterId.value') @@ -114,6 +114,8 @@ infra_deploy() { msi_vm_principal_id=$(echo "${outputs}" | jq -r '.msiVmPrincipalId.value') token_vm_name=$(echo "${outputs}" | jq -r '.tokenVmName.value') token_vm_ip=$(echo "${outputs}" | jq -r '.tokenVmIp.value') + kubeadm_vm_name=$(echo "${outputs}" | jq -r '.kubeadmVmName.value') + kubeadm_vm_ip=$(echo "${outputs}" | jq -r '.kubeadmVmIp.value') admin_username=$(echo "${outputs}" | jq -r '.adminUsername.value') # Persist to state @@ -124,6 +126,8 @@ infra_deploy() { state_set "msi_vm_principal_id" "${msi_vm_principal_id}" state_set "token_vm_name" "${token_vm_name}" state_set "token_vm_ip" "${token_vm_ip}" + state_set "kubeadm_vm_name" "${kubeadm_vm_name}" + state_set "kubeadm_vm_ip" "${kubeadm_vm_ip}" state_set "admin_username" "${admin_username}" state_set "resource_group" "${E2E_RESOURCE_GROUP}" state_set "location" "${E2E_LOCATION}" @@ -131,23 +135,27 @@ infra_deploy() { state_set "tenant_id" "${AZURE_TENANT_ID}" state_set "deployment_name" "${deployment_name}" - log_info "Cluster: ${cluster_name} (${cluster_id})" - log_info "MSI VM: ${msi_vm_name} @ ${msi_vm_ip}" - log_info "Token VM: ${token_vm_name} @ ${token_vm_ip}" + log_info "Cluster: ${cluster_name} (${cluster_id})" + log_info "MSI VM: ${msi_vm_name} @ ${msi_vm_ip}" + log_info "Token VM: ${token_vm_name} @ ${token_vm_ip}" + log_info "Kubeadm VM: ${kubeadm_vm_name} @ ${kubeadm_vm_ip}" # Get kubeconfig and extract cluster info infra_get_kubeconfig - # Wait for SSH on both VMs (in parallel) - log_info "Waiting for SSH on both VMs..." + # Wait for SSH on all VMs (in parallel) + log_info "Waiting for SSH on all VMs..." wait_for_ssh "${msi_vm_ip}" & local pid_msi=$! wait_for_ssh "${token_vm_ip}" & local pid_token=$! + wait_for_ssh "${kubeadm_vm_ip}" & + local pid_kubeadm=$! local ssh_failed=0 wait "${pid_msi}" || ssh_failed=1 wait "${pid_token}" || ssh_failed=1 + wait "${pid_kubeadm}" || ssh_failed=1 if [[ "${ssh_failed}" -eq 1 ]]; then log_error "One or more VMs not reachable via SSH" diff --git a/hack/e2e/lib/node-join.sh b/hack/e2e/lib/node-join.sh index f56ace3..3b1eb06 100755 --- a/hack/e2e/lib/node-join.sh +++ b/hack/e2e/lib/node-join.sh @@ -3,13 +3,15 @@ # hack/e2e/lib/node-join.sh - Bootstrap flex nodes into the AKS cluster # # Functions: -# node_join_msi - Install Azure CLI + MSI auth, deploy binary, run agent -# node_join_token - Create bootstrap token/RBAC, deploy binary, run agent -# node_join_all - Join both nodes (MSI first, then token) +# node_join_msi - Install Azure CLI + MSI auth, deploy binary, run agent +# node_join_token - Create bootstrap token/RBAC, deploy binary, run agent +# node_join_kubeadm - Create bootstrap token, deploy binary, run apply -f +# with a KubeadmNodeJoin action (kubeadm join flow) +# node_join_all - Join all nodes (MSI, token, and kubeadm) in parallel # # Each function: -# 1. Generates the appropriate config.json -# 2. SCPs the binary + config onto the VM +# 1. Generates the appropriate config / action file +# 2. SCPs the binary + config/action file onto the VM # 3. Starts the agent via systemd-run # 4. Waits for kubelet to report running # ============================================================================= @@ -309,15 +311,231 @@ EOF } # --------------------------------------------------------------------------- -# node_join_all - Join both nodes in parallel +# node_join_kubeadm - Join the Kubeadm VM using apply -f with KubeadmNodeJoin +# --------------------------------------------------------------------------- +node_join_kubeadm() { + log_section "Joining Kubeadm Node (apply -f)" + local start + start=$(timer_start) + + local vm_ip + vm_ip="$(state_get kubeadm_vm_ip)" + local server_url + server_url="$(state_get server_url)" + local ca_cert_data + ca_cert_data="$(state_get ca_cert_data)" + + # Step 1: Create bootstrap token & RBAC in the cluster + log_info "Creating bootstrap token and RBAC resources for kubeadm join..." + local token_id token_secret bootstrap_token expiration + + token_id="$(openssl rand -hex 3)" + token_secret="$(openssl rand -hex 8)" + bootstrap_token="${token_id}.${token_secret}" + + # Use a portable date command for expiration (24h from now) + if date --version &>/dev/null; then + # GNU date + expiration="$(date -u -d "+24 hours" +"%Y-%m-%dT%H:%M:%SZ")" + else + # BSD/macOS date + expiration="$(date -u -v+24H +"%Y-%m-%dT%H:%M:%SZ")" + fi + + log_info "Token ID: ${token_id} | Expires: ${expiration}" + + # Create the bootstrap token secret + kubectl apply -f - < "${action_file}" <&1 || true +fi +REMOTE + + log_success "Kubeadm node joined via apply -f in $(timer_elapsed "${start}")s" +} + +# --------------------------------------------------------------------------- +# node_join_all - Join all nodes in parallel # --------------------------------------------------------------------------- node_join_all() { - log_section "Joining Both Nodes (parallel)" + log_section "Joining All Nodes (parallel)" local start start=$(timer_start) - local msi_pid token_pid - local msi_exit=0 token_exit=0 + local msi_pid token_pid kubeadm_pid + local msi_exit=0 token_exit=0 kubeadm_exit=0 node_join_msi & msi_pid=$! @@ -325,8 +543,12 @@ node_join_all() { node_join_token & token_pid=$! + node_join_kubeadm & + kubeadm_pid=$! + wait "${msi_pid}" || msi_exit=$? wait "${token_pid}" || token_exit=$? + wait "${kubeadm_pid}" || kubeadm_exit=$? local duration duration=$(timer_elapsed "${start}") @@ -337,11 +559,14 @@ node_join_all() { if [[ "${token_exit}" -ne 0 ]]; then log_error "Token node join failed (exit ${token_exit})" fi + if [[ "${kubeadm_exit}" -ne 0 ]]; then + log_error "Kubeadm node join failed (exit ${kubeadm_exit})" + fi - if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 ]]; then + if [[ "${msi_exit}" -ne 0 || "${token_exit}" -ne 0 || "${kubeadm_exit}" -ne 0 ]]; then log_error "Node joins failed (${duration}s)" return 1 fi - log_success "Both nodes joined in ${duration}s" + log_success "All nodes joined in ${duration}s" } diff --git a/hack/e2e/lib/validate.sh b/hack/e2e/lib/validate.sh index 5ee92b8..f40ff1b 100755 --- a/hack/e2e/lib/validate.sh +++ b/hack/e2e/lib/validate.sh @@ -4,9 +4,9 @@ # # Functions: # validate_node_joined - Wait for a specific node to appear in kubectl -# validate_all_nodes - Verify both MSI and token nodes joined +# validate_all_nodes - Verify MSI, token, and kubeadm nodes joined # smoke_test