diff --git a/hack/e2e/README.md b/hack/e2e/README.md index 2f1e418..1066529 100644 --- a/hack/e2e/README.md +++ b/hack/e2e/README.md @@ -1,7 +1,8 @@ # AKS Flex Node E2E Tests -End-to-end tests that provision an AKS cluster and two Ubuntu VMs in Azure, join -them as flex nodes (one via MSI, one via bootstrap token), and run smoke tests. +End-to-end tests that provision an AKS cluster and three Ubuntu VMs in Azure, +join them as flex nodes (one via MSI, one via bootstrap token, one via kubeadm +join using `apply -f`), and run smoke tests. ## Prerequisites @@ -27,8 +28,8 @@ export E2E_LOCATION=westus2 make e2e ``` -This will build the agent binary, deploy infrastructure via Bicep, join both -nodes, run validations, collect logs, and tear everything down. +This will build the agent binary, deploy infrastructure via Bicep, join all +three nodes, run validations, collect logs, and tear everything down. ## Commands @@ -38,10 +39,11 @@ omitted it defaults to `all`. | Command | Description | |---------|-------------| | `all` | Full flow: build, infra, join, validate, cleanup (default) | -| `infra` | Deploy AKS cluster + 2 VMs via Bicep | -| `join` | Join both nodes to the cluster | +| `infra` | Deploy AKS cluster + 3 VMs via Bicep | +| `join` | Join all nodes to the cluster | | `join-msi` | Join only the MSI-authenticated node | | `join-token` | Join only the bootstrap-token node | +| `join-kubeadm` | Join only the kubeadm node (`apply -f` with `KubeadmNodeJoin`) | | `validate` | Verify nodes joined and run smoke tests | | `smoke` | Run smoke tests only (nginx pods on flex nodes) | | `logs` | Collect logs from VMs | @@ -72,6 +74,21 @@ Additional environment variables: | `AZURE_SUBSCRIPTION_ID` | (auto-detected) | Azure subscription | | `AZURE_TENANT_ID` | (auto-detected) | Azure tenant | +## Node Join Modes + +The E2E suite tests three node join methods: + +| VM | Auth Mode | Join Method | +|----|-----------|-------------| +| `vm-e2e-msi-*` | Managed Identity (MSI) | `aks-flex-node agent --config config.json` | +| `vm-e2e-token-*` | Bootstrap Token | `aks-flex-node agent --config config.json` | +| `vm-e2e-kubeadm-*` | Bootstrap Token | `aks-flex-node apply -f kubeadm-join.json` | + +The kubeadm VM uses the `apply -f` command with a JSON action file that +contains a sequence of component actions (configure OS, download CRI/kube/CNI +binaries, start containerd, then `KubeadmNodeJoin`) to join the cluster using +the kubeadm join flow. + ## Iterative Development The subcommands make it easy to deploy infrastructure once and iterate on the @@ -84,6 +101,7 @@ join or validation steps without re-provisioning every time. # Iterate on the join logic ./hack/e2e/run.sh join-msi ./hack/e2e/run.sh join-token +./hack/e2e/run.sh join-kubeadm # Run validation ./hack/e2e/run.sh validate @@ -114,13 +132,16 @@ make e2e-cleanup # Tear down resources hack/e2e/ run.sh Main entry point / orchestrator infra/ - main.bicep Bicep template (AKS + VNet + NSG + 2 VMs + role assignments) + main.bicep Bicep template (AKS + VNet + NSG + 3 VMs + role assignments) lib/ - common.sh Logging, prereqs, config, state management, SSH helpers - infra.sh Bicep deployment, output extraction, kubeconfig fetch - node-join.sh MSI and token node join logic - validate.sh Node-ready checks and smoke tests (nginx pods) - cleanup.sh Log collection and Azure resource teardown + common.sh Logging, prereqs, config, state management, SSH helpers + infra.sh Bicep deployment, output extraction, kubeconfig fetch + node-join.sh Shared helper (_deploy_and_start_agent) + node_join_all orchestration + node-join-msi.sh MSI auth node join (node_join_msi) + node-join-token.sh Bootstrap token node join (node_join_token) + node-join-kubeadm.sh Kubeadm apply -f node join (node_join_kubeadm) + validate.sh Node-ready checks and smoke tests (nginx pods) + cleanup.sh Log collection and Azure resource teardown ``` ## State File @@ -135,7 +156,10 @@ previous one left off. Use `run.sh status` to inspect it. your SSH key is available (defaults to `~/.ssh/id_rsa.pub`). Check the state file for the correct VM public IPs with `run.sh status`. - **Node not joining**: Run `run.sh logs` to pull `journalctl` and agent logs - from both VMs. Logs are saved to `$E2E_WORK_DIR/logs/`. + from all VMs. Logs are saved to `$E2E_WORK_DIR/logs/`. +- **Kubeadm join failures**: Check `kubeadm-agent-journal.log` and + `kubeadm-kubelet.log` in the logs directory. The `apply -f` approach runs + sequentially; each action step must succeed before the next one starts. - **Timeouts**: Adjust `E2E_SSH_WAIT_TIMEOUT`, `E2E_NODE_JOIN_TIMEOUT`, or `E2E_POD_READY_TIMEOUT` environment variables (in seconds). - **Leftover resources**: If a previous run didn't clean up, run diff --git a/hack/e2e/infra/main.bicep b/hack/e2e/infra/main.bicep index 06da42c..491c7d7 100644 --- a/hack/e2e/infra/main.bicep +++ b/hack/e2e/infra/main.bicep @@ -5,8 +5,10 @@ // - AKS cluster (1-node control plane) // - VM with system-assigned managed identity (MSI auth mode) // - VM without managed identity (bootstrap token auth mode) +// - VM without managed identity (kubeadm apply -f auth mode) // -// Both VMs run Ubuntu 22.04 LTS, have public IPs, and allow SSH ingress. +// All flex-node VMs run Ubuntu 22.04 LTS, have public IPs, and allow SSH +// ingress. VM creation is delegated to the reusable modules/vm.bicep module. // ============================================================================= @description('Azure region for all resources.') @@ -34,11 +36,12 @@ param tags object = {} // --------------------------------------------------------------------------- // Variables // --------------------------------------------------------------------------- -var clusterName = 'aks-e2e-${nameSuffix}' -var msiVmName = 'vm-e2e-msi-${nameSuffix}' -var tokenVmName = 'vm-e2e-token-${nameSuffix}' -var vnetName = 'vnet-e2e-${nameSuffix}' -var nsgName = 'nsg-e2e-${nameSuffix}' +var clusterName = 'aks-e2e-${nameSuffix}' +var msiVmName = 'vm-e2e-msi-${nameSuffix}' +var tokenVmName = 'vm-e2e-token-${nameSuffix}' +var kubeadmVmName = 'vm-e2e-kubeadm-${nameSuffix}' +var vnetName = 'vnet-e2e-${nameSuffix}' +var nsgName = 'nsg-e2e-${nameSuffix}' var subnetAksName = 'snet-aks' var subnetVmName = 'snet-vm' @@ -136,159 +139,47 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-01-01' = { } // --------------------------------------------------------------------------- -// Public IPs for VMs +// Flex-node VMs (via reusable module) // --------------------------------------------------------------------------- -resource pipMsi 'Microsoft.Network/publicIPAddresses@2023-11-01' = { - name: '${msiVmName}-pip' - location: location - tags: tags - sku: { name: 'Standard' } - properties: { - publicIPAllocationMethod: 'Static' - } -} - -resource pipToken 'Microsoft.Network/publicIPAddresses@2023-11-01' = { - name: '${tokenVmName}-pip' - location: location - tags: tags - sku: { name: 'Standard' } - properties: { - publicIPAllocationMethod: 'Static' - } -} - -// --------------------------------------------------------------------------- -// NICs -// --------------------------------------------------------------------------- -resource nicMsi 'Microsoft.Network/networkInterfaces@2023-11-01' = { - name: '${msiVmName}-nic' - location: location - tags: tags - properties: { - ipConfigurations: [ - { - name: 'ipconfig1' - properties: { - subnet: { - id: vnet.properties.subnets[1].id - } - publicIPAddress: { - id: pipMsi.id - } - privateIPAllocationMethod: 'Dynamic' - } - } - ] - } -} - -resource nicToken 'Microsoft.Network/networkInterfaces@2023-11-01' = { - name: '${tokenVmName}-nic' - location: location - tags: tags - properties: { - ipConfigurations: [ - { - name: 'ipconfig1' - properties: { - subnet: { - id: vnet.properties.subnets[1].id - } - publicIPAddress: { - id: pipToken.id - } - privateIPAllocationMethod: 'Dynamic' - } - } - ] +module vmMsi 'modules/vm.bicep' = { + name: 'deploy-vm-msi' + params: { + location: location + vmName: msiVmName + vmSize: vmSize + adminUsername: adminUsername + sshPublicKey: sshPublicKey + subnetId: vnet.properties.subnets[1].id + assignManagedIdentity: true + tags: tags } } -// --------------------------------------------------------------------------- -// VM: MSI (system-assigned managed identity) -// --------------------------------------------------------------------------- -resource vmMsi 'Microsoft.Compute/virtualMachines@2024-03-01' = { - name: msiVmName - location: location - tags: tags - identity: { - type: 'SystemAssigned' - } - properties: { - hardwareProfile: { vmSize: vmSize } - osProfile: { - computerName: msiVmName - adminUsername: adminUsername - linuxConfiguration: { - disablePasswordAuthentication: true - ssh: { - publicKeys: [ - { - path: '/home/${adminUsername}/.ssh/authorized_keys' - keyData: sshPublicKey - } - ] - } - } - } - storageProfile: { - imageReference: { - publisher: 'Canonical' - offer: '0001-com-ubuntu-server-jammy' - sku: '22_04-lts-gen2' - version: 'latest' - } - osDisk: { - createOption: 'FromImage' - managedDisk: { storageAccountType: 'StandardSSD_LRS' } - } - } - networkProfile: { - networkInterfaces: [ { id: nicMsi.id } ] - } +module vmToken 'modules/vm.bicep' = { + name: 'deploy-vm-token' + params: { + location: location + vmName: tokenVmName + vmSize: vmSize + adminUsername: adminUsername + sshPublicKey: sshPublicKey + subnetId: vnet.properties.subnets[1].id + assignManagedIdentity: false + tags: tags } } -// --------------------------------------------------------------------------- -// VM: Token (no managed identity) -// --------------------------------------------------------------------------- -resource vmToken 'Microsoft.Compute/virtualMachines@2024-03-01' = { - name: tokenVmName - location: location - tags: tags - properties: { - hardwareProfile: { vmSize: vmSize } - osProfile: { - computerName: tokenVmName - adminUsername: adminUsername - linuxConfiguration: { - disablePasswordAuthentication: true - ssh: { - publicKeys: [ - { - path: '/home/${adminUsername}/.ssh/authorized_keys' - keyData: sshPublicKey - } - ] - } - } - } - storageProfile: { - imageReference: { - publisher: 'Canonical' - offer: '0001-com-ubuntu-server-jammy' - sku: '22_04-lts-gen2' - version: 'latest' - } - osDisk: { - createOption: 'FromImage' - managedDisk: { storageAccountType: 'StandardSSD_LRS' } - } - } - networkProfile: { - networkInterfaces: [ { id: nicToken.id } ] - } +module vmKubeadm 'modules/vm.bicep' = { + name: 'deploy-vm-kubeadm' + params: { + location: location + vmName: kubeadmVmName + vmSize: vmSize + adminUsername: adminUsername + sshPublicKey: sshPublicKey + subnetId: vnet.properties.subnets[1].id + assignManagedIdentity: false + tags: tags } } @@ -297,10 +188,10 @@ resource vmToken 'Microsoft.Compute/virtualMachines@2024-03-01' = { // --------------------------------------------------------------------------- // Azure Kubernetes Service Cluster Admin Role resource roleClusterAdmin 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(aksCluster.id, vmMsi.id, 'aks-cluster-admin') + name: guid(aksCluster.id, msiVmName, 'aks-cluster-admin') scope: aksCluster properties: { - principalId: vmMsi.identity.principalId + principalId: vmMsi.outputs.principalId principalType: 'ServicePrincipal' roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '0ab0b1a8-8aac-4efd-b8c2-3ee1fb270be8') } @@ -308,10 +199,10 @@ resource roleClusterAdmin 'Microsoft.Authorization/roleAssignments@2022-04-01' = // Azure Kubernetes Service RBAC Cluster Admin resource roleRbacAdmin 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(aksCluster.id, vmMsi.id, 'aks-rbac-cluster-admin') + name: guid(aksCluster.id, msiVmName, 'aks-rbac-cluster-admin') scope: aksCluster properties: { - principalId: vmMsi.identity.principalId + principalId: vmMsi.outputs.principalId principalType: 'ServicePrincipal' roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b1ff04bb-8a4e-4dc4-8eb5-8693973ce19b') } @@ -324,11 +215,15 @@ output clusterName string = aksCluster.name output clusterId string = aksCluster.id output clusterFqdn string = aksCluster.properties.fqdn -output msiVmName string = vmMsi.name -output msiVmIp string = pipMsi.properties.ipAddress -output msiVmPrincipalId string = vmMsi.identity.principalId +output msiVmName string = vmMsi.outputs.vmName +output msiVmIp string = vmMsi.outputs.publicIpAddress +output msiVmPrincipalId string = vmMsi.outputs.principalId -output tokenVmName string = vmToken.name -output tokenVmIp string = pipToken.properties.ipAddress +output tokenVmName string = vmToken.outputs.vmName +output tokenVmIp string = vmToken.outputs.publicIpAddress + +output kubeadmVmName string = vmKubeadm.outputs.vmName +output kubeadmVmIp string = vmKubeadm.outputs.publicIpAddress output adminUsername string = adminUsername + diff --git a/hack/e2e/infra/modules/vm.bicep b/hack/e2e/infra/modules/vm.bicep new file mode 100644 index 0000000..b1b9e56 --- /dev/null +++ b/hack/e2e/infra/modules/vm.bicep @@ -0,0 +1,135 @@ +// ============================================================================= +// modules/vm.bicep - Reusable Ubuntu flex-node VM module +// +// Creates a public IP, NIC, and Ubuntu VM in the given subnet. +// The VHD image defaults to Ubuntu 24.04 LTS (Noble) but can be overridden. +// ============================================================================= + +@description('Azure region for all resources.') +param location string + +@description('VM name (also used as prefix for NIC and public IP names).') +param vmName string + +@description('VM size.') +param vmSize string + +@description('Admin username.') +param adminUsername string + +@description('SSH public key.') +@secure() +param sshPublicKey string + +@description('Subnet resource ID to attach the NIC to.') +param subnetId string + +@description('Whether to assign a system-assigned managed identity to the VM.') +param assignManagedIdentity bool = false + +@description('Marketplace image publisher.') +param imagePublisher string = 'Canonical' + +@description('Marketplace image offer.') +param imageOffer string = 'ubuntu-24_04-lts' + +@description('Marketplace image SKU.') +param imageSku string = 'server' + +@description('Marketplace image version.') +param imageVersion string = 'latest' + +@description('Tags applied to all resources in this module.') +param tags object = {} + +// --------------------------------------------------------------------------- +// Public IP +// --------------------------------------------------------------------------- +resource pip 'Microsoft.Network/publicIPAddresses@2023-11-01' = { + name: '${vmName}-pip' + location: location + tags: tags + sku: { name: 'Standard' } + properties: { + publicIPAllocationMethod: 'Static' + } +} + +// --------------------------------------------------------------------------- +// NIC +// --------------------------------------------------------------------------- +resource nic 'Microsoft.Network/networkInterfaces@2023-11-01' = { + name: '${vmName}-nic' + location: location + tags: tags + properties: { + ipConfigurations: [ + { + name: 'ipconfig1' + properties: { + subnet: { + id: subnetId + } + publicIPAddress: { + id: pip.id + } + privateIPAllocationMethod: 'Dynamic' + } + } + ] + } +} + +// --------------------------------------------------------------------------- +// VM +// --------------------------------------------------------------------------- +resource vm 'Microsoft.Compute/virtualMachines@2024-03-01' = { + name: vmName + location: location + tags: tags + identity: assignManagedIdentity ? { + type: 'SystemAssigned' + } : { + type: 'None' + } + properties: { + hardwareProfile: { vmSize: vmSize } + osProfile: { + computerName: vmName + adminUsername: adminUsername + linuxConfiguration: { + disablePasswordAuthentication: true + ssh: { + publicKeys: [ + { + path: '/home/${adminUsername}/.ssh/authorized_keys' + keyData: sshPublicKey + } + ] + } + } + } + storageProfile: { + imageReference: { + publisher: imagePublisher + offer: imageOffer + sku: imageSku + version: imageVersion + } + osDisk: { + createOption: 'FromImage' + managedDisk: { storageAccountType: 'StandardSSD_LRS' } + } + } + networkProfile: { + networkInterfaces: [ { id: nic.id } ] + } + } +} + +// --------------------------------------------------------------------------- +// Outputs +// --------------------------------------------------------------------------- +output vmName string = vm.name +output publicIpAddress string = pip.properties.ipAddress +output principalId string = assignManagedIdentity ? vm.identity.principalId : '' diff --git a/hack/e2e/lib/cleanup.sh b/hack/e2e/lib/cleanup.sh index 2ed7676..16ecb3a 100755 --- a/hack/e2e/lib/cleanup.sh +++ b/hack/e2e/lib/cleanup.sh @@ -43,16 +43,17 @@ _collect_vm_logs() { } # --------------------------------------------------------------------------- -# collect_logs - Collect logs from both VMs +# collect_logs - Collect logs from all VMs # --------------------------------------------------------------------------- collect_logs() { log_section "Collecting Logs" mkdir -p "${E2E_LOG_DIR}" - local msi_vm_ip token_vm_ip + local msi_vm_ip token_vm_ip kubeadm_vm_ip msi_vm_ip="$(state_get msi_vm_ip)" token_vm_ip="$(state_get token_vm_ip)" + kubeadm_vm_ip="$(state_get kubeadm_vm_ip)" if [[ -n "${msi_vm_ip}" ]]; then _collect_vm_logs "${msi_vm_ip}" "msi" || true @@ -62,6 +63,10 @@ collect_logs() { _collect_vm_logs "${token_vm_ip}" "token" || true fi + if [[ -n "${kubeadm_vm_ip}" ]]; then + _collect_vm_logs "${kubeadm_vm_ip}" "kubeadm" || true + fi + # Also capture cluster-side info { echo "=== Nodes ===" @@ -91,11 +96,12 @@ cleanup() { return 0 fi - local resource_group cluster_name msi_vm_name token_vm_name + local resource_group cluster_name msi_vm_name token_vm_name kubeadm_vm_name resource_group="$(state_get resource_group)" cluster_name="$(state_get cluster_name)" msi_vm_name="$(state_get msi_vm_name)" token_vm_name="$(state_get token_vm_name)" + kubeadm_vm_name="$(state_get kubeadm_vm_name)" local deployment_name deployment_name="$(state_get deployment_name)" @@ -105,16 +111,20 @@ cleanup() { fi # Delete VMs first (faster than waiting for full RG delete) - log_info "[1/4] Deleting MSI VM: ${msi_vm_name}..." + log_info "[1/5] Deleting MSI VM: ${msi_vm_name}..." az vm delete --resource-group "${resource_group}" --name "${msi_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true - log_info "[2/4] Deleting Token VM: ${token_vm_name}..." + log_info "[2/5] Deleting Token VM: ${token_vm_name}..." az vm delete --resource-group "${resource_group}" --name "${token_vm_name}" \ --force-deletion yes --yes --no-wait 2>/dev/null || true + log_info "[3/5] Deleting Kubeadm VM: ${kubeadm_vm_name}..." + az vm delete --resource-group "${resource_group}" --name "${kubeadm_vm_name}" \ + --force-deletion yes --yes --no-wait 2>/dev/null || true + # Clean up leftover networking resources tied to our deployment - log_info "[3/4] Cleaning up networking resources..." + log_info "[4/5] Cleaning up networking resources..." local run_id="${GITHUB_RUN_ID:-}" if [[ -n "${run_id}" ]]; then for res_type in networkInterfaces publicIPAddresses networkSecurityGroups disks; do @@ -126,7 +136,7 @@ cleanup() { done fi - log_info "[4/4] Deleting AKS cluster: ${cluster_name}..." + log_info "[5/5] Deleting AKS cluster: ${cluster_name}..." az aks delete --resource-group "${resource_group}" --name "${cluster_name}" \ --yes --no-wait 2>/dev/null || true diff --git a/hack/e2e/lib/infra.sh b/hack/e2e/lib/infra.sh index 1ba134d..b30a7d6 100755 --- a/hack/e2e/lib/infra.sh +++ b/hack/e2e/lib/infra.sh @@ -105,7 +105,7 @@ infra_deploy() { -o json) local cluster_name cluster_id msi_vm_name msi_vm_ip msi_vm_principal_id - local token_vm_name token_vm_ip admin_username + local token_vm_name token_vm_ip kubeadm_vm_name kubeadm_vm_ip admin_username cluster_name=$(echo "${outputs}" | jq -r '.clusterName.value') cluster_id=$(echo "${outputs}" | jq -r '.clusterId.value') @@ -114,6 +114,8 @@ infra_deploy() { msi_vm_principal_id=$(echo "${outputs}" | jq -r '.msiVmPrincipalId.value') token_vm_name=$(echo "${outputs}" | jq -r '.tokenVmName.value') token_vm_ip=$(echo "${outputs}" | jq -r '.tokenVmIp.value') + kubeadm_vm_name=$(echo "${outputs}" | jq -r '.kubeadmVmName.value') + kubeadm_vm_ip=$(echo "${outputs}" | jq -r '.kubeadmVmIp.value') admin_username=$(echo "${outputs}" | jq -r '.adminUsername.value') # Persist to state @@ -124,6 +126,8 @@ infra_deploy() { state_set "msi_vm_principal_id" "${msi_vm_principal_id}" state_set "token_vm_name" "${token_vm_name}" state_set "token_vm_ip" "${token_vm_ip}" + state_set "kubeadm_vm_name" "${kubeadm_vm_name}" + state_set "kubeadm_vm_ip" "${kubeadm_vm_ip}" state_set "admin_username" "${admin_username}" state_set "resource_group" "${E2E_RESOURCE_GROUP}" state_set "location" "${E2E_LOCATION}" @@ -131,23 +135,27 @@ infra_deploy() { state_set "tenant_id" "${AZURE_TENANT_ID}" state_set "deployment_name" "${deployment_name}" - log_info "Cluster: ${cluster_name} (${cluster_id})" - log_info "MSI VM: ${msi_vm_name} @ ${msi_vm_ip}" - log_info "Token VM: ${token_vm_name} @ ${token_vm_ip}" + log_info "Cluster: ${cluster_name} (${cluster_id})" + log_info "MSI VM: ${msi_vm_name} @ ${msi_vm_ip}" + log_info "Token VM: ${token_vm_name} @ ${token_vm_ip}" + log_info "Kubeadm VM: ${kubeadm_vm_name} @ ${kubeadm_vm_ip}" # Get kubeconfig and extract cluster info infra_get_kubeconfig - # Wait for SSH on both VMs (in parallel) - log_info "Waiting for SSH on both VMs..." + # Wait for SSH on all VMs (in parallel) + log_info "Waiting for SSH on all VMs..." wait_for_ssh "${msi_vm_ip}" & local pid_msi=$! wait_for_ssh "${token_vm_ip}" & local pid_token=$! + wait_for_ssh "${kubeadm_vm_ip}" & + local pid_kubeadm=$! local ssh_failed=0 wait "${pid_msi}" || ssh_failed=1 wait "${pid_token}" || ssh_failed=1 + wait "${pid_kubeadm}" || ssh_failed=1 if [[ "${ssh_failed}" -eq 1 ]]; then log_error "One or more VMs not reachable via SSH" diff --git a/hack/e2e/lib/node-join-kubeadm.sh b/hack/e2e/lib/node-join-kubeadm.sh new file mode 100644 index 0000000..23eae1e --- /dev/null +++ b/hack/e2e/lib/node-join-kubeadm.sh @@ -0,0 +1,353 @@ +#!/usr/bin/env bash +# ============================================================================= +# hack/e2e/lib/node-join-kubeadm.sh - Join an AKS flex node using the kubeadm join flow +# +# Functions: +# node_join_kubeadm - Create bootstrap token, generate action file, +# run aks-flex-node apply -f (KubeadmNodeJoin) +# ============================================================================= +set -euo pipefail + +[[ -n "${_E2E_NODE_JOIN_KUBEADM_LOADED:-}" ]] && return 0 +readonly _E2E_NODE_JOIN_KUBEADM_LOADED=1 + +# shellcheck disable=SC1091 +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# --------------------------------------------------------------------------- +# node_join_kubeadm - Join the Kubeadm VM using apply -f with KubeadmNodeJoin +# --------------------------------------------------------------------------- +node_join_kubeadm() { + log_section "Joining Kubeadm Node (apply -f)" + local start + start=$(timer_start) + + local vm_ip + vm_ip="$(state_get kubeadm_vm_ip)" + local server_url + server_url="$(state_get server_url)" + local ca_cert_data + ca_cert_data="$(state_get ca_cert_data)" + + # Step 1: Create bootstrap token & RBAC in the cluster + log_info "Creating bootstrap token and RBAC resources for kubeadm join..." + local token_id token_secret bootstrap_token expiration + + token_id="$(openssl rand -hex 3)" + token_secret="$(openssl rand -hex 8)" + bootstrap_token="${token_id}.${token_secret}" + + # Use a portable date command for expiration (24h from now) + if date --version &>/dev/null; then + # GNU date + expiration="$(date -u -d "+24 hours" +"%Y-%m-%dT%H:%M:%SZ")" + else + # BSD/macOS date + expiration="$(date -u -v+24H +"%Y-%m-%dT%H:%M:%SZ")" + fi + + log_info "Token ID: ${token_id} | Expires: ${expiration}" + + # Create the bootstrap token secret + kubectl apply -f - < "${action_file}" <&1 | sudo tee /var/log/aks-flex-node/aks-flex-node.log + +if systemctl is-active --quiet kubelet; then + echo "kubelet is running" +else + echo "kubelet status:" + systemctl status kubelet --no-pager -l 2>&1 || true +fi +REMOTE + + log_success "Kubeadm node joined via apply -f in $(timer_elapsed "${start}")s" +} diff --git a/hack/e2e/lib/node-join-msi.sh b/hack/e2e/lib/node-join-msi.sh new file mode 100644 index 0000000..846d42e --- /dev/null +++ b/hack/e2e/lib/node-join-msi.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# ============================================================================= +# hack/e2e/lib/node-join-msi.sh - Join an AKS flex node using MSI auth +# +# Functions: +# node_join_msi - Install Azure CLI + MSI auth, deploy binary, run agent +# ============================================================================= +set -euo pipefail + +[[ -n "${_E2E_NODE_JOIN_MSI_LOADED:-}" ]] && return 0 +readonly _E2E_NODE_JOIN_MSI_LOADED=1 + +# shellcheck disable=SC1091 +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# --------------------------------------------------------------------------- +# node_join_msi - Join the MSI VM +# --------------------------------------------------------------------------- +node_join_msi() { + log_section "Joining MSI Node" + local start + start=$(timer_start) + + local vm_ip + vm_ip="$(state_get msi_vm_ip)" + local cluster_id + cluster_id="$(state_get cluster_id)" + local subscription_id + subscription_id="$(state_get subscription_id)" + local tenant_id + tenant_id="$(state_get tenant_id)" + local location + location="$(state_get location)" + local server_url + server_url="$(state_get server_url)" + local ca_cert_data + ca_cert_data="$(state_get ca_cert_data)" + + # Step 1: Install Azure CLI on VM and log in with MSI + log_info "Installing Azure CLI on MSI VM (${vm_ip})..." + remote_exec "${vm_ip}" 'bash -s' <<'AZURECLI' +set -euo pipefail + +MAX_RETRIES=5 +RETRY_DELAY=15 +for attempt in $(seq 1 $MAX_RETRIES); do + while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + sleep 5 + done + + if sudo apt-get update -qq && curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash; then + echo "Azure CLI installed" + break + fi + + if [ "$attempt" -lt "$MAX_RETRIES" ]; then + sudo dpkg --configure -a 2>/dev/null || true + sleep $RETRY_DELAY + else + echo "Azure CLI installation failed after ${MAX_RETRIES} attempts" + exit 1 + fi +done + +az login --identity --output none +sudo az login --identity --output none +echo "Azure CLI authenticated with managed identity" +AZURECLI + + # Step 2: Generate MSI config + local config_file="${E2E_WORK_DIR}/config-msi.json" + cat > "${config_file}" </dev/null 2>&1; then + # GNU date + expiration="$(date -u -d "+24 hours" +"%Y-%m-%dT%H:%M:%SZ")" + else + # BSD/macOS date + expiration="$(date -u -v+24H +"%Y-%m-%dT%H:%M:%SZ")" + fi + + log_info "Token ID: ${token_id} | Expires: ${expiration}" + + # Create the bootstrap token secret + kubectl apply -f - < "${config_file}" </dev/null 2>&1; do - sleep 5 - done - - if sudo apt-get update -qq && curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash; then - echo "Azure CLI installed" - break - fi - - if [ "$attempt" -lt "$MAX_RETRIES" ]; then - sudo dpkg --configure -a 2>/dev/null || true - sleep $RETRY_DELAY - else - echo "Azure CLI installation failed after ${MAX_RETRIES} attempts" - exit 1 - fi -done - -az login --identity --output none -sudo az login --identity --output none -echo "Azure CLI authenticated with managed identity" -AZURECLI - - # Step 2: Generate MSI config - local config_file="${E2E_WORK_DIR}/config-msi.json" - cat > "${config_file}" </dev/null 2>&1; then - # GNU date - expiration="$(date -u -d "+24 hours" +"%Y-%m-%dT%H:%M:%SZ")" - else - # BSD/macOS date - expiration="$(date -u -v+24H +"%Y-%m-%dT%H:%M:%SZ")" - fi - - log_info "Token ID: ${token_id} | Expires: ${expiration}" - - # Create the bootstrap token secret - kubectl apply -f - < "${config_file}" < - Wait for a specific node to appear in kubectl -# validate_all_nodes - Verify both MSI and token nodes joined +# validate_all_nodes - Verify MSI, token, and kubeadm nodes joined # smoke_test