Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*.so
*.dylib
aks-flex-node
AKSFlexNode

# Test binary, built with `go test -c`
*.test
Expand Down Expand Up @@ -42,6 +43,7 @@ Thumbs.db

# Config files with sensitive data (keep sample config)
config.json
Standard_D8pds_v6_sku.json
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rm?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes


# Environment files with secrets
.env
Expand Down
34 changes: 32 additions & 2 deletions commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ var (
BuildTime = "unknown"
)

// Unbootstrap command flags
var cleanupMode string

// NewAgentCommand creates a new agent command
func NewAgentCommand() *cobra.Command {
cmd := &cobra.Command{
Expand All @@ -44,12 +47,19 @@ func NewUnbootstrapCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "unbootstrap",
Short: "Remove AKS node configuration and Arc connection",
Long: "Clean up and remove all AKS node components and Arc registration from this machine",
Long: `Clean up and remove all AKS node components and Arc registration from this machine.

For private clusters (config has private: true), this also handles VPN cleanup:
--cleanup-mode=local Remove node and local VPN config, keep Gateway (default)
--cleanup-mode=full Remove everything including Gateway VM and Azure resources`,
RunE: func(cmd *cobra.Command, args []string) error {
return runUnbootstrap(cmd.Context())
},
}

cmd.Flags().StringVar(&cleanupMode, "cleanup-mode", "local",
"[private cluster only] Cleanup mode: 'local' (keep Gateway) or 'full' (remove all Azure resources)")

return cmd
}

Expand Down Expand Up @@ -87,6 +97,13 @@ func runAgent(ctx context.Context) error {
return err
}

// Print visible success message
fmt.Println()
fmt.Println("========================================")
fmt.Println(" Join process finished successfully!")
fmt.Println("========================================")
fmt.Println()

// After successful bootstrap, transition to daemon mode
logger.Info("Bootstrap completed successfully, transitioning to daemon mode...")
return runDaemonLoop(ctx, cfg)
Expand All @@ -101,14 +118,27 @@ func runUnbootstrap(ctx context.Context) error {
return fmt.Errorf("failed to load config from %s: %w", configPath, err)
}

// Pass cleanup mode to config so the PrivateClusterUninstall step can read it
if cfg.Azure.TargetCluster != nil {
cfg.Azure.TargetCluster.CleanupMode = cleanupMode
}

bootstrapExecutor := bootstrapper.New(cfg, logger)
result, err := bootstrapExecutor.Unbootstrap(ctx)
if err != nil {
return err
}

// Handle and log the result (unbootstrap is more lenient with failures)
return handleExecutionResult(result, "unbootstrap", logger)
if err := handleExecutionResult(result, "unbootstrap", logger); err != nil {
return err
}

// Print final success message
fmt.Println()
fmt.Println("\033[0;32mSUCCESS:\033[0m Unbootstrap completed successfully!")

return nil
}

// runVersion displays version information
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0
github.com/Azure/go-autorest/autorest/to v0.4.1
github.com/google/renameio/v2 v2.0.2
github.com/google/uuid v1.6.0
Expand Down
12 changes: 10 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,22 @@ github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDo
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2 h1:qiir/pptnHqp6hV8QwV+IExYIf6cPsXBfUDUXQ27t2Y=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v3 v3.0.0-beta.2/go.mod h1:jVRrRDLCOuif95HDYC23ADTMlvahB7tMdl519m9Iyjc=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0 h1:z7Mqz6l0EFH549GvHEqfjKvi+cRScxLWbaoeLm9wxVQ=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v6 v6.4.0/go.mod h1:v6gbfH+7DG7xH2kUNs+ZJ9tF6O3iNnR85wMtmr+F54o=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0 h1:5n7dPVqsWfVKw+ZiEKSd3Kzu7gwBkbEBkeXb8rgaE9Q=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5 v5.0.0/go.mod h1:HcZY0PHPo/7d75p99lB6lK0qYOP4vLRJUBpiehYXtLQ=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0 h1:7UuAn4ljE+H3GQ7qts3c7oAaMRvge68EgyckoNP/1Ro=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/hybridcompute/armhybridcompute v1.2.0/go.mod h1:F2eDq/BGK2LOEoDtoHbBOphaPqcjT0K/Y5Am8vf7+0w=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0 h1:2qsIIvxVT+uE6yrNldntJKlLRgxGbZ85kgtz5SNBhMw=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v3 v3.1.0/go.mod h1:AW8VEadnhw9xox+VaVd9sP7NjzOAnaZBLRH6Tq3cJ38=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0 h1:HYGD75g0bQ3VO/Omedm54v4LrD3B1cGImuRF3AJ5wLo=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6 v6.2.0/go.mod h1:ulHyBFJOI0ONiRL4vcJTmS7rx18jQQlEPmAgo80cRdM=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 h1:Dd+RhdJn0OTtVGaeDLZpcumkIVCtA/3/Fo42+eoYvVM=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0/go.mod h1:5kakwfW5CjC9KK+Q4wjXAg+ShuIm2mBMua0ZFj2C8PE=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0 h1:wxQx2Bt4xzPIKvW59WQf1tJNx/ZZKPfN+EhPX3Z6CYY=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions v1.3.0/go.mod h1:TpiwjwnW/khS0LKs4vW5UmmT9OWcxaveS8U7+tlknzo=
github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs=
github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24=
github.com/Azure/go-autorest/autorest/to v0.4.1 h1:CxNHBqdzTr7rLtdrtb5CMjJcDut+WNGCVv7OmS5+lTc=
Expand Down
3 changes: 3 additions & 0 deletions pkg/bootstrapper/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"go.goms.io/aks/AKSFlexNode/pkg/components/services"
"go.goms.io/aks/AKSFlexNode/pkg/components/system_configuration"
"go.goms.io/aks/AKSFlexNode/pkg/config"
"go.goms.io/aks/AKSFlexNode/pkg/privatecluster"
)

// Bootstrapper executes bootstrap steps sequentially
Expand All @@ -33,6 +34,7 @@ func New(cfg *config.Config, logger *logrus.Logger) *Bootstrapper {
func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error) {
// Define the bootstrap steps in order - using modules directly
steps := []Executor{
privatecluster.NewInstaller(b.logger), // VPN/Gateway setup (if private cluster)
arc.NewInstaller(b.logger), // Setup Arc
services.NewUnInstaller(b.logger), // Stop kubelet before setup
system_configuration.NewInstaller(b.logger), // Configure system (early)
Expand All @@ -51,6 +53,7 @@ func (b *Bootstrapper) Bootstrap(ctx context.Context) (*ExecutionResult, error)
// Unbootstrap executes all cleanup steps sequentially (in reverse order of bootstrap)
func (b *Bootstrapper) Unbootstrap(ctx context.Context) (*ExecutionResult, error) {
steps := []Executor{
privatecluster.NewUninstaller(b.logger), // Node removal + VPN teardown (if private cluster)
services.NewUnInstaller(b.logger), // Stop services first
npd.NewUnInstaller(b.logger), // Uninstall Node Problem Detector
kubelet.NewUnInstaller(b.logger), // Clean kubelet configuration
Expand Down
8 changes: 6 additions & 2 deletions pkg/config/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,12 @@ type BootstrapTokenConfig struct {

// TargetClusterConfig holds configuration for the target AKS cluster the ARC machine will connect to.
type TargetClusterConfig struct {
ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster
Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2")
ResourceID string `json:"resourceId"` // Full resource ID of the target AKS cluster
Location string `json:"location"` // Azure region of the cluster (e.g., "eastus", "westus2")
IsPrivateCluster bool `json:"private" mapstructure:"private"` // Whether this is a private AKS cluster (requires Gateway/VPN setup)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reconsider the naming private cluster, since this applies to other use cases: VM/BM within VPC from 3rd party cloud, physical machine behind office firewall

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Agreed. VPN connections are not limited to private clusters. Currently, "private: true" is used as the trigger condition; this can be changed to "gateway: true" or other conditions in the future to support more network scenarios.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we update it in the following PR as the current PR is already big?

GatewayVMSize string `json:"gatewayVMSize,omitempty" mapstructure:"gatewayVMSize"` // VPN Gateway VM size (defaults to "Standard_D2s_v3")
GatewayPort int `json:"gatewayPort,omitempty" mapstructure:"gatewayPort"` // VPN Gateway port (defaults to 51820)
CleanupMode string `json:"-"` // Runtime-only, set by CLI flag for unbootstrap
Name string // will be populated from ResourceID
ResourceGroup string // will be populated from ResourceID
SubscriptionID string // will be populated from ResourceID
Expand Down
99 changes: 99 additions & 0 deletions pkg/privatecluster/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Private AKS Cluster - Edge Node Join/Leave
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consolidate with create_private_cluster.md into one usage doc?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, will do it.


## Prerequisites

### 1. Login to Azure CLI

```bash
az login
```

> **Note:** When running the agent with `sudo`, use `sudo -E` to preserve your Azure CLI token.

### 2. Create a Private AKS Cluster

Create a Private AKS cluster with AAD and Azure RBAC enabled, and assign the required roles to your user.

See: [create_private_cluster.md](create_private_cluster.md)

### 3. Prepare Configuration File

Create a `config.json` with `"private": true` in the `targetCluster` section:

```json
{
"azure": {
"subscriptionId": "<SUBSCRIPTION_ID>",
"tenantId": "<TENANT_ID>",
"targetCluster": {
"resourceId": "/subscriptions/<SUB_ID>/resourceGroups/<RG>/providers/Microsoft.ContainerService/managedClusters/<CLUSTER_NAME>",
"location": "eastus2",
"private": true
},
"arc": {
"enabled": true,
"resourceGroup": "<RG>",
"location": "eastus2"
}
},
"kubernetes": {
"version": "1.33.0"
},
"containerd": {
"version": "1.7.11",
"pauseImage": "mcr.microsoft.com/oss/kubernetes/pause:3.6"
},
"agent": {
"logLevel": "info",
"logDir": "/var/log/aks-flex-node"
}
}
```

## Join Private AKS Cluster

### 1. Build the project

```bash
go build -o aks-flex-node .
```

### 2. Join the cluster

When the config has `"private": true`, the `agent` command automatically sets up the Gateway/VPN before bootstrapping:

```bash
sudo -E ./aks-flex-node agent --config config.json
```

This will:
1. Detect private cluster from config
2. Set up Gateway VM and VPN tunnel (WireGuard)
3. Run normal bootstrap (Arc, containerd, kubelet, etc.)
4. Enter daemon mode for status monitoring

### 3. Verify

```bash
kubectl get nodes
```

## Leave Private AKS Cluster

When the config has `"private": true`, the `unbootstrap` command automatically handles VPN/Gateway cleanup:

```bash
sudo -E ./aks-flex-node unbootstrap --config config.json [--cleanup-mode <local|full>]
```

### Mode Comparison

| Mode | Command | Description |
|------|---------|-------------|
| `local` (default) | `sudo -E ./aks-flex-node unbootstrap --config config.json` | Remove node and local VPN config, **keep Gateway** for other nodes |
| `full` | `sudo -E ./aks-flex-node unbootstrap --config config.json --cleanup-mode full` | Remove all components **including Gateway VM and Azure resources** |

### When to use each mode

- **`--cleanup-mode=local`** (default): Other nodes are still using the Gateway, or you plan to rejoin later
- **`--cleanup-mode=full`**: Last node leaving, clean up all Azure resources (Gateway VM, subnet, NSG, public IP)
Loading
Loading