Skip to content
Draft
1 change: 1 addition & 0 deletions cli/cmd/bootstrap_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) {
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPublicKeyPath, "ssh-public-key-path", "~/.ssh/id_rsa.pub", "SSH Public Key Path (default: ~/.ssh/id_rsa.pub)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPrivateKeyPath, "ssh-private-key-path", "~/.ssh/id_rsa", "SSH Private Key Path (default: ~/.ssh/id_rsa)")
flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure (default: false)")
flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Spot, "spot", false, "Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)")
flags.IntVar(&bootstrapGcpCmd.CodesphereEnv.DatacenterID, "datacenter-id", 1, "Datacenter ID (default: 1)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.CustomPgIP, "custom-pg-ip", "", "Custom PostgreSQL IP (optional)")
flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.InstallConfigPath, "install-config", "config.yaml", "Path to install config file (optional)")
Expand Down
1 change: 1 addition & 0 deletions docs/oms_beta_bootstrap-gcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ oms beta bootstrap-gcp [flags]
--registry-user string Custom Registry username (only for GitHub registry type) (optional)
--secrets-dir string Directory for secrets (default: /etc/codesphere/secrets) (default "/etc/codesphere/secrets")
--secrets-file string Path to secrets files (optional) (default "prod.vault.yaml")
--spot Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)
--ssh-private-key-path string SSH Private Key Path (default: ~/.ssh/id_rsa) (default "~/.ssh/id_rsa")
--ssh-public-key-path string SSH Public Key Path (default: ~/.ssh/id_rsa.pub) (default "~/.ssh/id_rsa.pub")
--ssh-quiet Suppress SSH command output (default: true) (default true)
Expand Down
162 changes: 156 additions & 6 deletions internal/bootstrap/gcp/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ type CodesphereEnvironment struct {
InstallHash string `json:"install_hash"`
InstallSkipSteps []string `json:"install_skip_steps"`
Preemptible bool `json:"preemptible"`
Spot bool `json:"spot"`
WriteConfig bool `json:"-"`
GatewayIP string `json:"gateway_ip"`
PublicGatewayIP string `json:"public_gateway_ip"`
Expand Down Expand Up @@ -306,9 +307,22 @@ func (b *GCPBootstrapper) ValidateInput() error {
return err
}

err = b.validateVMProvisioningOptions()
if err != nil {
return err
}

return b.validateGithubParams()
}

// validateVMProvisioningOptions checks that spot and preemptible options are not both set
func (b *GCPBootstrapper) validateVMProvisioningOptions() error {
if b.Env.Spot && b.Env.Preemptible {
return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model")
}
return nil
}

// validateInstallVersion checks if the specified install version exists and contains the required installer artifact
func (b *GCPBootstrapper) validateInstallVersion() error {
if b.Env.InstallLocal != "" {
Expand Down Expand Up @@ -686,6 +700,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
wg := sync.WaitGroup{}
errCh := make(chan error, len(vmDefs))
resultCh := make(chan vmResult, len(vmDefs))
logCh := make(chan string, len(vmDefs))
rootDiskSize := int64(200)
if b.Env.RegistryType == RegistryTypeGitHub {
rootDiskSize = 50
Expand All @@ -694,6 +709,47 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
wg.Add(1)
go func(vm VMDef) {
defer wg.Done()

existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name)
if err != nil {
if !isNotFoundError(err) {
errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err)
return
}
}
if existingInstance != nil {
instanceStatus := existingInstance.GetStatus()
if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" {
// Start the stopped instance
err = b.GCPClient.StartInstance(projectID, zone, vm.Name)
if err != nil {
errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err)
return
}
}

// Wait until the instance is RUNNING and IPs are populated.
readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP)
if err != nil {
errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err)
return
}

externalIP := ""
internalIP := readyInstance.GetNetworkInterfaces()[0].GetNetworkIP()
if len(readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 {
externalIP = readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP()
}
Comment on lines +738 to +742
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When an instance already exists, this branch returns IPs from the current instance record without ensuring the VM is actually ready (e.g., RUNNING) and without validating that required IPs are present. If the instance is in an intermediate state or the network interface/NAT IP isn’t populated yet, this will record empty IPs and likely break subsequent SSH/provisioning steps. Consider polling GetInstance until status is RUNNING and the expected internal/external IPs are non-empty (or returning a clear error if they can’t be obtained).

Copilot uses AI. Check for mistakes.
resultCh <- vmResult{
vmType: vm.Tags[0],
name: vm.Name,
externalIP: externalIP,
internalIP: internalIP,
}
return
}

// Instance doesn't exist, create it
disks := []*computepb.AttachedDisk{
{
Boot: protoBool(true),
Expand Down Expand Up @@ -737,9 +793,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
Tags: &computepb.Tags{
Items: vm.Tags,
},
Scheduling: &computepb.Scheduling{
Preemptible: &b.Env.Preemptible,
},
Scheduling: b.buildSchedulingConfig(),
NetworkInterfaces: []*computepb.NetworkInterface{
{
Network: protoString(network),
Expand Down Expand Up @@ -767,9 +821,9 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
}
}

err = b.GCPClient.CreateInstance(projectID, zone, instance)
if err != nil && !isAlreadyExistsError(err) {
errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err)
err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name, logCh)
if err != nil {
errCh <- err
return
}

Expand Down Expand Up @@ -802,6 +856,11 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {

close(errCh)
close(resultCh)
close(logCh)

for msg := range logCh {
b.stlog.Logf("%s", msg)
}

var errs []error
for err := range errCh {
Expand Down Expand Up @@ -843,6 +902,93 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error {
return nil
}

// buildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings
func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling {
if b.Env.Spot {
return &computepb.Scheduling{
ProvisioningModel: protoString("SPOT"),
OnHostMaintenance: protoString("TERMINATE"),
AutomaticRestart: protoBool(false),
InstanceTerminationAction: protoString("STOP"),
}
}
if b.Env.Preemptible {
return &computepb.Scheduling{
Preemptible: protoBool(true),
}
}

return &computepb.Scheduling{}
}

// createInstanceWithFallback attempts to create an instance with the configured settings.
// If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs.
func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error {
err := b.GCPClient.CreateInstance(projectID, zone, instance)
if err == nil {
return nil
}

if isAlreadyExistsError(err) {
return nil
}

if b.Env.Spot && isSpotCapacityError(err) {
logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName)
instance.Scheduling = &computepb.Scheduling{}
err = b.GCPClient.CreateInstance(projectID, zone, instance)
if err != nil && !isAlreadyExistsError(err) {
return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err)
}
return nil
}

return fmt.Errorf("failed to create instance %s: %w", vmName, err)
}

// waitForInstanceRunning polls GetInstance until the instance status is RUNNING
// and its internal IP (and external IP, when needsExternalIP is true) are populated.
// It returns the ready instance or an error if the deadline is exceeded.
func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) {
const (
maxAttempts = 60
pollInterval = 5 * time.Second
)
for attempt := range maxAttempts {
inst, err := b.GCPClient.GetInstance(projectID, zone, name)
if err != nil {
return nil, fmt.Errorf("failed to poll instance %s: %w", name, err)
}

if inst.GetStatus() == "RUNNING" &&
len(inst.GetNetworkInterfaces()) > 0 &&
inst.GetNetworkInterfaces()[0].GetNetworkIP() != "" &&
(!needsExternalIP || (len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 &&
inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() != "")) {
return inst, nil
}

if attempt < maxAttempts-1 {
time.Sleep(pollInterval)
}
}
return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s",
name, time.Duration(maxAttempts)*pollInterval)
}

// isSpotCapacityError checks if the error is related to spot VM capacity issues
func isSpotCapacityError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
return strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") ||
strings.Contains(errStr, "UNSUPPORTED_OPERATION") ||
strings.Contains(errStr, "stockout") ||
strings.Contains(errStr, "does not have enough resources") ||
status.Code(err) == codes.ResourceExhausted
}

// EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress
// controllers of the cluster.
func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error {
Expand Down Expand Up @@ -1591,6 +1737,10 @@ func isAlreadyExistsError(err error) bool {
return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists")
}

func isNotFoundError(err error) bool {
return status.Code(err) == codes.NotFound || strings.Contains(err.Error(), "not found")
}

// readSSHKey reads an SSH key file, expanding ~ in the path
func (b *GCPBootstrapper) readSSHKey(path string) (string, error) {
realPath := util.ExpandPath(path)
Expand Down
21 changes: 21 additions & 0 deletions internal/bootstrap/gcp/gcp_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type GCPClientManager interface {
CreateFirewallRule(projectID string, rule *computepb.Firewall) error
CreateInstance(projectID, zone string, instance *computepb.Instance) error
GetInstance(projectID, zone, instanceName string) (*computepb.Instance, error)
StartInstance(projectID, zone, instanceName string) error
CreateAddress(projectID, region string, address *computepb.Address) (string, error)
GetAddress(projectID, region, addressName string) (*computepb.Address, error)
EnsureDNSManagedZone(projectID, zoneName, dnsName, description string) error
Expand Down Expand Up @@ -552,6 +553,26 @@ func (c *GCPClient) GetInstance(projectID, zone, instanceName string) (*computep
})
}

// StartInstance starts a stopped Compute Engine instance in the specified project and zone.
func (c *GCPClient) StartInstance(projectID, zone, instanceName string) error {
client, err := compute.NewInstancesRESTClient(c.ctx)
if err != nil {
return err
}
defer util.IgnoreError(client.Close)

op, err := client.Start(c.ctx, &computepb.StartInstanceRequest{
Project: projectID,
Zone: zone,
Instance: instanceName,
})
if err != nil {
return err
}

return op.Wait(c.ctx)
}

// CreateAddress creates a new static IP address in the specified project and region.
func (c *GCPClient) CreateAddress(projectID, region string, address *computepb.Address) (string, error) {
client, err := compute.NewAddressesRESTClient(c.ctx)
Expand Down
Loading