From 918169663cdf73c71761b6ca14040c7e68716609 Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Wed, 4 Mar 2026 09:33:25 +0100 Subject: [PATCH 1/7] feat: add support for Spot VMs and validation for provisioning options --- cli/cmd/bootstrap_gcp.go | 1 + internal/bootstrap/gcp/gcp.go | 119 +++++++++++++- internal/bootstrap/gcp/gcp_client.go | 21 +++ internal/bootstrap/gcp/gcp_test.go | 232 +++++++++++++++++++++++++-- internal/bootstrap/gcp/mocks.go | 63 ++++++++ 5 files changed, 415 insertions(+), 21 deletions(-) diff --git a/cli/cmd/bootstrap_gcp.go b/cli/cmd/bootstrap_gcp.go index b96a38a6..9855f21a 100644 --- a/cli/cmd/bootstrap_gcp.go +++ b/cli/cmd/bootstrap_gcp.go @@ -69,6 +69,7 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) { flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPublicKeyPath, "ssh-public-key-path", "~/.ssh/id_rsa.pub", "SSH Public Key Path (default: ~/.ssh/id_rsa.pub)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPrivateKeyPath, "ssh-private-key-path", "~/.ssh/id_rsa", "SSH Private Key Path (default: ~/.ssh/id_rsa)") flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure (default: false)") + flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Spot, "spot", false, "Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false)") flags.IntVar(&bootstrapGcpCmd.CodesphereEnv.DatacenterID, "datacenter-id", 1, "Datacenter ID (default: 1)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.CustomPgIP, "custom-pg-ip", "", "Custom PostgreSQL IP (optional)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.InstallConfigPath, "install-config", "config.yaml", "Path to install config file (optional)") diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index 71373096..b227d7d8 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -93,6 +93,7 @@ type CodesphereEnvironment struct { InstallHash string `json:"install_hash"` InstallSkipSteps []string `json:"install_skip_steps"` Preemptible bool `json:"preemptible"` + Spot bool `json:"spot"` WriteConfig bool `json:"-"` GatewayIP string `json:"gateway_ip"` PublicGatewayIP string `json:"public_gateway_ip"` @@ -306,9 +307,22 @@ func (b *GCPBootstrapper) ValidateInput() error { return err } + err = b.validateVMProvisioningOptions() + if err != nil { + return err + } + return b.validateGithubParams() } +// validateVMProvisioningOptions checks that spot and preemptible options are not both set +func (b *GCPBootstrapper) validateVMProvisioningOptions() error { + if b.Env.Spot && b.Env.Preemptible { + return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model") + } + return nil +} + // validateInstallVersion checks if the specified install version exists and contains the required installer artifact func (b *GCPBootstrapper) validateInstallVersion() error { if b.Env.InstallLocal != "" { @@ -694,6 +708,43 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { wg.Add(1) go func(vm VMDef) { defer wg.Done() + + existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) + if err == nil && existingInstance != nil { + instanceStatus := existingInstance.GetStatus() + if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" { + // Start the stopped instance + err = b.GCPClient.StartInstance(projectID, zone, vm.Name) + if err != nil { + errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err) + return + } + // Re-fetch instance to get updated IPs after start + existingInstance, err = b.GCPClient.GetInstance(projectID, zone, vm.Name) + if err != nil { + errCh <- fmt.Errorf("failed to get instance %s after start: %w", vm.Name, err) + return + } + } + + externalIP := "" + internalIP := "" + if len(existingInstance.GetNetworkInterfaces()) > 0 { + internalIP = existingInstance.GetNetworkInterfaces()[0].GetNetworkIP() + if len(existingInstance.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { + externalIP = existingInstance.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() + } + } + resultCh <- vmResult{ + vmType: vm.Tags[0], + name: vm.Name, + externalIP: externalIP, + internalIP: internalIP, + } + return + } + + // Instance doesn't exist, create it disks := []*computepb.AttachedDisk{ { Boot: protoBool(true), @@ -737,9 +788,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { Tags: &computepb.Tags{ Items: vm.Tags, }, - Scheduling: &computepb.Scheduling{ - Preemptible: &b.Env.Preemptible, - }, + Scheduling: b.buildSchedulingConfig(), NetworkInterfaces: []*computepb.NetworkInterface{ { Network: protoString(network), @@ -767,9 +816,9 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { } } - err = b.GCPClient.CreateInstance(projectID, zone, instance) - if err != nil && !isAlreadyExistsError(err) { - errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err) + err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name) + if err != nil { + errCh <- err return } @@ -843,6 +892,64 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { return nil } +// buildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings +func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling { + if b.Env.Spot { + return &computepb.Scheduling{ + ProvisioningModel: protoString("SPOT"), + OnHostMaintenance: protoString("TERMINATE"), + AutomaticRestart: protoBool(false), + InstanceTerminationAction: protoString("STOP"), + } + } + if b.Env.Preemptible { + return &computepb.Scheduling{ + Preemptible: protoBool(true), + } + } + + return &computepb.Scheduling{} +} + +// createInstanceWithFallback attempts to create an instance with the configured settings. +// If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs. +func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string) error { + err := b.GCPClient.CreateInstance(projectID, zone, instance) + if err == nil { + return nil + } + + if isAlreadyExistsError(err) { + return nil + } + + if b.Env.Spot && isSpotCapacityError(err) { + b.stlog.Logf("Spot capacity unavailable for %s, falling back to standard VM", vmName) + instance.Scheduling = &computepb.Scheduling{} + err = b.GCPClient.CreateInstance(projectID, zone, instance) + if err != nil && !isAlreadyExistsError(err) { + return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err) + } + return nil + } + + return fmt.Errorf("failed to create instance %s: %w", vmName, err) +} + +// isSpotCapacityError checks if the error is related to spot VM capacity issues +func isSpotCapacityError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") || + strings.Contains(errStr, "UNSUPPORTED_OPERATION") || + strings.Contains(errStr, "stockout") || + strings.Contains(errStr, "does not have enough resources") || + strings.Contains(errStr, "quota") || + status.Code(err) == codes.ResourceExhausted +} + // EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress // controllers of the cluster. func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error { diff --git a/internal/bootstrap/gcp/gcp_client.go b/internal/bootstrap/gcp/gcp_client.go index b487343a..4fc8de6f 100644 --- a/internal/bootstrap/gcp/gcp_client.go +++ b/internal/bootstrap/gcp/gcp_client.go @@ -48,6 +48,7 @@ type GCPClientManager interface { CreateFirewallRule(projectID string, rule *computepb.Firewall) error CreateInstance(projectID, zone string, instance *computepb.Instance) error GetInstance(projectID, zone, instanceName string) (*computepb.Instance, error) + StartInstance(projectID, zone, instanceName string) error CreateAddress(projectID, region string, address *computepb.Address) (string, error) GetAddress(projectID, region, addressName string) (*computepb.Address, error) EnsureDNSManagedZone(projectID, zoneName, dnsName, description string) error @@ -552,6 +553,26 @@ func (c *GCPClient) GetInstance(projectID, zone, instanceName string) (*computep }) } +// StartInstance starts a stopped Compute Engine instance in the specified project and zone. +func (c *GCPClient) StartInstance(projectID, zone, instanceName string) error { + client, err := compute.NewInstancesRESTClient(c.ctx) + if err != nil { + return err + } + defer util.IgnoreError(client.Close) + + op, err := client.Start(c.ctx, &computepb.StartInstanceRequest{ + Project: projectID, + Zone: zone, + Instance: instanceName, + }) + if err != nil { + return err + } + + return op.Wait(c.ctx) +} + // CreateAddress creates a new static IP address in the specified project and region. func (c *GCPClient) CreateAddress(projectID, region string, address *computepb.Address) (string, error) { client, err := compute.NewAddressesRESTClient(c.ctx) diff --git a/internal/bootstrap/gcp/gcp_test.go b/internal/bootstrap/gcp/gcp_test.go index 4fe6ccea..c47e2bf1 100644 --- a/internal/bootstrap/gcp/gcp_test.go +++ b/internal/bootstrap/gcp/gcp_test.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "strings" + "sync" "os" @@ -169,9 +170,11 @@ var _ = Describe("GCP Bootstrapper", func() { gc.EXPECT().CreateFirewallRule(projectId, mock.Anything).Return(nil).Times(5) // 11. EnsureComputeInstances - gc.EXPECT().CreateInstance(projectId, "us-central1-a", mock.Anything).Return(nil).Times(9) - // GetInstance calls to retrieve IPs + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var instanceMu sync.Mutex ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), NetworkInterfaces: []*computepb.NetworkInterface{ { NetworkIP: protoString("10.0.0.1"), @@ -181,9 +184,19 @@ var _ = Describe("GCP Bootstrapper", func() { }, }, } - - gc.EXPECT().GetInstance(projectId, "us-central1-a", mock.Anything).Return(ipResp, nil).Times(9) + gc.EXPECT().GetInstance(projectId, "us-central1-a", mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + instanceMu.Lock() + defer instanceMu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, instance doesn't exist + return nil, fmt.Errorf("not found") + } + // Second call, return instance with IPs + return ipResp, nil + }).Times(18) fw.EXPECT().ReadFile(mock.Anything).Return([]byte("fake-key"), nil).Times(9) + gc.EXPECT().CreateInstance(projectId, "us-central1-a", mock.Anything).Return(nil).Times(9) // 12. EnsureGatewayIPAddresses gc.EXPECT().GetAddress(projectId, "us-central1", "gateway").Return(nil, fmt.Errorf("not found")) @@ -384,6 +397,17 @@ var _ = Describe("GCP Bootstrapper", func() { }) + Context("When both --spot and --preemptible are specified", func() { + BeforeEach(func() { + csEnv.Spot = true + csEnv.Preemptible = true + }) + It("fails with a clear error message", func() { + err := bs.ValidateInput() + Expect(err).To(MatchError(MatchRegexp("cannot specify both --spot and --preemptible"))) + }) + }) + }) Describe("EnsureInstallConfig", func() { @@ -945,14 +969,10 @@ var _ = Describe("GCP Bootstrapper", func() { }) Describe("Valid EnsureComputeInstances", func() { It("creates all instances", func() { - // Mock ReadFile for SSH key (called 9 times in parallel) - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(9) - - // Mock CreateInstance (9 times) - gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) - - // Mock GetInstance (9 times) + instanceCalls := make(map[string]int) + var mu sync.Mutex ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), NetworkInterfaces: []*computepb.NetworkInterface{ { NetworkIP: protoString("10.0.0.x"), @@ -962,7 +982,20 @@ var _ = Describe("GCP Bootstrapper", func() { }, }, } - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(ipResp, nil).Times(9) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, instance doesn't exist + return nil, fmt.Errorf("not found") + } + // Second call, return instance with IPs + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) err := bs.EnsureComputeInstances() Expect(err).NotTo(HaveOccurred()) @@ -975,7 +1008,8 @@ var _ = Describe("GCP Bootstrapper", func() { Describe("Invalid cases", func() { It("fails when SSH key read fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return(nil, fmt.Errorf("read error")).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return(nil, fmt.Errorf("read error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) @@ -983,7 +1017,8 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when CreateInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(fmt.Errorf("create error")).Maybe() err := bs.EnsureComputeInstances() @@ -992,7 +1027,8 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when GetInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Maybe() gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("get error")).Maybe() @@ -1001,6 +1037,172 @@ var _ = Describe("GCP Bootstrapper", func() { Expect(err.Error()).To(ContainSubstring("error ensuring compute instances")) }) }) + + Describe("Spot VM functionality", func() { + It("creates spot VMs when spot flag is enabled", func() { + csEnv.Spot = true + + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var mu sync.Mutex + ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + // Verify CreateInstance is called with SPOT provisioning model + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.MatchedBy(func(instance *computepb.Instance) bool { + return instance.Scheduling != nil && + instance.Scheduling.ProvisioningModel != nil && + *instance.Scheduling.ProvisioningModel == "SPOT" + })).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("falls back to standard VM when spot capacity is exhausted", func() { + csEnv.Spot = true + + // Track GetInstance calls per VM name + instanceCalls := make(map[string]int) + var mu sync.Mutex + ipResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return ipResp, nil + }).Times(18) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + createCalls := make(map[string]int) + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone string, instance *computepb.Instance) error { + mu.Lock() + defer mu.Unlock() + name := *instance.Name + createCalls[name]++ + if createCalls[name] == 1 { + return fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED") + } + return nil + }).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("restarts stopped VMs instead of creating new ones", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + stoppedResp := &computepb.Instance{ + Status: protoString("TERMINATED"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + runningResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, VM exists but is stopped + return stoppedResp, nil + } + // After StartInstance, VM is running + return runningResp, nil + }).Times(18) + + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("uses existing running VMs without starting them", func() { + runningResp := &computepb.Instance{ + Status: protoString("RUNNING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(runningResp, nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("handles VMs in intermediate states (STAGING/PROVISIONING)", func() { + provisioningResp := &computepb.Instance{ + Status: protoString("STAGING"), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString("10.0.0.x"), + AccessConfigs: []*computepb.AccessConfig{ + {NatIP: protoString("1.2.3.x")}, + }, + }, + }, + } + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(provisioningResp, nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + }) }) Describe("EnsureGatewayIPAddresses", func() { diff --git a/internal/bootstrap/gcp/mocks.go b/internal/bootstrap/gcp/mocks.go index 495ad6e3..a24742b3 100644 --- a/internal/bootstrap/gcp/mocks.go +++ b/internal/bootstrap/gcp/mocks.go @@ -1320,3 +1320,66 @@ func (_c *MockGCPClientManager_GetProjectByName_Call) RunAndReturn(run func(fold _c.Call.Return(run) return _c } + +// StartInstance provides a mock function for the type MockGCPClientManager +func (_mock *MockGCPClientManager) StartInstance(projectID string, zone string, instanceName string) error { + ret := _mock.Called(projectID, zone, instanceName) + + if len(ret) == 0 { + panic("no return value specified for StartInstance") + } + + var r0 error + if returnFunc, ok := ret.Get(0).(func(string, string, string) error); ok { + r0 = returnFunc(projectID, zone, instanceName) + } else { + r0 = ret.Error(0) + } + return r0 +} + +// MockGCPClientManager_StartInstance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'StartInstance' +type MockGCPClientManager_StartInstance_Call struct { + *mock.Call +} + +// StartInstance is a helper method to define mock.On call +// - projectID string +// - zone string +// - instanceName string +func (_e *MockGCPClientManager_Expecter) StartInstance(projectID interface{}, zone interface{}, instanceName interface{}) *MockGCPClientManager_StartInstance_Call { + return &MockGCPClientManager_StartInstance_Call{Call: _e.mock.On("StartInstance", projectID, zone, instanceName)} +} + +func (_c *MockGCPClientManager_StartInstance_Call) Run(run func(projectID string, zone string, instanceName string)) *MockGCPClientManager_StartInstance_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + var arg1 string + if args[1] != nil { + arg1 = args[1].(string) + } + var arg2 string + if args[2] != nil { + arg2 = args[2].(string) + } + run( + arg0, + arg1, + arg2, + ) + }) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) Return(err error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(err) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) RunAndReturn(run func(projectID string, zone string, instanceName string) error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(run) + return _c +} From 1ff3b6f3e8952ef03b7d7fa0bfda47b733025d53 Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:47:49 +0000 Subject: [PATCH 2/7] chore(docs): Auto-update docs and licenses Signed-off-by: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> --- docs/oms_beta_bootstrap-gcp.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/oms_beta_bootstrap-gcp.md b/docs/oms_beta_bootstrap-gcp.md index 33c0fd95..1582a4e3 100644 --- a/docs/oms_beta_bootstrap-gcp.md +++ b/docs/oms_beta_bootstrap-gcp.md @@ -47,6 +47,7 @@ oms beta bootstrap-gcp [flags] --registry-user string Custom Registry username (only for GitHub registry type) (optional) --secrets-dir string Directory for secrets (default: /etc/codesphere/secrets) (default "/etc/codesphere/secrets") --secrets-file string Path to secrets files (optional) (default "prod.vault.yaml") + --spot Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable (default: false) --ssh-private-key-path string SSH Private Key Path (default: ~/.ssh/id_rsa) (default "~/.ssh/id_rsa") --ssh-public-key-path string SSH Public Key Path (default: ~/.ssh/id_rsa.pub) (default "~/.ssh/id_rsa.pub") --ssh-quiet Suppress SSH command output (default: true) (default true) From f5cdbb3c8d6d45eab87909b40b28baee90b24707 Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:22:04 +0100 Subject: [PATCH 3/7] Update internal/bootstrap/gcp/gcp.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- internal/bootstrap/gcp/gcp.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index fee5d408..042a3c0a 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -710,7 +710,14 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { defer wg.Done() existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) - if err == nil && existingInstance != nil { + if err != nil { + st, ok := status.FromError(err) + if !ok || st.Code() != codes.NotFound { + errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) + return + } + } + if existingInstance != nil { instanceStatus := existingInstance.GetStatus() if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" { // Start the stopped instance From 7b78e189d09ed8ae4873034b6a87aa859a45a1ac Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:27:30 +0100 Subject: [PATCH 4/7] feat: enhance instance creation logging for spot VM fallback --- internal/bootstrap/gcp/gcp.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index fee5d408..c13f59fe 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -700,6 +700,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { wg := sync.WaitGroup{} errCh := make(chan error, len(vmDefs)) resultCh := make(chan vmResult, len(vmDefs)) + logCh := make(chan string, len(vmDefs)) rootDiskSize := int64(200) if b.Env.RegistryType == RegistryTypeGitHub { rootDiskSize = 50 @@ -816,7 +817,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { } } - err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name) + err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name, logCh) if err != nil { errCh <- err return @@ -851,6 +852,11 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { close(errCh) close(resultCh) + close(logCh) + + for msg := range logCh { + b.stlog.Logf("%s", msg) + } var errs []error for err := range errCh { @@ -913,7 +919,7 @@ func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling { // createInstanceWithFallback attempts to create an instance with the configured settings. // If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs. -func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string) error { +func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error { err := b.GCPClient.CreateInstance(projectID, zone, instance) if err == nil { return nil @@ -924,7 +930,7 @@ func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, ins } if b.Env.Spot && isSpotCapacityError(err) { - b.stlog.Logf("Spot capacity unavailable for %s, falling back to standard VM", vmName) + logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName) instance.Scheduling = &computepb.Scheduling{} err = b.GCPClient.CreateInstance(projectID, zone, instance) if err != nil && !isAlreadyExistsError(err) { From 2c6a4243063bd70e680439a21d040707eb870e08 Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:18:58 +0100 Subject: [PATCH 5/7] test: improve GetInstance error handling in GCPBootstrapper tests --- internal/bootstrap/gcp/gcp.go | 7 +++++-- internal/bootstrap/gcp/gcp_test.go | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index 8a0836a4..ac973b49 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -712,8 +712,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) if err != nil { - st, ok := status.FromError(err) - if !ok || st.Code() != codes.NotFound { + if !isNotFoundError(err) { errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) return } @@ -1711,6 +1710,10 @@ func isAlreadyExistsError(err error) bool { return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists") } +func isNotFoundError(err error) bool { + return status.Code(err) == codes.NotFound || strings.Contains(err.Error(), "not found") +} + // readSSHKey reads an SSH key file, expanding ~ in the path func (b *GCPBootstrapper) readSSHKey(path string) (string, error) { realPath := util.ExpandPath(path) diff --git a/internal/bootstrap/gcp/gcp_test.go b/internal/bootstrap/gcp/gcp_test.go index 196ca03a..1d91ddb4 100644 --- a/internal/bootstrap/gcp/gcp_test.go +++ b/internal/bootstrap/gcp/gcp_test.go @@ -1029,10 +1029,21 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when GetInstance fails", func() { - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("not found")).Maybe() + instanceCalls := make(map[string]int) + var mu sync.Mutex + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn( + func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, fmt.Errorf("not found") + } + return nil, fmt.Errorf("get error") + }, + ).Maybe() fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Maybe() - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("get error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) From e234f5f619aaf130d47a3c99b9951663f83f63d1 Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:49:46 +0100 Subject: [PATCH 6/7] Update internal/bootstrap/gcp/gcp.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- internal/bootstrap/gcp/gcp.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index ac973b49..2fbea36e 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -958,7 +958,6 @@ func isSpotCapacityError(err error) bool { strings.Contains(errStr, "UNSUPPORTED_OPERATION") || strings.Contains(errStr, "stockout") || strings.Contains(errStr, "does not have enough resources") || - strings.Contains(errStr, "quota") || status.Code(err) == codes.ResourceExhausted } From af02bd5753a009dc3ae045013d9c9deea69b7a1f Mon Sep 17 00:00:00 2001 From: OliverTrautvetter <66372584+OliverTrautvetter@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:46:25 +0100 Subject: [PATCH 7/7] feat: implement waitForInstanceRunning to ensure instance readiness and IP assignment --- internal/bootstrap/gcp/gcp.go | 52 +++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index 2fbea36e..c99c666d 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -726,21 +726,19 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err) return } - // Re-fetch instance to get updated IPs after start - existingInstance, err = b.GCPClient.GetInstance(projectID, zone, vm.Name) - if err != nil { - errCh <- fmt.Errorf("failed to get instance %s after start: %w", vm.Name, err) - return - } + } + + // Wait until the instance is RUNNING and IPs are populated. + readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP) + if err != nil { + errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err) + return } externalIP := "" - internalIP := "" - if len(existingInstance.GetNetworkInterfaces()) > 0 { - internalIP = existingInstance.GetNetworkInterfaces()[0].GetNetworkIP() - if len(existingInstance.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { - externalIP = existingInstance.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() - } + internalIP := readyInstance.GetNetworkInterfaces()[0].GetNetworkIP() + if len(readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { + externalIP = readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() } resultCh <- vmResult{ vmType: vm.Tags[0], @@ -948,6 +946,36 @@ func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, ins return fmt.Errorf("failed to create instance %s: %w", vmName, err) } +// waitForInstanceRunning polls GetInstance until the instance status is RUNNING +// and its internal IP (and external IP, when needsExternalIP is true) are populated. +// It returns the ready instance or an error if the deadline is exceeded. +func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) { + const ( + maxAttempts = 60 + pollInterval = 5 * time.Second + ) + for attempt := range maxAttempts { + inst, err := b.GCPClient.GetInstance(projectID, zone, name) + if err != nil { + return nil, fmt.Errorf("failed to poll instance %s: %w", name, err) + } + + if inst.GetStatus() == "RUNNING" && + len(inst.GetNetworkInterfaces()) > 0 && + inst.GetNetworkInterfaces()[0].GetNetworkIP() != "" && + (!needsExternalIP || (len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 && + inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() != "")) { + return inst, nil + } + + if attempt < maxAttempts-1 { + time.Sleep(pollInterval) + } + } + return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s", + name, time.Duration(maxAttempts)*pollInterval) +} + // isSpotCapacityError checks if the error is related to spot VM capacity issues func isSpotCapacityError(err error) bool { if err == nil {