-
Notifications
You must be signed in to change notification settings - Fork 2
feat: add support for Spot VMs and validation for provisioning options #213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9181696
1b7f037
1ff3b6f
f5cdbb3
7b78e18
bd82e50
2c6a424
e234f5f
af02bd5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,7 @@ type CodesphereEnvironment struct { | |
| InstallHash string `json:"install_hash"` | ||
| InstallSkipSteps []string `json:"install_skip_steps"` | ||
| Preemptible bool `json:"preemptible"` | ||
| Spot bool `json:"spot"` | ||
| WriteConfig bool `json:"-"` | ||
| GatewayIP string `json:"gateway_ip"` | ||
| PublicGatewayIP string `json:"public_gateway_ip"` | ||
|
|
@@ -306,9 +307,22 @@ func (b *GCPBootstrapper) ValidateInput() error { | |
| return err | ||
| } | ||
|
|
||
| err = b.validateVMProvisioningOptions() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| return b.validateGithubParams() | ||
| } | ||
|
|
||
| // validateVMProvisioningOptions checks that spot and preemptible options are not both set | ||
| func (b *GCPBootstrapper) validateVMProvisioningOptions() error { | ||
| if b.Env.Spot && b.Env.Preemptible { | ||
| return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model") | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // validateInstallVersion checks if the specified install version exists and contains the required installer artifact | ||
| func (b *GCPBootstrapper) validateInstallVersion() error { | ||
| if b.Env.InstallLocal != "" { | ||
|
|
@@ -686,6 +700,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
| wg := sync.WaitGroup{} | ||
| errCh := make(chan error, len(vmDefs)) | ||
| resultCh := make(chan vmResult, len(vmDefs)) | ||
| logCh := make(chan string, len(vmDefs)) | ||
| rootDiskSize := int64(200) | ||
| if b.Env.RegistryType == RegistryTypeGitHub { | ||
| rootDiskSize = 50 | ||
|
|
@@ -694,6 +709,47 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
| wg.Add(1) | ||
| go func(vm VMDef) { | ||
| defer wg.Done() | ||
|
|
||
| existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) | ||
| if err != nil { | ||
| if !isNotFoundError(err) { | ||
| errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) | ||
| return | ||
| } | ||
| } | ||
| if existingInstance != nil { | ||
| instanceStatus := existingInstance.GetStatus() | ||
| if instanceStatus == "TERMINATED" || instanceStatus == "STOPPED" || instanceStatus == "SUSPENDED" { | ||
| // Start the stopped instance | ||
| err = b.GCPClient.StartInstance(projectID, zone, vm.Name) | ||
| if err != nil { | ||
| errCh <- fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err) | ||
| return | ||
| } | ||
| } | ||
|
|
||
| // Wait until the instance is RUNNING and IPs are populated. | ||
| readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP) | ||
| if err != nil { | ||
| errCh <- fmt.Errorf("instance %s did not become ready: %w", vm.Name, err) | ||
| return | ||
| } | ||
|
|
||
| externalIP := "" | ||
| internalIP := readyInstance.GetNetworkInterfaces()[0].GetNetworkIP() | ||
| if len(readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { | ||
| externalIP = readyInstance.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() | ||
| } | ||
|
Comment on lines
+738
to
+742
|
||
| resultCh <- vmResult{ | ||
| vmType: vm.Tags[0], | ||
| name: vm.Name, | ||
| externalIP: externalIP, | ||
| internalIP: internalIP, | ||
| } | ||
| return | ||
| } | ||
|
|
||
| // Instance doesn't exist, create it | ||
| disks := []*computepb.AttachedDisk{ | ||
| { | ||
| Boot: protoBool(true), | ||
|
|
@@ -737,9 +793,7 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
| Tags: &computepb.Tags{ | ||
| Items: vm.Tags, | ||
| }, | ||
| Scheduling: &computepb.Scheduling{ | ||
| Preemptible: &b.Env.Preemptible, | ||
| }, | ||
| Scheduling: b.buildSchedulingConfig(), | ||
| NetworkInterfaces: []*computepb.NetworkInterface{ | ||
| { | ||
| Network: protoString(network), | ||
|
|
@@ -767,9 +821,9 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
| } | ||
| } | ||
|
|
||
| err = b.GCPClient.CreateInstance(projectID, zone, instance) | ||
| if err != nil && !isAlreadyExistsError(err) { | ||
| errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err) | ||
| err = b.createInstanceWithFallback(projectID, zone, instance, vm.Name, logCh) | ||
| if err != nil { | ||
| errCh <- err | ||
| return | ||
| } | ||
|
|
||
|
|
@@ -802,6 +856,11 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
|
|
||
| close(errCh) | ||
| close(resultCh) | ||
| close(logCh) | ||
|
|
||
| for msg := range logCh { | ||
| b.stlog.Logf("%s", msg) | ||
| } | ||
|
|
||
| var errs []error | ||
| for err := range errCh { | ||
|
|
@@ -843,6 +902,93 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { | |
| return nil | ||
| } | ||
|
|
||
| // buildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings | ||
| func (b *GCPBootstrapper) buildSchedulingConfig() *computepb.Scheduling { | ||
| if b.Env.Spot { | ||
| return &computepb.Scheduling{ | ||
| ProvisioningModel: protoString("SPOT"), | ||
| OnHostMaintenance: protoString("TERMINATE"), | ||
| AutomaticRestart: protoBool(false), | ||
| InstanceTerminationAction: protoString("STOP"), | ||
| } | ||
| } | ||
| if b.Env.Preemptible { | ||
| return &computepb.Scheduling{ | ||
| Preemptible: protoBool(true), | ||
| } | ||
| } | ||
|
|
||
| return &computepb.Scheduling{} | ||
| } | ||
|
|
||
| // createInstanceWithFallback attempts to create an instance with the configured settings. | ||
| // If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs. | ||
| func (b *GCPBootstrapper) createInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error { | ||
| err := b.GCPClient.CreateInstance(projectID, zone, instance) | ||
| if err == nil { | ||
| return nil | ||
| } | ||
|
|
||
| if isAlreadyExistsError(err) { | ||
| return nil | ||
| } | ||
|
|
||
| if b.Env.Spot && isSpotCapacityError(err) { | ||
| logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName) | ||
| instance.Scheduling = &computepb.Scheduling{} | ||
OliverTrautvetter marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| err = b.GCPClient.CreateInstance(projectID, zone, instance) | ||
| if err != nil && !isAlreadyExistsError(err) { | ||
| return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| return fmt.Errorf("failed to create instance %s: %w", vmName, err) | ||
| } | ||
|
|
||
| // waitForInstanceRunning polls GetInstance until the instance status is RUNNING | ||
| // and its internal IP (and external IP, when needsExternalIP is true) are populated. | ||
| // It returns the ready instance or an error if the deadline is exceeded. | ||
| func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) { | ||
| const ( | ||
| maxAttempts = 60 | ||
| pollInterval = 5 * time.Second | ||
| ) | ||
| for attempt := range maxAttempts { | ||
| inst, err := b.GCPClient.GetInstance(projectID, zone, name) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to poll instance %s: %w", name, err) | ||
| } | ||
|
|
||
| if inst.GetStatus() == "RUNNING" && | ||
| len(inst.GetNetworkInterfaces()) > 0 && | ||
| inst.GetNetworkInterfaces()[0].GetNetworkIP() != "" && | ||
| (!needsExternalIP || (len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 && | ||
| inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() != "")) { | ||
| return inst, nil | ||
| } | ||
|
|
||
| if attempt < maxAttempts-1 { | ||
| time.Sleep(pollInterval) | ||
| } | ||
| } | ||
| return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s", | ||
| name, time.Duration(maxAttempts)*pollInterval) | ||
| } | ||
|
|
||
| // isSpotCapacityError checks if the error is related to spot VM capacity issues | ||
| func isSpotCapacityError(err error) bool { | ||
| if err == nil { | ||
| return false | ||
| } | ||
| errStr := err.Error() | ||
| return strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") || | ||
| strings.Contains(errStr, "UNSUPPORTED_OPERATION") || | ||
| strings.Contains(errStr, "stockout") || | ||
| strings.Contains(errStr, "does not have enough resources") || | ||
| status.Code(err) == codes.ResourceExhausted | ||
| } | ||
|
|
||
| // EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress | ||
| // controllers of the cluster. | ||
| func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error { | ||
|
|
@@ -1591,6 +1737,10 @@ func isAlreadyExistsError(err error) bool { | |
| return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists") | ||
| } | ||
|
|
||
| func isNotFoundError(err error) bool { | ||
| return status.Code(err) == codes.NotFound || strings.Contains(err.Error(), "not found") | ||
| } | ||
|
|
||
| // readSSHKey reads an SSH key file, expanding ~ in the path | ||
| func (b *GCPBootstrapper) readSSHKey(path string) (string, error) { | ||
| realPath := util.ExpandPath(path) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.