Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8cd3286
feat: added two new fields to cr and generated boilerplate
hmbill694 Dec 11, 2025
d280f12
feat: add checks to see if we have a missing artifact in provisioner
hmbill694 Dec 11, 2025
0ad7e70
refactor: rename fields to be more descriptive
hmbill694 Dec 11, 2025
40f5411
feat: add new values to config to handle retrying totally failed syncs
hmbill694 Dec 11, 2025
d3bc6e9
feat: wired up
hmbill694 Dec 12, 2025
36fe3fb
refactor: moved to time based failure
hmbill694 Dec 12, 2025
ac4ad75
refactor: let library check on sync
hmbill694 Dec 12, 2025
8583667
refactor: remove other unneeded fields
hmbill694 Dec 12, 2025
60c612f
refactor: formatting
hmbill694 Dec 12, 2025
b4ca4d5
chore: generate manifests again
hmbill694 Dec 12, 2025
e7a8cb4
fix: messy names and duplicate env var targeting
hmbill694 Dec 12, 2025
6e4f304
feat: added more condition reasons
hmbill694 Dec 12, 2025
9930590
feat: added new phase for visibility into retry loop
hmbill694 Dec 12, 2025
e114b8d
feat: handling new phase in controller and service code
hmbill694 Dec 12, 2025
5287187
chore: regenerate chart to include new phase
hmbill694 Dec 12, 2025
610f34c
fix: wait 10 seconds before checking sync status again
hmbill694 Dec 12, 2025
6c138e7
chore: regenerate resources after adding LastFailureTime to spec
hmbill694 Dec 12, 2025
799e8fd
fix: resolve finalizer conflict and now respecting backoff behavior
hmbill694 Dec 12, 2025
8dbf19e
chore: regenerate the chart
hmbill694 Dec 12, 2025
139fe04
fix: pr feedback
hmbill694 Dec 16, 2025
e1b65e0
fix: resolve nit
hmbill694 Dec 16, 2025
07ee5f8
fix: consistent names
hmbill694 Dec 16, 2025
bdbe599
fix: better boolean name
hmbill694 Dec 16, 2025
be78d45
fix: 10 as concurrency limit on sync. Average number of labs per sync…
hmbill694 Jan 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions api/v1alpha1/vmdiskimage_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,24 @@ const (

// Condition Reasons
const (
ReasonResourceCreationFailed string = "ResourceCreationFailed"
ReasonResouceUpdateFailed string = "ResourceUpdateFailed"
ReasonQueued string = "Queued"
ReasonSyncing string = "Syncing"
ReasonRetryLimitExceeded string = "RetryLimitExceeded"
ReasonSynced string = "Synced"
ReasonResourceCreationFailed string = "ResourceCreationFailed"
ReasonResouceUpdateFailed string = "ResourceUpdateFailed"
ReasonQueued string = "Queued"
ReasonSyncing string = "Syncing"
ReasonRetryLimitExceeded string = "RetryLimitExceeded"
ReasonMissingSourceArtifact string = "MissingSourceArtifact"
ReasonSyncAttemptDurationExceeded string = "SyncAttemptDurationExceeded"
ReasonUnknownSyncFailure string = "UnknownSyncFailure"
ReasonSynced string = "Synced"
)

// CRD phases
const (
PhaseQueued string = "Queued"
PhaseSyncing string = "Syncing"
PhaseReady string = "Ready"
PhaseFailed string = "Failed"
PhaseQueued string = "Queued"
PhaseSyncing string = "Syncing"
PhaseReady string = "Ready"
PhaseRetryableFailure string = "RetryableFailure"
PhaseFailed string = "Failed"
)

// VMDiskImage Labels
Expand Down Expand Up @@ -94,7 +98,7 @@ type VMDiskImageStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file

// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed
// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed;RetryableFailure
Phase string `json:"phase"`

// A human-readable message providing more details about the current phase.
Expand All @@ -103,7 +107,8 @@ type VMDiskImageStatus struct {
// Conditions of the VMDiskImage resource.
Conditions []metav1.Condition `json:"conditions,omitempty"`

FailureCount int `json:"failureCount,omitempty"`
FailureCount int `json:"failureCount,omitempty"`
LastFailureTime *metav1.Time `json:"lastFailureTime,omitempty"`
}

// +kubebuilder:object:root=true
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ spec:
type: array
failureCount:
type: integer
lastFailureTime:
format: date-time
type: string
message:
description: A human-readable message providing more details about the current phase.
type: string
Expand All @@ -146,6 +149,7 @@ spec:
- Syncing
- Ready
- Failed
- RetryableFailure
type: string
required:
- phase
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ spec:
type: array
failureCount:
type: integer
lastFailureTime:
format: date-time
type: string
message:
description: A human-readable message providing more details about
the current phase.
Expand All @@ -150,6 +153,7 @@ spec:
- Syncing
- Ready
- Failed
- RetryableFailure
type: string
required:
- phase
Expand Down
4 changes: 4 additions & 0 deletions dist/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ spec:
type: array
failureCount:
type: integer
lastFailureTime:
format: date-time
type: string
message:
description: A human-readable message providing more details about
the current phase.
Expand All @@ -158,6 +161,7 @@ spec:
- Syncing
- Ready
- Failed
- RetryableFailure
type: string
required:
- phase
Expand Down
42 changes: 24 additions & 18 deletions internal/vm-disk-image/config/vmdi-controller-config-reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,44 @@ import (
)

const (
defaultConcurrency = 10 // TODO: We will need to tune this default
defaultRetryLimit = 2
defaultBackoffDuration = 10 * time.Second
defaultMaxSyncDuration = 1 * time.Hour
defaultConcurrency = 10
defaultMaxBackoffDelay = 1 * time.Hour
defaultMaxSyncDuration = 12 * time.Hour
defaultMaxSyncAttemptRetries = 3
defaultMaxSyncAttemptDuration = 1 * time.Hour
)

type VMDiskImageControllerConfig struct {
Concurrency int
RetryLimit int
RetryBackoffDuration time.Duration
MaxSyncDuration time.Duration
Concurrency int
MaxBackoffDelay time.Duration
MaxSyncDuration time.Duration
MaxSyncAttemptDuration time.Duration
MaxSyncAttemptRetries int
}

// This function will allow us to get the required config variables from the environment.
// Locally this is your "env" and in production these values will come from a configmap
func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
// The max amount of VMDIs we can have syncing at one time.
concurrency := corecfg.GetIntEnvOrDefault("CONCURRENCY", defaultConcurrency)
concurrency := corecfg.GetIntEnvOrDefault("MAX_VMDI_SYNC_CONCURRENCY", defaultConcurrency)

// How many times we will retry a failed sync.
retryLimit := corecfg.GetIntEnvOrDefault("RETRY_LIMIT", defaultRetryLimit)
// The longest we will ever wait to retry.
maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay)

// How long we want to wait before trying to resync a failed VMDI.
retryBackoffDuration := corecfg.GetDurationEnvOrDefault("RETRY_BACKOFF_DURATION", defaultBackoffDuration)
// How long we will try to run a sync before we fail it forever.
maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)

// How long we will let a VMDI sit in syncing status.
maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)

// How many times we will retry on a given attempt.
maxSyncAttemptRetries := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)

return VMDiskImageControllerConfig{
Concurrency: concurrency,
RetryLimit: retryLimit,
RetryBackoffDuration: retryBackoffDuration,
MaxSyncDuration: maxSyncDuration,
Concurrency: concurrency,
MaxBackoffDelay: maxBackoffDelay,
MaxSyncAttemptDuration: maxAttemptDuration,
MaxSyncAttemptRetries: maxSyncAttemptRetries,
MaxSyncDuration: maxSyncDuration,
}
}
32 changes: 15 additions & 17 deletions internal/vm-disk-image/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return r.VMDiskImageOrchestrator.DeleteResource(ctx, &VMDiskImage)
}

resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
if resourceHasFinalizer {
err := r.AddControllerFinalizer(ctx, &VMDiskImage)
if err != nil {
return r.HandleResourceUpdateError(ctx, &VMDiskImage, err, "Failed to add finalizer to our resource")
}

resourceMissingFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
if resourceMissingFinalizer {
return r.AddControllerFinalizer(ctx, &VMDiskImage)
}

currentPhase := VMDiskImage.Status.Phase
Expand All @@ -93,6 +89,8 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return r.AttemptSyncingOfResource(ctx, &VMDiskImage)
case crdv1.PhaseSyncing:
return r.TransitonFromSyncing(ctx, &VMDiskImage)
case crdv1.PhaseRetryableFailure:
return r.AttemptRetry(ctx, &VMDiskImage)
case crdv1.PhaseReady, crdv1.PhaseFailed:
return ctrl.Result{}, nil
default:
Expand All @@ -112,18 +110,18 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {

resourceGenerator := &vmdi.Generator{}
vmdiProvisioner := vmdi.K8sVMDIProvisioner{
Client: client,
ResourceGenerator: resourceGenerator,
MaxSyncDuration: config.MaxSyncDuration,
RetryLimit: config.RetryLimit,
Client: client,
ResourceGenerator: resourceGenerator,
MaxSyncAttemptDuration: config.MaxSyncAttemptDuration,
MaxSyncAttemptRetries: config.MaxSyncAttemptRetries,
}
orchestrator := vmdi.Orchestrator{
Client: client,
Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
Provisioner: vmdiProvisioner,
RetryLimit: config.RetryLimit,
RetryBackoff: config.RetryBackoffDuration,
SyncLimit: config.Concurrency,
Client: client,
Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
Provisioner: vmdiProvisioner,
MaxRetryBackoff: config.MaxBackoffDelay,
MaxSyncTime: config.MaxSyncDuration,
ConcurrentSyncLimit: config.Concurrency,
}
reconciler := &VMDiskImageReconciler{
Scheme: mgr.GetScheme(),
Expand Down
Loading