-
Notifications
You must be signed in to change notification settings - Fork 9
feat(fracmanager): add handling for upload queue overflow and disk space exhaustion #266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -129,6 +129,8 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active | |
| curInfo := old.instance.Info() | ||
| r.stats.sealing.Add(curInfo) | ||
|
|
||
| r.active.Suspend(old.Suspended()) | ||
|
|
||
| wg := sync.WaitGroup{} | ||
| wg.Add(1) | ||
| // since old.WaitWriteIdle() can take some time, we don't want to do it under the lock | ||
|
|
@@ -151,6 +153,31 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active | |
| return old, wg.Wait, nil | ||
| } | ||
|
|
||
| func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: seems like return value is never used |
||
| r.mu.Lock() | ||
| defer r.mu.Unlock() | ||
|
|
||
| if maxQueue > 0 && r.stats.sealing.count >= int(maxQueue) { | ||
| r.active.Suspend(true) | ||
| return true | ||
| } | ||
|
|
||
| if maxSize > 0 && r.diskUsage() > maxSize { | ||
| r.active.Suspend(true) | ||
| return true | ||
| } | ||
|
|
||
| r.active.Suspend(false) | ||
| return false | ||
| } | ||
|
|
||
| func (r *fractionRegistry) diskUsage() uint64 { | ||
| return r.active.instance.Info().FullSize() + | ||
| r.stats.sealed.totalSizeOnDisk + | ||
| r.stats.sealing.totalSizeOnDisk + | ||
| r.stats.offloading.totalSizeOnDisk | ||
| } | ||
|
|
||
| // addActive sets a new active fraction and updates the complete fractions list. | ||
| func (r *fractionRegistry) addActive(a *activeProxy) { | ||
| r.muAll.Lock() | ||
|
|
@@ -227,6 +254,10 @@ func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]* | |
| // Fractions older than retention period are permanently deleted. | ||
| // Returns removed fractions or empty slice if nothing to remove. | ||
| func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { | ||
| if retention == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| r.mu.Lock() | ||
| defer r.mu.Unlock() | ||
|
|
||
|
|
@@ -248,6 +279,42 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { | |
| return evicted | ||
| } | ||
|
|
||
| // EvictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. | ||
| // Selects fractions that haven't finished offloading yet to minimize data loss. | ||
| // Used when offloading queue grows too large due to slow remote storage performance. | ||
| func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) []*sealedProxy { | ||
| if sizeLimit == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| r.mu.Lock() | ||
| defer r.mu.Unlock() | ||
|
|
||
| // Fast path: skip processing if within size limits | ||
| if r.stats.offloading.totalSizeOnDisk <= sizeLimit { | ||
| return nil | ||
| } | ||
|
|
||
| count := 0 | ||
| evicted := []*sealedProxy{} | ||
| // filter fractions | ||
| for _, item := range r.offloading { | ||
| // keep items that are within limits or already offloaded | ||
| if r.stats.offloading.totalSizeOnDisk <= sizeLimit || item.remote != nil { | ||
| r.offloading[count] = item | ||
| count++ | ||
| continue | ||
| } | ||
| evicted = append(evicted, item) | ||
| r.stats.offloading.Sub(item.instance.Info()) | ||
| } | ||
|
|
||
| r.offloading = r.offloading[:count] | ||
| r.rebuildAllFractions() | ||
|
|
||
| return evicted | ||
| } | ||
|
|
||
| // PromoteToSealed moves fractions from sealing to local queue when sealing completes. | ||
| // Maintains strict ordering - younger fractions wait for older ones to seal first. | ||
| func (r *fractionRegistry) PromoteToSealed(active *activeProxy, sealed *frac.Sealed) { | ||
|
|
@@ -322,6 +389,11 @@ func (r *fractionRegistry) removeFromOffloading(sealed *sealedProxy) { | |
| count++ | ||
| } | ||
| } | ||
|
|
||
| if count == len(r.offloading) { // not found to remove (can be removed earlier in EvictOverflowed) | ||
| return | ||
| } | ||
|
|
||
| r.offloading = r.offloading[:count] | ||
| r.stats.offloading.Sub(sealed.instance.Info()) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ type lifecycleManager struct { | |
| provider *fractionProvider // provider for fraction operations | ||
| flags *StateManager // storage state flags | ||
| registry *fractionRegistry // fraction state registry | ||
| tasks *TaskManager // Background offloading tasks | ||
|
|
||
| sealingWg sync.WaitGroup | ||
| } | ||
|
|
@@ -36,18 +37,26 @@ func newLifecycleManager( | |
| provider: provider, | ||
| flags: flags, | ||
| registry: registry, | ||
| tasks: NewTaskManager(), | ||
| } | ||
| } | ||
|
|
||
| // Maintain performs periodic lifecycle management tasks. | ||
| // It coordinates rotation, offloading, cleanup based on configuration. | ||
| func (lc *lifecycleManager) Maintain(ctx context.Context, config *Config, wg *sync.WaitGroup) { | ||
| lc.rotate(config.FracSize, wg) | ||
| if config.OffloadingEnabled { | ||
| lc.offloadLocal(ctx, config.TotalSize, wg) | ||
| lc.cleanRemote(config.OffloadingRetention, wg) | ||
| func (lc *lifecycleManager) Maintain(ctx context.Context, cfg *Config, wg *sync.WaitGroup) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Maybe you can add method Now it's done like this in fm.mu.Lock()
// Perform fraction maintenance (rotation, truncating, offloading, etc.)
fm.lc.Maintain(ctx, cfg, wg)
fm.mu.Unlock()I do not like that we grab mutexes somewhere in the related code. So I suggest to do just: // Perform fraction maintenance (rotation, truncating, offloading, etc.)
fm.Maintain(ctx, cfg, wg) |
||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Empty line. |
||
| suspendThreshold := cfg.TotalSize + cfg.TotalSize/100 + cfg.OffloadingQueueSize | ||
| lc.registry.SuspendIfOverCapacity(cfg.SealingQueueLen, suspendThreshold) | ||
|
|
||
| lc.rotate(cfg.FracSize, wg) | ||
| if cfg.OffloadingEnabled { | ||
| lc.offloadLocal(ctx, cfg.TotalSize, cfg.OffloadingRetryDelay, wg) | ||
| if cfg.OffloadingQueueSize > 0 { | ||
| lc.removeOverflowed(cfg.OffloadingQueueSize, wg) | ||
| } | ||
|
Comment on lines
+54
to
+56
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we check for overflow before making an attempt to offload sealed fractions? |
||
| lc.cleanRemote(cfg.OffloadingRetention, wg) | ||
| } else { | ||
| lc.cleanLocal(config.TotalSize, wg) | ||
| lc.cleanLocal(cfg.TotalSize, wg) | ||
| } | ||
| lc.updateOldestMetric() | ||
| lc.SyncInfoCache() | ||
|
|
@@ -113,17 +122,18 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { | |
|
|
||
| // offloadLocal starts offloading of local fractions to remote storage. | ||
| // Selects fractions based on disk space usage and retention policy. | ||
| func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, wg *sync.WaitGroup) { | ||
| func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, retryDelay time.Duration, wg *sync.WaitGroup) { | ||
| toOffload, err := lc.registry.EvictLocal(true, sizeLimit) | ||
| if err != nil { | ||
| logger.Fatal("error releasing old fractions:", zap.Error(err)) | ||
| } | ||
| for _, sealed := range toOffload { | ||
| wg.Add(1) | ||
| go func() { | ||
| lc.tasks.Run(sealed.instance.BaseFileName, ctx, func(ctx context.Context) { | ||
| defer wg.Done() | ||
|
|
||
| remote, _ := lc.tryOffload(ctx, sealed.instance) | ||
| remote := lc.offloadWithRetry(ctx, sealed.instance, retryDelay) | ||
|
|
||
| lc.registry.PromoteToRemote(sealed, remote) | ||
|
|
||
| if remote == nil { | ||
|
|
@@ -136,7 +146,41 @@ func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, | |
| // free up local resources | ||
| sealed.instance.Suicide() | ||
| maintenanceTruncateTotal.Add(1) | ||
| }() | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // OffloadWithRetry attempts to offload a fraction with retries until success or cancellation. | ||
| // Returns the remote fraction instance and a boolean indicating whether offloading was not canceled. | ||
| func (lc *lifecycleManager) offloadWithRetry(ctx context.Context, sealed *frac.Sealed, retryDelay time.Duration) *frac.Remote { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously we attempted to offload fraction Now, there might be more than Also |
||
| start := time.Now() | ||
| for i := 0; ; i++ { | ||
| remote, err := lc.tryOffload(ctx, sealed) | ||
| if err == nil { | ||
| return remote | ||
| } | ||
|
|
||
| logger.Warn( | ||
| "fail to offload fraction", | ||
| zap.String("name", sealed.BaseFileName), | ||
| zap.Duration("offloading_time", time.Since(start)), | ||
| zap.Int("attempts", i), | ||
| zap.Error(err), | ||
| ) | ||
|
|
||
| select { | ||
| case <-ctx.Done(): | ||
| logger.Info( | ||
| "fraction offloading was stopped", | ||
| zap.String("name", sealed.BaseFileName), | ||
| zap.Duration("offloading_time", time.Since(start)), | ||
| zap.Int("attempts", i), | ||
| zap.Error(ctx.Err()), | ||
| ) | ||
| return nil | ||
| case <-time.After(retryDelay): | ||
| // Wait before next retry attempt | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -163,9 +207,6 @@ func (lc *lifecycleManager) tryOffload(ctx context.Context, sealed *frac.Sealed) | |
|
|
||
| // cleanRemote deletes outdated remote fractions based on retention policy. | ||
| func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGroup) { | ||
| if retention == 0 { | ||
| return | ||
| } | ||
| toDelete := lc.registry.EvictRemote(retention) | ||
| wg.Add(1) | ||
| go func() { | ||
|
|
@@ -207,3 +248,18 @@ func (lc *lifecycleManager) updateOldestMetric() { | |
| oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.OldestTotal()) * time.Millisecond).Seconds()) | ||
| oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.OldestLocal()) * time.Millisecond).Seconds()) | ||
| } | ||
|
|
||
| // removeOverflowed removes fractions from offloading queue that exceed size limit | ||
| // Stops ongoing offloading tasks and cleans up both local and remote resources. | ||
| func (lc *lifecycleManager) removeOverflowed(sizeLimit uint64, wg *sync.WaitGroup) { | ||
| evicted := lc.registry.EvictOverflowed(sizeLimit) | ||
| for _, item := range evicted { | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| // Cancel the offloading task - this operation may take significant time | ||
| // hence executed in a separate goroutine to avoid blocking | ||
| lc.tasks.Cancel(item.instance.BaseFileName) | ||
| }() | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add corresponding validations (of the new config options) into
config/validation.go.