From 38a7993c75c774e93cde5622a3f2cf1b1334fe90 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 24 Nov 2025 18:04:52 +0300 Subject: [PATCH] feat(fracmanager): implement fraction snapshots with wait group reference counting --- fracmanager/fracmanager.go | 17 +-- fracmanager/fracmanager_for_tests.go | 2 +- fracmanager/fracmanager_test.go | 25 ++-- fracmanager/fracs_stats.go | 4 + fracmanager/fraction_provider.go | 23 +++- fracmanager/fraction_registry.go | 168 ++++++++++--------------- fracmanager/fractions_snapshot.go | 93 ++++++++++++++ fracmanager/lifecycle_manager.go | 83 ++++-------- fracmanager/lifecycle_manager_test.go | 123 ++++++++++++++---- fracmanager/proxy_frac.go | 174 -------------------------- fracmanager/sync_appender.go | 59 +++++++++ storeapi/grpc_async_search.go | 8 +- storeapi/grpc_fetch.go | 5 +- storeapi/grpc_search.go | 5 +- storeapi/grpc_v1.go | 5 +- 15 files changed, 409 insertions(+), 385 deletions(-) create mode 100644 fracmanager/fractions_snapshot.go delete mode 100644 fracmanager/proxy_frac.go create mode 100644 fracmanager/sync_appender.go diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 50d7e1f3..da8d9935 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -76,18 +76,19 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func cancel() wg.Wait() - // freeze active fraction to prevent new writes - active := lc.registry.Active() - if err := active.Finalize(); err != nil { + // finalize appender fraction to prevent new writes + appender := lc.registry.Appender() + if err := appender.Finalize(); err != nil { logger.Fatal("shutdown fraction freezing error", zap.Error(err)) } - active.WaitWriteIdle() + appender.WaitWriteIdle() stopIdx() lc.SyncInfoCache() - sealOnShutdown(active.instance, provider, cfg.MinSealFracSize) + // Seal active fraction + sealOnShutdown(appender.frac, provider, cfg.MinSealFracSize) logger.Info("fracmanager's workers are stopped", zap.Int64("took_ms", time.Since(n).Milliseconds())) } @@ -95,8 +96,8 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func return &fm, stop, nil } -func (fm *FracManager) Fractions() List { - return fm.lc.registry.AllFractions() +func (fm *FracManager) FractionsSnapshot() (List, ReleaseSnapshot) { + return fm.lc.registry.FractionsSnapshot() } func (fm *FracManager) Oldest() uint64 { @@ -116,7 +117,7 @@ func (fm *FracManager) Append(ctx context.Context, docs, metas storage.DocBlock) return ctx.Err() default: // Try to append data to the currently active fraction - err := fm.lc.registry.Active().Append(docs, metas) + err := fm.lc.registry.Appender().Append(docs, metas) if err != nil { logger.Info("append fail", zap.Error(err)) if err == ErrFractionNotWritable { diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index ab7cd851..c4ec1cad 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -3,7 +3,7 @@ package fracmanager import "sync" func (fm *FracManager) WaitIdleForTests() { - fm.lc.registry.Active().WaitWriteIdle() + fm.lc.registry.Appender().WaitWriteIdle() } func (fm *FracManager) SealForcedForTests() { diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index 89437904..43afe89e 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -52,30 +52,33 @@ func TestSealingOnShutdown(t *testing.T) { cfg.MinSealFracSize = 0 // to ensure that the frac will not be sealed on shutdown cfg, fm, stop := setupFracManager(t, cfg) appendDocsToFracManager(t, fm, 10) - activeName := fm.Fractions()[0].Info().Name() + + fractions := fm.lc.registry.all.fractions + activeName := fractions[0].Info().Name() + stop() // second start cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown cfg, fm, stop = setupFracManager(t, cfg) - assert.Equal(t, 1, len(fm.Fractions()), "should have one fraction") - assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "fraction should have the same name") - _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*frac.Active) + fractions = fm.lc.registry.all.fractions + assert.Equal(t, 1, len(fractions), "should have one fraction") + assert.Equal(t, activeName, fractions[0].Info().Name(), "fraction should have the same name") + _, ok := fractions[0].(*frac.Active) assert.True(t, ok, "fraction should be active") - stop() // third start _, fm, stop = setupFracManager(t, cfg) - assert.Equal(t, 2, len(fm.Fractions()), "should have 2 fraction: new active and old sealed") - _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*frac.Sealed) + fractions = fm.lc.registry.all.fractions + assert.Equal(t, 2, len(fractions), "should have 2 fraction: new active and old sealed") + _, ok = fractions[0].(*frac.Sealed) assert.True(t, ok, "first fraction should be sealed") - assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "sealed fraction should have the same name") - assert.Equal(t, uint32(0), fm.Fractions()[1].Info().DocsTotal, "active fraction should be empty") - _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*frac.Active) + assert.Equal(t, activeName, fractions[0].Info().Name(), "sealed fraction should have the same name") + assert.Equal(t, uint32(0), fractions[1].Info().DocsTotal, "active fraction should be empty") + _, ok = fractions[1].(*frac.Active) assert.True(t, ok, "new fraction should be active") - stop() } diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go index 968b8b41..c70bbd37 100644 --- a/fracmanager/fracs_stats.go +++ b/fracmanager/fracs_stats.go @@ -95,3 +95,7 @@ func (s *registryStats) SetMetrics() { s.offloading.SetMetrics(dataSizeTotal, "offloading") s.remotes.SetMetrics(dataSizeTotal, "remotes") } + +func (s registryStats) TotalSizeOnDiskLocal() uint64 { + return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk +} diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index e2915598..cb4a6eec 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -8,13 +8,16 @@ import ( "time" "github.com/oklog/ulid/v2" + "go.uber.org/zap" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" + "github.com/ozontech/seq-db/util" ) const fileBasePattern = "seq-db-" @@ -107,8 +110,11 @@ func (fp *fractionProvider) CreateActive() *frac.Active { // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading -func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { - src, err := frac.NewActiveSealingSource(active, fp.config.SealParams) +func (fp *fractionProvider) Seal(a *frac.Active) (*frac.Sealed, error) { + sealsTotal.Inc() + now := time.Now() + + src, err := frac.NewActiveSealingSource(a, fp.config.SealParams) if err != nil { return nil, err } @@ -117,7 +123,18 @@ func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { return nil, err } - return fp.NewSealedPreloaded(active.BaseFileName, preloaded), nil + s := fp.NewSealedPreloaded(a.BaseFileName, preloaded) + + sealingTime := time.Since(now) + sealsDoneSeconds.Observe(sealingTime.Seconds()) + + logger.Info( + "fraction sealed", + zap.String("fraction", filepath.Base(s.BaseFileName)), + zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), + ) + + return s, nil } // Offload uploads fraction to S3 storage and returns a remote fraction diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index 2aeca819..f69d07bd 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -18,18 +18,18 @@ type fractionRegistry struct { mu sync.RWMutex // main mutex for protecting registry state // lifecycle queues (FIFO order, oldest at lower indexes) - sealing []*activeProxy // fractions being sealed (0-5 typical) - sealed []*sealedProxy // local sealed fractions (can be thousands) - offloading []*sealedProxy // fractions being offloaded (0-5 typical) - remotes []*remoteProxy // offloaded fractions (can be thousands) + sealing []*activeSyncDestroyable // fractions being sealed (0-5 typical) + sealed []*sealedSyncDestroyable // local sealed fractions (can be thousands) + offloading []*sealedSyncDestroyable // fractions being offloaded (0-5 typical) + remotes []*remoteSyncDestroyable // offloaded fractions (can be thousands) stats registryStats // size statistics for monitoring oldestTotal uint64 // creation time of oldest fraction in all list including remote oldestLocal uint64 // creation time of oldest fraction in local or offloading queues - muAll sync.RWMutex // protects active, all, and oldestTotal fields - active *activeProxy // currently active writable fraction - all []frac.Fraction // all fractions in creation order (read-only view) + muAll sync.RWMutex // protects active, all, and oldestTotal fields + appender *syncAppender // currently active writable fraction + all *fractionsSnapshot // all fractions in creation order (read-only view) } // NewFractionRegistry creates and initializes a new fraction registry instance. @@ -40,49 +40,38 @@ func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []* return nil, errors.New("active fraction must be specified") } - r := fractionRegistry{ - active: &activeProxy{ - proxy: &fractionProxy{impl: active}, - instance: active, - }, - } + r := fractionRegistry{appender: &syncAppender{frac: active}} // initialize local sealed fractions for _, sealed := range sealed { r.stats.sealed.Add(sealed.Info()) - r.sealed = append(r.sealed, &sealedProxy{ - proxy: &fractionProxy{impl: sealed}, - instance: sealed, - }) + r.sealed = append(r.sealed, &sealedSyncDestroyable{sealed: sealed}) } // initialize remote fractions for _, remote := range remotes { r.stats.remotes.Add(remote.Info()) - r.remotes = append(r.remotes, &remoteProxy{ - proxy: &fractionProxy{impl: remote}, - instance: remote, - }) + r.remotes = append(r.remotes, &remoteSyncDestroyable{remote: remote}) } r.updateOldestLocal() - r.rebuildAllFractions() + r.rebuildSnapshot() return &r, nil } -// Active returns the currently active writable fraction. -func (r *fractionRegistry) Active() *activeProxy { +// Appender returns the currently active writable fraction. +func (r *fractionRegistry) Appender() *syncAppender { r.muAll.RLock() defer r.muAll.RUnlock() - return r.active + return r.appender } -// AllFractions returns a read-only view of all fractions in creation order. -func (r *fractionRegistry) AllFractions() []frac.Fraction { +// FractionsSnapshot returns a read-only view of all fractions in creation order. +func (r *fractionRegistry) FractionsSnapshot() ([]frac.Fraction, ReleaseSnapshot) { r.muAll.RLock() defer r.muAll.RUnlock() - return r.all + return r.all.Fractions() } // Stats returns current size statistics of the registry. @@ -90,7 +79,7 @@ func (r *fractionRegistry) Stats() registryStats { r.mu.RLock() defer r.mu.RUnlock() - r.stats.active.Set(r.active.instance.Info()) + r.stats.active.Set(r.appender.frac.Info()) return r.stats } @@ -112,23 +101,26 @@ func (r *fractionRegistry) OldestLocal() uint64 { // Moves previous active fraction to sealing queue. // Updates statistics and maintains chronological order. // Should be called when creating a new fraction. -func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *activeProxy) (*activeProxy, func(), error) { +func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *frac.Active) (*activeSyncDestroyable, func(), error) { r.mu.Lock() defer r.mu.Unlock() - if r.active.instance.Info().DocsOnDisk <= maxSize { + if r.appender.frac.Info().DocsOnDisk <= maxSize { return nil, nil, nil } - old := r.active - r.sealing = append(r.sealing, old) + old := r.appender + sealing := &activeSyncDestroyable{active: old.frac} + r.sealing = append(r.sealing, sealing) r.addActive(newActive()) + r.rebuildSnapshot() + if err := old.Finalize(); err != nil { - return old, nil, err + return nil, nil, err } - curInfo := old.instance.Info() + curInfo := old.frac.Info() r.stats.sealing.Add(curInfo) wg := sync.WaitGroup{} @@ -139,7 +131,7 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active defer wg.Done() old.WaitWriteIdle() // can be long enough - finalInfo := old.instance.Info() + finalInfo := old.frac.Info() r.mu.Lock() defer r.mu.Unlock() @@ -150,32 +142,21 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active r.stats.sealing.Add(finalInfo) }() - return old, wg.Wait, nil + return sealing, wg.Wait, nil } // addActive sets a new active fraction and updates the complete fractions list. -func (r *fractionRegistry) addActive(a *activeProxy) { +func (r *fractionRegistry) addActive(a *frac.Active) { r.muAll.Lock() defer r.muAll.Unlock() - r.active = a - r.all = append(r.all, a.proxy) -} - -// trimAll removes the oldest fractions from the complete fractions list. -// Used when fractions are evicted or deleted from the system. -func (r *fractionRegistry) trimAll(count int) { - r.muAll.Lock() - defer r.muAll.Unlock() - - r.all = r.all[count:] - r.updateOldestTotal() + r.appender = &syncAppender{frac: a} } // EvictLocal removes oldest local fractions to free disk space. // If shouldOffload is true, moves fractions to offloading queue instead of deleting. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]*sealedProxy, error) { +func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]*sealedSyncDestroyable, error) { r.mu.Lock() defer r.mu.Unlock() @@ -185,16 +166,14 @@ func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]* ) // calculate total used disk space - totalUsedSize := r.stats.sealed.totalSizeOnDisk + - r.stats.sealing.totalSizeOnDisk + - r.active.instance.Info().FullSize() + totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.appender.frac.Info().FullSize() // determine how many oldest fractions need to be removed to meet size limit for _, item := range r.sealed { if totalUsedSize-releasingSize <= sizeLimit { break } - info := item.instance.Info() + info := item.sealed.Info() releasingSize += info.FullSize() r.stats.sealed.Sub(info) count++ @@ -215,10 +194,10 @@ func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]* if shouldOffload { for _, item := range evicted { r.offloading = append(r.offloading, item) - r.stats.offloading.Add(item.instance.Info()) + r.stats.offloading.Add(item.sealed.Info()) } } else { - r.trimAll(count) // permanently remove + r.rebuildSnapshot() r.updateOldestLocal() // oldest local can be changed here } @@ -228,14 +207,14 @@ func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]* // EvictRemote removes oldest remote fractions based on retention policy. // Fractions older than retention period are permanently deleted. // Returns removed fractions or empty slice if nothing to remove. -func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { +func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteSyncDestroyable { r.mu.Lock() defer r.mu.Unlock() count := 0 // find fractions older than retention period for _, item := range r.remotes { - info := item.instance.Info() + info := item.remote.Info() if time.Since(time.UnixMilli(int64(info.CreationTime))) <= retention { break // stop at first fraction within retention } @@ -245,14 +224,14 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { evicted := r.remotes[:count] r.remotes = r.remotes[count:] - r.trimAll(count) // remove from complete list + r.rebuildSnapshot() return evicted } // PromoteToSealed moves fractions from sealing to local queue when sealing completes. // Maintains strict ordering - younger fractions wait for older ones to seal first. -func (r *fractionRegistry) PromoteToSealed(active *activeProxy, sealed *frac.Sealed) { +func (r *fractionRegistry) PromoteToSealed(active *activeSyncDestroyable, sealed *frac.Sealed) { r.mu.Lock() defer r.mu.Unlock() @@ -265,22 +244,23 @@ func (r *fractionRegistry) PromoteToSealed(active *activeProxy, sealed *frac.Sea break // maintain order - wait for previous fractions to complete } promotedCount++ - r.sealed = append(r.sealed, &sealedProxy{ - proxy: item.proxy, - instance: item.sealed, - }) + r.sealed = append(r.sealed, &sealedSyncDestroyable{sealed: item.sealed}) r.stats.sealed.Add(item.sealed.Info()) - r.stats.sealing.Sub(item.instance.Info()) + r.stats.sealing.Sub(item.active.Info()) + } + + if promotedCount > 0 { + // remove promoted fractions from sealing queue and rebuild snapshot + r.sealing = r.sealing[promotedCount:] + r.rebuildSnapshot() } - // remove promoted fractions from sealing queue - r.sealing = r.sealing[promotedCount:] } // PromoteToRemote moves fractions from offloading to remote queue when offloading completes. // Special case: handles fractions that don't require offloading (remote == nil). // Maintains strict ordering - younger fractions wait for older ones to offload. -func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *frac.Remote) { +func (r *fractionRegistry) PromoteToRemote(sealed *sealedSyncDestroyable, remote *frac.Remote) { r.mu.Lock() defer r.mu.Unlock() @@ -298,62 +278,42 @@ func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *frac.Rem break // maintain order - wait for previous fractions to complete } promotedCount++ - r.remotes = append(r.remotes, &remoteProxy{ - proxy: item.proxy, - instance: item.remote, - }) + r.remotes = append(r.remotes, &remoteSyncDestroyable{remote: item.remote}) r.stats.remotes.Add(item.remote.Info()) - r.stats.offloading.Sub(item.instance.Info()) + r.stats.offloading.Sub(item.sealed.Info()) } if promotedCount > 0 { // remove promoted fractions from offloading queue r.offloading = r.offloading[promotedCount:] r.updateOldestLocal() + r.rebuildSnapshot() } } // removeFromOffloading removes a specific fraction from offloading queue. // O(n) operation that rebuilds the all fractions list. -func (r *fractionRegistry) removeFromOffloading(sealed *sealedProxy) { +func (r *fractionRegistry) removeFromOffloading(sealed *sealedSyncDestroyable) { count := 0 // filter out the target fraction for _, item := range r.offloading { - if sealed != item { + if sealed.sealed != item.sealed { r.offloading[count] = item count++ } } r.offloading = r.offloading[:count] - r.stats.offloading.Sub(sealed.instance.Info()) + r.stats.offloading.Sub(sealed.sealed.Info()) - // oldest local can be changed here r.updateOldestLocal() - - // rebuild complete list since we modified the middle of the queue - r.rebuildAllFractions() + r.rebuildSnapshot() } -// rebuildAllFractions reconstructs the all fractions list in correct chronological order. +// rebuildSnapshot reconstructs the all fractions list in correct chronological order. // Order: remote (oldest) → offloading → sealed → sealing → active (newest) // Expensive O(n) operation used when direct list modification is insufficient. -func (r *fractionRegistry) rebuildAllFractions() { - all := make([]frac.Fraction, 0, len(r.all)) - - // collect fractions in correct chronological order: from oldest (remote) to newest (active) - for _, remote := range r.remotes { - all = append(all, remote.proxy) - } - for _, offloaded := range r.offloading { - all = append(all, offloaded.proxy) - } - for _, sealed := range r.sealed { - all = append(all, sealed.proxy) - } - for _, active := range r.sealing { - all = append(all, active.proxy) - } - all = append(all, r.active.proxy) +func (r *fractionRegistry) rebuildSnapshot() { + all := buildFractionsSnapshot(r.remotes, r.offloading, r.sealed, r.sealing, r.appender.frac) r.muAll.Lock() defer r.muAll.Unlock() @@ -365,7 +325,7 @@ func (r *fractionRegistry) rebuildAllFractions() { // updateOldestTotal recalculates the creation time of the oldest fraction. // Called after modifications of the complete fractions list. func (r *fractionRegistry) updateOldestTotal() { - r.oldestTotal = r.all[0].Info().CreationTime + r.oldestTotal = r.all.fractions[0].Info().CreationTime } // updateOldestLocal recalculates the creation time of the oldest local fraction. @@ -373,12 +333,12 @@ func (r *fractionRegistry) updateOldestTotal() { // Called after modifications func (r *fractionRegistry) updateOldestLocal() { if len(r.offloading) > 0 { - r.oldestLocal = r.offloading[0].proxy.Info().CreationTime + r.oldestLocal = r.offloading[0].sealed.Info().CreationTime } else if len(r.sealed) > 0 { - r.oldestLocal = r.sealed[0].proxy.Info().CreationTime + r.oldestLocal = r.sealed[0].sealed.Info().CreationTime } else if len(r.sealing) > 0 { - r.oldestLocal = r.sealing[0].proxy.Info().CreationTime + r.oldestLocal = r.sealing[0].active.Info().CreationTime } else { - r.oldestLocal = r.active.proxy.Info().CreationTime + r.oldestLocal = r.appender.frac.Info().CreationTime } } diff --git a/fracmanager/fractions_snapshot.go b/fracmanager/fractions_snapshot.go new file mode 100644 index 00000000..be38fd12 --- /dev/null +++ b/fracmanager/fractions_snapshot.go @@ -0,0 +1,93 @@ +package fracmanager + +import ( + "errors" + "sync" + + "github.com/ozontech/seq-db/frac" +) + +var ( + ErrFractionNotWritable = errors.New("fraction is not writable") +) + +type ReleaseSnapshot func() + +type fractionsSnapshot struct { + wg sync.WaitGroup + fractions []frac.Fraction +} + +func (fs *fractionsSnapshot) Fractions() ([]frac.Fraction, ReleaseSnapshot) { + fs.wg.Add(1) + return fs.fractions, fs.wg.Done +} + +func buildFractionsSnapshot( + remotes []*remoteSyncDestroyable, + offloading, locals []*sealedSyncDestroyable, + sealing []*activeSyncDestroyable, + active *frac.Active, +) *fractionsSnapshot { + capacity := 1 + + len(remotes) + + len(offloading) + + len(locals) + + len(sealing) + + fs := fractionsSnapshot{fractions: make([]frac.Fraction, 0, capacity)} + + // Collect fractions in correct chronological order: from oldest (remote) to newest (active) + for _, r := range remotes { + r.wg = &fs.wg + fs.fractions = append(fs.fractions, r.remote) + } + for _, o := range offloading { + o.wg = &fs.wg + fs.fractions = append(fs.fractions, o.sealed) + } + for _, l := range locals { + l.wg = &fs.wg + fs.fractions = append(fs.fractions, l.sealed) + } + for _, s := range sealing { + s.wg = &fs.wg + fs.fractions = append(fs.fractions, s.active) + } + + fs.fractions = append(fs.fractions, active) + + return &fs +} + +type activeSyncDestroyable struct { + wg *sync.WaitGroup + active *frac.Active + sealed *frac.Sealed +} + +func (d activeSyncDestroyable) Destroy() { + d.wg.Wait() + d.active.Release() +} + +type sealedSyncDestroyable struct { + wg *sync.WaitGroup + sealed *frac.Sealed + remote *frac.Remote +} + +func (d sealedSyncDestroyable) Destroy() { + d.wg.Wait() + d.sealed.Suicide() +} + +type remoteSyncDestroyable struct { + wg *sync.WaitGroup + remote *frac.Remote +} + +func (d remoteSyncDestroyable) Destroy() { + d.wg.Wait() + d.remote.Suicide() +} diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index 63934868..0c73a5a5 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -2,7 +2,6 @@ package fracmanager import ( "context" - "path/filepath" "sync" "time" @@ -10,7 +9,6 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/util" ) // lifecycleManager manages the complete lifecycle of fractions. @@ -60,37 +58,10 @@ func (lc *lifecycleManager) SyncInfoCache() { } } -// seal converts an active fraction to sealed state. -// It freezes writes, waits for pending operations, then seals the fraction. -func (lc *lifecycleManager) seal(active *activeProxy) error { - sealsTotal.Inc() - now := time.Now() - sealed, err := lc.provider.Seal(active.instance) - if err != nil { - return err - } - sealingTime := time.Since(now) - sealsDoneSeconds.Observe(sealingTime.Seconds()) - - logger.Info( - "fraction sealed", - zap.String("fraction", filepath.Base(sealed.BaseFileName)), - zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), - ) - - lc.infoCache.Add(sealed.Info()) - lc.registry.PromoteToSealed(active, sealed) - active.proxy.Redirect(sealed) - active.instance.Release() - return nil -} - // rotate checks if active fraction needs rotation based on size limit. // Creates new active fraction and starts sealing the previous one. func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { - activeToSeal, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, func() *activeProxy { - return newActiveProxy(lc.provider.CreateActive()) - }) + activeToSeal, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, lc.provider.CreateActive) if err != nil { logger.Fatal("active fraction rotation error", zap.Error(err)) } @@ -105,9 +76,14 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { defer lc.sealingWg.Done() waitBeforeSealing() - if err := lc.seal(activeToSeal); err != nil { + sealed, err := lc.provider.Seal(activeToSeal.active) + if err != nil { logger.Fatal("sealing error", zap.Error(err)) } + + lc.infoCache.Add(sealed.Info()) + lc.registry.PromoteToSealed(activeToSeal, sealed) + activeToSeal.Destroy() }() } @@ -118,23 +94,20 @@ func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } + wg.Add(len(toOffload)) for _, sealed := range toOffload { - wg.Add(1) go func() { defer wg.Done() - remote, _ := lc.tryOffload(ctx, sealed.instance) + remote, _ := lc.tryOffload(ctx, sealed.sealed) lc.registry.PromoteToRemote(sealed, remote) if remote == nil { - sealed.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(sealed.instance.Info().Name()) - } else { - sealed.proxy.Redirect(remote) + lc.infoCache.Remove(sealed.sealed.Info().Name()) } // free up local resources - sealed.instance.Suicide() + sealed.Destroy() maintenanceTruncateTotal.Add(1) }() } @@ -167,15 +140,14 @@ func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGr return } toDelete := lc.registry.EvictRemote(retention) - wg.Add(1) - go func() { - defer wg.Done() - for _, remote := range toDelete { - remote.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(remote.instance.Info().Name()) - remote.instance.Suicide() - } - }() + wg.Add(len(toDelete)) + for _, remote := range toDelete { + go func() { + defer wg.Done() + lc.infoCache.Remove(remote.remote.Info().Name()) + remote.Destroy() + }() + } } // cleanLocal deletes outdated local fractions when offloading is disabled. @@ -190,16 +162,15 @@ func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { } } - wg.Add(1) - go func() { - defer wg.Done() - for _, sealed := range toDelete { - sealed.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(sealed.instance.Info().Name()) - sealed.instance.Suicide() + wg.Add(len(toDelete)) + for _, sealed := range toDelete { + go func() { + defer wg.Done() + lc.infoCache.Remove(sealed.sealed.Info().Name()) + sealed.Destroy() maintenanceTruncateTotal.Add(1) - } - }() + }() + } } // updateOldestMetric updates the prometheus metric with oldest fraction timestamp. diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go index b7afb715..fb8c1f2d 100644 --- a/fracmanager/lifecycle_manager_test.go +++ b/fracmanager/lifecycle_manager_test.go @@ -1,14 +1,20 @@ package fracmanager import ( + "math" "math/rand" "path/filepath" "sync" "testing" + "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/seq" ) func setupLifecycle(t testing.TB, cfg *Config) (*lifecycleManager, func()) { @@ -31,21 +37,18 @@ func TestFracInfoCache(t *testing.T) { lc, tearDown := setupLifecycle(t, nil) defer tearDown() - var total uint64 - fillRotateAndCheck := func(names map[string]struct{}) { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + active := lc.registry.Appender() + appendDocsToActive(t, active.frac, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - info := active.proxy.Info() + info := active.frac.Info() _, ok := lc.infoCache.Get(info.Name()) assert.True(t, ok) - total += info.FullSize() names[info.Name()] = struct{}{} } @@ -53,12 +56,13 @@ func TestFracInfoCache(t *testing.T) { for range 10 { fillRotateAndCheck(first) } - halfSize := total + halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() second := map[string]struct{}{} for range 10 { fillRotateAndCheck(second) } + total := lc.registry.Stats().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total-halfSize, &wg) @@ -80,18 +84,14 @@ func TestCapacityExceeded(t *testing.T) { defer tearDown() const fracsCount = 10 - var total uint64 fillAndRotate := func() { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + active := lc.registry.Appender() + appendDocsToActive(t, active.frac, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - - info := active.proxy.Info() - total += info.FullSize() } assert.False(t, lc.flags.IsCapacityExceeded(), "expect data dir is empty") @@ -102,6 +102,8 @@ func TestCapacityExceeded(t *testing.T) { } assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") + total := lc.registry.Stats().TotalSizeOnDiskLocal() + wg := sync.WaitGroup{} lc.cleanLocal(total, &wg) wg.Wait() @@ -121,20 +123,15 @@ func TestOldestMetrics(t *testing.T) { defer tearDown() const fracsCount = 10 - var total uint64 - fillAndRotate := func() { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + active := lc.registry.Appender() + appendDocsToActive(t, active.frac, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - - info := active.proxy.Info() - total += info.FullSize() } - firstFracTime := lc.registry.Active().proxy.Info().CreationTime + firstFracTime := lc.registry.Appender().frac.Info().CreationTime for range fracsCount { fillAndRotate() } @@ -143,12 +140,15 @@ func TestOldestMetrics(t *testing.T) { assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") - halfSize := total - halfwayFracTime := lc.registry.Active().proxy.Info().CreationTime + halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + + halfwayFracTime := lc.registry.Appender().frac.Info().CreationTime for range fracsCount { fillAndRotate() } + total := lc.registry.Stats().TotalSizeOnDiskLocal() + wg := sync.WaitGroup{} lc.offloadLocal(t.Context(), total-halfSize, &wg) wg.Wait() @@ -158,3 +158,80 @@ func TestOldestMetrics(t *testing.T) { assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") } + +func TestPendingDestroy(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const ( + fracsCount = 10 + docsPerFrac = 10 + ) + // appending docs to `fracsCount` fractions where the last is active and the rest are sealed + wg := sync.WaitGroup{} + for range fracsCount - 1 { + appendDocsToActive(t, lc.registry.Appender().frac, docsPerFrac) + lc.rotate(0, &wg) + } + appendDocsToActive(t, lc.registry.Appender().frac, docsPerFrac) + + // wait sealing complete + wg.Wait() + + // take all fracs to search + fractions1, release1 := lc.registry.FractionsSnapshot() + + // delete all sealing fracs + lc.cleanLocal(lc.registry.Appender().frac.Info().FullSize(), &wg) + + var ( + beforeRelease time.Time + afterCleanup time.Time + ) + + cleanup := sync.WaitGroup{} + cleanup.Add(1) + go func() { + // cleanup is pending, so run it in a goroutine + // waiting for cleanup to finish + defer cleanup.Done() + wg.Wait() + afterCleanup = time.Now() + }() + + queryAst, err := parser.ParseSeqQL("*", seq.Mapping{}) + require.NoError(t, err, "failed to parse query") + params := processor.SearchParams{ + AST: queryAst.Root, + From: seq.MID(0), + To: seq.MID(math.MaxUint64), + Limit: math.MaxInt32, + } + + for _, f := range fractions1 { + qpr, err := f.Search(t.Context(), params) + assert.NoError(t, err, "failed to search") + assert.Equal(t, docsPerFrac, len(qpr.IDs)) + } + + beforeRelease = time.Now() + release1() + + cleanup.Wait() + assert.Less(t, beforeRelease, afterCleanup, "we expect cleanup to happen after release") + + fractions2, release2 := lc.registry.FractionsSnapshot() + + assert.Len(t, fractions2, 1, "only one active fraction should remain") + singleName := fractions2[0].Info().Name() + + for _, f := range fractions1 { + if f.Info().Name() == singleName { + continue + } + assert.Panics(t, func() { + _, _ = f.Search(t.Context(), params) + }, "searching by destroyed faction is expected to trigger a panic") + } + release2() +} diff --git a/fracmanager/proxy_frac.go b/fracmanager/proxy_frac.go deleted file mode 100644 index 949e2412..00000000 --- a/fracmanager/proxy_frac.go +++ /dev/null @@ -1,174 +0,0 @@ -package fracmanager - -import ( - "context" - "errors" - "math" - "sync" - "time" - - "go.uber.org/zap" - - "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/util" -) - -var ( - _ frac.Fraction = (*fractionProxy)(nil) - _ frac.Fraction = (*emptyFraction)(nil) - - ErrFractionNotWritable = errors.New("fraction is not writable") -) - -// fractionProxy provides thread-safe access to a fraction with atomic replacement -// Used to switch fraction implementations (active → sealed → remote) without blocking readers. -// Lifecycle: Created for each fraction, persists through state transitions. -type fractionProxy struct { - mu sync.RWMutex - impl frac.Fraction // Current fraction implementation -} - -func (p *fractionProxy) Redirect(f frac.Fraction) { - p.mu.Lock() - defer p.mu.Unlock() - p.impl = f -} - -func (p *fractionProxy) Info() *common.Info { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Info() -} - -func (p *fractionProxy) IsIntersecting(from, to seq.MID) bool { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.IsIntersecting(from, to) -} - -func (p *fractionProxy) Contains(mid seq.MID) bool { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Contains(mid) -} - -func (p *fractionProxy) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Fetch(ctx, ids) -} - -func (p *fractionProxy) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Search(ctx, params) -} - -// activeProxy manages an active (writable) fraction -// Tracks pending write operations and provides freeze capability. -// Lifecycle: Created when fraction becomes active, destroyed after sealing. -type activeProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Active // Actual active fraction instance - sealed *frac.Sealed // Sealed version (set after sealing) - - mu sync.RWMutex // Protects readonly state - wg sync.WaitGroup // Tracks pending write operations - - finalized bool // Whether fraction is frozen for writes -} - -func newActiveProxy(active *frac.Active) *activeProxy { - return &activeProxy{ - proxy: &fractionProxy{impl: active}, - instance: active, - } -} - -// Append adds documents to the active fraction -func (p *activeProxy) Append(docs, meta []byte) error { - p.mu.RLock() - if p.finalized { - p.mu.RUnlock() - return ErrFractionNotWritable - } - p.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() - p.mu.RUnlock() - - return p.instance.Append(docs, meta, &p.wg) -} - -// WaitWriteIdle waits for all pending write operations to complete -// Used before sealing to ensure data consistency. -func (p *activeProxy) WaitWriteIdle() { - start := time.Now() - logger.Info("waiting fraction to stop write...", zap.String("name", p.instance.BaseFileName)) - p.wg.Wait() - waitTime := util.DurationToUnit(time.Since(start), "s") - logger.Info("write is stopped", - zap.String("name", p.instance.BaseFileName), - zap.Float64("time_wait_s", waitTime)) -} - -// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. -func (p *activeProxy) Finalize() error { - p.mu.Lock() - defer p.mu.Unlock() - - if p.finalized { - return errors.New("fraction is already finalized") - } - p.finalized = true - - return nil -} - -// sealedProxy represents a sealed fraction that may be offloaded -// Tracks both local sealed instance and remote version if offloaded. -type sealedProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Sealed // Local sealed fraction - remote *frac.Remote // Remote version (if offloaded) -} - -// remoteProxy represents an offloaded fraction -type remoteProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Remote // Remote fraction instance -} - -// emptyFraction represents a missing or deleted fraction -// Returns empty results for all operations. -// Used as placeholder when fraction is removed but references still exist. -type emptyFraction struct { -} - -func (emptyFraction) Info() *common.Info { - return &common.Info{ - Path: "empty", - From: math.MaxUint64, - To: 0, - } -} - -func (emptyFraction) IsIntersecting(_, _ seq.MID) bool { - return false -} - -func (emptyFraction) Contains(mid seq.MID) bool { - return false -} - -func (emptyFraction) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - return nil, nil -} - -func (emptyFraction) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { - metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() - return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil -} diff --git a/fracmanager/sync_appender.go b/fracmanager/sync_appender.go new file mode 100644 index 00000000..d4e5c613 --- /dev/null +++ b/fracmanager/sync_appender.go @@ -0,0 +1,59 @@ +package fracmanager + +import ( + "errors" + "sync" + "time" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +type syncAppender struct { + frac *frac.Active // Actual active fraction instance + + mu sync.RWMutex // Protects readonly state + wg sync.WaitGroup // Tracks pending write operations + + finalized bool // Whether fraction is frozen for writes +} + +// Append adds documents to the active fraction +func (a *syncAppender) Append(docs, meta []byte) error { + a.mu.RLock() + if a.finalized { + a.mu.RUnlock() + return ErrFractionNotWritable + } + a.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() + a.mu.RUnlock() + + return a.frac.Append(docs, meta, &a.wg) +} + +// WaitWriteIdle waits for all pending write operations to complete +// Used before sealing to ensure data consistency. +func (a *syncAppender) WaitWriteIdle() { + start := time.Now() + logger.Info("waiting fraction to stop write...", zap.String("name", a.frac.BaseFileName)) + a.wg.Wait() + waitTime := util.DurationToUnit(time.Since(start), "s") + logger.Info("write is stopped", + zap.String("name", a.frac.BaseFileName), + zap.Float64("time_wait_s", waitTime)) +} + +// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (a *syncAppender) Finalize() error { + a.mu.Lock() + if a.finalized { + a.mu.Unlock() + return errors.New("fraction is already finalized") + } + a.finalized = true + a.mu.Unlock() + + return nil +} diff --git a/storeapi/grpc_async_search.go b/storeapi/grpc_async_search.go index 518ed19f..d44df1b2 100644 --- a/storeapi/grpc_async_search.go +++ b/storeapi/grpc_async_search.go @@ -46,8 +46,12 @@ func (g *GrpcV1) StartAsyncSearch( Retention: r.Retention.AsDuration(), WithDocs: r.WithDocs, } - fracs := g.fracManager.Fractions().FilterInRange(seq.MillisToMID(uint64(r.From)), seq.MillisToMID(uint64(r.To))) - if err := g.asyncSearcher.StartSearch(req, fracs); err != nil { + + fracs, release := g.fracManager.FractionsSnapshot() + defer release() + + filtered := fracs.FilterInRange(seq.MillisToMID(uint64(r.From)), seq.MillisToMID(uint64(r.To))) + if err := g.asyncSearcher.StartSearch(req, filtered); err != nil { return nil, err } diff --git a/storeapi/grpc_fetch.go b/storeapi/grpc_fetch.go index d640618c..7ddaa5ab 100644 --- a/storeapi/grpc_fetch.go +++ b/storeapi/grpc_fetch.go @@ -68,7 +68,10 @@ func (g *GrpcV1) doFetch(ctx context.Context, req *storeapi.FetchRequest, stream dp := acquireDocFieldsFilter(req.FieldsFilter) defer releaseDocFieldsFilter(dp) - docsStream := newDocsStream(ctx, ids, g.fetchData.docFetcher, g.fracManager.Fractions()) + fractions, release := g.fracManager.FractionsSnapshot() + defer release() + + docsStream := newDocsStream(ctx, ids, g.fetchData.docFetcher, fractions) for _, id := range ids { workTime := time.Now() doc, err := docsStream.Next() diff --git a/storeapi/grpc_search.go b/storeapi/grpc_search.go index 93f0ba64..38b90e2d 100644 --- a/storeapi/grpc_search.go +++ b/storeapi/grpc_search.go @@ -164,13 +164,16 @@ func (g *GrpcV1) doSearch( } searchTr := tr.NewChild("search iteratively") + fractions, release := g.fracManager.FractionsSnapshot() qpr, err := g.searchData.searcher.SearchDocs( ctx, - g.fracManager.Fractions(), + fractions, searchParams, tr, ) + release() searchTr.Done() + if err != nil { if code, ok := parseStoreError(err); ok { return &storeapi.SearchResponse{Code: code}, nil diff --git a/storeapi/grpc_v1.go b/storeapi/grpc_v1.go index c7ec6705..d4007ac1 100644 --- a/storeapi/grpc_v1.go +++ b/storeapi/grpc_v1.go @@ -98,6 +98,9 @@ type GrpcV1 struct { } func NewGrpcV1(cfg APIConfig, fracManager *fracmanager.FracManager, mappingProvider MappingProvider) *GrpcV1 { + fractions, release := fracManager.FractionsSnapshot() + defer release() + g := &GrpcV1{ config: cfg, fracManager: fracManager, @@ -113,7 +116,7 @@ func NewGrpcV1(cfg APIConfig, fracManager *fracmanager.FracManager, mappingProvi }, asyncSearcher: asyncsearcher.MustStartAsync( cfg.Search.Async, mappingProvider, - fracManager.Fractions(), + fractions, ), }