From 245b8755166c21405ec8dd75202f45ad9029f97c Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Thu, 6 Nov 2025 22:20:38 +0300 Subject: [PATCH 01/28] refactor(fracmanager): using fifo queues of fractions --- cmd/seq-db/seq-db.go | 1 + frac/active.go | 97 +--- frac/active_indexer.go | 21 +- frac/active_indexer_test.go | 5 +- frac/empty.go | 50 -- frac/fraction.go | 3 - frac/fraction_test.go | 59 ++- frac/remote.go | 98 ++-- frac/sealed.go | 109 +---- fracmanager/config.go | 1 + fracmanager/frac_info_cache_test.go | 200 -------- fracmanager/fracmanager.go | 637 ++++++-------------------- fracmanager/fracmanager_for_tests.go | 20 +- fracmanager/fracmanager_test.go | 215 ++------- fracmanager/fracs_stats.go | 85 ++++ fracmanager/fraction_provider.go | 2 +- fracmanager/fraction_provider_test.go | 33 +- fracmanager/fraction_registry.go | 385 ++++++++++++++++ fracmanager/lifecycle_manager.go | 209 +++++++++ fracmanager/lifecycle_manager_test.go | 160 +++++++ fracmanager/loader.go | 8 +- fracmanager/loader_test.go | 98 +++- fracmanager/proxy_frac.go | 284 +++++------- fracmanager/sealer_test.go | 10 +- fracmanager/searcher_test.go | 1 - indexer/test_doc_provider.go | 41 +- storeapi/grpc_v1_test.go | 7 +- storeapi/store.go | 18 +- 28 files changed, 1377 insertions(+), 1480 deletions(-) delete mode 100644 frac/empty.go create mode 100644 fracmanager/fracs_stats.go create mode 100644 fracmanager/fraction_registry.go create mode 100644 fracmanager/lifecycle_manager.go create mode 100644 fracmanager/lifecycle_manager_test.go diff --git a/cmd/seq-db/seq-db.go b/cmd/seq-db/seq-db.go index 2be1c1f9..97720478 100644 --- a/cmd/seq-db/seq-db.go +++ b/cmd/seq-db/seq-db.go @@ -259,6 +259,7 @@ func startStore( MaintenanceDelay: 0, CacheGCDelay: 0, CacheCleanupDelay: 0, + MinSealFracSize: uint64(cfg.Storage.TotalSize) * consts.DefaultMinSealPercent / 100, SealParams: common.SealParams{ IDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, LIDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, diff --git a/frac/active.go b/frac/active.go index 82810773..6485c9c4 100644 --- a/frac/active.go +++ b/frac/active.go @@ -35,10 +35,6 @@ type Active struct { BaseFileName string - useMu sync.RWMutex - suicided bool - released bool - infoMu sync.RWMutex info *common.Info @@ -269,40 +265,18 @@ func (f *Active) String() string { } func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) + if f.Info().DocsTotal == 0 { // it is empty active fraction state + return nil, nil } - return dp.Fetch(ids) + return f.createDataProvider(ctx).Fetch(ids) } func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) - } - return dp.Search(params) -} - -func (f *Active) DataProvider(ctx context.Context) (*activeDataProvider, func()) { - f.useMu.RLock() - - if f.suicided || f.released || f.Info().DocsTotal == 0 { // it is empty active fraction state - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - } - f.useMu.RUnlock() - return nil, func() {} - } - - // it is ordinary active fraction state - dp := f.createDataProvider(ctx) - return dp, func() { - dp.release() - f.useMu.RUnlock() + if f.Info().DocsTotal == 0 { // it is empty active fraction state + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil } + return f.createDataProvider(ctx).Search(params) } func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { @@ -338,49 +312,24 @@ func (f *Active) IsIntersecting(from, to seq.MID) bool { } func (f *Active) Release() { - f.useMu.Lock() - f.released = true - f.useMu.Unlock() - f.releaseMem() if !f.Config.KeepMetaFile { - f.removeMetaFile() + util.RemoveFile(f.metaFile.Name()) } if !f.Config.SkipSortDocs { // we use sorted docs in sealed fraction so we can remove original docs of active fraction - f.removeDocsFiles() + util.RemoveFile(f.docsFile.Name()) } } -// Offload for [Active] fraction is no-op. -// -// Since search within [Active] fraction is too costly (we have to replay the whole index in memory), -// we decided to support offloading only for [Sealed] fractions. -func (f *Active) Offload(context.Context, storage.Uploader) (bool, error) { - return false, nil -} - func (f *Active) Suicide() { - f.useMu.Lock() - released := f.released - f.suicided = true - f.released = true - f.useMu.Unlock() - - if released { // fraction can be suicided after release - if f.Config.KeepMetaFile { - f.removeMetaFile() // meta was not removed while release - } - if f.Config.SkipSortDocs { - f.removeDocsFiles() // docs was not removed while release - } - } else { // was not release - f.releaseMem() - f.removeMetaFile() - f.removeDocsFiles() - } + f.releaseMem() + + util.RemoveFile(f.metaFile.Name()) + util.RemoveFile(f.docsFile.Name()) + util.RemoveFile(f.BaseFileName + consts.SdocsFileSuffix) } func (f *Active) releaseMem() { @@ -393,24 +342,12 @@ func (f *Active) releaseMem() { if err := f.metaFile.Close(); err != nil { logger.Error("can't close meta file", zap.String("frac", f.BaseFileName), zap.Error(err)) } + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } f.RIDs = nil f.MIDs = nil f.TokenList = nil f.DocsPositions = nil } - -func (f *Active) removeDocsFiles() { - if err := f.docsFile.Close(); err != nil { - logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } - if err := os.Remove(f.docsFile.Name()); err != nil { - logger.Error("can't delete docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } -} - -func (f *Active) removeMetaFile() { - if err := os.Remove(f.metaFile.Name()); err != nil { - logger.Error("can't delete metas file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } -} diff --git a/frac/active_indexer.go b/frac/active_indexer.go index 0422c105..f1d31a6f 100644 --- a/frac/active_indexer.go +++ b/frac/active_indexer.go @@ -18,8 +18,6 @@ type ActiveIndexer struct { ch chan *indexTask chMerge chan *mergeTask workerCount int - - stopFn func() } type indexTask struct { @@ -34,12 +32,14 @@ type mergeTask struct { tokenLIDs *TokenLIDs } -func NewActiveIndexer(workerCount, chLen int) *ActiveIndexer { - return &ActiveIndexer{ +func NewActiveIndexer(workerCount, chLen int) (*ActiveIndexer, func()) { + idx := ActiveIndexer{ ch: make(chan *indexTask, chLen), chMerge: make(chan *mergeTask, chLen), workerCount: workerCount, } + stopIdx := idx.start() + return &idx, stopIdx } func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { @@ -53,7 +53,7 @@ func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, s m.Stop() } -func (ai *ActiveIndexer) Start() { +func (ai *ActiveIndexer) start() func() { wg := sync.WaitGroup{} wg.Add(ai.workerCount) @@ -72,13 +72,10 @@ func (ai *ActiveIndexer) Start() { }() } - ai.stopFn = func() { + return func() { close(ai.ch) close(ai.chMerge) - wg.Wait() - - ai.stopFn = nil } } @@ -88,12 +85,6 @@ func (ai *ActiveIndexer) mergeWorker() { } } -func (ai *ActiveIndexer) Stop() { - if ai.stopFn != nil { - ai.stopFn() - } -} - var metaDataPool = sync.Pool{ New: func() any { return new(indexer.MetaData) diff --git a/frac/active_indexer_test.go b/frac/active_indexer_test.go index faa07a8f..fc2585c6 100644 --- a/frac/active_indexer_test.go +++ b/frac/active_indexer_test.go @@ -76,9 +76,8 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx := NewActiveIndexer(8, 8) - idx.Start() - defer idx.Stop() + idx, stop := NewActiveIndexer(8, 8) + defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) diff --git a/frac/empty.go b/frac/empty.go deleted file mode 100644 index f4748c47..00000000 --- a/frac/empty.go +++ /dev/null @@ -1,50 +0,0 @@ -package frac - -import ( - "context" - "math" - - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/metric" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" -) - -var EmptyFraction Fraction = Empty{ - info: &common.Info{ - Path: "empty", - From: math.MaxUint64, - To: 0, - }, -} - -type Empty struct { - info *common.Info -} - -func (Empty) Fetch(context.Context, []seq.ID) ([][]byte, error) { - metric.CountersTotal.WithLabelValues("empty_fraction_fetch").Inc() - return nil, nil -} - -func (Empty) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { - metric.CountersTotal.WithLabelValues("empty_fraction_search").Inc() - return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil -} - -func (e Empty) Info() *common.Info { - return e.info -} -func (Empty) IsIntersecting(seq.MID, seq.MID) bool { - return false -} -func (Empty) Contains(mid seq.MID) bool { - return false -} - -func (Empty) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - return false, nil -} - -func (Empty) Suicide() {} diff --git a/frac/fraction.go b/frac/fraction.go index 89929c8b..9e80c838 100644 --- a/frac/fraction.go +++ b/frac/fraction.go @@ -13,7 +13,6 @@ import ( "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" ) type Fraction interface { @@ -22,8 +21,6 @@ type Fraction interface { Contains(mid seq.MID) bool Fetch(context.Context, []seq.ID) ([][]byte, error) Search(context.Context, processor.SearchParams) (*seq.QPR, error) - Offload(ctx context.Context, u storage.Uploader) (bool, error) - Suicide() } var ( diff --git a/frac/fraction_test.go b/frac/fraction_test.go index 920445ce..f1091192 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -41,6 +41,7 @@ type FractionTestSuite struct { mapping seq.Mapping tokenizers map[seq.TokenizerType]tokenizer.Tokenizer activeIndexer *ActiveIndexer + stopIndexer func() sealParams common.SealParams fraction Fraction @@ -49,12 +50,11 @@ type FractionTestSuite struct { } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer = NewActiveIndexer(4, 10) - s.activeIndexer.Start() + s.activeIndexer, s.stopIndexer = NewActiveIndexer(4, 10) } func (s *FractionTestSuite) TearDownSuiteCommon() { - s.activeIndexer.Stop() + s.stopIndexer() } func (s *FractionTestSuite) SetupTestCommon() { @@ -96,6 +96,9 @@ func (s *FractionTestSuite) SetupTestCommon() { } func (s *FractionTestSuite) TearDownTestCommon() { + if s.fraction != nil { + s.fraction = nil + } err := os.RemoveAll(s.tmpDir) s.NoError(err, "Failed to remove tmp dir") } @@ -1348,15 +1351,10 @@ func (s *ActiveFractionTestSuite) SetupTest() { } func (s *ActiveFractionTestSuite) TearDownTest() { - if s.fraction != nil { - active, ok := s.fraction.(*Active) - if ok { - active.Release() - } else { - s.Require().Fail("fraction is not of Active type") - } - s.fraction.Suicide() - s.fraction = nil + if active, ok := s.fraction.(*Active); ok { + active.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") } s.TearDownTestCommon() @@ -1407,17 +1405,11 @@ func (s *ActiveReplayedFractionTestSuite) Replay(frac *Active) Fraction { } func (s *ActiveReplayedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - active, ok := s.fraction.(*Active) - if ok { - active.Release() - } else { - s.Require().Fail("fraction is not of Active type") - } - s.fraction.Suicide() - s.fraction = nil + if active, ok := s.fraction.(*Active); ok { + active.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") } - s.TearDownTestCommon() } @@ -1448,9 +1440,10 @@ func (s *SealedFractionTestSuite) SetupTest() { } func (s *SealedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if sealed, ok := s.fraction.(*Sealed); ok { + sealed.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") } s.TearDownTestCommon() } @@ -1483,9 +1476,10 @@ func (s *SealedLoadedFractionTestSuite) SetupTest() { } func (s *SealedLoadedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if sealed, ok := s.fraction.(*Sealed); ok { + sealed.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") } s.TearDownTestCommon() } @@ -1496,7 +1490,7 @@ func (s *SealedLoadedFractionTestSuite) TearDownSuite() { func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Sealed { sealed := s.newSealed(bulks...) - sealed.close("closed") + sealed.Release() indexCache := &IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), @@ -1589,9 +1583,10 @@ func (s *RemoteFractionTestSuite) SetupTest() { } func (s *RemoteFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if remote, ok := s.fraction.(*Remote); ok { + remote.Suicide() + } else { + s.Require().Nil(s.fraction, "fraction is not of Remote type") } s.TearDownTestCommon() } diff --git a/frac/remote.go b/frac/remote.go index d5e85340..13f25f17 100644 --- a/frac/remote.go +++ b/frac/remote.go @@ -17,7 +17,6 @@ import ( "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -42,9 +41,6 @@ type Remote struct { info *common.Info - useMu sync.RWMutex - suicided bool - docsFile storage.ImmutableFile docsCache *cache.Cache[[]byte] docsReader storage.DocsReader @@ -116,54 +112,52 @@ func (f *Remote) Contains(mid seq.MID) bool { } func (f *Remote) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) + dp, err := f.createDataProvider(ctx) + if err != nil { + return nil, err + } return dp.Fetch(ids) } func (f *Remote) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) + dp, err := f.createDataProvider(ctx) + if err != nil { + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, err } return dp.Search(params) } -func (f *Remote) DataProvider(ctx context.Context) (*sealedDataProvider, func()) { - f.useMu.RLock() - - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - f.useMu.RUnlock() - return nil, func() {} - } - - defer func() { - if panicData := recover(); panicData != nil { - f.useMu.RUnlock() - panic(panicData) - } - }() - +func (f *Remote) createDataProvider(ctx context.Context) (*sealedDataProvider, error) { if err := f.load(); err != nil { logger.Error( "will create empty data provider: cannot load remote fraction", zap.String("fraction", f.Info().Name()), zap.Error(err), ) - f.useMu.RUnlock() - return nil, func() {} + return nil, err } + return &sealedDataProvider{ + ctx: ctx, + info: f.info, + config: f.Config, + docsReader: &f.docsReader, + blocksOffsets: f.blocksData.BlocksOffsets, + lidsTable: f.blocksData.LIDsTable, + lidsLoader: lids.NewLoader(&f.indexReader, f.indexCache.LIDs), + tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.indexReader, f.indexCache.Tokens), + tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.indexReader, f.indexCache.TokenTable), - dp := f.createDataProvider(ctx) - return dp, func() { - dp.release() - f.useMu.RUnlock() - } + idsTable: &f.blocksData.IDsTable, + idsProvider: seqids.NewProvider( + &f.indexReader, + f.indexCache.MIDs, + f.indexCache.RIDs, + f.indexCache.Params, + &f.blocksData.IDsTable, + f.info.BinaryDataVer, + ), + }, nil } func (f *Remote) Info() *common.Info { @@ -174,15 +168,7 @@ func (f *Remote) IsIntersecting(from, to seq.MID) bool { return f.info.IsIntersecting(from, to) } -func (f *Remote) Offload(context.Context, storage.Uploader) (bool, error) { - panic("BUG: remote fraction cannot be offloaded") -} - func (f *Remote) Suicide() { - f.useMu.Lock() - f.suicided = true - f.useMu.Unlock() - util.MustRemoveFileByPath(f.BaseFileName + consts.RemoteFractionSuffix) f.docsCache.Release() @@ -208,32 +194,6 @@ func (f *Remote) String() string { return fracToString(f, "remote") } -func (f *Remote) createDataProvider(ctx context.Context) *sealedDataProvider { - return &sealedDataProvider{ - ctx: ctx, - fractionTypeLabel: "remote", - - info: f.info, - config: f.Config, - docsReader: &f.docsReader, - blocksOffsets: f.blocksData.BlocksOffsets, - lidsTable: f.blocksData.LIDsTable, - lidsLoader: lids.NewLoader(&f.indexReader, f.indexCache.LIDs), - tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.indexReader, f.indexCache.Tokens), - tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.indexReader, f.indexCache.TokenTable), - - idsTable: &f.blocksData.IDsTable, - idsProvider: seqids.NewProvider( - &f.indexReader, - f.indexCache.MIDs, - f.indexCache.RIDs, - f.indexCache.Params, - &f.blocksData.IDsTable, - f.info.BinaryDataVer, - ), - } -} - func (f *Remote) load() error { f.loadMu.Lock() defer f.loadMu.Unlock() diff --git a/frac/sealed.go b/frac/sealed.go index 0755164f..38657be0 100644 --- a/frac/sealed.go +++ b/frac/sealed.go @@ -19,7 +19,6 @@ import ( "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" @@ -36,9 +35,6 @@ type Sealed struct { info *common.Info - useMu sync.RWMutex - suicided bool - docsFile *os.File docsCache *cache.Cache[[]byte] docsReader storage.DocsReader @@ -189,20 +185,15 @@ func (f *Sealed) load() { // Offload saves `.docs` (or `.sdocs`) and `.index` files into remote storage. // It does not free any of the occupied memory (nor on disk nor in memory). func (f *Sealed) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - f.useMu.Lock() - defer f.useMu.Unlock() - g, gctx := errgroup.WithContext(ctx) - g.Go(func() error { - f.openDocs() - return u.Upload(gctx, f.docsFile) - }) + f.loadMu.Lock() + f.openDocs() + f.openIndex() + f.loadMu.Unlock() - g.Go(func() error { - f.openIndex() - return u.Upload(gctx, f.indexFile) - }) + g.Go(func() error { return u.Upload(gctx, f.docsFile) }) + g.Go(func() error { return u.Upload(gctx, f.indexFile) }) if err := g.Wait(); err != nil { return true, err @@ -220,15 +211,25 @@ func (f *Sealed) Offload(ctx context.Context, u storage.Uploader) (bool, error) return true, nil } -func (f *Sealed) Suicide() { - f.useMu.Lock() - f.suicided = true - f.useMu.Unlock() +func (f *Sealed) Release() { + if f.docsFile != nil { + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + } - f.close("suicide") + if f.indexFile != nil { + if err := f.indexFile.Close(); err != nil { + logger.Error("can't close index file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + } f.docsCache.Release() f.indexCache.Release() +} + +func (f *Sealed) Suicide() { + f.Release() // make some atomic magic, to be more stable on removing fractions oldPath := f.BaseFileName + consts.DocsFileSuffix @@ -294,82 +295,20 @@ func (f *Sealed) Suicide() { } } -func (f *Sealed) close(hint string) { - f.loadMu.Lock() - defer f.loadMu.Unlock() - - if !f.isLoaded { - return - } - - if f.docsFile != nil { // docs file may not be opened since it's loaded lazily - if err := f.docsFile.Close(); err != nil { - logger.Error("can't close docs file", - zap.String("frac", f.BaseFileName), - zap.String("type", "sealed"), - zap.String("hint", hint), - zap.Error(err)) - } - } - - if err := f.indexFile.Close(); err != nil { - logger.Error("can't close index file", - zap.String("frac", f.BaseFileName), - zap.String("type", "sealed"), - zap.String("hint", hint), - zap.Error(err)) - } -} - func (f *Sealed) String() string { return fracToString(f, "sealed") } func (f *Sealed) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) - } - return dp.Fetch(ids) + return f.createDataProvider(ctx).Fetch(ids) } func (f *Sealed) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) - } - return dp.Search(params) -} - -func (f *Sealed) DataProvider(ctx context.Context) (*sealedDataProvider, func()) { - f.useMu.RLock() - - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - f.useMu.RUnlock() - return nil, func() {} - } - - defer func() { - if panicData := recover(); panicData != nil { - f.useMu.RUnlock() - panic(panicData) - } - }() - - f.load() - - dp := f.createDataProvider(ctx) - - return dp, func() { - dp.release() - f.useMu.RUnlock() - } + return f.createDataProvider(ctx).Search(params) } func (f *Sealed) createDataProvider(ctx context.Context) *sealedDataProvider { + f.load() return &sealedDataProvider{ ctx: ctx, fractionTypeLabel: "sealed", diff --git a/fracmanager/config.go b/fracmanager/config.go index 30f442b9..de96c957 100644 --- a/fracmanager/config.go +++ b/fracmanager/config.go @@ -26,6 +26,7 @@ type Config struct { SealParams common.SealParams SortCacheSize uint64 // size for docs cache for active fraction Fraction frac.Config + MinSealFracSize uint64 OffloadingEnabled bool OffloadingRetention time.Duration diff --git a/fracmanager/frac_info_cache_test.go b/fracmanager/frac_info_cache_test.go index 126171ad..fbc64ae9 100644 --- a/fracmanager/frac_info_cache_test.go +++ b/fracmanager/frac_info_cache_test.go @@ -4,18 +4,12 @@ import ( "encoding/json" "os" "path/filepath" - "sort" - "sync" "testing" - insaneJSON "github.com/ozontech/insane-json" "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/indexer" - "github.com/ozontech/seq-db/seq" testscommon "github.com/ozontech/seq-db/tests/common" ) @@ -258,197 +252,3 @@ func TestUnusedFractionsCleanup(t *testing.T) { assert.NoError(t, err) assert.Equal(t, []byte("{}"), cacheStr) } - -func rotateAndSeal(fm *FracManager) frac.Fraction { - active := fm.rotate() - fm.seal(active) - return active.ref.instance -} - -func TestFracInfoSavedToCache(t *testing.T) { - const maxSize = 10000 - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize * 2, - }) - defer stop() - - dp := indexer.NewTestDocProvider() - metaRoot := insaneJSON.Spawn() - defer insaneJSON.Release(metaRoot) - - infos := map[string]*common.Info{} - totalSize := uint64(0) - cnt := 1 - for totalSize < maxSize { - addDummyDoc(t, fm, dp, seq.SimpleID(cnt)) - cnt++ - fracInstance := rotateAndSeal(fm) - totalSize += fracInstance.Info().FullSize() - info := fracInstance.Info() - infos[info.Name()] = info - dp.TryReset() - } - - err := fm.fracCache.SyncWithDisk() - assert.NoError(t, err) - - fracCacheFromDisk, err := loadFracCache(cfg.DataDir) - assert.NoError(t, err) - assert.Equal(t, fracCacheFromDisk, fm.fracCache.cache) - assert.Equal(t, fracCacheFromDisk, infos) -} - -type item struct { - value string - size int -} - -type evictingQueue struct { - values []item - size int - maxSize int -} - -func newEvictingQueue(maxSize int) evictingQueue { - return evictingQueue{ - values: []item{}, - maxSize: maxSize, - size: 0, - } -} - -func (q *evictingQueue) Add(v item) { - q.values = append(q.values, v) - q.size += v.size - - for q.size > q.maxSize { - q.size -= q.values[0].size - q.values = q.values[1:] - } -} - -func (q *evictingQueue) GetItems() []item { - return q.values -} - -func appendGlob(files []string, dataDir, glob string) []string { - docs, _ := filepath.Glob(filepath.Join(dataDir, glob)) - files = append(files, docs...) - return files -} - -func TestExtraFractionsRemoved(t *testing.T) { - const maxSize = 5500 - const times = 10 - - q := newEvictingQueue(maxSize) - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize, - }) - - dp := indexer.NewTestDocProvider() - infos := map[string]*common.Info{} - - for i := 1; i < times+1; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(i)) - fracInstance := rotateAndSeal(fm) - info := fracInstance.Info() - q.Add(item{ - value: info.Name(), - size: int(fracInstance.Info().FullSize()), - }) - infos[info.Name()] = info - dp.TryReset() - } - - expectedFracs := []string{} - for _, itemValue := range q.GetItems() { - expectedFracs = append(expectedFracs, itemValue.value) - } - - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - - fm.maintenance(&sealWG, &suicideWG) // shrinkSizes should be called - sealWG.Wait() - suicideWG.Wait() - - stop() - - fracsOnDisk := []string{} - fracCacheFromDisk, err := loadFracCache(cfg.DataDir) - - assert.NoError(t, err) - for k := range fracCacheFromDisk { - fracsOnDisk = append(fracsOnDisk, k) - } - - sort.Strings(expectedFracs) - sort.Strings(fracsOnDisk) - - assert.Equal(t, expectedFracs, fracsOnDisk) -} - -func TestMissingCacheFilesDeleted(t *testing.T) { - const maxSize = 5500 - const times = 10 - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize, - }) - - dp := indexer.NewTestDocProvider() - metaRoot := insaneJSON.Spawn() - defer insaneJSON.Release(metaRoot) - - for i := 1; i < times+1; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(i)) - rotateAndSeal(fm) - dp.TryReset() - } - - // make sure the disk is in sync with the in-memory fraction cache - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - - fm.maintenance(&sealWG, &suicideWG) // shrinkSizes should be called - sealWG.Wait() - suicideWG.Wait() - - stop() - - // remove the fraction files - dataDir := cfg.DataDir - files := []string{} - files = appendGlob(files, dataDir, "*.docs") - files = appendGlob(files, dataDir, "*.sdocs") - files = appendGlob(files, dataDir, "*.index") - files = appendGlob(files, dataDir, "*.meta") - for _, file := range files { - err := os.RemoveAll(file) - assert.NoError(t, err) - } - - // create a new fracmanager that will read the fraction cache file - - _, fm2, stop2 := setupFracManager(t, cfg) - - sealWG2 := sync.WaitGroup{} - suicideWG2 := sync.WaitGroup{} - - fm2.maintenance(&sealWG2, &suicideWG2) // shrinkSizes should be called - sealWG2.Wait() - suicideWG2.Wait() - - stop2() - - // make sure the missing files are removed from the fraction cache - fracCacheFromDisk, err := loadFracCacheContent(dataDir) - assert.NoError(t, err) - assert.Equal(t, fracCacheFromDisk, []byte("{}")) -} diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index fd89750f..62c1faf0 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -2,12 +2,10 @@ package fracmanager import ( "context" - "errors" "path/filepath" "sync" "time" - "go.uber.org/atomic" "go.uber.org/zap" "github.com/ozontech/seq-db/config" @@ -19,431 +17,123 @@ import ( "github.com/ozontech/seq-db/util" ) +// FracManager manages database fractions with lifecycle operations type FracManager struct { - ctx context.Context - config *Config - - cacheMaintainer *CacheMaintainer - - fracCache *fracInfoCache - - fracMu sync.RWMutex - localFracs []*fracRef - remoteFracs []*frac.Remote - active activeRef - - indexer *frac.ActiveIndexer - fracProvider *fractionProvider - - oldestCTLocal atomic.Uint64 - oldestCTRemote atomic.Uint64 - - flags *StateManager - - stopFn func() - statWG sync.WaitGroup - mntcWG sync.WaitGroup - cacheWG sync.WaitGroup - - s3cli *s3.Client -} - -type fracRef struct { - instance frac.Fraction + mu sync.Mutex // todo: get rid of mutex after removing SealForcedForTests method + lc *lifecycleManager } -type activeRef struct { - ref *fracRef // ref contains a back reference to the fraction in the slice - frac *proxyFrac +var defaultStorageState = StorageState{ + CapacityExceeded: false, } -func (fm *FracManager) newActiveRef(active *frac.Active) activeRef { - f := newProxyFrac(active, fm.fracProvider) - return activeRef{ - frac: f, - ref: &fracRef{instance: f}, - } -} - -func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, error) { +// New creates and initializes a new fraction manager +// Starts all background workers: +// - indexer, +// - cache cleaner, +// - fraction rotation +// - stats updating +// +// Returns the manager instance and a stop function to gracefully shutdown +func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func(), error) { FillConfigWithDefault(cfg) - cacheMaintainer := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) - readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) - indexer := frac.NewActiveIndexer(config.IndexWorkers, config.IndexWorkers) - indexer.Start() - - flags, err := NewStateManager(cfg.DataDir, StorageState{}) + idx, stopIdx := frac.NewActiveIndexer(config.IndexWorkers, config.IndexWorkers) + cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) + provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) + infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) + + // Load existing fractions into registry + loader := NewLoader(cfg, provider, infoCache) + registry, err := loader.Load(ctx) if err != nil { - logger.Fatal("state manager initiation error", zap.Error(err)) + return nil, nil, err } - fm := &FracManager{ - config: cfg, - ctx: ctx, - s3cli: s3cli, - flags: flags, - cacheMaintainer: cacheMaintainer, - indexer: indexer, - fracProvider: newFractionProvider(cfg, s3cli, cacheMaintainer, readLimiter, indexer), - fracCache: NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)), + // Initialize storage state manager to track capacity status + storageState, err := NewStateManager(cfg.DataDir, defaultStorageState) + if err != nil { + return nil, nil, err } - err = fm.load(ctx) - return fm, err -} + // Create lc manager to handle fraction maintenance + lc := newLifecycleManager(infoCache, provider, storageState, registry) + fm := FracManager{lc: lc} -func (fm *FracManager) maintenance(sealWg, cleanupWg *sync.WaitGroup) { - logger.Debug("maintenance started") + // Start background workers and get stop function + wg := sync.WaitGroup{} + ctx, cancel := context.WithCancel(ctx) - n := time.Now() - if fm.Active().Info().DocsOnDisk > fm.config.FracSize { - active := fm.rotate() + startStatsWorker(ctx, registry, &wg) + startMaintWorker(ctx, cfg, &fm, &wg) + startCacheWorker(ctx, cfg, cache, &wg) - sealWg.Add(1) - go func() { - fm.seal(active) - sealWg.Done() - }() - } - - fm.cleanupFractions(cleanupWg) - fm.removeStaleFractions(cleanupWg, fm.config.OffloadingRetention) - fm.updateOldestCT() + stop := func() { + n := time.Now() + logger.Info("start stopping fracmanager's workers") - if err := fm.fracCache.SyncWithDisk(); err != nil { - logger.Error("can't sync frac-cache", zap.Error(err)) - } + cancel() + wg.Wait() - logger.Debug("maintenance finished", zap.Int64("took_ms", time.Since(n).Milliseconds())) -} + // Freeze active fraction to prevent new writes + active := lc.registry.Active() + if err := active.Finalize(); err != nil { + logger.Fatal("shutdown fraction freezing error", zap.Error(err)) + } + active.WaitWriteIdle() -func (fm *FracManager) Oldest() uint64 { - local, remote := fm.oldestCTLocal.Load(), fm.oldestCTRemote.Load() - if local != 0 && remote != 0 { - return min(local, remote) - } - return local -} + // Stop indexer + stopIdx() -func (fm *FracManager) updateOldestCT() { - fm.updateOldestCTFor(fm.getLocalFracs(), &fm.oldestCTLocal, "local") - fm.updateOldestCTFor(fm.getRemoteFracs(), &fm.oldestCTRemote, "remote") -} + // Save info cache + lc.SyncInfoCache() -func (fm *FracManager) updateOldestCTFor( - fracs List, v *atomic.Uint64, storageType string, -) { - oldestByCT := fracs.GetOldestFrac() + // Seal active fraction + sealOnShutdown(active.instance, provider, cfg.MinSealFracSize) - if oldestByCT == nil { - v.Store(0) - return + logger.Info("fracmanager's workers are stopped", zap.Int64("took_ms", time.Since(n).Milliseconds())) } - newOldestCT := oldestByCT.Info().CreationTime - prevOldestCT := v.Swap(newOldestCT) - - if newOldestCT != prevOldestCT { - logger.Info( - "new oldest by creation time", - zap.String("fraction", oldestByCT.Info().Name()), - zap.String("storage_type", storageType), - zap.Time("creation_time", time.UnixMilli(int64(newOldestCT))), - ) - } + return &fm, stop, nil } -func (fm *FracManager) shiftFirstFrac() frac.Fraction { - fm.fracMu.Lock() - defer fm.fracMu.Unlock() - - if len(fm.localFracs) == 0 { - return nil - } - - outsider := fm.localFracs[0].instance - fm.localFracs[0] = nil - fm.localFracs = fm.localFracs[1:] - return outsider +func (fm *FracManager) Fractions() List { + return fm.lc.registry.AllFractions() } -// removeStaleFractions removes [frac.Remote] fractions from external storage. -// Decision is based on the retention period provided by user. -func (fm *FracManager) removeStaleFractions(cleanupWg *sync.WaitGroup, retention time.Duration) { - // User did not provide retention period so keep all remote fractions alive. - // It's safe to do because we do not keep anything locally (but maybe we will eventually run out of inodes). - if retention <= 0 { - return - } - - var ( - staleFractions []*frac.Remote - freshFractions []*frac.Remote - ) - - fm.fracMu.Lock() - - for _, f := range fm.remoteFracs { - ct := time.UnixMilli(int64(f.Info().CreationTime)) - if time.Since(ct) < retention { - freshFractions = append(freshFractions, f) - continue - } - staleFractions = append(staleFractions, f) - } - - fm.remoteFracs = freshFractions - - fm.fracMu.Unlock() - - cleanupWg.Add(1) - go func() { - defer cleanupWg.Done() - - for _, f := range staleFractions { - ct := time.UnixMilli(int64(f.Info().CreationTime)) - - logger.Info( - "removing stale remote fraction", - zap.String("fraction", f.Info().Name()), - zap.Time("creation_time", ct), - zap.String("retention", retention.String()), - ) - - fm.fracCache.Remove(f.Info().Name()) - f.Suicide() - } - }() +func (fm *FracManager) Oldest() uint64 { + return fm.lc.registry.OldestTotal() } func (fm *FracManager) Flags() *StateManager { - return fm.flags + return fm.lc.flags } -func (fm *FracManager) determineOutsiders() []frac.Fraction { - var outsiders []frac.Fraction - - localFracs := fm.getLocalFracs() - occupiedSize := localFracs.GetTotalSize() - - var truncated int - for occupiedSize > fm.config.TotalSize { - outsider := fm.shiftFirstFrac() - if outsider == nil { - break - } - - localFracs = localFracs[1:] - outsiders = append(outsiders, outsider) - occupiedSize -= outsider.Info().FullSize() - truncated++ - } - - if len(outsiders) > 0 && !fm.flags.IsCapacityExceeded() { - if err := fm.flags.setCapacityExceeded(true); err != nil { - logger.Fatal("set capacity exceeded error", zap.Error(err)) - } - } - - maintenanceTruncateTotal.Add(float64(truncated)) - return outsiders +// Active returns the currently active fraction +func (fm *FracManager) Active() frac.Fraction { + return fm.lc.registry.Active().proxy } -func (fm *FracManager) cleanupFractions(cleanupWg *sync.WaitGroup) { - outsiders := fm.determineOutsiders() - if len(outsiders) == 0 { - return - } - - for _, outsider := range outsiders { - cleanupWg.Add(1) - go func() { - defer cleanupWg.Done() - - info := outsider.Info() - if !fm.config.OffloadingEnabled { - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - return - } - - offloadStart := time.Now() - remote, err := fm.fracProvider.Offload(fm.ctx, outsider) +// Append writes documents and metadata to the active fraction +// Implements retry logic in case of fraction sealing during write +func (fm *FracManager) Append(ctx context.Context, docs, metas storage.DocBlock) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + // Try to append data to the currently active fraction + err := fm.lc.registry.Active().Append(docs, metas) if err != nil { - offloadingTotal.WithLabelValues("failure").Inc() - offloadingDurationSeconds.Observe(float64(time.Since(offloadStart).Seconds())) - - logger.Error( - "will call Suicide() on fraction: failed to offload fraction", - zap.String("fraction", info.Name()), - zap.Int("retry_count", fm.s3cli.MaxRetryAttempts()), - zap.Error(err), - ) - - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - - return - } - - if remote == nil { - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - return + logger.Info("append fail", zap.Error(err)) + if err == ErrFractionNotWritable { + continue // fraction is currently being sealed, retry the operation + } } - - offloadingTotal.WithLabelValues("success").Inc() - offloadingDurationSeconds.Observe(float64(time.Since(offloadStart).Seconds())) - - logger.Info( - "successully offloaded fraction", - zap.String("fraction", info.Name()), - zap.String("took", time.Since(offloadStart).String()), - ) - - fm.fracMu.Lock() - // FIXME(dkharms): We had previously shifted fraction from local fracs list (in [fm.determineOutsiders] via [fm.shiftFirstFrac]) - // and therefore excluded it from search queries. - // But now we return that fraction back (well now it's a [frac.Remote] fraction but it still points to the same data) - // so user can face incosistent search results. - fm.remoteFracs = append(fm.remoteFracs, remote) - fm.fracMu.Unlock() - - outsider.Suicide() - }() - } -} - -type FracType int - -const ( - FracTypeLocal FracType = 1 << iota - FracTypeRemote -) - -// Fractions returns a list of known fracs (local and remote). -// -// While working with this list, it may become irrelevant (factions may, for example, be deleted). -// This is a valid situation, because access to the data of these factions (search and fetch) occurs under blocking (see DataProvider). -// This way we avoid the race. -// -// Accessing the deleted faction data just will return an empty result. -func (fm *FracManager) Fractions() (fracs List) { - return append(fm.getLocalFracs(), fm.getRemoteFracs()...) -} - -func (fm *FracManager) getLocalFracs() List { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - fracs := make(List, 0, len(fm.localFracs)) - for _, f := range fm.localFracs { - fracs = append(fracs, f.instance) - } - - return fracs -} - -func (fm *FracManager) getRemoteFracs() List { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - fracs := make(List, 0, len(fm.remoteFracs)) - for _, f := range fm.remoteFracs { - fracs = append(fracs, f) - } - - return fracs -} - -func (fm *FracManager) processFracsStats() { - type fracStats struct { - docsTotal uint64 - docsRaw uint64 - docsDisk uint64 - index uint64 - totalSize uint64 - count int - } - - calculate := func(fracs List) (st fracStats) { - for _, f := range fracs { - info := f.Info() - st.count += 1 - st.totalSize += info.FullSize() - st.docsTotal += uint64(info.DocsTotal) - st.docsRaw += info.DocsRaw - st.docsDisk += info.DocsOnDisk - st.index += info.IndexOnDisk + info.MetaOnDisk + return err } - return } - - setMetrics := func(st string, oldest uint64, ft fracStats) { - logger.Info("fraction stats", - zap.Int("count", ft.count), - zap.String("storage_type", st), - zap.Uint64("docs_k", ft.docsTotal/1000), - util.ZapUint64AsSizeStr("total_size", ft.totalSize), - util.ZapUint64AsSizeStr("docs_raw", ft.docsRaw), - util.ZapUint64AsSizeStr("docs_comp", ft.docsDisk), - util.ZapUint64AsSizeStr("index", ft.index), - ) - - dataSizeTotal.WithLabelValues("total", st).Set(float64(ft.totalSize)) - dataSizeTotal.WithLabelValues("docs_raw", st).Set(float64(ft.docsRaw)) - dataSizeTotal.WithLabelValues("docs_on_disk", st).Set(float64(ft.docsDisk)) - dataSizeTotal.WithLabelValues("index", st).Set(float64(ft.index)) - - if oldest != 0 { - oldestFracTime.WithLabelValues(st). - Set((time.Duration(oldest) * time.Millisecond).Seconds()) - } - } - - setMetrics("local", fm.oldestCTLocal.Load(), calculate(fm.getLocalFracs())) - setMetrics("remote", fm.oldestCTRemote.Load(), calculate(fm.getRemoteFracs())) -} - -func (fm *FracManager) runMaintenanceLoop(ctx context.Context) { - fm.mntcWG.Add(1) - go func() { - defer fm.mntcWG.Done() - - var ( - sealWg sync.WaitGroup - cleanupWg sync.WaitGroup - ) - - util.RunEvery(ctx.Done(), fm.config.MaintenanceDelay, func() { - fm.maintenance(&sealWg, &cleanupWg) - }) - - sealWg.Wait() - cleanupWg.Wait() - }() -} - -func (fm *FracManager) runStatsLoop(ctx context.Context) { - fm.statWG.Add(1) - go func() { - defer fm.statWG.Done() - - util.RunEvery(ctx.Done(), time.Second*10, func() { - fm.processFracsStats() - }) - }() -} - -func (fm *FracManager) Start() { - var ctx context.Context - ctx, fm.stopFn = context.WithCancel(fm.ctx) - - fm.runStatsLoop(ctx) - fm.runMaintenanceLoop(ctx) - startCacheWorker(ctx, fm.config, fm.cacheMaintainer, &fm.cacheWG) } // startCacheWorker starts background cache garbage collection @@ -459,138 +149,71 @@ func startCacheWorker(ctx context.Context, cfg *Config, cache *CacheMaintainer, }() } -func (fm *FracManager) load(ctx context.Context) error { - l := NewLoader(fm.config, fm.fracProvider, fm.fracCache) - - active, locals, remotes, err := l.Load(ctx) - if err != nil { - return err - } - - for _, s := range locals { - fm.localFracs = append(fm.localFracs, &fracRef{instance: s}) - } - - for _, s := range remotes { - fm.remoteFracs = append(fm.remoteFracs, s) - } - - fm.active = fm.newActiveRef(active) - fm.localFracs = append(fm.localFracs, fm.active.ref) - - fm.updateOldestCT() - return nil -} - -func (fm *FracManager) Append(ctx context.Context, docs, metas storage.DocBlock) error { - var err error - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - if err = fm.Writer().Append(docs, metas); err == nil { - return nil - } - logger.Info("append fail", zap.Error(err)) // can get fail if fraction already sealed - } - } -} - -func (fm *FracManager) seal(activeRef activeRef) { - sealsTotal.Inc() - now := time.Now() - sealed, err := activeRef.frac.Seal() - if err != nil { - if errors.Is(err, ErrSealingFractionSuicided) { - // the faction is suicided, this means that it has already pushed out of the list of factions, - // so we simply skip further actions - return - } - logger.Fatal("sealing error", zap.Error(err)) - } - sealingTime := time.Since(now) - sealsDoneSeconds.Observe(sealingTime.Seconds()) - - logger.Info( - "fraction sealed", - zap.String("fraction", filepath.Dir(sealed.Info().Path)), - zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), - ) - - info := sealed.Info() - fm.fracCache.Add(info) +// startStatsWorker starts periodic statistics collection and reporting +func startStatsWorker(ctx context.Context, reg *fractionRegistry, wg *sync.WaitGroup) { + wg.Add(1) + go func() { + defer wg.Done() - fm.fracMu.Lock() - activeRef.ref.instance = sealed - fm.fracMu.Unlock() + logger.Info("stats loop is started") + // Run stats collection every 10 seconds + util.RunEvery(ctx.Done(), time.Second*10, func() { + stats := reg.Stats() + stats.Log() // Log statistics + stats.SetMetrics() // Update Prometheus metrics + }) + logger.Info("stats loop is stopped") + }() } -func (fm *FracManager) rotate() activeRef { - next := fm.newActiveRef(fm.fracProvider.CreateActive()) - - fm.fracMu.Lock() - prev := fm.active - fm.active = next - fm.localFracs = append(fm.localFracs, fm.active.ref) - fm.fracMu.Unlock() - - logger.Info("new fraction created", zap.String("filepath", next.frac.active.BaseFileName)) - - return prev -} +// startMaintWorker starts periodic fraction maintenance operations +func startMaintWorker(ctx context.Context, cfg *Config, fm *FracManager, wg *sync.WaitGroup) { + wg.Add(1) + go func() { + defer wg.Done() -func (fm *FracManager) minFracSizeToSeal() uint64 { - return fm.config.FracSize * consts.DefaultMinSealPercent / 100 + logger.Info("maintenance loop is started") + // Run maintenance at configured interval + util.RunEvery(ctx.Done(), cfg.MaintenanceDelay, func() { + n := time.Now() + logger.Debug("maintenance iteration started") + fm.mu.Lock() + // Perform fraction maintenance (rotation, truncating, offloading, etc.) + fm.lc.Maintain(ctx, cfg, wg) + fm.mu.Unlock() + logger.Debug("maintenance iteration finished", zap.Int64("took_ms", time.Since(n).Milliseconds())) + }) + logger.Info("waiting maintenance complete background tasks") + logger.Info("maintenance loop is stopped") + }() } -func (fm *FracManager) Stop() { - fm.Writer().WaitWriteIdle() - fm.indexer.Stop() - fm.stopFn() - - fm.statWG.Wait() - fm.mntcWG.Wait() - fm.cacheWG.Wait() +// SealOnShutdown seals the active fraction on storage shutdown +func sealOnShutdown(active *frac.Active, provider *fractionProvider, minSealSize uint64) { + fracSize := active.Info().FullSize() - if err := fm.fracCache.SyncWithDisk(); err != nil { - logger.Error( - "failed to sync frac-cache on disk", - zap.Error(err), + if minSealSize == 0 { + logger.Info("sealing skipped: sealing on shutdown is disabled", + zap.String("frac", active.BaseFileName), + zap.Uint64("size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), ) + return } - needSealing := false - status := "frac too small to be sealed" - - info := fm.active.frac.Info() - if info.FullSize() > fm.minFracSizeToSeal() { - needSealing = true - status = "need seal active fraction before exit" + if fracSize < minSealSize { + logger.Info("sealing skipped: fraction too small", + zap.String("frac", active.BaseFileName), + zap.Uint64("size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), + ) + return } - logger.Info( - "sealing on exit", - zap.String("status", status), - zap.String("frac", info.Name()), - zap.Uint64("fill_size_mb", uint64(util.SizeToUnit(info.FullSize(), "mb"))), + logger.Info("fraction sealed before shutdown", + zap.String("frac", active.BaseFileName), + zap.Uint64("fill_size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), ) - if needSealing { - fm.seal(fm.active) + if _, err := provider.Seal(active); err != nil { + logger.Error("error sealing on shutdown", zap.Error(err)) } } - -func (fm *FracManager) Writer() *proxyFrac { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - return fm.active.frac -} - -func (fm *FracManager) Active() frac.Fraction { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - return fm.active.frac -} diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index 75d664ec..6b3e1bc5 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -1,12 +1,22 @@ package fracmanager +import "sync" + func (fm *FracManager) WaitIdleForTests() { - fm.Writer().WaitWriteIdle() + fm.lc.registry.Active().WaitWriteIdle() } func (fm *FracManager) SealForcedForTests() { - active := fm.rotate() - if active.frac.Info().DocsTotal > 0 { - fm.seal(active) - } + wg := sync.WaitGroup{} + fm.mu.Lock() // todo: get rid of mutex after removing SealForcedForTests method + fm.lc.Rotate(0, &wg) + fm.mu.Unlock() + + wg.Wait() + fm.lc.waitSealingForTests() // todo: get rid of waitSealingForTests method after removing SealForcedForTests method +} + +// todo: get rid of this after removing fracmanager.SealForcedForTests() +func (lc *lifecycleManager) waitSealingForTests() { + lc.sealingWg.Wait() } diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index c8e7088f..451d76a8 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -1,25 +1,21 @@ package fracmanager import ( - "context" - "fmt" - "sync" "testing" - "time" + "github.com/alecthomas/units" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/seq" - testscommon "github.com/ozontech/seq-db/tests/common" ) func setupDataDir(t testing.TB, cfg *Config) *Config { if cfg == nil { - cfg = &Config{} + cfg = &Config{ + Fraction: frac.Config{SkipSortDocs: true}, + } } if cfg.DataDir == "" { cfg.DataDir = t.TempDir() @@ -29,186 +25,57 @@ func setupDataDir(t testing.TB, cfg *Config) *Config { func setupFracManager(t testing.TB, cfg *Config) (*Config, *FracManager, func()) { cfg = setupDataDir(t, cfg) - fm, err := New(t.Context(), cfg, nil) + fm, stop, err := New(t.Context(), cfg, nil) assert.NoError(t, err) - fm.Start() - return cfg, fm, fm.Stop + return cfg, fm, stop } -func addDummyDoc(t *testing.T, fm *FracManager, dp *indexer.TestDocProvider, seqID seq.ID) { - doc := []byte("document") - dp.Append(doc, nil, seqID, "service:100500", "k8s_pod", "_all_:") +func appendDocsToFracManager(t testing.TB, fm *FracManager, docCount int) { + dp := indexer.NewTestDocProvider() + for i := 0; i < docCount; i++ { + doc := []byte("{\"timestamp\": 0, \"message\": \"msg\"}") + dp.Append(doc, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") + } docs, metas := dp.Provide() - err := fm.Append(context.Background(), docs, metas) + err := fm.Append(t.Context(), docs, metas) assert.NoError(t, err) } -func MakeSomeFractions(t *testing.T, fm *FracManager) { - dp := indexer.NewTestDocProvider() - addDummyDoc(t, fm, dp, seq.SimpleID(1)) - fm.seal(fm.rotate()) - - dp.TryReset() - - addDummyDoc(t, fm, dp, seq.SimpleID(2)) - fm.seal(fm.rotate()) - - dp.TryReset() - addDummyDoc(t, fm, dp, seq.SimpleID(3)) -} - -func TestCleanUp(t *testing.T) { - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 1000, - TotalSize: 100000, - }) +func TestSealingOnShutdown(t *testing.T) { + cfg := &Config{ + FracSize: 1 * uint64(units.MiB), // to ensure that the frac will not be sealed on maintenance + TotalSize: 1 * uint64(units.MiB), + Fraction: frac.Config{SkipSortDocs: true}, + } - MakeSomeFractions(t, fm) + // first start + cfg.MinSealFracSize = 0 // to ensure that the frac will not be sealed on shutdown + cfg, fm, stop := setupFracManager(t, cfg) + appendDocsToFracManager(t, fm, 10) + activeName := fm.Fractions()[0].Info().Name() + stop() - first := fm.localFracs[0].instance.(*frac.Sealed) - first.PartialSuicideMode = frac.HalfRename - first.Suicide() + // second start + cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown + cfg, fm, stop = setupFracManager(t, cfg) - second := fm.localFracs[1].instance.(*frac.Sealed) - second.PartialSuicideMode = frac.HalfRemove - second.Suicide() - info := fm.active.frac.Info() - shouldSealOnExit := info.FullSize() > fm.minFracSizeToSeal() + assert.Equal(t, 1, len(fm.Fractions()), "should have one fraction") + assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "fraction should have the same name") + _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*frac.Active) + assert.True(t, ok, "fraction should be active") stop() - if shouldSealOnExit && info.DocsTotal > 0 { - t.Error("active fraction should be empty after rotation and sealing") - } - + // third start _, fm, stop = setupFracManager(t, cfg) - defer stop() - - assert.Equal(t, 1, len(fm.localFracs), "wrong frac count") -} - -func TestCapacityExceeded(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) - - launchAndCheck := func(checkFn func(fm *FracManager)) { - fm, err := New(context.Background(), &Config{ - FracSize: 500, - TotalSize: 5000, - DataDir: dataDir, - }, nil) - assert.NoError(t, err) - - checkFn(fm) - fm.indexer.Stop() - } - - id := 1 - dp := indexer.NewTestDocProvider() - makeSealedFrac := func(fm *FracManager, docsPerFrac int) { - for i := 0; i < docsPerFrac; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(id)) - id++ - } - fm.seal(fm.rotate()) - dp.TryReset() - } + assert.Equal(t, 2, len(fm.Fractions()), "should have 2 fraction: new active and old sealed") + _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*frac.Sealed) + assert.True(t, ok, "first fraction should be sealed") + assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "sealed fraction should have the same name") + assert.Equal(t, uint32(0), fm.Fractions()[1].Info().DocsTotal, "active fraction should be empty") + _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*frac.Active) + assert.True(t, ok, "new fraction should be active") - // first run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "expect data dir is empty") - makeSealedFrac(fm, 10) - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - }) - - // second run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - for fm.Fractions().GetTotalSize() < fm.config.TotalSize { - makeSealedFrac(fm, 10) - } - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - fm.maintenance(&sealWG, &suicideWG) - assert.Equal(t, true, fm.Flags().IsCapacityExceeded(), "the deletion should occur and the flag should now be true") - }) - - // third run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, true, fm.Flags().IsCapacityExceeded(), "IsCapacityExceeded must be set to true in the state file") - }) - -} - -func TestOldestCT(t *testing.T) { - const fracCount = 10 - - t.Run("local", func(t *testing.T) { - fm, err := New(context.Background(), &Config{DataDir: t.TempDir()}, nil) - assert.NoError(t, err) - - oldestLocal := time.Now() - nowOldestLocal := oldestLocal - - fm.localFracs = nil - for i := range fracCount { - fm.localFracs = append(fm.localFracs, &fracRef{instance: frac.NewSealed( - "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("local-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestLocal.UnixMilli()), - }, nil, - )}) - nowOldestLocal = nowOldestLocal.Add(time.Second) - } - - fm.updateOldestCT() - - require.Equal(t, uint64(0), fm.oldestCTRemote.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.oldestCTLocal.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.Oldest()) - }) - - t.Run("local-and-remote", func(t *testing.T) { - fm, err := New(context.Background(), &Config{DataDir: t.TempDir()}, nil) - assert.NoError(t, err) - - oldestRemote := time.Now() - nowOldestRemote := oldestRemote - - fm.localFracs = nil - for i := range fracCount { - fm.remoteFracs = append(fm.remoteFracs, frac.NewRemote( - t.Context(), "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("remote-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestRemote.UnixMilli()), - }, nil, nil, - )) - nowOldestRemote = nowOldestRemote.Add(time.Second) - } - - oldestLocal := nowOldestRemote - nowOldestLocal := oldestLocal - - for i := range fracCount { - fm.localFracs = append(fm.localFracs, &fracRef{instance: frac.NewSealed( - "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("local-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestLocal.UnixMilli()), - }, nil, - )}) - nowOldestLocal = nowOldestLocal.Add(time.Second) - } - - fm.updateOldestCT() - - require.Equal(t, uint64(oldestRemote.UnixMilli()), fm.oldestCTRemote.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.oldestCTLocal.Load()) - require.Equal(t, uint64(oldestRemote.UnixMilli()), fm.Oldest()) - }) + stop() } diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go new file mode 100644 index 00000000..1c2a9fa3 --- /dev/null +++ b/fracmanager/fracs_stats.go @@ -0,0 +1,85 @@ +package fracmanager + +import ( + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +// fracsStats contains statistical information about a group of fractions +// Used to track aggregate metrics for fractions in different states +type fracsStats struct { + count int // Number of fractions in the group + docsCount uint64 // Total number of documents across all fractions + docsSizeRaw uint64 // Total raw size of documents before compression + docsSizeOnDisk uint64 // Total size of documents on disk after compression + indexSizeOnDisk uint64 // Total size of index and metadata on disk + totalSizeOnDisk uint64 // Total storage size, including documents, index and metadata +} + +// Add incorporates fraction information into the statistics +// Updates all aggregate metrics with the values from the provided fraction info +func (s *fracsStats) Add(info *common.Info) { + s.count++ + s.docsCount += uint64(info.DocsTotal) + s.docsSizeRaw += info.DocsRaw + s.docsSizeOnDisk += info.DocsOnDisk + s.indexSizeOnDisk += info.IndexOnDisk + info.MetaOnDisk + s.totalSizeOnDisk += info.FullSize() +} + +// Sub removes fraction information from the statistics +// Decrements all aggregate metrics with the values from the provided fraction info +func (s *fracsStats) Sub(info *common.Info) { + s.count-- + s.docsCount -= uint64(info.DocsTotal) + s.docsSizeRaw -= info.DocsRaw + s.docsSizeOnDisk -= info.DocsOnDisk + s.indexSizeOnDisk -= info.IndexOnDisk + info.MetaOnDisk + s.totalSizeOnDisk -= info.FullSize() +} + +func (s *fracsStats) Log(stage string) { + logger.Info("fraction stats", + zap.Int("count", s.count), + zap.String("stage", stage), + zap.Uint64("docs_k", s.docsCount/1000), + util.ZapUint64AsSizeStr("total_size", s.totalSizeOnDisk), + util.ZapUint64AsSizeStr("docs_raw", s.docsSizeRaw), + util.ZapUint64AsSizeStr("docs_comp", s.docsSizeOnDisk), + util.ZapUint64AsSizeStr("index", s.indexSizeOnDisk), + ) +} + +func (s *fracsStats) SetMetrics(metric *prometheus.GaugeVec, stage string) { + metric.WithLabelValues("total", stage).Set(float64(s.totalSizeOnDisk)) + metric.WithLabelValues("docs_raw", stage).Set(float64(s.docsSizeRaw)) + metric.WithLabelValues("docs_on_disk", stage).Set(float64(s.docsSizeOnDisk)) + metric.WithLabelValues("index", stage).Set(float64(s.indexSizeOnDisk)) +} + +// registryStats contains statistical data for all fraction queues +// Used for monitoring and memory management decisions +type registryStats struct { + sealing fracsStats // Statistics for fractions in the sealing process + locals fracsStats // Statistics for fractions on local disk + offloading fracsStats // Statistics for fractions in the offloading process + remotes fracsStats // Statistics for fractions in remote storage +} + +func (s *registryStats) Log() { + s.sealing.Log("sealing") + s.locals.Log("locals") + s.offloading.Log("offloading") + s.remotes.Log("remotes") +} + +func (s *registryStats) SetMetrics() { + s.sealing.SetMetrics(dataSizeTotal, "sealing") + s.locals.SetMetrics(dataSizeTotal, "locals") + s.offloading.SetMetrics(dataSizeTotal, "offloading") + s.remotes.SetMetrics(dataSizeTotal, "remotes") +} diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 6343b3a4..e2915598 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -122,7 +122,7 @@ func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { // Offload uploads fraction to S3 storage and returns a remote fraction // IMPORTANT: context controls timeouts and operation cancellation -func (fp *fractionProvider) Offload(ctx context.Context, f frac.Fraction) (*frac.Remote, error) { +func (fp *fractionProvider) Offload(ctx context.Context, f *frac.Sealed) (*frac.Remote, error) { mustBeOffloaded, err := f.Offload(ctx, s3.NewUploader(fp.s3cli)) if err != nil { return nil, err diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index d5769bdf..f315b615 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -1,23 +1,48 @@ package fracmanager import ( + "fmt" + "math/rand" + "net/http/httptest" "testing" + "time" "github.com/alecthomas/units" + "github.com/johannesboyne/gofakes3" + "github.com/johannesboyne/gofakes3/backend/s3mem" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/storage/s3" ) +func setupS3Client(t testing.TB) (*s3.Client, func()) { + s3Backend := s3mem.New() + s3server := httptest.NewServer(gofakes3.New(s3Backend).Server()) + + bucketName := fmt.Sprintf("bucket_%s_%d_%d", t.Name(), time.Now().UnixMilli(), rand.Int()) + err := s3Backend.CreateBucket(bucketName) + require.NoError(t, err, "create bucket failed") + + s3cli, err := s3.NewClient(s3server.URL, "ACCESS_KEY", "SECRET_KEY", "eu-west-3", bucketName, 3) + require.NoError(t, err, "s3 client setup failed") + + return s3cli, s3server.Close +} + func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func()) { cfg = setupDataDir(t, cfg) rl := storage.NewReadLimiter(1, nil) - idx := frac.NewActiveIndexer(1, 1) - idx.Start() + s3cli, stopS3 := setupS3Client(t) + idx, stopIdx := frac.NewActiveIndexer(1, 1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) - provider := newFractionProvider(cfg, nil, cache, rl, idx) - return provider, idx.Stop + provider := newFractionProvider(cfg, s3cli, cache, rl, idx) + return provider, func() { + stopIdx() + stopS3() + } } func TestFractionID(t *testing.T) { diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go new file mode 100644 index 00000000..9178dcf1 --- /dev/null +++ b/fracmanager/fraction_registry.go @@ -0,0 +1,385 @@ +package fracmanager + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/ozontech/seq-db/frac" +) + +// fractionRegistry manages fraction queues at different lifecycle stages. +// Tracks fractions through different stages: active → sealing → local → offloading → remote +// Ensures correct state transitions while maintaining chronological order. +// The entire structure is thread-safe due to internal synchronization. +// Lifecycle: Created once, persists through application lifetime. +type fractionRegistry struct { + mu sync.RWMutex // Main mutex for protecting registry state + + // Lifecycle queues (FIFO order, oldest at lower indexes) + sealing []*activeProxy // Fractions being sealed (0-5 typical) + locals []*sealedProxy // Local sealed fractions (can be thousands) + offloading []*sealedProxy // Fractions being offloaded (0-5 typical) + remotes []*remoteProxy // Offloaded fractions (can be thousands) + + stats registryStats // Size statistics for monitoring + oldestTotal uint64 // Creation time of oldest fraction + oldestLocal uint64 // Creation time of oldest fraction + + muAll sync.RWMutex // Mutex specifically for all fractions list + active *activeProxy // Currently active writable fraction + all []frac.Fraction // All fractions in creation order (read-only view) +} + +// NewFractionRegistry creates and initializes a new fraction registry instance. +// Populates the registry with existing active, local and remote fractions. +// Rebuilds the complete fractions list in chronological order. +func NewFractionRegistry(active *frac.Active, locals []*frac.Sealed, remotes []*frac.Remote) (*fractionRegistry, error) { + if active == nil { + return nil, errors.New("active fraction must be specified") + } + + // Set current active fraction + r := fractionRegistry{ + active: &activeProxy{ + proxy: &fractionProxy{impl: active}, + instance: active, + }, + } + + // Initialize local sealed fractions + for _, sealed := range locals { + r.stats.locals.Add(sealed.Info()) + r.locals = append(r.locals, &sealedProxy{ + proxy: &fractionProxy{impl: sealed}, + instance: sealed, + }) + } + + // Initialize remote fractions + for _, remote := range remotes { + r.stats.remotes.Add(remote.Info()) + r.remotes = append(r.remotes, &remoteProxy{ + proxy: &fractionProxy{impl: remote}, + instance: remote, + }) + } + + // Init oldest local value + r.updateOldestLocal() + + // Rebuild complete fractions list in order + r.rebuildAllFractions() + + return &r, nil +} + +// Active returns the currently active writable fraction. +func (r *fractionRegistry) Active() *activeProxy { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.active +} + +// AllFractions returns a read-only view of all fractions in creation order. +func (r *fractionRegistry) AllFractions() []frac.Fraction { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.all +} + +// Stats returns current size statistics of the registry. +func (r *fractionRegistry) Stats() registryStats { + r.mu.RLock() + defer r.mu.RUnlock() + return r.stats +} + +// OldestTotal returns the creation time of the oldest fraction in the registry. +func (r *fractionRegistry) OldestTotal() uint64 { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.oldestTotal +} + +// OldestLocal returns the creation time of the oldest local fraction in the registry. +func (r *fractionRegistry) OldestLocal() uint64 { + r.mu.RLock() + defer r.mu.RUnlock() + return r.oldestLocal +} + +// RotateIfFull completes the current active fraction and starts a new one. +// Moves previous active fraction to sealing queue. +// Updates statistics and maintains chronological order. +// Should be called when creating a new fraction. +func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *activeProxy) (*activeProxy, func(), error) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.active.instance.Info().DocsOnDisk <= maxSize { + return nil, nil, nil + } + + old := r.active + r.sealing = append(r.sealing, old) + r.addActive(newActive()) + + if err := old.Finalize(); err != nil { + return old, nil, err + } + + curInfo := old.instance.Info() + r.stats.sealing.Add(curInfo) + + wg := sync.WaitGroup{} + wg.Add(1) + // since old.WaitWriteIdle() can take some time, we don't want to do it under the lock + // we will do it asynchronously in a goroutine. + go func() { + defer wg.Done() + + old.WaitWriteIdle() // can be long enough + finalInfo := old.instance.Info() + + r.mu.Lock() + defer r.mu.Unlock() + + // curInfo and finalInfo differ because while we are waiting for old.WaitWriteIdle(), + // the latest data is being written to the active fraction index. + r.stats.sealing.Sub(curInfo) + r.stats.sealing.Add(finalInfo) + }() + + return old, wg.Wait, nil +} + +// addActive sets a new active fraction and updates the complete fractions list. +func (r *fractionRegistry) addActive(a *activeProxy) { + r.muAll.Lock() + defer r.muAll.Unlock() + + r.active = a + r.all = append(r.all, a.proxy) +} + +// trimAll removes the oldest fractions from the complete fractions list. +// Used when fractions are evicted or deleted from the system. +func (r *fractionRegistry) trimAll(count int) { + r.muAll.Lock() + defer r.muAll.Unlock() + + r.all = r.all[count:] + r.updateOldestTotal() +} + +// EvictLocal removes oldest local fractions to free disk space. +// If shouldOffload is true, moves fractions to offloading queue instead of deleting. +// Returns evicted fractions or error if insufficient space is released. +func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]*sealedProxy, error) { + r.mu.Lock() + defer r.mu.Unlock() + + var ( + count int + releasingSize uint64 + ) + + // Calculate total used disk space + totalUsedSize := r.stats.locals.totalSizeOnDisk + + r.stats.sealing.totalSizeOnDisk + + r.active.instance.Info().FullSize() + + // Determine how many oldest fractions need to be removed to meet size limit + for _, item := range r.locals { + if totalUsedSize-releasingSize <= sizeLimit { + break + } + info := item.instance.Info() + releasingSize += info.FullSize() + r.stats.locals.Sub(info) + count++ + } + + // Check if enough space will be freed + if totalUsedSize-releasingSize > sizeLimit { + return nil, fmt.Errorf("insufficient space released: need to free %d more bytes "+ + "(total: %d, releasing: %d, limit: %d)", + (totalUsedSize-releasingSize)-sizeLimit, totalUsedSize, releasingSize, sizeLimit) + } + + // Extract fractions to evict + evicted := r.locals[:count] + r.locals = r.locals[count:] + + // Either offload or completely remove the fractions + if shouldOffload { + for _, item := range evicted { + r.offloading = append(r.offloading, item) + r.stats.offloading.Add(item.instance.Info()) + } + } else { + r.trimAll(count) // Permanently remove + r.updateOldestLocal() // Oldest local can be changed here + } + + return evicted, nil +} + +// EvictRemote removes oldest remote fractions based on retention policy. +// Fractions older than retention period are permanently deleted. +// Returns removed fractions or empty slice if nothing to remove. +func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { + r.mu.Lock() + defer r.mu.Unlock() + + count := 0 + // Find fractions older than retention period + for _, item := range r.remotes { + info := item.instance.Info() + if time.Since(time.UnixMilli(int64(info.CreationTime))) <= retention { + break // Stop at first fraction within retention + } + r.stats.remotes.Sub(info) + count++ + } + + evicted := r.remotes[:count] + r.remotes = r.remotes[count:] + r.trimAll(count) // Remove from complete list + + return evicted +} + +// PromoteToLocal moves fractions from sealing to local queue when sealing completes. +// Maintains strict ordering - younger fractions wait for older ones to seal first. +func (r *fractionRegistry) PromoteToLocal(active *activeProxy, sealed *frac.Sealed) { + r.mu.Lock() + defer r.mu.Unlock() + + active.sealed = sealed + + promotedCount := 0 + // Process sealing queue in order, promoting completed fractions + for _, item := range r.sealing { + if item.sealed == nil { + break // Maintain order - wait for previous fractions to complete + } + promotedCount++ + r.locals = append(r.locals, &sealedProxy{ + proxy: item.proxy, + instance: item.sealed, + }) + r.stats.locals.Add(item.sealed.Info()) + r.stats.sealing.Sub(item.instance.Info()) + } + + // Remove promoted fractions from sealing queue + r.sealing = r.sealing[promotedCount:] +} + +// PromoteToRemote moves fractions from offloading to remote queue when offloading completes. +// Special case: Handles fractions that don't require offloading (remote == nil). +// Maintains strict ordering - younger fractions wait for older ones to offload. +func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *frac.Remote) { + r.mu.Lock() + defer r.mu.Unlock() + + sealed.remote = remote + + // Special case: remote == nil means fraction doesn't require offloading + if remote == nil { + r.removeFromOffloading(sealed) + } + + promotedCount := 0 + // Process offloading queue in order, promoting completed fractions + for _, item := range r.offloading { + if item.remote == nil { + break // Maintain order - wait for previous fractions to complete + } + promotedCount++ + r.remotes = append(r.remotes, &remoteProxy{ + proxy: item.proxy, + instance: item.remote, + }) + + r.stats.remotes.Add(item.remote.Info()) + r.stats.offloading.Sub(item.instance.Info()) + } + if promotedCount > 0 { + // Remove promoted fractions from offloading queue + r.offloading = r.offloading[promotedCount:] + r.updateOldestLocal() + } +} + +// removeFromOffloading removes a specific fraction from offloading queue. +// O(n) operation that rebuilds the all fractions list. +func (r *fractionRegistry) removeFromOffloading(sealed *sealedProxy) { + count := 0 + // Filter out the target fraction + for _, item := range r.offloading { + if sealed != item { + r.offloading[count] = item + count++ + } + } + r.offloading = r.offloading[:count] + r.stats.offloading.Sub(sealed.instance.Info()) + + // Oldest local can be changed here + r.updateOldestLocal() + + // Rebuild complete list since we modified the middle of the queue + r.rebuildAllFractions() +} + +// rebuildAllFractions reconstructs the all fractions list in correct chronological order. +// Order: remote (oldest) → offloading → local → sealing → active (newest) +// Expensive O(n) operation used when direct list modification is insufficient. +func (r *fractionRegistry) rebuildAllFractions() { + all := make([]frac.Fraction, 0, len(r.all)) + + // Collect fractions in correct chronological order: from oldest (remote) to newest (active) + for _, remote := range r.remotes { + all = append(all, remote.proxy) + } + for _, offloaded := range r.offloading { + all = append(all, offloaded.proxy) + } + for _, sealed := range r.locals { + all = append(all, sealed.proxy) + } + for _, active := range r.sealing { + all = append(all, active.proxy) + } + all = append(all, r.active.proxy) + + r.muAll.Lock() + defer r.muAll.Unlock() + + r.all = all + r.updateOldestTotal() +} + +// updateOldestTotal recalculates the creation time of the oldest fraction. +// Called after modifications of the complete fractions list. +func (r *fractionRegistry) updateOldestTotal() { + r.oldestTotal = r.all[0].Info().CreationTime +} + +// updateOldestLocal recalculates the creation time of the oldest local fraction. +// Called after modifications of the local fractions list. +func (r *fractionRegistry) updateOldestLocal() { + if len(r.offloading) > 0 { + r.oldestLocal = r.offloading[0].proxy.Info().CreationTime + } else if len(r.locals) > 0 { + r.oldestLocal = r.locals[0].proxy.Info().CreationTime + } else if len(r.sealing) > 0 { + r.oldestLocal = r.sealing[0].proxy.Info().CreationTime + } else { + r.oldestLocal = r.active.proxy.Info().CreationTime + } +} diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go new file mode 100644 index 00000000..597a6cbf --- /dev/null +++ b/fracmanager/lifecycle_manager.go @@ -0,0 +1,209 @@ +package fracmanager + +import ( + "context" + "path/filepath" + "sync" + "time" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +// lifecycleManager manages the complete lifecycle of fractions. +// Handles rotation, sealing, offloading, and cleanup operations. +// Lifecycle: Created once, coordinates all fraction state transitions. +type lifecycleManager struct { + infoCache *fracInfoCache // Fraction metadata cache + provider *fractionProvider // Provider for fraction operations + flags *StateManager // Storage state flags + registry *fractionRegistry // Fraction state registry + + sealingWg sync.WaitGroup +} + +func newLifecycleManager( + infoCache *fracInfoCache, + provider *fractionProvider, + flags *StateManager, + registry *fractionRegistry, +) *lifecycleManager { + return &lifecycleManager{ + infoCache: infoCache, + provider: provider, + flags: flags, + registry: registry, + } +} + +// Maintain performs periodic lifecycle management tasks. +// It is a CORE method of lifecycleManager +// Coordinates rotation, offloading, cleanup based on configuration. +func (lc *lifecycleManager) Maintain(ctx context.Context, config *Config, wg *sync.WaitGroup) { + lc.Rotate(config.FracSize, wg) + if config.OffloadingEnabled { + lc.OffloadLocal(ctx, config.TotalSize, wg) + lc.CleanRemote(config.OffloadingRetention, wg) + } else { + lc.CleanLocal(config.TotalSize, wg) + } + lc.UpdateOldestMetric() + lc.SyncInfoCache() +} + +func (lc *lifecycleManager) SyncInfoCache() { + if err := lc.infoCache.SyncWithDisk(); err != nil { + logger.Error("can't sync info-cache", zap.Error(err)) + } +} + +// Seal converts an active fraction to sealed state +// Freezes writes, waits for pending operations, then seals the fraction. +func (lc *lifecycleManager) Seal(active *activeProxy) error { + now := time.Now() + sealed, err := lc.provider.Seal(active.instance) + if err != nil { + return err + } + sealsTotal.Inc() + sealingTime := time.Since(now) + sealsDoneSeconds.Observe(sealingTime.Seconds()) + + logger.Info( + "fraction sealed", + zap.String("fraction", filepath.Base(sealed.BaseFileName)), + zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), + ) + + lc.infoCache.Add(sealed.Info()) + lc.registry.PromoteToLocal(active, sealed) + active.proxy.Redirect(sealed) + active.instance.Release() + return nil +} + +// Rotate checks if active fraction needs rotation based on size limit +// Creates new active fraction and starts sealing the previous one. +func (lc *lifecycleManager) Rotate(maxSize uint64, wg *sync.WaitGroup) { + activeToSeal, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, func() *activeProxy { + return newActiveProxy(lc.provider.CreateActive()) + }) + if err != nil { + logger.Fatal("active fraction rotation error", zap.Error(err)) + } + if activeToSeal == nil { + return + } + + wg.Add(1) + lc.sealingWg.Add(1) + go func() { + defer wg.Done() + defer lc.sealingWg.Done() + + waitBeforeSealing() + if err := lc.Seal(activeToSeal); err != nil { + logger.Fatal("sealing error", zap.Error(err)) + } + }() +} + +// OffloadLocal starts offloading of local fractions to remote storage +// Selects fractions based on disk space usage and retention policy. +func (lc *lifecycleManager) OffloadLocal(ctx context.Context, sizeLimit uint64, wg *sync.WaitGroup) { + toOffload, err := lc.registry.EvictLocal(true, sizeLimit) + if err != nil { + logger.Fatal("error releasing old fractions:", zap.Error(err)) + } + for _, sealed := range toOffload { + wg.Add(1) + go func() { + defer wg.Done() + + remote, _ := lc.TryOffload(ctx, sealed.instance) + lc.registry.PromoteToRemote(sealed, remote) + + if remote == nil { + sealed.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(sealed.instance.Info().Name()) + } else { + sealed.proxy.Redirect(remote) + } + + // Free up local resources + sealed.instance.Suicide() + maintenanceTruncateTotal.Add(1) + }() + } +} + +// TryOffload performs a single offload attempt and records metrics +// Measures offloading duration and tracks success/failure statistics. +func (lc *lifecycleManager) TryOffload(ctx context.Context, sealed *frac.Sealed) (*frac.Remote, error) { + now := time.Now() + remote, err := lc.provider.Offload(ctx, sealed) + offloadingDuration := time.Since(now).Seconds() + + if err != nil { + offloadingTotal.WithLabelValues("failure").Inc() + offloadingDurationSeconds.Observe(float64(offloadingDuration)) + return nil, err + } + + if remote != nil { + offloadingTotal.WithLabelValues("success").Inc() + offloadingDurationSeconds.Observe(float64(offloadingDuration)) + } + + return remote, nil +} + +// CleanRemote deletes outdated remote fractions based on retention policy +func (lc *lifecycleManager) CleanRemote(retention time.Duration, wg *sync.WaitGroup) { + if retention == 0 { + return + } + toDelete := lc.registry.EvictRemote(retention) + wg.Add(1) + go func() { + defer wg.Done() + for _, remote := range toDelete { + remote.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(remote.instance.Info().Name()) + remote.instance.Suicide() + } + }() +} + +// CleanLocal deletes outdated local fractions when offloading is disabled +func (lc *lifecycleManager) CleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { + toDelete, err := lc.registry.EvictLocal(false, sizeLimit) + if err != nil { + logger.Fatal("error releasing old fractions:", zap.Error(err)) + } + if len(toDelete) > 0 && !lc.flags.IsCapacityExceeded() { + if err := lc.flags.setCapacityExceeded(true); err != nil { + logger.Fatal("can't set capacity_exceeded flag", zap.Error(err)) + } + } + + wg.Add(1) + go func() { + defer wg.Done() + for _, sealed := range toDelete { + sealed.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(sealed.instance.Info().Name()) + sealed.instance.Suicide() + maintenanceTruncateTotal.Add(1) + } + }() +} + +// UpdateOldestMetric updates the prometheus metric with oldest fraction timestamp +func (lc *lifecycleManager) UpdateOldestMetric() { + oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.OldestTotal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.OldestLocal()) * time.Millisecond).Seconds()) +} diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go new file mode 100644 index 00000000..c277c98e --- /dev/null +++ b/fracmanager/lifecycle_manager_test.go @@ -0,0 +1,160 @@ +package fracmanager + +import ( + "math/rand" + "path/filepath" + "sync" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/ozontech/seq-db/consts" +) + +func setupLifecycle(t testing.TB, cfg *Config) (*lifecycleManager, func()) { + provider, tearDown := setupFractionProvider(t, cfg) + dataDir := provider.config.DataDir + infoCache := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) + + registry, err := NewFractionRegistry(provider.CreateActive(), nil, nil) + assert.NoError(t, err) + + storageState, err := NewStateManager(dataDir, defaultStorageState) + assert.NoError(t, err) + + lifecycle := newLifecycleManager(infoCache, provider, storageState, registry) + + return lifecycle, tearDown +} + +func TestFracInfoCache(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + var total uint64 + + fillRotateAndCheck := func(names map[string]struct{}) { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + _, ok := lc.infoCache.Get(info.Name()) + assert.True(t, ok) + + total += info.FullSize() + names[info.Name()] = struct{}{} + } + + first := map[string]struct{}{} + for range 10 { + fillRotateAndCheck(first) + } + halfSize := total + + second := map[string]struct{}{} + for range 10 { + fillRotateAndCheck(second) + } + + wg := sync.WaitGroup{} + lc.CleanLocal(total-halfSize, &wg) + wg.Wait() + + for n := range first { + _, ok := lc.infoCache.Get(n) + assert.False(t, ok, "expect the first part to be deleted") + } + + for n := range second { + _, ok := lc.infoCache.Get(n) + assert.True(t, ok, "expect the second part to still be present") + } +} + +func TestCapacityExceeded(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const fracsCount = 10 + var total uint64 + + fillAndRotate := func() { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + total += info.FullSize() + } + + assert.False(t, lc.flags.IsCapacityExceeded(), "expect data dir is empty") + + // make some fracs + for range fracsCount { + fillAndRotate() + } + assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") + + wg := sync.WaitGroup{} + lc.CleanLocal(total, &wg) + wg.Wait() + + assert.Equal(t, fracsCount, lc.registry.Stats().locals.count, "as much as was added, so much should be") + assert.False(t, lc.flags.IsCapacityExceeded(), "there should still be no deletions, and the flag is false") + + lc.CleanLocal(total-1, &wg) + wg.Wait() + + assert.Equal(t, fracsCount-1, lc.registry.Stats().locals.count, "expect one less") + assert.True(t, lc.flags.IsCapacityExceeded(), "the flag must be true now") +} + +func TestOldestMetrics(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const fracsCount = 10 + var total uint64 + + fillAndRotate := func() { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + total += info.FullSize() + } + + firstFracTime := lc.registry.Active().proxy.Info().CreationTime + for range fracsCount { + fillAndRotate() + } + + // Check state after initial rotations + assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") + assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") + + halfSize := total + halfwayFracTime := lc.registry.Active().proxy.Info().CreationTime + for range fracsCount { + fillAndRotate() + } + + wg := sync.WaitGroup{} + lc.OffloadLocal(t.Context(), total-halfSize, &wg) + wg.Wait() + + // Check state after offloading + assert.NotEqual(t, firstFracTime, halfwayFracTime, "expect different creation times") + assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") + assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") +} diff --git a/fracmanager/loader.go b/fracmanager/loader.go index 41322bbf..6eb788ee 100644 --- a/fracmanager/loader.go +++ b/fracmanager/loader.go @@ -38,17 +38,17 @@ func NewLoader(config *Config, provider *fractionProvider, infoCache *fracInfoCa // Load is the main method for loading all fractions // Coordinates the entire process: discovery, validation, recovery, and ordering -func (l *Loader) Load(ctx context.Context) (*frac.Active, []*frac.Sealed, []*frac.Remote, error) { +func (l *Loader) Load(ctx context.Context) (*fractionRegistry, error) { // Stage 1: Discover all fractions in filesystem actives, locals, remotes, err := l.discover(ctx) if err != nil { - return nil, nil, nil, err + return nil, err } // Stage 2: Replay active fractions and seal them active, sealed, err := l.replayAndSeal(ctx, actives) if err != nil { - return nil, nil, nil, err + return nil, err } // Stage 3: Create new active fraction if no existing ones @@ -58,7 +58,7 @@ func (l *Loader) Load(ctx context.Context) (*frac.Active, []*frac.Sealed, []*fra // Stage 4: Combine all local fractions locals = append(locals, sealed...) - return active, locals, remotes, nil + return NewFractionRegistry(active, locals, remotes) } // replayAndSeal replays active fractions and seals old ones diff --git a/fracmanager/loader_test.go b/fracmanager/loader_test.go index de92ad19..57d054f6 100644 --- a/fracmanager/loader_test.go +++ b/fracmanager/loader_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - insaneJSON "github.com/ozontech/insane-json" "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/consts" @@ -20,18 +19,17 @@ import ( func setupLoaderTest(t testing.TB, cfg *Config) (*fractionProvider, *Loader, func()) { fp, tearDown := setupFractionProvider(t, cfg) + cfg = fp.config ic := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) loader := NewLoader(cfg, fp, ic) return fp, loader, tearDown } -func appendDocs(t *testing.T, active *frac.Active, docCount int) { +func appendDocsToActive(t testing.TB, active *frac.Active, docCount int) { dp := indexer.NewTestDocProvider() - for i := 0; i < docCount; i++ { + for i := 1; i <= docCount; i++ { doc := []byte("{\"timestamp\": 0, \"message\": \"msg\"}") - docRoot, err := insaneJSON.DecodeBytes(doc) - assert.NoError(t, err) - dp.Append(doc, docRoot, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") + dp.Append(doc, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") } docs, metas := dp.Provide() @@ -53,7 +51,7 @@ func TestReplayWithEmptyActive(t *testing.T) { actives := make([]*frac.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) actives = append(actives, active) } actives = append(actives, fp.CreateActive()) // last active frac is now empty @@ -85,7 +83,7 @@ func TestReplayWithMultipleEmpty(t *testing.T) { for i := 0; i < fracCount; i++ { active := fp.CreateActive() if i%3 == 0 { - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) nonEmpty = append(nonEmpty, active.Info()) } actives = append(actives, active) @@ -116,11 +114,11 @@ func TestReplayMultiple(t *testing.T) { actives := make([]*frac.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) actives = append(actives, active) } active := fp.CreateActive() - appendDocs(t, active, 5) + appendDocsToActive(t, active, 5) actives = append(actives, active) // replay and seal @@ -166,7 +164,7 @@ func TestReplayContextCancel(t *testing.T) { actives := make([]*frac.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) actives = append(actives, active) } actives = append(actives, fp.CreateActive()) @@ -189,7 +187,7 @@ func TestReplaySingleNonEmpty(t *testing.T) { // fill data actives := []*frac.Active{fp.CreateActive()} - appendDocs(t, actives[0], 500+rand.Intn(100)) + appendDocsToActive(t, actives[0], 500+rand.Intn(100)) // replay and seal active, sealed, err := loader.replayAndSeal(t.Context(), actives) @@ -199,3 +197,79 @@ func TestReplaySingleNonEmpty(t *testing.T) { assert.Equal(t, active.Info().Name(), actives[0].Info().Name(), "should have the same name") assert.Equal(t, active.Info().DocsTotal, actives[0].Info().DocsTotal, "should have the same doc count for replayed frac") } + +func TestDiscover(t *testing.T) { + const fracCount = 16 + + // setup + fp, loader, tearDown := setupLoaderTest(t, nil) + defer tearDown() + + // make some sealed fracs + expectedSealed := map[string]*frac.Sealed{} + for range fracCount { + a := fp.CreateActive() + appendDocsToActive(t, a, 10+rand.Intn(10)) + s, err := fp.Seal(a) + assert.NoError(t, err) + expectedSealed[s.Info().Name()] = s + } + + // make half sealed fracs remote + expectedRemote := map[string]*frac.Remote{} + for n, s := range expectedSealed { + if rand.Intn(2) != 0 { + continue + } + r, err := fp.Offload(t.Context(), s) + assert.NoError(t, err) + expectedRemote[n] = r + s.Suicide() + delete(expectedSealed, n) + } + + // make half sealed fracs deleted + for n, s := range expectedSealed { + if rand.Intn(2) != 0 { + continue + } + s.Suicide() + delete(expectedSealed, n) + } + + // make half remote fracs deleted + for n, r := range expectedRemote { + if rand.Intn(2) != 0 { + continue + } + r.Suicide() + delete(expectedRemote, n) + } + + // make active + a := fp.CreateActive() + appendDocsToActive(t, a, 10+rand.Intn(10)) + + // discover from FS + actives, locals, remotes, err := loader.discover(t.Context()) + assert.NoError(t, err) + + // checks + for _, s := range locals { + n := s.Info().Name() + _, ok := expectedSealed[n] + delete(expectedSealed, n) + assert.True(t, ok, "not deleted sealed should be discovered") + } + for _, s := range remotes { + n := s.Info().Name() + _, ok := expectedRemote[n] + delete(expectedRemote, n) + assert.True(t, ok, "not deleted remote should be discovered %s", n) + } + + assert.Equal(t, 1, len(actives), "only one active should be discovered") + assert.Equal(t, a.BaseFileName, actives[0].BaseFileName, "must be the same name") + assert.Empty(t, expectedSealed, "we don't expect any more sealed fractions") + assert.Empty(t, expectedRemote, "we don't expect any more remote fractions") +} diff --git a/fracmanager/proxy_frac.go b/fracmanager/proxy_frac.go index 11e9fc85..e30dc1a0 100644 --- a/fracmanager/proxy_frac.go +++ b/fracmanager/proxy_frac.go @@ -3,7 +3,7 @@ package fracmanager import ( "context" "errors" - "fmt" + "math" "sync" "time" @@ -15,222 +15,160 @@ import ( "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" ) -var ErrSealingFractionSuicided = errors.New("sealing fraction is suicided") - -/** - * Possible states (only 4): - * -------------------------------------------------------- - * | | f.active | f.sealed | f.readonly | - * -------------------------------------------------------- - * | Active & Writable | value | nil | false | - * -------------------------------------------------------- - * | Sealing | value | nil | true | - * -------------------------------------------------------- - * | Sealed | nil | value | true | - * -------------------------------------------------------- - * | Suicided | nil | nil | true | - * -------------------------------------------------------- - * All other states are impossible. - */ - -type proxyFrac struct { - fp *fractionProvider - - useMu sync.RWMutex - active *frac.Active - sealed *frac.Sealed - readonly bool - - name string - - indexWg sync.WaitGroup - sealWg sync.WaitGroup -} - -func newProxyFrac(active *frac.Active, fp *fractionProvider) *proxyFrac { - return &proxyFrac{ - fp: fp, - active: active, - name: active.BaseFileName, - } -} - -func (f *proxyFrac) cur() frac.Fraction { - f.useMu.RLock() - defer f.useMu.RUnlock() - - if f.active != nil { - return f.active - } +var ( + _ frac.Fraction = (*fractionProxy)(nil) + _ frac.Fraction = (*emptyFraction)(nil) - if f.sealed != nil { - metric.CountersTotal.WithLabelValues("use_sealed_from_active").Inc() - return f.sealed - } + ErrFractionNotWritable = errors.New("fraction is not writable") +) - metric.CountersTotal.WithLabelValues("use_empty_from_active").Inc() - return frac.EmptyFraction +// fractionProxy provides thread-safe access to a fraction with atomic replacement +// Used to switch fraction implementations (active → sealed → remote) without blocking readers. +// Lifecycle: Created for each fraction, persists through state transitions. +type fractionProxy struct { + mu sync.RWMutex + impl frac.Fraction // Current fraction implementation } -func (f *proxyFrac) IsIntersecting(from, to seq.MID) bool { - return f.cur().IsIntersecting(from, to) +func (p *fractionProxy) Redirect(f frac.Fraction) { + p.mu.Lock() + defer p.mu.Unlock() + p.impl = f } -func (f *proxyFrac) Contains(mid seq.MID) bool { - return f.cur().Contains(mid) +func (p *fractionProxy) Info() *common.Info { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Info() } -func (f *proxyFrac) Info() *common.Info { - return f.cur().Info() +func (p *fractionProxy) IsIntersecting(from, to seq.MID) bool { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.IsIntersecting(from, to) } -func (f *proxyFrac) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - return f.cur().Fetch(ctx, ids) +func (p *fractionProxy) Contains(mid seq.MID) bool { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Contains(mid) } -func (f *proxyFrac) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - return f.cur().Search(ctx, params) +func (p *fractionProxy) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Fetch(ctx, ids) } -func (f *proxyFrac) Append(docs, meta []byte) error { - f.useMu.RLock() - if !f.isActiveState() { - f.useMu.RUnlock() - return errors.New("fraction is not writable") - } - active := f.active - f.indexWg.Add(1) // It's important to put wg.Add() inside a lock, otherwise we might call WaitWriteIdle() before it - f.useMu.RUnlock() - - return active.Append(docs, meta, &f.indexWg) +func (p *fractionProxy) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Search(ctx, params) } -func (f *proxyFrac) WaitWriteIdle() { - start := time.Now() - logger.Info("waiting fraction to stop write...", zap.String("name", f.name)) - f.indexWg.Wait() - waitTime := util.DurationToUnit(time.Since(start), "s") - logger.Info("write is stopped", zap.String("name", f.name), zap.Float64("time_wait_s", waitTime)) -} +// activeProxy manages an active (writable) fraction +// Tracks pending write operations and provides freeze capability. +// Lifecycle: Created when fraction becomes active, destroyed after sealing. +type activeProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *frac.Active // Actual active fraction instance + sealed *frac.Sealed // Sealed version (set after sealing) -func (f *proxyFrac) Seal() (*frac.Sealed, error) { - f.useMu.Lock() - if f.isSuicidedState() { - f.useMu.Unlock() - return nil, ErrSealingFractionSuicided - } - - if !f.isActiveState() { - f.useMu.Unlock() - return nil, errors.New("sealing fraction is not active") - } + mu sync.RWMutex // Protects readonly state + wg sync.WaitGroup // Tracks pending write operations - f.readonly = true - active := f.active - - f.sealWg.Add(1) // It's important to put wg.Add() inside a lock, otherwise we might call wg.Wait() before it - f.useMu.Unlock() - - f.WaitWriteIdle() - - sealed, err := f.fp.Seal(active) - if err != nil { - return nil, err - } - - f.useMu.Lock() - f.sealed = sealed - f.active = nil - f.useMu.Unlock() - - f.sealWg.Done() - - active.Release() - - return sealed, nil + finalized bool // Whether fraction is frozen for writes } -// trySetSuicided set suicided state if possible (if not sealing right now) -func (f *proxyFrac) trySetSuicided() (*frac.Active, *frac.Sealed, bool) { - f.useMu.Lock() - defer f.useMu.Unlock() - - sealed := f.sealed - active := f.active - - // We must compute `isSealing` before - // we change fraction to read-only. - isSealing := f.isSealingState() - - // If the object is in active state, switch to read-only mode - if f.isActiveState() { - f.readonly = true +func newActiveProxy(active *frac.Active) *activeProxy { + return &activeProxy{ + proxy: &fractionProxy{impl: active}, + instance: active, } +} - // If sealing is not in progress, we can safely clear the state - if !isSealing { - f.sealed = nil - f.active = nil +// Append adds documents to the active fraction +func (p *activeProxy) Append(docs, meta []byte) error { + p.mu.RLock() + if p.finalized { + p.mu.RUnlock() + return ErrFractionNotWritable } + p.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() + p.mu.RUnlock() - return active, sealed, isSealing + return p.instance.Append(docs, meta, &p.wg) } -func (f *proxyFrac) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - f.useMu.RLock() - - if f.isSealingState() { - f.useMu.RUnlock() - f.sealWg.Wait() - - if c := f.cur(); c != nil { - return c.Offload(ctx, u) - } +// WaitWriteIdle waits for all pending write operations to complete +// Used before sealing to ensure data consistency. +func (p *activeProxy) WaitWriteIdle() { + start := time.Now() + logger.Info("waiting fraction to stop write...", zap.String("name", p.instance.BaseFileName)) + p.wg.Wait() + waitTime := util.DurationToUnit(time.Since(start), "s") + logger.Info("write is stopped", + zap.String("name", p.instance.BaseFileName), + zap.Float64("time_wait_s", waitTime)) +} - return false, nil +// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (p *activeProxy) Finalize() error { + p.mu.Lock() + if p.finalized { + p.mu.Unlock() + return errors.New("fraction is already finalized") } + p.finalized = true + p.mu.Unlock() - f.useMu.RUnlock() - return f.cur().Offload(ctx, u) + return nil } -func (f *proxyFrac) Suicide() { - active, sealed, isSealing := f.trySetSuicided() +// sealedProxy represents a sealed fraction that may be offloaded +// Tracks both local sealed instance and remote version if offloaded. +type sealedProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *frac.Sealed // Local sealed fraction + remote *frac.Remote // Remote version (if offloaded) +} - if isSealing { - f.sealWg.Wait() - // we can get `sealing` == true only once here - // next attempt after Wait() should be successful - active, sealed, _ = f.trySetSuicided() - } +// remoteProxy represents an offloaded fraction +type remoteProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *frac.Remote // Remote fraction instance +} - if active != nil { - // Wait for write operations to complete before suiciding - f.WaitWriteIdle() - active.Suicide() - } +// emptyFraction represents a missing or deleted fraction +// Returns empty results for all operations. +// Used as placeholder when fraction is removed but references still exist. +type emptyFraction struct { +} - if sealed != nil { - sealed.Suicide() +func (emptyFraction) Info() *common.Info { + return &common.Info{ + Path: "empty", + From: math.MaxUint64, + To: 0, } } -func (f *proxyFrac) String() string { - return fmt.Sprintf("%s", f.cur()) +func (emptyFraction) IsIntersecting(_, _ seq.MID) bool { + return false } -func (f *proxyFrac) isActiveState() bool { - return f.active != nil && f.sealed == nil && !f.readonly +func (emptyFraction) Contains(mid seq.MID) bool { + return false } -func (f *proxyFrac) isSealingState() bool { - return f.active != nil && f.sealed == nil && f.readonly +func (emptyFraction) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + return nil, nil } -func (f *proxyFrac) isSuicidedState() bool { - return f.active == nil && f.sealed == nil +func (emptyFraction) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil } diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index d4251b16..f85c3f8f 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -13,7 +13,6 @@ import ( "time" "github.com/alecthomas/units" - insaneJSON "github.com/ozontech/insane-json" "github.com/pkg/profile" "github.com/stretchr/testify/assert" @@ -39,9 +38,6 @@ func TestMain(m *testing.M) { func fillActiveFraction(active *frac.Active) error { const muliplier = 10 - docRoot := insaneJSON.Spawn() - defer insaneJSON.Release(docRoot) - file, err := os.Open(filepath.Join(testscommon.TestDataDir, "k8s.logs")) if err != nil { return err @@ -62,12 +58,8 @@ func fillActiveFraction(active *frac.Active) error { for scanner.Scan() { k++ doc := scanner.Bytes() - if err := docRoot.DecodeBytes(doc); err != nil { - return err - } - id := seq.NewID(time.Now(), uint64(rand.Int63())) - dp.Append(doc, docRoot, id, + dp.Append(doc, id, "_all_:", "service:service"+strconv.Itoa(rand.Intn(200)), "k8s_pod1:"+strconv.Itoa(k%100000), diff --git a/fracmanager/searcher_test.go b/fracmanager/searcher_test.go index e584e9a1..016ec348 100644 --- a/fracmanager/searcher_test.go +++ b/fracmanager/searcher_test.go @@ -17,7 +17,6 @@ import ( ) type testFakeFrac struct { - frac.Empty info *common.Info qpr *seq.QPR searchesCount int diff --git a/indexer/test_doc_provider.go b/indexer/test_doc_provider.go index 316464d2..0af90dde 100644 --- a/indexer/test_doc_provider.go +++ b/indexer/test_doc_provider.go @@ -2,13 +2,8 @@ package indexer import ( "encoding/binary" - "math/rand" "strings" - "time" - insaneJSON "github.com/ozontech/insane-json" - - "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tokenizer" @@ -45,14 +40,8 @@ func (dp *TestDocProvider) appendMeta(docLen int, id seq.ID, tokens []tokenizer. dp.Metas = append(dp.Metas, dp.buf...) } -func (dp *TestDocProvider) Append(doc []byte, docRoot *insaneJSON.Root, id seq.ID, tokensStr ...string) { +func (dp *TestDocProvider) Append(doc []byte, id seq.ID, tokensStr ...string) { tokens := stringsToTokens(tokensStr...) - if id.MID == 0 { - // this case runs only in the integration tests - t, _ := extractDocTimeForTest(docRoot) - id = seq.NewID(t, uint64(rand.Int63())) - } - dp.appendMeta(len(doc), id, tokens) dp.appendDoc(doc) } @@ -86,34 +75,6 @@ func encodeMeta(buf []byte, tokens []tokenizer.MetaToken, id seq.ID, size int) [ return md.MarshalBinaryTo(buf) } -// extractDocTimeForTest extracts timestamp from doc -// It searches by one of supported field name and parses by supported formats -// If no field was found or not parsable it returns time.Now() -func extractDocTimeForTest(docRoot *insaneJSON.Root) (time.Time, []string) { - var t time.Time - var f []string -top: - for _, field := range consts.TimeFields { - timeNode := docRoot.Dig(field...) - if timeNode == nil { - continue - } - timeVal := timeNode.AsString() - for _, format := range consts.TimeFormats { - if value, err := time.Parse(format, timeVal); err == nil { - t = value - f = field - break top - } - } - } - - if t.IsZero() { - t = time.Now() - } - return t, f -} - func stringsToTokens(tokens ...string) []tokenizer.MetaToken { r := make([]tokenizer.MetaToken, 0) for _, tokenStr := range tokens { diff --git a/storeapi/grpc_v1_test.go b/storeapi/grpc_v1_test.go index e8c69038..20688ee0 100644 --- a/storeapi/grpc_v1_test.go +++ b/storeapi/grpc_v1_test.go @@ -56,7 +56,7 @@ func makeBulkRequest(cnt int) *storeapi.BulkRequest { for i := 0; i < cnt; i++ { id := seq.SimpleID(i + 1) doc := []byte("document") - dp.Append(doc, nil, id, "_all_:", "service:100500", "k8s_pod:"+strconv.Itoa(i)) + dp.Append(doc, id, "_all_:", "service:100500", "k8s_pod:"+strconv.Itoa(i)) } req := &storeapi.BulkRequest{Count: int64(cnt)} req.Docs, req.Metas = dp.Provide() @@ -67,13 +67,12 @@ func getTestGrpc(t *testing.T) (*GrpcV1, func(), func()) { dataDir := common.GetTestTmpDir(t) common.RecreateDir(dataDir) - fm, err := fracmanager.New(t.Context(), &fracmanager.Config{ + fm, stop, err := fracmanager.New(t.Context(), &fracmanager.Config{ FracSize: 500, TotalSize: 5000, DataDir: dataDir, }, nil) assert.NoError(t, err) - fm.Start() config := APIConfig{ StoreMode: "", @@ -99,7 +98,7 @@ func getTestGrpc(t *testing.T) (*GrpcV1, func(), func()) { g := NewGrpcV1(config, fm, mappingProvider) release := func() { - fm.Stop() + stop() common.RemoveDir(dataDir) } diff --git a/storeapi/store.go b/storeapi/store.go index aa06f44f..857be4e2 100644 --- a/storeapi/store.go +++ b/storeapi/store.go @@ -26,7 +26,8 @@ type Store struct { grpcAddr string grpcServer *grpcServer - FracManager *fracmanager.FracManager + FracManager *fracmanager.FracManager + fracManagerStop func() isStopped atomic.Bool } @@ -51,19 +52,19 @@ func NewStore(ctx context.Context, c StoreConfig, s3cli *s3.Client, mappingProvi return nil, err } - fracManager, err := fracmanager.New(ctx, &c.FracManager, s3cli) + fracManager, stop, err := fracmanager.New(ctx, &c.FracManager, s3cli) if err != nil { return nil, fmt.Errorf("loading fractions error: %w", err) } - fracManager.Start() return &Store{ Config: c, // We will set grpcAddr later in Start() - grpcAddr: "", - grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), - FracManager: fracManager, - isStopped: atomic.Bool{}, + grpcAddr: "", + grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), + FracManager: fracManager, + fracManagerStop: stop, + isStopped: atomic.Bool{}, }, nil } @@ -86,8 +87,7 @@ func (s *Store) Stop() { defer cancel() s.grpcServer.Stop(ctx) - - s.FracManager.Stop() + s.fracManagerStop() logger.Info("store stopped") } From 2e2c673d8772aa6b0b69ff1545fb06261277f1b7 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 21 Nov 2025 18:19:17 +0300 Subject: [PATCH 02/28] fix: return data provider release --- frac/active.go | 12 ++++++++++-- frac/remote.go | 5 ++++- frac/sealed.go | 10 ++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/frac/active.go b/frac/active.go index 6485c9c4..984a0e75 100644 --- a/frac/active.go +++ b/frac/active.go @@ -268,7 +268,11 @@ func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { if f.Info().DocsTotal == 0 { // it is empty active fraction state return nil, nil } - return f.createDataProvider(ctx).Fetch(ids) + + dp := f.createDataProvider(ctx) + defer dp.release() + + return dp.Fetch(ids) } func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { @@ -276,7 +280,11 @@ func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*se metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil } - return f.createDataProvider(ctx).Search(params) + + dp := f.createDataProvider(ctx) + defer dp.release() + + return dp.Search(params) } func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { diff --git a/frac/remote.go b/frac/remote.go index 13f25f17..c2088caa 100644 --- a/frac/remote.go +++ b/frac/remote.go @@ -115,8 +115,9 @@ func (f *Remote) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { dp, err := f.createDataProvider(ctx) if err != nil { return nil, err - } + defer dp.release() + return dp.Fetch(ids) } @@ -125,6 +126,8 @@ func (f *Remote) Search(ctx context.Context, params processor.SearchParams) (*se if err != nil { return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, err } + defer dp.release() + return dp.Search(params) } diff --git a/frac/sealed.go b/frac/sealed.go index 38657be0..b3de2e82 100644 --- a/frac/sealed.go +++ b/frac/sealed.go @@ -300,11 +300,17 @@ func (f *Sealed) String() string { } func (f *Sealed) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - return f.createDataProvider(ctx).Fetch(ids) + dp := f.createDataProvider(ctx) + defer dp.release() + + return dp.Fetch(ids) } func (f *Sealed) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - return f.createDataProvider(ctx).Search(params) + dp := f.createDataProvider(ctx) + defer dp.release() + + return dp.Search(params) } func (f *Sealed) createDataProvider(ctx context.Context) *sealedDataProvider { From 2730b2a73a9d80ef596e4283a2e90af67112ac55 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Sat, 6 Dec 2025 16:48:21 +0300 Subject: [PATCH 03/28] feat(frac): new active fraction implementation based in mini-indexes --- asyncsearcher/async_searcher_test.go | 7 +- cmd/distribution/main.go | 6 +- cmd/seq-db/seq-db.go | 3 +- frac/{ => active}/active.go | 59 ++-- .../docs_positions.go} | 2 +- frac/{active_ids.go => active/ids.go} | 2 +- .../ids_test.go} | 2 +- frac/{active_index.go => active/index.go} | 74 ++-- frac/{active_indexer.go => active/indexer.go} | 19 +- .../indexer_test.go} | 9 +- frac/{ => active}/inverser.go | 2 +- frac/{ => active}/meta_data_collector.go | 2 +- .../sealing_source.go} | 46 +-- frac/{active_lids.go => active/token_lids.go} | 2 +- .../token_list.go} | 52 +-- frac/active/writer.go | 44 +++ frac/active2/active2.go | 328 ++++++++++++++++++ frac/active2/data_provider.go | 138 ++++++++ frac/active2/indexer.go | 304 ++++++++++++++++ frac/active2/indexer_allocator.go | 131 +++++++ frac/active2/indexer_test.go | 119 +++++++ frac/active2/mem_index.go | 50 +++ frac/active2/mem_index_pool.go | 103 ++++++ frac/active2/merge.go | 209 +++++++++++ frac/active2/merge2.go | 211 +++++++++++ frac/active2/merge_iterator.go | 97 ++++++ frac/active2/merge_manager.go | 150 ++++++++ frac/active2/merge_strategy.go | 178 ++++++++++ frac/active2/metrics.go | 42 +++ frac/active2/sealing_source.go | 121 +++++++ frac/active2/tiers.go | 80 +++++ frac/active_writer.go | 44 --- frac/fraction.go | 9 +- frac/{common => }/info.go | 2 +- frac/{common => }/seal_params.go | 2 +- frac/sealed/block_info.go | 6 +- frac/{sealed_index.go => sealed/index.go} | 12 +- frac/{ => sealed}/index_cache.go | 2 +- frac/{sealed_loader.go => sealed/loader.go} | 9 +- frac/sealed/preloaded_data.go | 4 +- frac/{ => sealed}/remote.go | 21 +- frac/{ => sealed}/sealed.go | 36 +- frac/sealed/sealing/blocks_builder_test.go | 6 +- frac/sealed/sealing/index.go | 12 +- frac/sealed/sealing/sealer.go | 6 +- frac/{ => tests}/fraction_test.go | 279 +++++++++------ fracmanager/cache_maintainer.go | 6 +- fracmanager/config.go | 3 +- fracmanager/frac_info_cache.go | 10 +- fracmanager/frac_info_cache_test.go | 46 +-- fracmanager/fracmanager.go | 5 +- fracmanager/fracmanager_test.go | 8 +- fracmanager/fracs_stats.go | 6 +- fracmanager/fraction_provider.go | 32 +- fracmanager/fraction_provider_test.go | 4 +- fracmanager/fraction_registry.go | 8 +- fracmanager/lifecycle_manager.go | 4 +- fracmanager/loader.go | 19 +- fracmanager/loader_test.go | 23 +- fracmanager/proxy_frac.go | 21 +- fracmanager/sealer_test.go | 20 +- fracmanager/searcher_test.go | 11 +- indexer/meta_data.go | 65 +++- seq/seq.go | 8 + tests/setup/env.go | 10 +- util/fs.go | 19 + 66 files changed, 2886 insertions(+), 484 deletions(-) rename frac/{ => active}/active.go (84%) rename frac/{active_docs_positions.go => active/docs_positions.go} (98%) rename frac/{active_ids.go => active/ids.go} (97%) rename frac/{active_ids_test.go => active/ids_test.go} (98%) rename frac/{active_index.go => active/index.go} (73%) rename frac/{active_indexer.go => active/indexer.go} (90%) rename frac/{active_indexer_test.go => active/indexer_test.go} (96%) rename frac/{ => active}/inverser.go (99%) rename frac/{ => active}/meta_data_collector.go (99%) rename frac/{active_sealing_source.go => active/sealing_source.go} (89%) rename frac/{active_lids.go => active/token_lids.go} (99%) rename frac/{active_token_list.go => active/token_list.go} (79%) create mode 100644 frac/active/writer.go create mode 100644 frac/active2/active2.go create mode 100644 frac/active2/data_provider.go create mode 100644 frac/active2/indexer.go create mode 100644 frac/active2/indexer_allocator.go create mode 100644 frac/active2/indexer_test.go create mode 100644 frac/active2/mem_index.go create mode 100644 frac/active2/mem_index_pool.go create mode 100644 frac/active2/merge.go create mode 100644 frac/active2/merge2.go create mode 100644 frac/active2/merge_iterator.go create mode 100644 frac/active2/merge_manager.go create mode 100644 frac/active2/merge_strategy.go create mode 100644 frac/active2/metrics.go create mode 100644 frac/active2/sealing_source.go create mode 100644 frac/active2/tiers.go delete mode 100644 frac/active_writer.go rename frac/{common => }/info.go (99%) rename frac/{common => }/seal_params.go (95%) rename frac/{sealed_index.go => sealed/index.go} (98%) rename frac/{ => sealed}/index_cache.go (97%) rename frac/{sealed_loader.go => sealed/loader.go} (93%) rename frac/{ => sealed}/remote.go (95%) rename frac/{ => sealed}/sealed.go (94%) rename frac/{ => tests}/fraction_test.go (90%) diff --git a/asyncsearcher/async_searcher_test.go b/asyncsearcher/async_searcher_test.go index f403aa44..fce9afcd 100644 --- a/asyncsearcher/async_searcher_test.go +++ b/asyncsearcher/async_searcher_test.go @@ -9,7 +9,6 @@ import ( "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/mappingprovider" "github.com/ozontech/seq-db/seq" @@ -17,11 +16,11 @@ import ( type fakeFrac struct { frac.Fraction - info common.Info + info frac.Info dp fakeDP } -func (f *fakeFrac) Info() *common.Info { +func (f *fakeFrac) Info() *frac.Info { return &f.info } @@ -51,7 +50,7 @@ func TestAsyncSearcherMaintain(t *testing.T) { Retention: time.Hour, } fracs := []frac.Fraction{ - &fakeFrac{info: common.Info{Path: "1"}}, + &fakeFrac{info: frac.Info{Path: "1"}}, } r.NoError(as.StartSearch(req, fracs)) diff --git a/cmd/distribution/main.go b/cmd/distribution/main.go index c8caad0b..93821ffe 100644 --- a/cmd/distribution/main.go +++ b/cmd/distribution/main.go @@ -11,7 +11,7 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" @@ -59,7 +59,7 @@ func readBlock(reader storage.IndexReader, blockIndex uint32) ([]byte, error) { return data, nil } -func loadInfo(path string) *common.Info { +func loadInfo(path string) *frac.Info { indexReader, f := getReader(path) defer f.Close() @@ -87,7 +87,7 @@ func loadInfo(path string) *common.Info { return b.Info } -func buildDist(dist *seq.MIDsDistribution, path string, _ *common.Info) { +func buildDist(dist *seq.MIDsDistribution, path string, _ *frac.Info) { blocksReader, f := getReader(path) defer f.Close() diff --git a/cmd/seq-db/seq-db.go b/cmd/seq-db/seq-db.go index 97720478..3831356b 100644 --- a/cmd/seq-db/seq-db.go +++ b/cmd/seq-db/seq-db.go @@ -23,7 +23,6 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/mappingprovider" @@ -260,7 +259,7 @@ func startStore( CacheGCDelay: 0, CacheCleanupDelay: 0, MinSealFracSize: uint64(cfg.Storage.TotalSize) * consts.DefaultMinSealPercent / 100, - SealParams: common.SealParams{ + SealParams: frac.SealParams{ IDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, LIDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, TokenListZstdLevel: cfg.Compression.SealedZstdCompressionLevel, diff --git a/frac/active.go b/frac/active/active.go similarity index 84% rename from frac/active.go rename to frac/active/active.go index 984a0e75..9db7a5d7 100644 --- a/frac/active.go +++ b/frac/active/active.go @@ -1,11 +1,10 @@ -package frac +package active import ( "context" "io" "math" "os" - "path/filepath" "sync" "time" @@ -16,7 +15,7 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" @@ -27,23 +26,23 @@ import ( ) var ( - _ Fraction = (*Active)(nil) + _ frac.Fraction = (*Active)(nil) ) type Active struct { - Config *Config + Config *frac.Config BaseFileName string infoMu sync.RWMutex - info *common.Info + info *frac.Info MIDs *UInt64s RIDs *UInt64s DocBlocks *UInt64s - TokenList *TokenList + TokenList *tokenList DocsPositions *DocsPositions @@ -56,8 +55,8 @@ type Active struct { metaFile *os.File metaReader storage.DocBlocksReader - writer *ActiveWriter - indexer *ActiveIndexer + writer *Writer + indexer *Indexer } const ( @@ -70,19 +69,19 @@ var systemSeqID = seq.ID{ RID: systemRID, } -func NewActive( +func New( baseFileName string, - activeIndexer *ActiveIndexer, + activeIndexer *Indexer, readLimiter *storage.ReadLimiter, docsCache *cache.Cache[[]byte], sortCache *cache.Cache[[]byte], - cfg *Config, + cfg *frac.Config, ) *Active { - docsFile, docsStats := mustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) - metaFile, metaStats := mustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) + docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) + metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) f := &Active{ - TokenList: NewActiveTokenList(config.IndexWorkers), + TokenList: NewTokenList(config.IndexWorkers), DocsPositions: NewSyncDocsPositions(), MIDs: NewIDs(), RIDs: NewIDs(), @@ -98,10 +97,10 @@ func NewActive( metaReader: storage.NewDocBlocksReader(readLimiter, metaFile), indexer: activeIndexer, - writer: NewActiveWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), + writer: NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), BaseFileName: baseFileName, - info: common.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), + info: frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), Config: cfg, } @@ -114,24 +113,6 @@ func NewActive( return f } -func mustOpenFile(name string, skipFsync bool) (*os.File, os.FileInfo) { - file, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o776) - if err != nil { - logger.Fatal("can't create docs file", zap.String("file", name), zap.Error(err)) - } - - if !skipFsync { - parentDirPath := filepath.Dir(name) - util.MustSyncPath(parentDirPath) - } - - stat, err := file.Stat() - if err != nil { - logger.Fatal("can't stat docs file", zap.String("file", name), zap.Error(err)) - } - return file, stat -} - func (f *Active) Replay(ctx context.Context) error { logger.Info("start replaying...", zap.String("name", f.info.Name())) @@ -261,7 +242,7 @@ func (f *Active) UpdateStats(minMID, maxMID seq.MID, docCount uint32, sizeCount } func (f *Active) String() string { - return fracToString(f, "active") + return frac.FracToString(f, "active") } func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { @@ -287,8 +268,8 @@ func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*se return dp.Search(params) } -func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { - return &activeDataProvider{ +func (f *Active) createDataProvider(ctx context.Context) *dataProvider { + return &dataProvider{ ctx: ctx, config: f.Config, info: f.Info(), @@ -303,7 +284,7 @@ func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { } } -func (f *Active) Info() *common.Info { +func (f *Active) Info() *frac.Info { f.infoMu.RLock() defer f.infoMu.RUnlock() diff --git a/frac/active_docs_positions.go b/frac/active/docs_positions.go similarity index 98% rename from frac/active_docs_positions.go rename to frac/active/docs_positions.go index 0b4c596b..2f5f8bc5 100644 --- a/frac/active_docs_positions.go +++ b/frac/active/docs_positions.go @@ -1,4 +1,4 @@ -package frac +package active import ( "sync" diff --git a/frac/active_ids.go b/frac/active/ids.go similarity index 97% rename from frac/active_ids.go rename to frac/active/ids.go index 1195c8fa..5207ecd4 100644 --- a/frac/active_ids.go +++ b/frac/active/ids.go @@ -1,4 +1,4 @@ -package frac +package active import ( "sync" diff --git a/frac/active_ids_test.go b/frac/active/ids_test.go similarity index 98% rename from frac/active_ids_test.go rename to frac/active/ids_test.go index ae2f6111..71a15131 100644 --- a/frac/active_ids_test.go +++ b/frac/active/ids_test.go @@ -1,4 +1,4 @@ -package frac +package active import ( "sync" diff --git a/frac/active_index.go b/frac/active/index.go similarity index 73% rename from frac/active_index.go rename to frac/active/index.go index 350a8e0d..3c1e01d1 100644 --- a/frac/active_index.go +++ b/frac/active/index.go @@ -1,9 +1,9 @@ -package frac +package active import ( "context" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/metric/stopwatch" @@ -13,24 +13,24 @@ import ( "github.com/ozontech/seq-db/storage" ) -type activeDataProvider struct { +type dataProvider struct { ctx context.Context - config *Config - info *common.Info + config *frac.Config + info *frac.Info mids *UInt64s rids *UInt64s - tokenList *TokenList + tokenList *tokenList blocksOffsets []uint64 docsPositions *DocsPositions docsReader *storage.DocsReader - idsIndex *activeIDsIndex + idsIndex *idsIndex } -func (dp *activeDataProvider) release() { +func (dp *dataProvider) release() { if dp.idsIndex != nil { dp.idsIndex.inverser.Release() } @@ -38,13 +38,13 @@ func (dp *activeDataProvider) release() { // getIDsIndex creates on demand and returns ActiveIDsIndex. // Creation of inverser for ActiveIDsIndex is expensive operation -func (dp *activeDataProvider) getIDsIndex() *activeIDsIndex { +func (dp *dataProvider) getIDsIndex() *idsIndex { if dp.idsIndex == nil { // creation order is matter mapping := dp.tokenList.GetAllTokenLIDs().GetLIDs(dp.mids, dp.rids) mids := dp.mids.GetVals() // mids and rids should be created after mapping to ensure that rids := dp.rids.GetVals() // they contain all the ids that mapping contains. - dp.idsIndex = &activeIDsIndex{ + dp.idsIndex = &idsIndex{ inverser: newInverser(mapping, len(mids)), mids: mids, rids: rids, @@ -53,8 +53,8 @@ func (dp *activeDataProvider) getIDsIndex() *activeIDsIndex { return dp.idsIndex } -func (dp *activeDataProvider) getTokenIndex() *activeTokenIndex { - return &activeTokenIndex{ +func (dp *dataProvider) getTokenIndex() *tokenIndex { + return &tokenIndex{ ctx: dp.ctx, mids: dp.mids, rids: dp.rids, @@ -63,17 +63,17 @@ func (dp *activeDataProvider) getTokenIndex() *activeTokenIndex { } } -func (dp *activeDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { +func (dp *dataProvider) Fetch(ids []seq.ID) ([][]byte, error) { sw := stopwatch.New() defer sw.Export( - fetcherStagesSeconds, + frac.FetcherStagesSeconds, stopwatch.SetLabel("fraction_type", "active"), ) res := make([][]byte, len(ids)) - indexes := []activeFetchIndex{{ + indexes := []fetchIndex{{ blocksOffsets: dp.blocksOffsets, docsPositions: dp.docsPositions, docsReader: dp.docsReader, @@ -88,7 +88,7 @@ func (dp *activeDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { return res, nil } -func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, error) { +func (dp *dataProvider) Search(params processor.SearchParams) (*seq.QPR, error) { // The index of the active fraction changes in parts and at a single moment in time may not be consistent. // So we can add new IDs to the index but update the range [from; to] with a delay. // Because of this, at the Search stage, we can get IDs that are outside the fraction range [from; to]. @@ -106,16 +106,16 @@ func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, e sw := stopwatch.New() defer sw.Export( - fractionSearchMetric(params), + frac.FractionSearchMetric(params), stopwatch.SetLabel("fraction_type", "active"), ) t := sw.Start("total") m := sw.Start("new_search_index") - indexes := []activeSearchIndex{{ - activeIDsIndex: dp.getIDsIndex(), - activeTokenIndex: dp.getTokenIndex(), + indexes := []searchIndex{{ + idsIndex: dp.getIDsIndex(), + tokenIndex: dp.getTokenIndex(), }} m.Stop() @@ -136,27 +136,27 @@ func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, e return res, nil } -type activeIDsIndex struct { +type idsIndex struct { mids []uint64 rids []uint64 inverser *inverser } -func (p *activeIDsIndex) GetMID(lid seq.LID) seq.MID { +func (p *idsIndex) GetMID(lid seq.LID) seq.MID { restoredLID := p.inverser.Revert(uint32(lid)) return seq.MID(p.mids[restoredLID]) } -func (p *activeIDsIndex) GetRID(lid seq.LID) seq.RID { +func (p *idsIndex) GetRID(lid seq.LID) seq.RID { restoredLID := p.inverser.Revert(uint32(lid)) return seq.RID(p.rids[restoredLID]) } -func (p *activeIDsIndex) Len() int { +func (p *idsIndex) Len() int { return p.inverser.Len() } -func (p *activeIDsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { +func (p *idsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { checkedMID := p.GetMID(lid) if checkedMID == id.MID { return p.GetRID(lid) <= id.RID @@ -164,28 +164,28 @@ func (p *activeIDsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { return checkedMID < id.MID } -type activeSearchIndex struct { - *activeIDsIndex - *activeTokenIndex +type searchIndex struct { + *idsIndex + *tokenIndex } -type activeTokenIndex struct { +type tokenIndex struct { ctx context.Context mids *UInt64s rids *UInt64s - tokenList *TokenList + tokenList *tokenList inverser *inverser } -func (si *activeTokenIndex) GetValByTID(tid uint32) []byte { +func (si *tokenIndex) GetValByTID(tid uint32) []byte { return si.tokenList.GetValByTID(tid) } -func (si *activeTokenIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { +func (si *tokenIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { return si.tokenList.FindPattern(si.ctx, t) } -func (si *activeTokenIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { +func (si *tokenIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { nodes := make([]node.Node, 0, len(tids)) for _, tid := range tids { tlids := si.tokenList.Provide(tid) @@ -209,17 +209,17 @@ func inverseLIDs(unmapped []uint32, inv *inverser, minLID, maxLID uint32) []uint return result } -type activeFetchIndex struct { +type fetchIndex struct { blocksOffsets []uint64 docsPositions *DocsPositions docsReader *storage.DocsReader } -func (di *activeFetchIndex) GetBlocksOffsets(num uint32) uint64 { +func (di *fetchIndex) GetBlocksOffsets(num uint32) uint64 { return di.blocksOffsets[num] } -func (di *activeFetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { +func (di *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { docsPos := make([]seq.DocPos, len(ids)) for i, id := range ids { docsPos[i] = di.docsPositions.GetSync(id) @@ -227,6 +227,6 @@ func (di *activeFetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { return docsPos } -func (di *activeFetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { +func (di *fetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { return di.docsReader.ReadDocs(blockOffset, docOffsets) } diff --git a/frac/active_indexer.go b/frac/active/indexer.go similarity index 90% rename from frac/active_indexer.go rename to frac/active/indexer.go index f1d31a6f..ae8e2caf 100644 --- a/frac/active_indexer.go +++ b/frac/active/indexer.go @@ -1,4 +1,4 @@ -package frac +package active import ( "encoding/binary" @@ -14,7 +14,7 @@ import ( "github.com/ozontech/seq-db/storage" ) -type ActiveIndexer struct { +type Indexer struct { ch chan *indexTask chMerge chan *mergeTask workerCount int @@ -32,8 +32,8 @@ type mergeTask struct { tokenLIDs *TokenLIDs } -func NewActiveIndexer(workerCount, chLen int) (*ActiveIndexer, func()) { - idx := ActiveIndexer{ +func NewIndexer(workerCount, chLen int) (*Indexer, func()) { + idx := Indexer{ ch: make(chan *indexTask, chLen), chMerge: make(chan *mergeTask, chLen), workerCount: workerCount, @@ -42,7 +42,7 @@ func NewActiveIndexer(workerCount, chLen int) (*ActiveIndexer, func()) { return &idx, stopIdx } -func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { +func (ai *Indexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { m := sw.Start("send_index_chan") ai.ch <- &indexTask{ Pos: storage.DocBlock(metas).GetExt2(), @@ -53,7 +53,7 @@ func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, s m.Stop() } -func (ai *ActiveIndexer) start() func() { +func (ai *Indexer) start() func() { wg := sync.WaitGroup{} wg.Add(ai.workerCount) @@ -79,7 +79,7 @@ func (ai *ActiveIndexer) start() func() { } } -func (ai *ActiveIndexer) mergeWorker() { +func (ai *Indexer) mergeWorker() { for task := range ai.chMerge { task.tokenLIDs.GetLIDs(task.frac.MIDs, task.frac.RIDs) // GetLIDs cause sort and merge LIDs from queue } @@ -91,7 +91,7 @@ var metaDataPool = sync.Pool{ }, } -func (ai *ActiveIndexer) appendWorker(index int) { +func (ai *Indexer) appendWorker(index int) { // collector of bulk meta data collector := newMetaDataCollector() @@ -169,7 +169,8 @@ func (ai *ActiveIndexer) appendWorker(index int) { } } -func (ai *ActiveIndexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { +func (ai *Indexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { + return for _, tl := range tokens { task := mergeTask{ frac: frac, diff --git a/frac/active_indexer_test.go b/frac/active/indexer_test.go similarity index 96% rename from frac/active_indexer_test.go rename to frac/active/indexer_test.go index fc2585c6..2d0c942c 100644 --- a/frac/active_indexer_test.go +++ b/frac/active/indexer_test.go @@ -1,4 +1,4 @@ -package frac +package active import ( "bytes" @@ -12,6 +12,7 @@ import ( "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric/stopwatch" @@ -76,20 +77,20 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewActiveIndexer(8, 8) + idx, stop := NewIndexer(8, 8) defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - active := NewActive( + active := New( filepath.Join(b.TempDir(), "test"), idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, ) processor := getTestProcessor() diff --git a/frac/inverser.go b/frac/active/inverser.go similarity index 99% rename from frac/inverser.go rename to frac/active/inverser.go index 11d854a7..a96521eb 100644 --- a/frac/inverser.go +++ b/frac/active/inverser.go @@ -1,4 +1,4 @@ -package frac +package active import ( "unsafe" diff --git a/frac/meta_data_collector.go b/frac/active/meta_data_collector.go similarity index 99% rename from frac/meta_data_collector.go rename to frac/active/meta_data_collector.go index 6047bd12..07fb59d9 100644 --- a/frac/meta_data_collector.go +++ b/frac/active/meta_data_collector.go @@ -1,4 +1,4 @@ -package frac +package active import ( "math" diff --git a/frac/active_sealing_source.go b/frac/active/sealing_source.go similarity index 89% rename from frac/active_sealing_source.go rename to frac/active/sealing_source.go index 42bde383..d4208f4c 100644 --- a/frac/active_sealing_source.go +++ b/frac/active/sealing_source.go @@ -1,4 +1,4 @@ -package frac +package active import ( "bytes" @@ -17,14 +17,14 @@ import ( "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" ) -// ActiveSealingSource transforms data from in-memory (frac.Active) storage +// SealingSource transforms data from in-memory (frac.Active) storage // into a format suitable for disk writing during index creation. // // The main purpose of this type is to provide access to sorted data @@ -39,9 +39,9 @@ import ( // // All iterators work with pre-sorted data and return information // in an order optimal for creating disk index structures. -type ActiveSealingSource struct { - params common.SealParams // Sealing parameters - info *common.Info // fraction Info +type SealingSource struct { + params frac.SealParams // Sealing parameters + info *frac.Info // fraction Info created time.Time // Creation time of the source sortedLIDs []uint32 // Sorted LIDs (Local ID) oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) @@ -59,9 +59,9 @@ type ActiveSealingSource struct { lastErr error // Last error } -// NewActiveSealingSource creates a new data source for sealing +// NewSealingSource creates a new data source for sealing // based on an active in-memory index. -func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSealingSource, error) { +func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, error) { info := *active.info // copy sortedLIDs := active.GetAllDocuments() @@ -71,7 +71,7 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe // Sort tokens within each field sortedTIDs := sortTokens(sortedFields, active.TokenList) - src := ActiveSealingSource{ + src := SealingSource{ params: params, info: &info, created: time.Now(), @@ -103,7 +103,7 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe // sortFields sorts field names and calculates maximum TIDs for each field. // Returns sorted field list and array of maximum TIDs. -func sortFields(tl *TokenList) ([]string, []uint32) { +func sortFields(tl *tokenList) ([]string, []uint32) { fields := make([]string, 0, len(tl.FieldTIDs)) for field := range tl.FieldTIDs { fields = append(fields, field) @@ -122,7 +122,7 @@ func sortFields(tl *TokenList) ([]string, []uint32) { // sortTokens sorts tokens lexicographically within each field. // Returns sorted list of TIDs. -func sortTokens(sortedFields []string, tl *TokenList) []uint32 { +func sortTokens(sortedFields []string, tl *tokenList) []uint32 { pos := 0 tids := make([]uint32, 0, len(tl.tidToVal)) for _, field := range sortedFields { @@ -139,26 +139,26 @@ func sortTokens(sortedFields []string, tl *TokenList) []uint32 { } // LastError returns the last error that occurred during processing. -func (src *ActiveSealingSource) LastError() error { +func (src *SealingSource) LastError() error { return src.lastErr } // prepareInfo prepares metadata for disk writing. -func (src *ActiveSealingSource) prepareInfo() { +func (src *SealingSource) prepareInfo() { src.info.MetaOnDisk = 0 src.info.SealingTime = uint64(src.created.UnixMilli()) src.info.BuildDistribution(src.mids.vals) } // Info returns index metadata information. -func (src *ActiveSealingSource) Info() *common.Info { +func (src *SealingSource) Info() *frac.Info { return src.info } // TokenBlocks returns an iterator for token blocks for disk writing. // Tokens are pre-sorted: first by fields, then lexicographically within each field. // Each block contains up to blockSize bytes of data for efficient writing. -func (src *ActiveSealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { +func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { const tokenLengthSize = int(unsafe.Sizeof(uint32(0))) return func(yield func([][]byte) bool) { if len(src.tids) == 0 { @@ -193,7 +193,7 @@ func (src *ActiveSealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { // Fields returns an iterator for sorted fields and their maximum TIDs. // Fields are sorted lexicographically, ensuring predictable order // when building disk index structures. -func (src *ActiveSealingSource) Fields() iter.Seq2[string, uint32] { +func (src *SealingSource) Fields() iter.Seq2[string, uint32] { return func(yield func(string, uint32) bool) { for i, field := range src.fields { if !yield(field, src.fieldsMaxTIDs[i]) { @@ -206,7 +206,7 @@ func (src *ActiveSealingSource) Fields() iter.Seq2[string, uint32] { // IDsBlocks returns an iterator for document ID blocks and corresponding positions. // IDs are sorted. Block size is controlled by blockSize parameter for balance between // performance and memory usage. -func (src *ActiveSealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { return func(yield func([]seq.ID, []seq.DocPos) bool) { mids := src.mids.vals rids := src.rids.vals @@ -242,7 +242,7 @@ func (src *ActiveSealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []s } // BlocksOffsets returns document block offsets. -func (src *ActiveSealingSource) BlocksOffsets() []uint64 { +func (src *SealingSource) BlocksOffsets() []uint64 { return src.blocksOffsets } @@ -250,7 +250,7 @@ func (src *ActiveSealingSource) BlocksOffsets() []uint64 { // LIDs are converted to new numbering after document sorting. // Each iterator call returns a list of documents containing a specific token, // in sorted order. -func (src *ActiveSealingSource) TokenLIDs() iter.Seq[[]uint32] { +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { return func(yield func([]uint32) bool) { newLIDs := []uint32{} @@ -284,7 +284,7 @@ func makeInverser(sortedLIDs []uint32) []uint32 { // Docs returns an iterator for documents with their IDs. // Handles duplicate IDs (for nested indexes). -func (src *ActiveSealingSource) Docs() iter.Seq2[seq.ID, []byte] { +func (src *SealingSource) Docs() iter.Seq2[seq.ID, []byte] { src.lastErr = nil return func(yield func(seq.ID, []byte) bool) { var ( @@ -313,7 +313,7 @@ func (src *ActiveSealingSource) Docs() iter.Seq2[seq.ID, []byte] { } // doc reads a document from storage by its position. -func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { +func (src *SealingSource) doc(pos seq.DocPos) ([]byte, error) { blockIndex, docOffset := pos.Unpack() blockOffset := src.blocksOffsets[blockIndex] @@ -330,7 +330,7 @@ func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { // SortDocs sorts documents and writes them in compressed form to disk. // Creates a temporary file that is then renamed to the final one. -func (src *ActiveSealingSource) SortDocs() error { +func (src *SealingSource) SortDocs() error { start := time.Now() logger.Info("sorting docs...") @@ -396,7 +396,7 @@ func (src *ActiveSealingSource) SortDocs() error { // writeDocs compresses and writes document blocks, calculating new offsets // and collecting document positions. -func (src *ActiveSealingSource) writeDocs(blocks iter.Seq2[[]byte, []seq.DocPos], w io.Writer) ([]uint64, []seq.DocPos, error) { +func (src *SealingSource) writeDocs(blocks iter.Seq2[[]byte, []seq.DocPos], w io.Writer) ([]uint64, []seq.DocPos, error) { offset := 0 buf := make([]byte, 0) blocksOffsets := make([]uint64, 0) diff --git a/frac/active_lids.go b/frac/active/token_lids.go similarity index 99% rename from frac/active_lids.go rename to frac/active/token_lids.go index 47abe92a..a4f8e851 100644 --- a/frac/active_lids.go +++ b/frac/active/token_lids.go @@ -1,4 +1,4 @@ -package frac +package active import ( "math" diff --git a/frac/active_token_list.go b/frac/active/token_list.go similarity index 79% rename from frac/active_token_list.go rename to frac/active/token_list.go index adf94ffd..122c88d7 100644 --- a/frac/active_token_list.go +++ b/frac/active/token_list.go @@ -1,4 +1,4 @@ -package frac +package active import ( "context" @@ -28,36 +28,36 @@ type tokenTask struct { tlids []*TokenLIDs } -type activeTokenProvider struct { +type tokenProvider struct { inverseIndex []uint32 tidToVal [][]byte } -func (tp *activeTokenProvider) GetToken(tid uint32) []byte { +func (tp *tokenProvider) GetToken(tid uint32) []byte { id := tp.inverseIndex[tid-1] return tp.tidToVal[id] } -func (tp *activeTokenProvider) FirstTID() uint32 { +func (tp *tokenProvider) FirstTID() uint32 { return 1 } -func (tp *activeTokenProvider) LastTID() uint32 { +func (tp *tokenProvider) LastTID() uint32 { return uint32(len(tp.inverseIndex)) } -func (tp *activeTokenProvider) Ordered() bool { +func (tp *tokenProvider) Ordered() bool { return false } -func (tp *activeTokenProvider) inverseTIDs(tids []uint32) []uint32 { +func (tp *tokenProvider) inverseTIDs(tids []uint32) []uint32 { for i, tid := range tids { tids[i] = tp.inverseIndex[tid-1] } return tids } -type TokenList struct { +type tokenList struct { chList []chan tokenTask fieldsMu sync.RWMutex @@ -73,8 +73,8 @@ type TokenList struct { tidToLIDs []*TokenLIDs } -func NewActiveTokenList(workers int) *TokenList { - tl := &TokenList{ +func NewTokenList(workers int) *tokenList { + tl := &tokenList{ chList: make([]chan tokenTask, workers), FieldTIDs: make(map[string][]uint32), fieldSizes: make(map[string]uint32), @@ -91,13 +91,13 @@ func NewActiveTokenList(workers int) *TokenList { return tl } -func (tl *TokenList) Stop() { +func (tl *tokenList) Stop() { for _, c := range tl.chList { close(c) } } -func (tl *TokenList) tokenLIDsWorker(ch chan tokenTask, tokenToLIDs map[string]*TokenLIDs) { +func (tl *tokenList) tokenLIDsWorker(ch chan tokenTask, tokenToLIDs map[string]*TokenLIDs) { nonExistent := make([]int, 0) for task := range ch { bufSize := 0 @@ -138,13 +138,13 @@ func copyAndSplit(token []byte, fLen int, dest []byte) (string, string, []byte, dest } -func (tl *TokenList) initSystemTokens() { +func (tl *tokenList) initSystemTokens() { token := []byte(seq.TokenAll + ":") tlids := tl.Append([][]byte{token}, []int{len(seq.TokenAll)}, []*TokenLIDs{nil}) tl.allTokenLIDs = tlids[0] } -func (tl *TokenList) GetValByTID(tid uint32) []byte { +func (tl *tokenList) GetValByTID(tid uint32) []byte { tl.tidMu.RLock() defer tl.tidMu.RUnlock() @@ -152,7 +152,7 @@ func (tl *TokenList) GetValByTID(tid uint32) []byte { return tl.tidToVal[tid] } -func (tl *TokenList) Provide(tid uint32) *TokenLIDs { +func (tl *tokenList) Provide(tid uint32) *TokenLIDs { tl.tidMu.RLock() defer tl.tidMu.RUnlock() @@ -160,31 +160,31 @@ func (tl *TokenList) Provide(tid uint32) *TokenLIDs { return tl.tidToLIDs[tid] } -func (tl *TokenList) GetAllTokenLIDs() *TokenLIDs { +func (tl *tokenList) GetAllTokenLIDs() *TokenLIDs { return tl.allTokenLIDs } -func (tl *TokenList) GetTIDsByField(f string) []uint32 { +func (tl *tokenList) GetTIDsByField(f string) []uint32 { tl.fieldsMu.RLock() defer tl.fieldsMu.RUnlock() return tl.FieldTIDs[f] } -func (tl *TokenList) getTokenProvider(field string) *activeTokenProvider { +func (tl *tokenList) getTokenProvider(field string) *tokenProvider { inverseIndex := tl.GetTIDsByField(field) tl.tidMu.RLock() tidToVal := tl.tidToVal tl.tidMu.RUnlock() - return &activeTokenProvider{ + return &tokenProvider{ tidToVal: tidToVal, inverseIndex: inverseIndex, } } -func (tl *TokenList) FindPattern(ctx context.Context, t parser.Token) ([]uint32, error) { +func (tl *tokenList) FindPattern(ctx context.Context, t parser.Token) ([]uint32, error) { field := parser.GetField(t) tp := tl.getTokenProvider(field) tids, err := pattern.Search(ctx, t, tp) @@ -198,7 +198,7 @@ func getTokenHash(token []byte) uint32 { return crc32.ChecksumIEEE(token) } -func (tl *TokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids []*TokenLIDs) []tokenData { +func (tl *tokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids []*TokenLIDs) []tokenData { n := len(tl.chList) remap := make([][]int, n) for i, token := range tokens { @@ -234,7 +234,7 @@ func (tl *TokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids [] return newTokensData } -func (tl *TokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlaces []*TokenLIDs) []*TokenLIDs { +func (tl *tokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlaces []*TokenLIDs) []*TokenLIDs { newTokensData := tl.getTokenLIDs(tokens, fieldsLengths, tokenLIDsPlaces) tl.createTIDs(newTokensData) @@ -244,7 +244,7 @@ func (tl *TokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlace return tokenLIDsPlaces } -func (tl *TokenList) createTIDs(newTokensData []tokenData) { +func (tl *tokenList) createTIDs(newTokensData []tokenData) { tl.tidMu.Lock() for i, token := range newTokensData { newTokensData[i].tid = uint32(len(tl.tidToVal)) @@ -254,7 +254,7 @@ func (tl *TokenList) createTIDs(newTokensData []tokenData) { tl.tidMu.Unlock() } -func (tl *TokenList) fillFieldTIDs(newTokensData []tokenData) { +func (tl *tokenList) fillFieldTIDs(newTokensData []tokenData) { tl.fieldsMu.Lock() for _, token := range newTokensData { field := token.field @@ -263,7 +263,7 @@ func (tl *TokenList) fillFieldTIDs(newTokensData []tokenData) { tl.fieldsMu.Unlock() } -func (tl *TokenList) fillSizes(newTokensData []tokenData) { +func (tl *tokenList) fillSizes(newTokensData []tokenData) { tl.sizesMu.Lock() for _, token := range newTokensData { tl.fieldSizes[token.field] += uint32(len(token.value)) @@ -271,7 +271,7 @@ func (tl *TokenList) fillSizes(newTokensData []tokenData) { tl.sizesMu.Unlock() } -func (tl *TokenList) GetFieldSizes() map[string]uint32 { +func (tl *tokenList) GetFieldSizes() map[string]uint32 { tl.sizesMu.Lock() defer tl.sizesMu.Unlock() diff --git a/frac/active/writer.go b/frac/active/writer.go new file mode 100644 index 00000000..1aadee64 --- /dev/null +++ b/frac/active/writer.go @@ -0,0 +1,44 @@ +package active + +import ( + "os" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/storage" +) + +type Writer struct { + docs *frac.FileWriter + meta *frac.FileWriter +} + +func NewWriter(docsFile, metaFile *os.File, docsOffset, metaOffset int64, skipFsync bool) *Writer { + return &Writer{ + docs: frac.NewFileWriter(docsFile, docsOffset, skipFsync), + meta: frac.NewFileWriter(metaFile, metaOffset, skipFsync), + } +} + +func (a *Writer) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { + m := sw.Start("write_docs") + offset, err := a.docs.Write(docs, sw) + m.Stop() + + if err != nil { + return err + } + + storage.DocBlock(meta).SetExt2(uint64(offset)) + + m = sw.Start("write_meta") + _, err = a.meta.Write(meta, sw) + m.Stop() + + return err +} + +func (a *Writer) Stop() { + a.docs.Stop() + a.meta.Stop() +} diff --git a/frac/active2/active2.go b/frac/active2/active2.go new file mode 100644 index 00000000..86e0e61b --- /dev/null +++ b/frac/active2/active2.go @@ -0,0 +1,328 @@ +package active2 + +import ( + "context" + "io" + "os" + "sync" + "time" + + "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/metric" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +type Active2 struct { + Config *frac.Config + + BaseFileName string + + indexMu sync.RWMutex + info *frac.Info + indexes *MergeManager + indexer *Indexer + + docsFile *os.File + docsReader storage.DocsReader + sortReader storage.DocsReader + docsCache *cache.Cache[[]byte] + sortCache *cache.Cache[[]byte] + + metaFile *os.File + metaReader storage.DocBlocksReader + + writer *active.Writer +} + +type indexSnapshot struct { + info *frac.Info + indexes []*memIndex +} + +func New( + baseFileName string, + cfg *frac.Config, + indexer *Indexer, + readLimiter *storage.ReadLimiter, + docsCache *cache.Cache[[]byte], + sortCache *cache.Cache[[]byte], +) *Active2 { + docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) + metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) + + f := &Active2{ + BaseFileName: baseFileName, + Config: cfg, + info: frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), + indexer: indexer, + indexes: newMergeManager(2), + + docsFile: docsFile, + docsCache: docsCache, + sortCache: sortCache, + docsReader: storage.NewDocsReader(readLimiter, docsFile, docsCache), + sortReader: storage.NewDocsReader(readLimiter, docsFile, sortCache), + + metaFile: metaFile, + metaReader: storage.NewDocBlocksReader(readLimiter, metaFile), + + writer: active.NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), + } + + logger.Info("active fraction created", zap.String("fraction", baseFileName)) + + return f +} + +func (f *Active2) Replay(ctx context.Context) error { + logger.Info("start replaying...", zap.String("name", f.info.Name())) + + t := time.Now() + + offset := uint64(0) + step := f.info.MetaOnDisk / 10 + wg := sync.WaitGroup{} + next := step + +out: + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + meta, metaSize, err := f.metaReader.ReadDocBlock(int64(offset)) + if err == io.EOF { + if metaSize != 0 { + logger.Warn("last meta block is partially written, skipping it") + } + break out + } + if err != nil && err != io.EOF { + return err + } + + if offset > next { + next += step + progress := float64(offset) / float64(f.info.MetaOnDisk) * 100 + logger.Info("replaying batch, meta", + zap.String("name", f.info.Name()), + zap.Uint64("from", offset), + zap.Uint64("to", offset+metaSize), + zap.Uint64("target", f.info.MetaOnDisk), + util.ZapFloat64WithPrec("progress_percentage", progress, 2), + ) + } + offset += metaSize + + wg.Add(1) + f.indexer.Index(meta, func(idx *memIndex, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + f.addIndex(idx) + wg.Done() + }) + } + } + + wg.Wait() + + tookSeconds := util.DurationToUnit(time.Since(t), "s") + throughputRaw := util.SizeToUnit(f.info.DocsRaw, "mb") / tookSeconds + throughputMeta := util.SizeToUnit(f.info.MetaOnDisk, "mb") / tookSeconds + logger.Info("active fraction replayed", + zap.String("name", f.info.Name()), + zap.Uint32("docs_total", f.info.DocsTotal), + util.ZapUint64AsSizeStr("docs_size", f.info.DocsOnDisk), + util.ZapFloat64WithPrec("took_s", tookSeconds, 1), + util.ZapFloat64WithPrec("throughput_raw_mb_sec", throughputRaw, 1), + util.ZapFloat64WithPrec("throughput_meta_mb_sec", throughputMeta, 1), + ) + return nil +} + +func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { + sw := stopwatch.New() + ma := sw.Start("append") + if err = f.writer.Write(docs, meta, sw); err != nil { + ma.Stop() + return err + } + f.updateDiskStats(uint64(len(docs)), uint64(len(meta))) + + mi := sw.Start("send_to_indexer") + f.indexer.Index(meta, func(idx *memIndex, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + f.addIndex(idx) + wg.Done() + }) + mi.Stop() + + ma.Stop() + sw.Export(bulkStagesSeconds) + return nil +} + +func (f *Active2) updateDiskStats(docsLen, metaLen uint64) { + f.indexMu.Lock() + f.info.DocsOnDisk += docsLen + f.info.MetaOnDisk += metaLen + f.indexMu.Unlock() +} + +func (f *Active2) addIndex(index *memIndex) { + maxMID := index.ids[0].MID + minMID := index.ids[len(index.ids)-1].MID + + f.indexMu.Lock() + defer f.indexMu.Unlock() + + f.indexes.Add(index) + + if f.info.From > minMID { + f.info.From = minMID + } + if f.info.To < maxMID { + f.info.To = maxMID + } + f.info.DocsRaw += index.docsSize + f.info.DocsTotal += index.docsCount +} + +func (f *Active2) String() string { + return frac.FracToString(f, "active") +} + +func (f *Active2) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + sw := stopwatch.New() + defer sw.Export(fetcherStagesSec) + + t := sw.Start("total") + + ss := f.indexSnapshot(ctx) + if ss.info.DocsTotal == 0 { // it is empty active fraction state + return nil, nil + } + + res := make([][]byte, len(ids)) + for _, index := range ss.indexes { + fetchIndex := fetchIndex{index: index, docsReader: &f.docsReader} + if err := processor.IndexFetch(ids, sw, &fetchIndex, res); err != nil { + return nil, err + } + } + t.Stop() + + return res, nil +} + +func (f *Active2) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { + ss := f.indexSnapshot(ctx) + + if ss.info.DocsTotal == 0 { // it is empty active fraction state + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil + } + + aggLimits := processor.AggLimits(f.Config.Search.AggLimits) + + // Limit the parameter range to data boundaries to prevent histogram overflow + params.From = max(params.From, ss.info.From) + params.To = min(params.To, ss.info.To) + + sw := stopwatch.New() + defer sw.Export(getActiveSearchMetric(params)) + + t := sw.Start("total") + qprs := make([]*seq.QPR, 0, len(ss.indexes)) + for _, index := range ss.indexes { + si := searchIndex{ctx: ctx, index: index} + qpr, err := processor.IndexSearch(ctx, params, &si, aggLimits, sw) + if err != nil { + return nil, err + } + qprs = append(qprs, qpr) + } + res := processor.MergeQPRs(qprs, params) + res.IDs.ApplyHint(ss.info.Name()) + t.Stop() + + return res, nil +} + +func (f *Active2) indexSnapshot(ctx context.Context) *indexSnapshot { + f.indexMu.RLock() + info := *f.info // copy + indexes := f.indexes.Indexes() + f.indexMu.RUnlock() + + return &indexSnapshot{ + info: &info, + indexes: indexes, + } +} + +func (f *Active2) Info() *frac.Info { + f.indexMu.RLock() + defer f.indexMu.RUnlock() + + cp := *f.info // copy + return &cp +} + +func (f *Active2) Contains(id seq.MID) bool { + return f.Info().IsIntersecting(id, id) +} + +func (f *Active2) IsIntersecting(from, to seq.MID) bool { + return f.Info().IsIntersecting(from, to) +} + +func (f *Active2) Release() { + f.releaseMem() + + if !f.Config.KeepMetaFile { + util.RemoveFile(f.metaFile.Name()) + } + + if !f.Config.SkipSortDocs { + // we use sorted docs in sealed fraction so we can remove original docs of active fraction + util.RemoveFile(f.docsFile.Name()) + } + +} + +func (f *Active2) Suicide() { + f.releaseMem() + + util.RemoveFile(f.metaFile.Name()) + util.RemoveFile(f.docsFile.Name()) + util.RemoveFile(f.BaseFileName + consts.SdocsFileSuffix) +} + +func (f *Active2) releaseMem() { + f.writer.Stop() + f.indexes.Stop() + + f.docsCache.Release() + f.sortCache.Release() + + if err := f.metaFile.Close(); err != nil { + logger.Error("can't close meta file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } +} diff --git a/frac/active2/data_provider.go b/frac/active2/data_provider.go new file mode 100644 index 00000000..78771216 --- /dev/null +++ b/frac/active2/data_provider.go @@ -0,0 +1,138 @@ +package active2 + +import ( + "context" + "fmt" + "sort" + + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/node" + "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/pattern" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/prometheus/client_golang/prometheus" +) + +type fetchIndex struct { + index *memIndex + docsReader *storage.DocsReader +} + +func (si *fetchIndex) GetBlocksOffsets(blockIndex uint32) uint64 { + return si.index.blocksOffsets[blockIndex] +} + +func (si *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { + docsPos := make([]seq.DocPos, len(ids)) + for i, id := range ids { + if lid, ok := si.index.idToLID[id]; ok { + docsPos[i] = si.index.positions[lid] + continue + } + docsPos[i] = seq.DocPosNotFound + } + return docsPos +} + +func (si *fetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { + return si.docsReader.ReadDocs(blockOffset, docOffsets) +} + +type searchIndex struct { + ctx context.Context + index *memIndex +} + +func (si *searchIndex) GetValByTID(tid uint32) []byte { + return si.index.tokens[tid] +} + +func (si *searchIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { + field := parser.GetField(t) + tp := si.index.getTokenProvider(field) + tids, err := pattern.Search(si.ctx, t, tp) + if err != nil { + return nil, fmt.Errorf("search error: %w field: %s, query: %s", err, field, parser.GetHint(t)) + } + return tids, nil +} + +func (si *searchIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { + nodes := make([]node.Node, 0, len(tids)) + for _, tid := range tids { + nodes = append(nodes, si.geTidLidsNode(tid, minLID, maxLID, order)) + } + return nodes +} + +func (si *searchIndex) geTidLidsNode(tid, minLID, maxLID uint32, order seq.DocsOrder) node.Node { + if tid == si.index.allTID { + return node.NewRange(minLID, maxLID, order.IsReverse()) + } + tidLIDs := si.index.tokenLIDs[tid] + return node.NewStatic(narrowDownLIDs(tidLIDs, minLID, maxLID), order.IsReverse()) +} + +func narrowDownLIDs(tidLIDs []uint32, minLID, maxLID uint32) []uint32 { + n := len(tidLIDs) + left := sort.Search(n, func(i int) bool { return tidLIDs[i] >= minLID }) + right := sort.Search(n, func(i int) bool { return tidLIDs[i] > maxLID }) + if left > right { + return nil + } + return tidLIDs[left:right] +} + +func (si *searchIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { + checkedMID := si.GetMID(lid) + if checkedMID == id.MID { + return si.GetRID(lid) <= id.RID + } + return checkedMID < id.MID +} + +func (si *searchIndex) GetMID(lid seq.LID) seq.MID { + return si.index.ids[lid-1].MID +} + +func (si *searchIndex) GetRID(lid seq.LID) seq.RID { + return si.index.ids[lid-1].RID +} + +func (si *searchIndex) Len() int { + return len(si.index.ids) + 1 +} + +func getActiveSearchMetric(params processor.SearchParams) *prometheus.HistogramVec { + if params.HasAgg() { + return searchAggSec + } + if params.HasHist() { + return searchHstSec + } + return searchSimpleSec +} + +type tokenProvider struct { + firstTID uint32 + lastTID uint32 + tokens [][]byte +} + +func (p *tokenProvider) GetToken(tid uint32) []byte { + return p.tokens[tid] +} + +func (p *tokenProvider) FirstTID() uint32 { + return p.firstTID +} + +func (p *tokenProvider) LastTID() uint32 { + return p.lastTID +} + +func (p *tokenProvider) Ordered() bool { + return true +} diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go new file mode 100644 index 00000000..bc878b18 --- /dev/null +++ b/frac/active2/indexer.go @@ -0,0 +1,304 @@ +package active2 + +import ( + "cmp" + "encoding/binary" + "slices" + "unsafe" + + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tokenizer" + "github.com/ozontech/seq-db/util" +) + +const uint32Len = int(unsafe.Sizeof(uint32(0))) + +type Indexer struct { + sem chan struct{} +} + +func NewIndexer(workersCount int) *Indexer { + return &Indexer{ + sem: make(chan struct{}, workersCount), + } +} + +func (s *Indexer) Index(meta storage.DocBlock, applyFn func(index *memIndex, err error)) { + s.sem <- struct{}{} + go func() { + applyFn(NewMemIndex(meta)) + <-s.sem + }() +} + +func NewMemIndex(metaBlock storage.DocBlock) (*memIndex, error) { + sw := stopwatch.New() + + res := newIndexerResources() + defer res.releaseAll() + + payload, err := decompressionMeta(metaBlock, res, sw) + if err != nil { + return nil, err + } + + meta, err := decodeMeta(payload, res, sw) + if err != nil { + return nil, err + } + + idx := &memIndex{ + idToLID: make(map[seq.ID]uint32, len(meta)), + docsCount: uint32(len(meta)), + fieldsTokens: make(map[string]tokensRange), + blocksOffsets: []uint64{metaBlock.GetExt2()}, // only one block per bulk + } + + tids, lids, tokens, err := extractTokensFromMetadata(meta, idx, res) + if err != nil { + return nil, err + } + + tokenLIDs := groupLIDsByToken(res, tids, lids, len(tokens)) + + organizeTokensAndFields(idx, tokens, tokenLIDs) + + idx.allTID = uint32(idx.fieldsTokens[seq.TokenAll].start) + + return idx, nil +} + +type tokenKey struct { + v, k string +} + +func convertMetaToken(t tokenizer.MetaToken) tokenKey { + return tokenKey{ + k: util.ByteToStringUnsafe(t.Key), + v: util.ByteToStringUnsafe(t.Value), + } +} + +func extractTokensFromMetadata( + meta []indexer.MetaData, + idx *memIndex, + res *indexerResources, +) ([]uint32, []uint32, []tokenKey, error) { + var lidsSize uint32 + var docOffset uint64 + + localRes := newIndexerResources() + defer localRes.releaseAll() + + // scan in orig order to calc offsets and size + positions := localRes.newDocPos(len(meta)) + prev := seq.PackDocPos(0, docOffset) + positions = positions[:len(meta)] // inBoubds + for i, docMeta := range meta { + if docMeta.Size > 0 { + prev = seq.PackDocPos(0, docOffset) + docOffset += uint64(docMeta.Size) + uint64(uint32Len) + } + positions[i] = prev + lidsSize += docMeta.TokensCount() + } + + lids := res.newUint32s(int(lidsSize)) + tids := res.newUint32s(int(lidsSize)) + + order := localRes.newUint32s(len(meta)) + for i := range order { + order[i] = uint32(i) + } + slices.SortFunc(order, func(a, b uint32) int { return seq.Compare(meta[b].ID, meta[a].ID) }) + + ids := make([]seq.ID, len(order)) + pos := make([]seq.DocPos, len(order)) + + for lid, i := range order { + docMeta := meta[i] + ids[lid] = docMeta.ID + idx.docsSize += uint64(docMeta.Size) + idx.idToLID[docMeta.ID] = uint32(lid) + pos[lid] = positions[i] + } + + idx.ids = ids + idx.positions = pos + + var err error + var mt tokenKey + + tids = tids[:0] + lids = lids[:0] + + tokenToTID := localRes.newMetaTokenMap(1000) + tokens, release := localRes.newTokenizerMetaTokens(1000) + + for lid, i := range order { + docMeta := meta[i] + if tokens, err = docMeta.DecodeTokens(tokens[:0]); err != nil { + return nil, nil, nil, err + } + for _, t := range tokens { + mt = convertMetaToken(t) + tid, ok := tokenToTID[mt] + if !ok { + tid = uint32(len(tokenToTID)) + tokenToTID[mt] = tid + } + tids = append(tids, tid) + lids = append(lids, uint32(lid+1)) + } + } + + release(tokens) + + tidToToken := res.newMetaTokens(len(tokenToTID)) + for mt, tid := range tokenToTID { + tidToToken[tid] = mt + } + + return tids, lids, tidToToken, nil +} + +func groupLIDsByToken(res *indexerResources, tids, lids []uint32, tokensCnt int) [][]uint32 { + // считаем размеры токенлидсов + localRes := newIndexerResources() + defer localRes.releaseAll() + + lens := localRes.newUint32s(tokensCnt) + clear(lens) + for _, tid := range tids { + lens[tid]++ + } + + // нарезаем токенлидсы + tokenLIDs := res.newUint32Slices(tokensCnt) + lidsBuffer := make([]uint32, len(lids)) + for tid, cnt := range lens { + tokenLIDs[tid] = lidsBuffer[:cnt][:0] + lidsBuffer = lidsBuffer[cnt:] + } + + // заполняем токенлидсы + lids = lids[:len(tids)] // isInBounds + for i, tid := range tids { + tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) + } + return tokenLIDs +} + +func organizeTokensAndFields(idx *memIndex, tokens []tokenKey, tokenLIDs [][]uint32) { + localRes := newIndexerResources() + defer localRes.releaseAll() + + order := localRes.newUint32s(len(tokens)) + for i := range order { + order[i] = uint32(i) + } + + slices.SortFunc(order, func(a, b uint32) int { + aToken, bToken := tokens[a], tokens[b] + return cmp.Or( + cmp.Compare(aToken.k, bToken.k), + cmp.Compare(aToken.v, bToken.v), + ) + }) + + tokensSize := 0 + for _, t := range tokens { + tokensSize += len(t.v) + } + + prevField := "" + fieldsSize := 0 + fields := localRes.newStrings(100)[:0] + fieldsTIDs := localRes.newUint32s(100)[:0] + + bufferTokens := make([]byte, 0, tokensSize) + + orderedTokens := make([][]byte, len(order)) + orderedTokenLIDs := make([][]uint32, len(order)) + + for tid, i := range order { + mt := tokens[i] + if mt.k != prevField || prevField == "" { + // collect uniq fields values + fieldsSize += len(mt.k) + fields = append(fields, mt.k) + fieldsTIDs = append(fieldsTIDs, uint32(tid)) + } + prevField = mt.k + + // copy tokens + p := len(bufferTokens) + bufferTokens = append(bufferTokens, mt.v...) + + // fill tokens ordered + orderedTokens[tid] = bufferTokens[p:] + orderedTokenLIDs[tid] = tokenLIDs[i] + } + + idx.tokens = orderedTokens + idx.tokenLIDs = orderedTokenLIDs + + fieldsTIDs = append(fieldsTIDs, uint32(len(tokens))) + + bufferFields := make([]byte, 0, fieldsSize) + idx.fields = make([][]byte, len(fields)) + for i, field := range fields { + // copy fields + p := len(bufferFields) + bufferFields = append(bufferFields, field...) + idx.fields[i] = bufferFields[p:] + + // fill field range + tid1 := fieldsTIDs[i] + tid2 := fieldsTIDs[i+1] + idx.fieldsTokens[util.ByteToStringUnsafe(bufferFields[p:])] = tokensRange{ + start: tid1, + count: tid2 - tid1, + } + } +} + +func decompressionMeta(meta storage.DocBlock, ia *indexerResources, sw *stopwatch.Stopwatch) ([]byte, error) { + m := sw.Start("decompress_meta") + defer m.Stop() + + payload, err := meta.DecompressTo(ia.newBytes(int(meta.RawLen()))) + if err != nil { + return nil, err + } + return payload, nil +} + +func decodeMeta(payload []byte, ia *indexerResources, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { + m := sw.Start("decode_meta") + defer m.Stop() + + // scan to get length + offset := 0 + offsets := ia.newInts(1000)[:0] + for offset < len(payload) { + size := binary.LittleEndian.Uint32(payload[offset:]) + offset += uint32Len + int(size) + offsets = append(offsets, int(size)) + } + + // decode + meta := ia.newMetaData(len(offsets)) + for i, size := range offsets { + bin := payload[uint32Len : size+uint32Len] + if err := meta[i].UnmarshalBinaryLazy(bin); err != nil { + return nil, err + } + payload = payload[size+uint32Len:] + } + + return meta, nil +} diff --git a/frac/active2/indexer_allocator.go b/frac/active2/indexer_allocator.go new file mode 100644 index 00000000..2864ed7e --- /dev/null +++ b/frac/active2/indexer_allocator.go @@ -0,0 +1,131 @@ +package active2 + +import ( + "slices" + "sync" + "unsafe" + + "github.com/ozontech/seq-db/bytespool" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/tokenizer" +) + +type indexerResources struct { + releasers []func() +} + +var ( + poolAllocator = sync.Pool{} + poolMetaData = sync.Pool{} + poolMetaToken = sync.Pool{} + poolUint32Slices = sync.Pool{} + poolTokenizerMetaToken = sync.Pool{} + poolMetaTokenMap = sync.Pool{} + poolStrings = sync.Pool{} +) + +func newIndexerResources() *indexerResources { + ai, ok := poolAllocator.Get().(*indexerResources) + if ok { + ai.releasers = ai.releasers[:0] + } else { + ai = &indexerResources{releasers: make([]func(), 0, 64)} + } + return ai +} + +func (r *indexerResources) newUint32s(size int) []uint32 { + buf, free := acquireSlice[uint32](size) + r.releasers = append(r.releasers, free) + return buf +} + +func (r *indexerResources) newInts(size int) []int { + buf, free := acquireSlice[int](size) + r.releasers = append(r.releasers, free) + return buf +} + +func (r *indexerResources) newBytes(size int) []byte { + buf := bytespool.AcquireLen(size) + r.releasers = append(r.releasers, func() { bytespool.Release(buf) }) + return buf.B +} + +func (r *indexerResources) newDocPos(size int) []seq.DocPos { + buf, free := acquireSlice[seq.DocPos](size) + r.releasers = append(r.releasers, free) + return buf +} + +func (r *indexerResources) newMetaTokenMap(size int) map[tokenKey]uint32 { + buf, ok := poolMetaTokenMap.Get().(map[tokenKey]uint32) + if !ok { + buf = make(map[tokenKey]uint32, size) + } else { + clear(buf) + } + r.releasers = append(r.releasers, func() { poolMetaTokenMap.Put(buf) }) + return buf +} + +func (r *indexerResources) newUint32Slices(size int) [][]uint32 { + bufPtr, free := acquireFromPoolPtr[[]uint32](&poolUint32Slices, size) + r.releasers = append(r.releasers, free) + return *bufPtr +} + +func (r *indexerResources) newMetaTokens(size int) []tokenKey { + bufPtr, free := acquireFromPoolPtr[tokenKey](&poolMetaToken, size) + r.releasers = append(r.releasers, free) + return *bufPtr +} + +func (r *indexerResources) newTokenizerMetaTokensPtr(size int) *[]tokenizer.MetaToken { + bufPtr, free := acquireFromPoolPtr[tokenizer.MetaToken](&poolTokenizerMetaToken, size) + r.releasers = append(r.releasers, free) + return bufPtr +} + +func (r *indexerResources) newStrings(size int) []string { + bufPtr, free := acquireFromPoolPtr[string](&poolStrings, size) + r.releasers = append(r.releasers, free) + return *bufPtr +} + +func (a *indexerResources) newTokenizerMetaTokens(size int) ([]tokenizer.MetaToken, func([]tokenizer.MetaToken)) { + bufPtr := a.newTokenizerMetaTokensPtr(size) + return *bufPtr, func(mt []tokenizer.MetaToken) { *bufPtr = mt } +} + +func (r *indexerResources) newMetaData(size int) []indexer.MetaData { + bufPtr, free := acquireFromPoolPtr[indexer.MetaData](&poolMetaData, size) + r.releasers = append(r.releasers, free) + return *bufPtr +} + +func (r *indexerResources) releaseAll() { + for _, r := range r.releasers { + r() + } + poolAllocator.Put(r) +} + +func acquireSlice[T any](size int) ([]T, func()) { + var tmp T + itemSize := int(unsafe.Sizeof(tmp)) + buf := bytespool.AcquireLen(int(size) * itemSize) + res := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf.B))), size) + return res, func() { bytespool.Release(buf) } +} + +func acquireFromPoolPtr[T any](pool *sync.Pool, size int) (*[]T, func()) { + buf, ok := pool.Get().([]T) + if !ok { + buf = make([]T, size) + } else { + buf = slices.Grow(buf[:0], size)[:size] + } + return &buf, func() { pool.Put(buf) } +} diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go new file mode 100644 index 00000000..21aec724 --- /dev/null +++ b/frac/active2/indexer_test.go @@ -0,0 +1,119 @@ +package active2 + +import ( + "bytes" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/tokenizer" + "github.com/stretchr/testify/assert" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +func BenchmarkIndexer(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + idx := NewIndexer(8) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 2000) + assert.NoError(b, err) + + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + processor := getTestProcessor() + + for i := 0; i < b.N; i++ { + b.StopTimer() + bulks := make([][]byte, 0, len(readers)) + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + } + b.StartTimer() + + wg := sync.WaitGroup{} + for _, meta := range bulks { + wg.Add(1) + idx.Index(meta, func(index *memIndex, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + active.addIndex(index) + wg.Done() + }) + } + // runtime.GC() + wg.Wait() + } +} + +func readFileAllAtOnce(filename string) ([][]byte, error) { + content, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + lines := bytes.Split(content, []byte{'\n'}) + if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { + lines = lines[:len(lines)-1] + } + return lines, nil +} + +func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { + funcs := []func() ([]byte, error){} + for len(data) > 0 { + size := min(len(data), bulkSize) + funcs = append(funcs, testBufReader(data[0:size])) + data = data[size:] + } + return funcs +} + +func testBufReader(data [][]byte) func() ([]byte, error) { + orig := data + return func() ([]byte, error) { + if len(data) == 0 { + data = orig + return nil, nil + } + line := data[0] + data = data[1:] + return line, nil + } +} + +func getTestProcessor() *indexer.Processor { + mapping := seq.Mapping{ + "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), + "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), + "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), + "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), + } + + tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ + seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), + seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), + seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), + seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), + } + + return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) +} diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go new file mode 100644 index 00000000..674ca92a --- /dev/null +++ b/frac/active2/mem_index.go @@ -0,0 +1,50 @@ +package active2 + +import ( + "github.com/ozontech/seq-db/seq" +) + +type tokensRange struct { + start uint32 + count uint32 +} + +type memIndex struct { + ids []seq.ID // IDs ordered DESC + tokens [][]byte // tokens ordered ASC by field:token + tokenLIDs [][]uint32 // LIDs list for each token from `tokens` + fieldsTokens map[string]tokensRange // tokens locator for each field + fields [][]byte // fields ordered ASC + blocksOffsets []uint64 // blocks offsets ordered by offset + idToLID map[seq.ID]uint32 + positions []seq.DocPos + allTID uint32 + + docsSize uint64 + docsCount uint32 +} + +func (index *memIndex) getTokenProvider(field string) *tokenProvider { + if r, ok := index.fieldsTokens[field]; ok { + return &tokenProvider{ + firstTID: r.start, + lastTID: r.start + r.count - 1, + tokens: index.tokens, + } + } + // Field is not indexed - return empty token provider + return &tokenProvider{ + firstTID: 1, + lastTID: 0, // firstTID > lastTID = no tokens available + tokens: index.tokens, + } +} + +func (index *memIndex) IsIntersecting(from, to seq.MID) bool { + maxMID := index.ids[0].MID + minMID := index.ids[len(index.ids)-1].MID + if to < minMID || maxMID < from { + return false + } + return true +} diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go new file mode 100644 index 00000000..105316ed --- /dev/null +++ b/frac/active2/mem_index_pool.go @@ -0,0 +1,103 @@ +package active2 + +import ( + "slices" + "sync" + "sync/atomic" +) + +// memIndexExt contains index metadata for merge management +type memIndexExt struct { + id uint64 // unique runtime ID + index *memIndex // actual index + tier int // size tier of the index +} + +type memIndexPool struct { + mu sync.RWMutex + indexes []*memIndex + readyToMerge map[uint64]memIndexExt + underMerging map[uint64]memIndexExt + + tiers sizeTiers // index size tier classifier + counter atomic.Uint64 // atomic counter for generating index IDs +} + +func newIndexPool() *memIndexPool { + return &memIndexPool{ + readyToMerge: make(map[uint64]memIndexExt), + underMerging: make(map[uint64]memIndexExt), + + tiers: newSizeTiers(firstTierMaxSize, maxTierCount, tierSizeDeltaPercent), + } +} + +func (p *memIndexPool) Indexes() []*memIndex { + p.mu.RLock() + defer p.mu.RUnlock() + + return p.indexes +} + +func (p *memIndexPool) Add(index *memIndex) { + metaIndex := p.wrapIndex(index) + + p.mu.Lock() + defer p.mu.Unlock() + + p.readyToMerge[metaIndex.id] = metaIndex + p.indexes = append(p.indexes, index) +} + +func (p *memIndexPool) ReadyToMerge() []memIndexExt { + p.mu.RLock() + defer p.mu.RUnlock() + + items := make([]memIndexExt, 0, len(p.readyToMerge)) + for _, item := range p.readyToMerge { + items = append(items, item) + } + return items +} + +// markAsMerging moves indexes from "ready" to "merging" state +func (p *memIndexPool) markAsMerging(items []memIndexExt) { + p.mu.Lock() + defer p.mu.Unlock() + + for _, item := range items { + delete(p.readyToMerge, item.id) + p.underMerging[item.id] = item + } +} + +func (p *memIndexPool) replace(processed []memIndexExt, merged *memIndex) { + mergedMeta := p.wrapIndex(merged) + + p.mu.Lock() + defer p.mu.Unlock() + + for _, metaIndex := range processed { + delete(p.underMerging, metaIndex.id) + } + p.readyToMerge[mergedMeta.id] = mergedMeta + + // Rebuild the index list for reading + p.indexes = p.indexes[:0] + p.indexes = slices.Grow(p.indexes, len(p.readyToMerge)+len(p.underMerging)) + + for _, metaIndex := range p.readyToMerge { + p.indexes = append(p.indexes, metaIndex.index) // add all ready indexes + } + for _, metaIndex := range p.underMerging { + p.indexes = append(p.indexes, metaIndex.index) // add indexes currently being merged + } +} + +func (p *memIndexPool) wrapIndex(index *memIndex) memIndexExt { + return memIndexExt{ + id: p.counter.Add(1), // atomically increment counter + tier: p.tiers.Calc(index.docsCount), // determine size tier + index: index, + } +} diff --git a/frac/active2/merge.go b/frac/active2/merge.go new file mode 100644 index 00000000..85e30da4 --- /dev/null +++ b/frac/active2/merge.go @@ -0,0 +1,209 @@ +package active2 + +import ( + "bytes" + + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/tokenizer" + "go.uber.org/zap" +) + +func mergeIndexes(indexes []*memIndex) *memIndex { + docsCount := 0 + blocksCount := 0 + fieldsCount := 0 + docsSize := uint64(0) + iterators := make([]mergeIterator, 0, len(indexes)) + for _, index := range indexes { + docsSize += index.docsSize + docsCount += len(index.ids) + fieldsCount += len(index.fields) + blocksCount += len(index.blocksOffsets) + iterators = append(iterators, newIndexIterator(index)) + } + + dst := &memIndex{ + ids: make([]seq.ID, 0, docsCount), + positions: make([]seq.DocPos, docsCount), + idToLID: make(map[seq.ID]uint32, docsCount), + fieldsTokens: make(map[string]tokensRange, fieldsCount), + blocksOffsets: make([]uint64, 0, blocksCount), + docsSize: docsSize, + } + + doubles := mergeIDs(dst, iterators) + mergeTokens(dst, iterators) + mergePositions(dst, iterators) + + dst.docsCount = uint32(len(dst.ids)) + dst.allTID = dst.fieldsTokens[seq.TokenAll].start + + if len(doubles) > 0 { + logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) + } + + return dst +} + +func mergeIDs(dst *memIndex, orig []mergeIterator) []seq.ID { + doubles := []seq.ID{} + iterators := append([]mergeIterator{}, orig...) // make copy + + for len(iterators) > 0 { + // try select first + selected := []int{0} + maxID := iterators[0].CurrentID() + + for i := 1; i < len(iterators); i++ { + if cur := iterators[i].CurrentID(); cur == maxID { + selected = append(selected, i) + } else if seq.Less(maxID, cur) { + maxID = cur + selected = []int{i} + } + } + + lid := uint32(len(dst.ids)) + for _, i := range selected { + iterators[i].AddNewLID(lid) + if !iterators[i].ShiftID() { + removeItem(iterators, i) + } + } + dst.ids = append(dst.ids, maxID) + + if len(selected) > 1 { + doubles = append(doubles, maxID) + } + } + return doubles +} + +func mergeTokens(dst *memIndex, orig []mergeIterator) { + // todo copy tokens to compact mem usage + // todo allocate for all lids at once to optimize allocations + var prevField []byte + iterators := append([]mergeIterator{}, orig...) // make copy + for len(iterators) > 0 { + // try select first + selected := []int{0} + minToken := iterators[0].CurrentToken() + + for i := 1; i < len(iterators); i++ { + cur := iterators[i].CurrentToken() + if cmp := compareMetaToken(cur, minToken); cmp < 0 { + minToken = cur + selected = []int{i} + } else if cmp == 0 { + selected = append(selected, i) + } + } + + lids := make([][]uint32, 0, len(selected)) + for _, i := range selected { + lids = append(lids, iterators[i].CurrentTokenLIDs()) + if !iterators[i].ShiftToken() { + removeItem(iterators, i) + } + } + + if !bytes.Equal(prevField, minToken.Key) { // new field + if tr, ok := dst.fieldsTokens[string(prevField)]; ok { + tr.count = uint32(len(dst.tokens)) - tr.start + dst.fieldsTokens[string(prevField)] = tr + } + dst.fields = append(dst.fields, minToken.Key) + dst.fieldsTokens[string(minToken.Key)] = tokensRange{start: uint32(len(dst.tokens))} + prevField = minToken.Key + } + + dst.tokens = append(dst.tokens, minToken.Value) + dst.tokenLIDs = append(dst.tokenLIDs, mergeLIDs(lids)) + } + if tr, ok := dst.fieldsTokens[string(prevField)]; ok { + tr.count = uint32(len(dst.tokens)) - tr.start + dst.fieldsTokens[string(prevField)] = tr + } +} + +func mergePositions(dst *memIndex, orig []mergeIterator) { + iterators := append([]mergeIterator{}, orig...) // make copy + for len(iterators) > 0 { + // try select first + selected := []int{0} + minOffset := iterators[0].CurrentBlocksOffset() + + for i := 1; i < len(iterators); i++ { + if cur := iterators[i].CurrentBlocksOffset(); cur == minOffset { + selected = append(selected, i) + } else if cur < minOffset { + minOffset = cur + selected = []int{i} + } + } + + newBlockIndex := len(dst.blocksOffsets) + dst.blocksOffsets = append(dst.blocksOffsets, minOffset) + + for _, i := range selected { + iterators[i].AddNewBlockIndex(newBlockIndex) + if !iterators[i].ShiftBlocksOffset() { + removeItem(iterators, i) + } + } + } + + for _, iterator := range orig { + iterator.RepackDocPositions(dst.positions) + } +} + +func compareMetaToken(mt1, mt2 tokenizer.MetaToken) int { + res := bytes.Compare(mt1.Key, mt2.Key) + if res == 0 { + return bytes.Compare(mt1.Value, mt2.Value) + } + return res +} + +func mergeLIDs(lids [][]uint32) []uint32 { + size := 0 + for i := range lids { + size += len(lids[i]) + } + res := make([]uint32, 0, size) + + for len(lids) > 0 { + // try select first + selected := []int{0} + minLID := lids[0][0] + + for i := 1; i < len(lids); i++ { + cur := lids[i][0] + if cur == minLID { // can be doubles + selected = append(selected, i) + } else if cur < minLID { + selected = []int{i} + minLID = cur + } + } + + res = append(res, minLID) + + for _, i := range selected { + if lids[i] = lids[i][1:]; len(lids[i]) == 0 { + removeItem(lids, i) + } + } + } + + return res +} + +func removeItem[V any](items []V, i int) []V { + last := len(items) - 1 + items[i] = items[last] + items = items[:last] + return items +} diff --git a/frac/active2/merge2.go b/frac/active2/merge2.go new file mode 100644 index 00000000..5f447fee --- /dev/null +++ b/frac/active2/merge2.go @@ -0,0 +1,211 @@ +package active2 + +/* +import ( + "bytes" + + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/tokenizer" + "go.uber.org/zap" +) + +func mergeIndexes2(src1, src2 *memIndex) *memIndex { + docsCount := src1.docsCount + src2.docsCount + blocksCount := len(src1.blocksOffsets) + len(src2.blocksOffsets) + docsSize := src1.docsSize + src2.docsSize + fieldsCount := len(src1.fields) + len(src2.fields) + + dst := &memIndex{ + ids: make([]seq.ID, 0, docsCount), + positions: make(map[seq.ID]seq.DocPos, docsCount), + fieldsTokens: make(map[string]tokensRange, fieldsCount), + blocksOffsets: make([]uint64, 0, blocksCount), + docsSize: docsSize, + } + + lids1, lids2, doubles := mergeIDs1(src1, src2, dst) + + mergeTokens(dst, iterators) + mergePositions(dst, iterators) + + dst.docsCount = uint32(len(dst.ids)) + dst.allTID = dst.fieldsTokens[seq.TokenAll].start + + if len(doubles) > 0 { + logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) + } + + return dst +} + +func mergeIDs1(src1, src2, dst *memIndex) ([]uint32, []uint32, []seq.ID) { + doubles := []seq.ID{} + + newLIDs1 := []uint32{} + newLIDs2 := []uint32{} + + var i1, i2 int + for i1 < len(src1.ids) && i2 < len(src2.ids) { + lid := uint32(len(dst.ids)) + id1 := src1.ids[i1] + id2 := src2.ids[i2] + + if seq.Less(id2, id1) { + dst.ids = append(dst.ids, id1) + newLIDs1 = append(newLIDs1, lid) + i1++ + } else if seq.Less(id1, id2) { + dst.ids = append(dst.ids, id2) + newLIDs2 = append(newLIDs2, lid) + i2++ + } else { // id2 == id1 + dst.ids = append(dst.ids, id1) + doubles = append(doubles, id1) + newLIDs1 = append(newLIDs1, lid) + newLIDs2 = append(newLIDs2, lid) + i1++ + i2++ + } + } + + for ; i1 < len(src1.ids); i1++ { + newLIDs1 = append(newLIDs1, uint32(len(dst.ids))) + dst.ids = append(dst.ids, src1.ids[i1]) + } + for ; i2 < len(src1.ids); i2++ { + newLIDs2 = append(newLIDs2, uint32(len(dst.ids))) + dst.ids = append(dst.ids, src2.ids[i2]) + } + + return newLIDs1, newLIDs2, doubles +} + +func mergeTokens(dst *memIndex, orig []mergeIterator) { + // todo copy tokens to compact mem usage + // todo allocate for all lids at once to optimize allocations + var prevField []byte + iterators := append([]mergeIterator{}, orig...) // make copy + for len(iterators) > 0 { + // try select first + selected := []int{0} + minToken := iterators[0].CurrentToken() + + for i := 1; i < len(iterators); i++ { + cur := iterators[i].CurrentToken() + if cmp := compareMetaToken(cur, minToken); cmp < 0 { + minToken = cur + selected = []int{i} + } else if cmp == 0 { + selected = append(selected, i) + } + } + + lids := make([][]uint32, 0, len(selected)) + for _, i := range selected { + lids = append(lids, iterators[i].CurrentTokenLIDs()) + if !iterators[i].ShiftToken() { + removeItem(iterators, i) + } + } + + if !bytes.Equal(prevField, minToken.Key) { // new field + if tr, ok := dst.fieldsTokens[string(prevField)]; ok { + tr.count = uint32(len(dst.tokens)) - tr.start + dst.fieldsTokens[string(prevField)] = tr + } + dst.fields = append(dst.fields, minToken.Key) + dst.fieldsTokens[string(minToken.Key)] = tokensRange{start: uint32(len(dst.tokens))} + prevField = minToken.Key + } + + dst.tokens = append(dst.tokens, minToken.Value) + dst.tokenLIDs = append(dst.tokenLIDs, mergeLIDs(lids)) + } + if tr, ok := dst.fieldsTokens[string(prevField)]; ok { + tr.count = uint32(len(dst.tokens)) - tr.start + dst.fieldsTokens[string(prevField)] = tr + } +} + +func mergePositions(dst *memIndex, orig []mergeIterator) { + iterators := append([]mergeIterator{}, orig...) // make copy + for len(iterators) > 0 { + // try select first + selected := []int{0} + minOffset := iterators[0].CurrentBlocksOffset() + + for i := 1; i < len(iterators); i++ { + if cur := iterators[i].CurrentBlocksOffset(); cur == minOffset { + selected = append(selected, i) + } else if cur < minOffset { + minOffset = cur + selected = []int{i} + } + } + + newBlockIndex := len(dst.blocksOffsets) + dst.blocksOffsets = append(dst.blocksOffsets, minOffset) + + for _, i := range selected { + iterators[i].AddNewBlockIndex(newBlockIndex) + if !iterators[i].ShiftBlocksOffset() { + removeItem(iterators, i) + } + } + } + + for _, iterator := range orig { + iterator.RepackDocPositions(dst.positions) + } +} + +func compareMetaToken(mt1, mt2 tokenizer.MetaToken) int { + res := bytes.Compare(mt1.Key, mt2.Key) + if res == 0 { + return bytes.Compare(mt1.Value, mt2.Value) + } + return res +} + +func mergeLIDs(lids [][]uint32) []uint32 { + size := 0 + for i := range lids { + size += len(lids[i]) + } + res := make([]uint32, 0, size) + + for len(lids) > 0 { + // try select first + selected := []int{0} + minLID := lids[0][0] + + for i := 1; i < len(lids); i++ { + cur := lids[i][0] + if cur == minLID { // can be doubles + selected = append(selected, i) + } else if cur < minLID { + selected = []int{i} + minLID = cur + } + } + + res = append(res, minLID) + + for _, i := range selected { + if lids[i] = lids[i][1:]; len(lids[i]) == 0 { + removeItem(lids, i) + } + } + } + + return res +} + +func removeItem[V any](items []V, i int) []V { + last := len(items) - 1 + items[i] = items[last] + items = items[:last] + return items +} +*/ diff --git a/frac/active2/merge_iterator.go b/frac/active2/merge_iterator.go new file mode 100644 index 00000000..904735bd --- /dev/null +++ b/frac/active2/merge_iterator.go @@ -0,0 +1,97 @@ +package active2 + +import ( + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/tokenizer" +) + +// For compaction +type mergeIterator struct { + index *memIndex + posIDs int + posField int + posToken int + posBlocks int + lastFieldToken int + newLIDs []uint32 + newBlocks []int +} + +func newIndexIterator(index *memIndex) mergeIterator { + return mergeIterator{ + index: index, + newLIDs: make([]uint32, len(index.ids)), + newBlocks: make([]int, len(index.blocksOffsets)), + lastFieldToken: int(index.fieldsTokens[string(index.fields[0])].count) - 1, + } +} + +func (iq *mergeIterator) ShiftID() bool { + iq.posIDs++ + if iq.posIDs == len(iq.index.ids) { + return false + } + return true +} + +func (iq *mergeIterator) CurrentID() seq.ID { + return iq.index.ids[iq.posIDs] +} + +func (iq *mergeIterator) ShiftToken() bool { + iq.posToken++ + if iq.posToken == len(iq.index.tokens) { + return false + } + if iq.posToken > iq.lastFieldToken { // need shift field + iq.posField++ + field := iq.index.fields[iq.posField] + r := iq.index.fieldsTokens[string(field)] + iq.lastFieldToken += int(r.count) - 1 + } + return true +} + +func (iq *mergeIterator) CurrentToken() tokenizer.MetaToken { + return tokenizer.MetaToken{ + Key: iq.index.fields[iq.posField], + Value: iq.index.tokens[iq.posToken], + } +} + +func (iq *mergeIterator) CurrentTokenLIDs() []uint32 { + src := iq.index.tokenLIDs[iq.posToken] + dst := make([]uint32, 0, len(src)) + for _, oldLid := range src { + dst = append(dst, iq.newLIDs[oldLid-1]+1) + } + return dst +} + +func (iq *mergeIterator) ShiftBlocksOffset() bool { + iq.posBlocks++ + if iq.posBlocks == len(iq.index.blocksOffsets) { + return false + } + return true +} + +func (iq *mergeIterator) CurrentBlocksOffset() uint64 { + return iq.index.blocksOffsets[iq.posBlocks] +} + +func (iq *mergeIterator) AddNewLID(lid uint32) { + iq.newLIDs = append(iq.newLIDs, lid) +} + +func (iq *mergeIterator) AddNewBlockIndex(blockIndex int) { + iq.newBlocks = append(iq.newBlocks, blockIndex) +} + +func (iq *mergeIterator) RepackDocPositions(dst []seq.DocPos) { + for lid, docPos := range iq.index.positions { + oldBlockIndex, docOffset := docPos.Unpack() + newBlockIndex := uint32(iq.newBlocks[oldBlockIndex]) + dst[lid] = seq.PackDocPos(newBlockIndex, docOffset) + } +} diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go new file mode 100644 index 00000000..10d480d1 --- /dev/null +++ b/frac/active2/merge_manager.go @@ -0,0 +1,150 @@ +package active2 + +import ( + "sync" +) + +const ( + minIndexesToMerge = 4 // minimum number of indexes to trigger merge + forceMergeThreshold = 100 // index count threshold for forced merge + tierSizeDeltaPercent = 10 // percentage difference between size tiers + firstTierMaxSize = 100 // maximum size of the first tier + maxTierCount = 1000 // maximum number of size tiers allowed + bucketSizePercent = 50 // percentage difference between size buckets +) + +// MergeManager manages in-memory index collection and merging +type MergeManager struct { + mu sync.Mutex + wg sync.WaitGroup + + stopped bool + indexes *memIndexPool + + workers chan struct{} // semaphore to limit concurrent merge operations + mergeCh chan struct{} // channel to trigger merge process +} + +// newMergeManager creates a new index manager +func newMergeManager(maxConcurrentMerges int) *MergeManager { + m := MergeManager{ + workers: make(chan struct{}, maxConcurrentMerges), + mergeCh: make(chan struct{}, 1), + indexes: newIndexPool(), + } + + // todo + // Start background goroutine for merge scheduling + // go m.mergeScheduler() + return &m +} + +// Stop shuts down the index manager and waits for current operations to complete +func (m *MergeManager) Stop() { + m.mu.Lock() + defer m.mu.Unlock() + + m.stopped = true + + // Wait for all current merge operations to complete + m.wg.Wait() + close(m.mergeCh) +} + +// MergeAll performs full merge of all available indexes +func (m *MergeManager) MergeAll() *memIndex { + m.mu.Lock() + defer m.mu.Unlock() + + m.wg.Wait() + + indexesToMerge := m.indexes.ReadyToMerge() + mergedIndex := mergeIndexes(extractIndexes(indexesToMerge)) + m.indexes.replace(indexesToMerge, mergedIndex) + + return mergedIndex +} + +func extractIndexes(metadataList []memIndexExt) []*memIndex { + result := make([]*memIndex, 0, len(metadataList)) + for _, metadata := range metadataList { + result = append(result, metadata.index) + } + return result +} + +func (m *MergeManager) Indexes() []*memIndex { + return m.indexes.Indexes() +} + +func (m *MergeManager) Add(index *memIndex) { + m.indexes.Add(index) + m.triggerMerge() +} + +// prepareForMerging prepares index groups for merging +func (m *MergeManager) prepareForMerging() [][]memIndexExt { + m.mu.Lock() + defer m.mu.Unlock() + + if m.stopped { + return nil + } + + mergeCandidates := selectForMerge(m.indexes.ReadyToMerge(), minIndexesToMerge) + + for i, candidateGroup := range mergeCandidates { + if !m.acquireWorker() { // no free workers + mergeCandidates = mergeCandidates[:i] // truncate unprocessable tail + break + } + m.indexes.markAsMerging(candidateGroup) + } + + // Important: call Add() inside lock to prevent races during shutdown + m.wg.Add(len(mergeCandidates)) + + return mergeCandidates +} + +func (m *MergeManager) mergeScheduler() { + for range m.mergeCh { + for { + preparedGroups := m.prepareForMerging() + if len(preparedGroups) == 0 { + break + } + + for _, groupToMerge := range preparedGroups { + go func() { + mergedIndex := mergeIndexes(extractIndexes(groupToMerge)) + m.indexes.replace(groupToMerge, mergedIndex) + m.releaseWorker() + m.triggerMerge() // check if new merge is needed + m.wg.Done() + }() + } + } + } +} + +func (m *MergeManager) acquireWorker() bool { + select { + case m.workers <- struct{}{}: + return true + default: + return false + } +} + +func (m *MergeManager) releaseWorker() { + <-m.workers +} + +func (m *MergeManager) triggerMerge() { + select { + case m.mergeCh <- struct{}{}: + default: + // Trigger already set, no need for additional notification + } +} diff --git a/frac/active2/merge_strategy.go b/frac/active2/merge_strategy.go new file mode 100644 index 00000000..70da7f36 --- /dev/null +++ b/frac/active2/merge_strategy.go @@ -0,0 +1,178 @@ +package active2 + +import ( + "math" +) + +/* +ПРИНЦИП ВЫБОРА КАНДИДАТОВ ДЛЯ СЛИЯНИЯ + +1. ИСХОДНЫЕ ДАННЫЕ: + items (индексы) → сгруппированы по ТИРАМ (tiers) + + Пример: 10 индексов распределены по 7 тирам + + │ Tier 0 │ Tier 1 │ Tier 2 │ Tier 3 │ Tier 4 │ Tier 5 │ Tier 6 │ + ├────────┼────────┼────────┼────────┼────────┼────────┼────────┤ + │ 1 │ 2 │ 0 │ 3 │ 1 │ 2 │ 1 │ + └────────┴────────┴────────┴────────┴────────┴────────┴────────┘ + +2. ПОСТРОЕНИЕ РАСПРЕДЕЛЕНИЯ (buildTiersDistribution): + Считаем количество индексов в каждом тире + +3. ПОИСК ОКНА (mostPopulatedTiersRange): + Скользящее окно размером winSize (по умолчанию 2 тира) + + winSize = round(bucketSizePercent / tierSizeDeltaPercent) + Пример: 50% / 25% = 2 тира + + ┌─────────────────────────────────────────────────────┐ + │ Скользящее окно (размер = 2 тира) │ + ├─────────────────────────────────────────────────────┤ + │ Окно 1: │ Tier 0 + Tier 1 │ = 1 + 2 = 3 элементов │ + │ Окно 2: │ Tier 1 + Tier 2 │ = 2 + 0 = 2 элементов │ + │ Окно 3: │ Tier 2 + Tier 3 │ = 0 + 3 = 3 элементов │ + │ Окно 4: │ Tier 3 + Tier 4 │ = 3 + 1 = 4 элементов | ← max! + │ Окно 5: │ Tier 4 + Tier 5 │ = 1 + 2 = 3 элементов │ + │ Окно 6: │ Tier 5 + Tier 6 │ = 2 + 1 = 3 элементов │ + └─────────────────────────────────────────────────────┘ + + Найденное окно: Tier 3-4 с 4 элементами + Если элементов ≥ minToMerge → успех! + +4. ПРАВИЛА ВЫБОРА: + ┌─────────────────────────────────────────────────────┐ + │ Условие 1: элементов в окне ≥ minToMerge? │ + │ Да → берём это окно │ + │ Нет → переходим к условию 2 │ + ├─────────────────────────────────────────────────────┤ + │ Условие 2: findAtAnyCost = true? │ + │ (len(items) >= forceMergeThreshold) │ + │ Да → увеличиваем winSize в 2 раза │ + │ и ищем снова │ + │ Нет → возвращаем пустой результат │ + └─────────────────────────────────────────────────────┘ + +5. ВЫДЕЛЕНИЕ КАНДИДАТОВ (extractIndexesInRange): + Берём все индексы из найденного диапазона тиров + + Пример для окна Tier 3-4: + ┌─────────────────────────────────────────┐ + │ До: [1, 2, 0, 3, 1, 2, 1] │ + │ Выбор: ██ ██ │ + │ Результат: 3 элемента из Tier 3 │ + │ + 1 элемент из Tier 4 │ + │ = 4 элемента всего │ + └─────────────────────────────────────────┘ + +6. ПОВТОРЕНИЕ ПРОЦЕССА: + Удаляем выбранные элементы из распределения + Повторяем поиск, пока не останется окон + с достаточным количеством элементов + + ┌─────────────────────────────────────────┐ + │ 1-я итерация: выбрали Tier 3-4 (4 elem) │ + │ 2-я итерация: │ + │ Распределение: [1, 2, 0, 0, 0, 2, 1] │ + │ Находим новое окно... │ + └─────────────────────────────────────────┘ + +*/ + +// selectForMerge selects merge candidates based on their size. +// It groups items into sets within which the sizes of the items do not differ +// by more than a specified limit in percent (e.g. 50%) +func selectForMerge(items []memIndexExt, minToMerge int) [][]memIndexExt { + if len(items) < minToMerge { + return nil + } + + tiersDist := buildTiersDistribution(items) + findAtAnyCost := len(items) >= forceMergeThreshold + winSize := int(math.Round(float64(bucketSizePercent) / tierSizeDeltaPercent)) + + var res [][]memIndexExt + for { + countInRange, firstTier, lastTier := mostPopulatedTiersRange(tiersDist, minToMerge, winSize, findAtAnyCost) + if countInRange == 0 { + break + } + buf := make([]memIndexExt, 0, countInRange) + res = append(res, extractIndexesInRange(items, buf, firstTier, lastTier, tiersDist)) + } + return res +} + +func buildTiersDistribution(items []memIndexExt) []int { + lastTier := 0 + tiersDist := make([]int, maxTierCount) + for _, index := range items { + tiersDist[index.tier]++ + if index.tier > lastTier { + lastTier = index.tier + } + } + return tiersDist[:lastTier] +} + +func extractIndexesInRange(items, buf []memIndexExt, firstTier, lastTier int, tiersDist []int) []memIndexExt { + for _, index := range items { + if firstTier <= index.tier && index.tier <= lastTier { + buf = append(buf, index) + tiersDist[index.tier]-- + } + } + return buf +} + +func mostPopulatedTiersRange(tiersDist []int, minToMerge, winSize int, findAtAnyCost bool) (int, int, int) { + var lastWinTier, maxWinSum int + for { + lastWinTier, maxWinSum = findMaxSumWindow(tiersDist, winSize) + if maxWinSum >= minToMerge { // got it! + break + } + if findAtAnyCost { // expand window size and find again + // todo добавить логирования! + winSize *= 2 + continue + } + return 0, 0, 0 + } + + firstTier := max(0, lastWinTier-winSize) + lastTier := lastWinTier + + return maxWinSum, firstTier, lastTier +} + +// sliding window sum +type winSum struct { + buf []int + sum int + pos int +} + +func (w *winSum) Add(v int) { + w.sum += v - w.buf[w.pos] + w.buf[w.pos] = v + w.pos++ + if w.pos == len(w.buf) { + w.pos = 0 + } +} + +func findMaxSumWindow(tiersDist []int, winSize int) (int, int) { + maxWinSum := 0 + lastWinTier := 0 + win := winSum{buf: make([]int, winSize)} + + for tier, size := range tiersDist { + win.Add(size) + if win.sum >= maxWinSum { + lastWinTier = tier + maxWinSum = win.sum + } + } + return lastWinTier, maxWinSum +} diff --git a/frac/active2/metrics.go b/frac/active2/metrics.go new file mode 100644 index 00000000..dac3fd68 --- /dev/null +++ b/frac/active2/metrics.go @@ -0,0 +1,42 @@ +package active2 + +import ( + "github.com/ozontech/seq-db/metric" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + fetcherStagesSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "fetcher", + Name: "active_stages_seconds", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + + searchAggSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_agg_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + searchHstSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_hist_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + searchSimpleSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_reg_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + + bulkStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "bulk", + Name: "stages_seconds2", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) +) diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go new file mode 100644 index 00000000..fb9d09a3 --- /dev/null +++ b/frac/active2/sealing_source.go @@ -0,0 +1,121 @@ +package active2 + +/* +type SealingSource struct { + info *common.Info + created time.Time + index *memIndex + lastErr error +} + +func NewSealingSource(active *Active2, params common.SealParams) (sealing.Source, error) { + info := *active.info // copy + src := SealingSource{ + info: &info, + created: time.Now(), + index: active.indexes.MergeAll(), + } + src.prepareInfo() + + if !active.Config.SkipSortDocs { + sortedSrc, err := frac.NewSortedSealingSource(&src, &active.sortReader, params) + if err != nil { + return nil, err + } + return sortedSrc, nil + } + + return &src, nil +} + +func (src *SealingSource) prepareInfo() { + src.info.MetaOnDisk = 0 + src.info.SealingTime = uint64(src.created.UnixMilli()) + src.info.BuildDistribution(func(yield func(seq.ID) bool) { + for _, id := range src.index.ids { + if !yield(id) { + return + } + } + }) +} + +func (src *SealingSource) Info() *common.Info { + return src.info +} + +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { + return func(yield func([]seq.ID, []seq.DocPos) bool) { + + ids := make([]seq.ID, 0, blockSize) + pos := make([]seq.DocPos, 0, blockSize) + + // first + ids = append(ids, frac.SystemSeqID) // todo; get rid of SystemSeqID in index format + pos = append(pos, 0) + + for _, id := range src.index.ids { + if len(ids) == blockSize { + if !yield(ids, pos) { + return + } + ids = ids[:0] + pos = pos[:0] + } + ids = append(ids, id) + pos = append(pos, src.index.positions[id]) + } + yield(ids, pos) + } +} + +func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { + const uint32Size = int(unsafe.Sizeof(uint32(0))) + return func(yield func([][]byte) bool) { + actualSize := 0 + block := make([][]byte, 0, blockSize) + for _, token := range src.index.tokens { + if actualSize >= blockSize { + if !yield(block) { + return + } + actualSize = 0 + block = block[:0] + } + actualSize += len(token) + uint32Size + block = append(block, token) + } + yield(block) + } +} + +func (src *SealingSource) Fields() iter.Seq2[string, uint32] { + return func(yield func(string, uint32) bool) { + for _, field := range src.index.fields { + f := util.ByteToStringUnsafe(field) + r := src.index.fieldsTokens[f] + if !yield(f, r.start+r.count) { + return + } + } + } +} + +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { + return func(yield func([]uint32) bool) { + for _, tokenLIDs := range src.index.tokenLIDs { + if !yield(tokenLIDs) { + return + } + } + } +} + +func (src *SealingSource) BlocksOffsets() []uint64 { + return src.index.blocksOffsets +} + +func (ss *SealingSource) LastError() error { + return ss.lastErr +} +*/ diff --git a/frac/active2/tiers.go b/frac/active2/tiers.go new file mode 100644 index 00000000..b0fa6f17 --- /dev/null +++ b/frac/active2/tiers.go @@ -0,0 +1,80 @@ +package active2 + +import ( + "math" +) + +// sizeTiers splits the entire space of integers into successive ranges [A(n) ; B(n)] where: +// - A(n+1) = B(n) + 1 +// - (B(n) - A(n)) / A(n) ~ deltaPercent +// +// Example: for newSizeTiers(100, 200, 10) we will have: +// +// Tier 0: [ 0; 100 ] delta: 0.00% +// Tier 1: [ 101; 106 ] delta: 6.00% +// Tier 2: [ 107; 117 ] delta: 10.00% +// Tier 3: [ 118; 129 ] delta: 10.00% +// Tier 4: [ 130; 142 ] delta: 10.00% +// Tier 5: [ 143; 156 ] delta: 10.00% +// Tier 6: [ 157; 171 ] delta: 10.00% +// Tier 7: [ 172; 189 ] delta: 10.00% +// Tier 8: [ 190; 207 ] delta: 9.00% +// Tier 9: [ 208; 228 ] delta: 10.00% +// Tier 10: [ 229; 251 ] delta: 10.00% +// Tier 11: [ 252; 276 ] delta: 10.00% +// Tier 12: [ 277; 304 ] delta: 10.00% +// Tier 13: [ 305; 334 ] delta: 10.00% +// Tier 14: [ 335; 368 ] delta: 10.00% +// Tier 15: [ 369; 405 ] delta: 10.00% +// Tier 16: [ 406; 445 ] delta: 10.00% +// Tier 17: [ 446; 490 ] delta: 10.00% +// Tier 18: [ 491; 539 ] delta: 10.00% +// Tier 19: [ 540; 593 ] delta: 10.00% +// Tier 20: [ 594; 652 ] delta: 10.00% +// Tier 21: [ 653; 717 ] delta: 10.00% +// Tier 22: [ 718; 789 ] delta: 10.00% +// Tier 23: [ 790; 868 ] delta: 10.00% +// Tier 24: [ 869; 955 ] delta: 10.00% +// Tier 25: [ 956; 1051 ] delta: 10.00% +// Tier 26: [ 1052; 1156 ] delta: 10.00% +// +// etc. +// +// So, sizeTiers returns us the tier (the number of range) for any integer value. +type sizeTiers struct { + firstMax uint32 + maxTier int + deltaK float64 + offset float64 +} + +// newSizeTiers creates a calculator that determines the ordinal number of the range (size tier) a given integer value falls into. +// +// Parameters: +// +// firstMax - The upper bound of the initial (first) range. Range #0 is always [0, firstMax]. +// maxTier - The maximum number of ranges (tiers) to create. For the last range (#maxTier), +// the upper bound is considered to be positive infinity (+inf). +// deltaPercent - The defined growth percentage. The ratio of the difference between the upper bounds +// of two adjacent ranges to the lower bound of the next range is approximately equal +// to this value. Formula for tier #n: (B_n - B_{n-1}) / B_{n-1} ≈ deltaPercent / 100 +func newSizeTiers(firstMax uint32, maxTier, deltaPercent int) sizeTiers { + deltaK := 1 / math.Log(1+float64(deltaPercent)/100) + return sizeTiers{ + maxTier: maxTier, + firstMax: firstMax, + deltaK: deltaK, + offset: math.Floor(deltaK*(math.Log(float64(firstMax)))) - 1, + } +} + +func (t sizeTiers) Calc(size uint32) int { + if size <= t.firstMax { + return 0 + } + tier := int(math.Floor(t.deltaK*(math.Log(float64(size)))) - t.offset) + if tier > t.maxTier { + return t.maxTier + } + return tier +} diff --git a/frac/active_writer.go b/frac/active_writer.go deleted file mode 100644 index 95cf2fdd..00000000 --- a/frac/active_writer.go +++ /dev/null @@ -1,44 +0,0 @@ -package frac - -import ( - "os" - - "github.com/ozontech/seq-db/metric/stopwatch" - "github.com/ozontech/seq-db/storage" -) - -type ActiveWriter struct { - docs *FileWriter - meta *FileWriter -} - -func NewActiveWriter(docsFile, metaFile *os.File, docsOffset, metaOffset int64, skipFsync bool) *ActiveWriter { - return &ActiveWriter{ - docs: NewFileWriter(docsFile, docsOffset, skipFsync), - meta: NewFileWriter(metaFile, metaOffset, skipFsync), - } -} - -func (a *ActiveWriter) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { - m := sw.Start("write_docs") - offset, err := a.docs.Write(docs, sw) - m.Stop() - - if err != nil { - return err - } - - storage.DocBlock(meta).SetExt1(uint64(len(docs))) - storage.DocBlock(meta).SetExt2(uint64(offset)) - - m = sw.Start("write_meta") - _, err = a.meta.Write(meta, sw) - m.Stop() - - return err -} - -func (a *ActiveWriter) Stop() { - a.docs.Stop() - a.meta.Stop() -} diff --git a/frac/fraction.go b/frac/fraction.go index 9e80c838..1cf74f16 100644 --- a/frac/fraction.go +++ b/frac/fraction.go @@ -9,14 +9,13 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" ) type Fraction interface { - Info() *common.Info + Info() *Info IsIntersecting(from seq.MID, to seq.MID) bool Contains(mid seq.MID) bool Fetch(context.Context, []seq.ID) ([][]byte, error) @@ -24,7 +23,7 @@ type Fraction interface { } var ( - fetcherStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ + FetcherStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "seq_db_store", Subsystem: "fetcher", Name: "fraction_stages_seconds", @@ -50,7 +49,7 @@ var ( }, []string{"stage", "fraction_type"}) ) -func fractionSearchMetric( +func FractionSearchMetric( params processor.SearchParams, ) *prometheus.HistogramVec { if params.HasAgg() { @@ -62,7 +61,7 @@ func fractionSearchMetric( return fractionRegSearchSec } -func fracToString(f Fraction, fracType string) string { +func FracToString(f Fraction, fracType string) string { info := f.Info() s := fmt.Sprintf( "%s fraction name=%s, creation time=%s, from=%s, to=%s, %s", diff --git a/frac/common/info.go b/frac/info.go similarity index 99% rename from frac/common/info.go rename to frac/info.go index ad3c3a57..093b6e1c 100644 --- a/frac/common/info.go +++ b/frac/info.go @@ -1,4 +1,4 @@ -package common +package frac import ( "fmt" diff --git a/frac/common/seal_params.go b/frac/seal_params.go similarity index 95% rename from frac/common/seal_params.go rename to frac/seal_params.go index c19365f9..b0f9d462 100644 --- a/frac/common/seal_params.go +++ b/frac/seal_params.go @@ -1,4 +1,4 @@ -package common +package frac type SealParams struct { IDsZstdLevel int diff --git a/frac/sealed/block_info.go b/frac/sealed/block_info.go index 8436f91e..a79fdffa 100644 --- a/frac/sealed/block_info.go +++ b/frac/sealed/block_info.go @@ -6,14 +6,14 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" ) const seqDBMagic = "SEQM" type BlockInfo struct { - Info *common.Info + Info *frac.Info } func (b *BlockInfo) Pack(buf []byte) []byte { @@ -33,7 +33,7 @@ func (b *BlockInfo) Unpack(data []byte) error { return errors.New("seq-db index file header corrupted") } - b.Info = &common.Info{} + b.Info = &frac.Info{} if err := json.Unmarshal(data[4:], b.Info); err != nil { return errors.New("stats unmarshaling error") } diff --git a/frac/sealed_index.go b/frac/sealed/index.go similarity index 98% rename from frac/sealed_index.go rename to frac/sealed/index.go index f97c6e84..8c6c1777 100644 --- a/frac/sealed_index.go +++ b/frac/sealed/index.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -7,7 +7,7 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -24,8 +24,8 @@ import ( type sealedDataProvider struct { ctx context.Context - info *common.Info - config *Config + info *frac.Info + config *frac.Config idsTable *seqids.Table idsProvider *seqids.Provider @@ -85,7 +85,7 @@ func (dp *sealedDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { sw := stopwatch.New() defer sw.Export( - fetcherStagesSeconds, + frac.FetcherStagesSeconds, stopwatch.SetLabel("fraction_type", dp.fractionTypeLabel), ) @@ -107,7 +107,7 @@ func (dp *sealedDataProvider) Search(params processor.SearchParams) (*seq.QPR, e sw := stopwatch.New() defer sw.Export( - fractionSearchMetric(params), + frac.FractionSearchMetric(params), stopwatch.SetLabel("fraction_type", dp.fractionTypeLabel), ) diff --git a/frac/index_cache.go b/frac/sealed/index_cache.go similarity index 97% rename from frac/index_cache.go rename to frac/sealed/index_cache.go index 3dc5b3a4..19adeddb 100644 --- a/frac/index_cache.go +++ b/frac/sealed/index_cache.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "github.com/ozontech/seq-db/cache" diff --git a/frac/sealed_loader.go b/frac/sealed/loader.go similarity index 93% rename from frac/sealed_loader.go rename to frac/sealed/loader.go index 83a7f060..1b332a44 100644 --- a/frac/sealed_loader.go +++ b/frac/sealed/loader.go @@ -1,12 +1,11 @@ -package frac +package sealed import ( "time" "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/logger" @@ -21,7 +20,7 @@ type Loader struct { blockBuf []byte } -func (l *Loader) Load(blocksData *sealed.BlocksData, info *common.Info, indexReader *storage.IndexReader) { +func (l *Loader) Load(blocksData *BlocksData, info *frac.Info, indexReader *storage.IndexReader) { t := time.Now() l.reader = indexReader @@ -78,7 +77,7 @@ func (l *Loader) loadIDs() (idsTable seqids.Table, blocksOffsets []uint64, err e return idsTable, nil, err } - b := sealed.BlockOffsets{} + b := BlockOffsets{} if err := b.Unpack(result); err != nil { return idsTable, nil, err } diff --git a/frac/sealed/preloaded_data.go b/frac/sealed/preloaded_data.go index 1b43b865..76442c94 100644 --- a/frac/sealed/preloaded_data.go +++ b/frac/sealed/preloaded_data.go @@ -1,14 +1,14 @@ package sealed import ( - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" ) type PreloadedData struct { - Info *common.Info + Info *frac.Info BlocksData BlocksData TokenTable token.Table } diff --git a/frac/remote.go b/frac/sealed/remote.go similarity index 95% rename from frac/remote.go rename to frac/sealed/remote.go index c2088caa..e4ad8584 100644 --- a/frac/remote.go +++ b/frac/sealed/remote.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -10,9 +10,8 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" @@ -24,7 +23,7 @@ import ( ) var ( - _ Fraction = (*Remote)(nil) + _ frac.Fraction = (*Remote)(nil) ) // Remote fraction is a fraction that is backed by remote storage. @@ -35,11 +34,11 @@ var ( type Remote struct { ctx context.Context - Config *Config + Config *frac.Config BaseFileName string - info *common.Info + info *frac.Info docsFile storage.ImmutableFile docsCache *cache.Cache[[]byte] @@ -51,7 +50,7 @@ type Remote struct { loadMu *sync.RWMutex isLoaded bool - blocksData sealed.BlocksData + blocksData BlocksData s3cli *s3.Client readLimiter *storage.ReadLimiter @@ -63,8 +62,8 @@ func NewRemote( readLimiter *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - info *common.Info, - config *Config, + info *frac.Info, + config *frac.Config, s3cli *s3.Client, ) *Remote { f := &Remote{ @@ -163,7 +162,7 @@ func (f *Remote) createDataProvider(ctx context.Context) (*sealedDataProvider, e }, nil } -func (f *Remote) Info() *common.Info { +func (f *Remote) Info() *frac.Info { return f.info } @@ -194,7 +193,7 @@ func (f *Remote) Suicide() { } func (f *Remote) String() string { - return fracToString(f, "remote") + return frac.FracToString(f, "remote") } func (f *Remote) load() error { diff --git a/frac/sealed.go b/frac/sealed/sealed.go similarity index 94% rename from frac/sealed.go rename to frac/sealed/sealed.go index b3de2e82..af4f328f 100644 --- a/frac/sealed.go +++ b/frac/sealed/sealed.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -12,9 +12,8 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" @@ -25,15 +24,15 @@ import ( ) var ( - _ Fraction = (*Sealed)(nil) + _ frac.Fraction = (*Sealed)(nil) ) type Sealed struct { - Config *Config + Config *frac.Config BaseFileName string - info *common.Info + info *frac.Info docsFile *os.File docsCache *cache.Cache[[]byte] @@ -45,7 +44,7 @@ type Sealed struct { loadMu *sync.RWMutex isLoaded bool - blocksData sealed.BlocksData + blocksData BlocksData readLimiter *storage.ReadLimiter @@ -61,13 +60,13 @@ const ( HalfRemove ) -func NewSealed( +func New( baseFile string, readLimiter *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - info *common.Info, - config *Config, + info *frac.Info, + config *frac.Config, ) *Sealed { f := &Sealed{ loadMu: &sync.RWMutex{}, @@ -123,13 +122,13 @@ func (f *Sealed) openDocs() { } } -func NewSealedPreloaded( +func NewPreloaded( baseFile string, - preloaded *sealed.PreloadedData, + preloaded *PreloadedData, rl *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - config *Config, + config *frac.Config, ) *Sealed { f := &Sealed{ blocksData: preloaded.BlocksData, @@ -296,7 +295,7 @@ func (f *Sealed) Suicide() { } func (f *Sealed) String() string { - return fracToString(f, "sealed") + return frac.FracToString(f, "sealed") } func (f *Sealed) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { @@ -340,7 +339,7 @@ func (f *Sealed) createDataProvider(ctx context.Context) *sealedDataProvider { } } -func (f *Sealed) Info() *common.Info { +func (f *Sealed) Info() *frac.Info { return f.info } @@ -352,10 +351,7 @@ func (f *Sealed) IsIntersecting(from, to seq.MID) bool { return f.info.IsIntersecting(from, to) } -func loadHeader( - indexFile storage.ImmutableFile, - indexReader storage.IndexReader, -) *common.Info { +func loadHeader(indexFile storage.ImmutableFile, indexReader storage.IndexReader) *frac.Info { block, _, err := indexReader.ReadIndexBlock(0, nil) if err != nil { logger.Fatal( @@ -365,7 +361,7 @@ func loadHeader( ) } - var bi sealed.BlockInfo + var bi BlockInfo if err := bi.Unpack(block); err != nil { logger.Fatal( "error unpacking info block", diff --git a/frac/sealed/sealing/blocks_builder_test.go b/frac/sealed/sealing/blocks_builder_test.go index 80892ca2..8205f707 100644 --- a/frac/sealed/sealing/blocks_builder_test.go +++ b/frac/sealed/sealing/blocks_builder_test.go @@ -7,14 +7,14 @@ import ( "github.com/stretchr/testify/assert" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" ) type mockSource struct { - info common.Info + info frac.Info tokens [][]byte fields []string fieldMaxTIDs []uint32 @@ -25,7 +25,7 @@ type mockSource struct { lastError error } -func (m *mockSource) Info() common.Info { return m.info } +func (m *mockSource) Info() frac.Info { return m.info } func (m *mockSource) Fields() iter.Seq2[string, uint32] { return func(yield func(string, uint32) bool) { diff --git a/frac/sealed/sealing/index.go b/frac/sealed/sealing/index.go index 491c7233..871de9ef 100644 --- a/frac/sealed/sealing/index.go +++ b/frac/sealed/sealing/index.go @@ -11,7 +11,7 @@ import ( "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -29,10 +29,10 @@ import ( // - Registry for quick access to block locations // - PreloadedData structures for fast initialization instance of sealed fraction type IndexSealer struct { - lastErr error // Last error encountered during processing - buf1 []byte // Reusable buffer for packing raw data before compression - buf2 []byte // Reusable buffer for compressed data - params common.SealParams // Configuration parameters for sealing process + lastErr error // Last error encountered during processing + buf1 []byte // Reusable buffer for packing raw data before compression + buf2 []byte // Reusable buffer for compressed data + params frac.SealParams // Configuration parameters for sealing process // PreloadedData structures built during sealing for fast initialization of sealed fraction idsTable seqids.Table // Table mapping document IDs to blocks @@ -41,7 +41,7 @@ type IndexSealer struct { } // NewIndexSealer creates a new IndexSealer instance with the given parameters. -func NewIndexSealer(params common.SealParams) *IndexSealer { +func NewIndexSealer(params frac.SealParams) *IndexSealer { return &IndexSealer{ params: params, buf1: make([]byte, 0, consts.RegularBlockSize), diff --git a/frac/sealed/sealing/sealer.go b/frac/sealed/sealing/sealer.go index 3eb00761..b7444ab2 100644 --- a/frac/sealed/sealing/sealer.go +++ b/frac/sealed/sealing/sealer.go @@ -7,7 +7,7 @@ import ( "path/filepath" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/util" @@ -16,7 +16,7 @@ import ( // Source interface defines the contract for data sources that can be sealed. // Provides access to all necessary data components for index creation. type Source interface { - Info() *common.Info // Fraction metadata information + Info() *frac.Info // Fraction metadata information IDsBlocks(size int) iter.Seq2[[]seq.ID, []seq.DocPos] // Ordered sequence of document IDs and their positions, divided into blocks TokenBlocks(size int) iter.Seq[[][]byte] // Ordered sequence of tokens divided into blocks Fields() iter.Seq2[string, uint32] // Ordered sequence of fields with their max field's TID value @@ -39,7 +39,7 @@ type Source interface { // Returns: // - *sealed.PreloadedData: Preloaded data structures for initialization of sealed fraction // - error: Any error encountered during the sealing process -func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { +func Seal(src Source, params frac.SealParams) (*sealed.PreloadedData, error) { info := src.Info() // Validate that we're not sealing an empty fraction diff --git a/frac/fraction_test.go b/frac/tests/fraction_test.go similarity index 90% rename from frac/fraction_test.go rename to frac/tests/fraction_test.go index f1091192..cee4d8c9 100644 --- a/frac/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -1,4 +1,4 @@ -package frac +package tests import ( "context" @@ -20,8 +20,11 @@ import ( "github.com/stretchr/testify/suite" "github.com/ozontech/seq-db/cache" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active2" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -36,21 +39,23 @@ import ( type FractionTestSuite struct { suite.Suite - tmpDir string - config *Config - mapping seq.Mapping - tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *ActiveIndexer - stopIndexer func() - sealParams common.SealParams + tmpDir string + config *frac.Config + mapping seq.Mapping + tokenizers map[seq.TokenizerType]tokenizer.Tokenizer + activeIndexer *active.Indexer + activeIndexer2 *active2.Indexer + stopIndexer func() + sealParams frac.SealParams - fraction Fraction + fraction frac.Fraction insertDocuments func(docs ...[]string) } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer, s.stopIndexer = NewActiveIndexer(4, 10) + s.activeIndexer, s.stopIndexer = active.NewIndexer(4, 10) + s.activeIndexer2 = active2.NewIndexer(4) } func (s *FractionTestSuite) TearDownSuiteCommon() { @@ -58,7 +63,7 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &Config{} + s.config = &frac.Config{} s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -80,7 +85,7 @@ func (s *FractionTestSuite) SetupTestCommon() { "spans.span_id": seq.NewSingleType(seq.TokenizerTypeKeyword, "", 0), "v": seq.NewSingleType(seq.TokenizerTypeKeyword, "", 0), } - s.sealParams = common.SealParams{ + s.sealParams = frac.SealParams{ IDsZstdLevel: 1, LIDsZstdLevel: 1, TokenListZstdLevel: 1, @@ -1045,23 +1050,36 @@ func (s *FractionTestSuite) TestFractionInfo() { // these checks should not break without a reason // but if compression/marshalling has changed, expected values can be updated accordingly s.Require().Equal(uint32(5), info.DocsTotal, "doc total doesn't match") - // it varies depending on params and docs shuffled - s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), - "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(583), info.DocsRaw, "doc raw doesn't match") s.Require().Equal(seq.MID(946731625000), info.From, "from doesn't match") s.Require().Equal(seq.MID(946731654000), info.To, "to doesn't match") switch s.fraction.(type) { - case *Active: - s.Require().True(info.MetaOnDisk >= uint64(250) && info.MetaOnDisk <= uint64(350), + case *active.Active: + // it varies depending on params and docs shuffled + s.Require().True(info.DocsOnDisk > uint64(350) && info.DocsOnDisk < uint64(400), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) + s.Require().True(info.MetaOnDisk >= uint64(350) && info.MetaOnDisk <= uint64(450), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") - case *Sealed: + case *active2.Active2: + // it varies depending on params and docs shuffled + s.Require().True(info.DocsOnDisk > uint64(350) && info.DocsOnDisk < uint64(400), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) + s.Require().True(info.MetaOnDisk >= uint64(350) && info.MetaOnDisk <= uint64(450), + "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) + s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") + case *sealed.Sealed: + // it varies depending on params and docs shuffled and docs sorting + s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1600), "index on disk doesn't match. actual value: %d", info.MetaOnDisk) - case *Remote: + case *sealed.Remote: + // it varies depending on params and docs shuffled and docs sorting + s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1500), "index on disk doesn't match. actual value: %d", info.MetaOnDisk) @@ -1193,7 +1211,7 @@ func (s *FractionTestSuite) AssertSearchWithSearchParams( for i, fetchedDoc := range fetchedDocs { expectedDoc := originalDocs[expectedIndexes[i]] - s.Require().Equal(expectedDoc, fetchedDoc, "doc at index %d doesn't match", i) + s.Require().Equal(expectedDoc, fetchedDoc, "doc at index %d doesn't match (order %d)", i, order) } } } @@ -1252,9 +1270,9 @@ func (s *FractionTestSuite) AssertHist( } } -func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { +func (s *FractionTestSuite) newActive(bulks ...[]string) *active.Active { baseName := filepath.Join(s.tmpDir, "test_fraction") - active := NewActive( + a := active.New( baseName, s.activeIndexer, storage.NewReadLimiter(1, nil), @@ -1263,50 +1281,66 @@ func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { s.config, ) - var wg sync.WaitGroup + s.AppendBulks(a, bulks...) - for _, docs := range bulks { - docsCopy := slices.Clone(docs) - rand.Shuffle(len(docsCopy), func(i, j int) { - docsCopy[i], docsCopy[j] = docsCopy[j], docsCopy[i] - }) + return a +} + +type appender interface { + Append([]byte, []byte, *sync.WaitGroup) error +} - idx := 0 - readNext := func() ([]byte, error) { - if idx >= len(docsCopy) { - return nil, nil +func (s *FractionTestSuite) AppendBulks(a appender, bulks ...[]string) { + var wg sync.WaitGroup + + for _, bulk := range bulks { + bulkSize := (len(bulk)-1)/2 + 1 + for len(bulk) > 0 { + l := min(bulkSize, len(bulk)) + docs := bulk[:l] + bulk = bulk[l:] + + docsCopy := slices.Clone(docs) + rand.Shuffle(len(docsCopy), func(i, j int) { + docsCopy[i], docsCopy[j] = docsCopy[j], docsCopy[i] + }) + + idx := 0 + readNext := func() ([]byte, error) { + if idx >= len(docsCopy) { + return nil, nil + } + d := []byte(docsCopy[idx]) + idx++ + return d, nil } - d := []byte(docsCopy[idx]) - idx++ - return d, nil - } - proc := indexer.NewProcessor(s.mapping, s.tokenizers, 0, 0, 0) - compressor := indexer.GetDocsMetasCompressor(3, 3) - _, binaryDocs, binaryMeta, err := proc.ProcessBulk(time.Now(), nil, nil, readNext) - s.Require().NoError(err, "processing bulk failed") + proc := indexer.NewProcessor(s.mapping, s.tokenizers, 0, 0, 0) + compressor := indexer.GetDocsMetasCompressor(3, 3) + _, binaryDocs, binaryMeta, err := proc.ProcessBulk(time.Now(), nil, nil, readNext) + s.Require().NoError(err, "processing bulk failed") - compressor.CompressDocsAndMetas(binaryDocs, binaryMeta) - docsBlock, metasBlock := compressor.DocsMetas() + compressor.CompressDocsAndMetas(binaryDocs, binaryMeta) + docsBlock, metasBlock := compressor.DocsMetas() - wg.Add(1) - err = active.Append(docsBlock, metasBlock, &wg) - s.Require().NoError(err, "append to active failed") + wg.Add(1) + err = a.Append(docsBlock, metasBlock, &wg) + s.Require().NoError(err, "append to active failed") + } } wg.Wait() - return active } -func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { - active := s.newActive(bulks...) +func (s *FractionTestSuite) newSealed(bulks ...[]string) *sealed.Sealed { + a := s.newActive(bulks...) - activeSealingSource, err := NewActiveSealingSource(active, s.sealParams) + activeSealingSource, err := active.NewSealingSource(a, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) s.Require().NoError(err, "Sealing failed") - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1316,21 +1350,19 @@ func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { Registry: cache.NewCache[[]byte](nil, nil), } - sealed := NewSealedPreloaded( - active.BaseFileName, + f := sealed.NewPreloaded( + a.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), s.config, ) - active.Release() - return sealed + a.Release() + return f } -/* -ActiveFractionTestSuite run tests for active fraction -*/ +// ActiveFractionTestSuite run tests for active fraction type ActiveFractionTestSuite struct { FractionTestSuite } @@ -1351,8 +1383,8 @@ func (s *ActiveFractionTestSuite) SetupTest() { } func (s *ActiveFractionTestSuite) TearDownTest() { - if active, ok := s.fraction.(*Active); ok { - active.Release() + if a, ok := s.fraction.(*active.Active); ok { + a.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") } @@ -1364,9 +1396,7 @@ func (s *ActiveFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -ActiveReplayedFractionTestSuite run tests for active fraction which was replayed from meta and docs file on disk -*/ +// ActiveReplayedFractionTestSuite run tests for active fraction which was replayed from meta and docs file on disk type ActiveReplayedFractionTestSuite struct { FractionTestSuite } @@ -1389,24 +1419,24 @@ func (s *ActiveReplayedFractionTestSuite) SetupTest() { } } -func (s *ActiveReplayedFractionTestSuite) Replay(frac *Active) Fraction { - fracFileName := frac.BaseFileName - frac.Release() - replayedFrac := NewActive( +func (s *ActiveReplayedFractionTestSuite) Replay(f *active.Active) frac.Fraction { + fracFileName := f.BaseFileName + f.Release() + replayedFrac := active.New( fracFileName, s.activeIndexer, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}) + &frac.Config{}) err := replayedFrac.Replay(context.Background()) s.Require().NoError(err, "replay failed") return replayedFrac } func (s *ActiveReplayedFractionTestSuite) TearDownTest() { - if active, ok := s.fraction.(*Active); ok { - active.Release() + if f, ok := s.fraction.(*active.Active); ok { + f.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") } @@ -1417,9 +1447,7 @@ func (s *ActiveReplayedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -SealedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. -*/ +// SealedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. type SealedFractionTestSuite struct { FractionTestSuite } @@ -1440,8 +1468,8 @@ func (s *SealedFractionTestSuite) SetupTest() { } func (s *SealedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { - sealed.Release() + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") } @@ -1452,10 +1480,8 @@ func (s *SealedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -SealedLoadedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. -Sealed fraction is then loaded with sealed.NewSealed call -*/ +// SealedLoadedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. +// Sealed fraction is then loaded with sealed.NewSealed call type SealedLoadedFractionTestSuite struct { FractionTestSuite } @@ -1476,8 +1502,8 @@ func (s *SealedLoadedFractionTestSuite) SetupTest() { } func (s *SealedLoadedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { - sealed.Release() + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") } @@ -1488,11 +1514,11 @@ func (s *SealedLoadedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Sealed { - sealed := s.newSealed(bulks...) - sealed.Release() +func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *sealed.Sealed { + f := s.newSealed(bulks...) + f.Release() - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1502,21 +1528,19 @@ func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Seal Registry: cache.NewCache[[]byte](nil, nil), } - sealed = NewSealed( - sealed.BaseFileName, + f = sealed.New( + f.BaseFileName, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), nil, s.config) - s.fraction = sealed - return sealed + s.fraction = f + return f } -/* -RemoteFractionTestSuite runs tests for remote fraction. Fraction is first sealed, then uploaded -to fakes3 backend. -*/ +// RemoteFractionTestSuite runs tests for remote fraction. Fraction is first sealed, then uploaded +// to fakes3 backend. type RemoteFractionTestSuite struct { FractionTestSuite @@ -1542,8 +1566,8 @@ func (s *RemoteFractionTestSuite) SetupTest() { if s.fraction != nil { s.Require().Fail("can insert docs only once") } - sealed := s.newSealed(bulks...) - defer sealed.Suicide() + f := s.newSealed(bulks...) + defer f.Suicide() s3cli, err := s3.NewClient( s.s3server.URL, @@ -1555,11 +1579,11 @@ func (s *RemoteFractionTestSuite) SetupTest() { ) s.Require().NoError(err, "s3 client setup failed") - offloaded, err := sealed.Offload(context.Background(), s3.NewUploader(s3cli)) + offloaded, err := f.Offload(context.Background(), s3.NewUploader(s3cli)) s.Require().NoError(err, "offload failed") s.Require().True(offloaded, "didn't offload frac") - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1569,13 +1593,13 @@ func (s *RemoteFractionTestSuite) SetupTest() { Registry: cache.NewCache[[]byte](nil, nil), } - remoteFrac := NewRemote( + remoteFrac := sealed.NewRemote( context.Background(), - sealed.BaseFileName, + f.BaseFileName, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), - sealed.info, + f.Info(), s.config, s3cli) s.fraction = remoteFrac @@ -1583,7 +1607,7 @@ func (s *RemoteFractionTestSuite) SetupTest() { } func (s *RemoteFractionTestSuite) TearDownTest() { - if remote, ok := s.fraction.(*Remote); ok { + if remote, ok := s.fraction.(*sealed.Remote); ok { remote.Suicide() } else { s.Require().Nil(s.fraction, "fraction is not of Remote type") @@ -1616,3 +1640,56 @@ func TestSealedLoadedFractionTestSuite(t *testing.T) { func TestRemoteFractionTestSuite(t *testing.T) { suite.Run(t, new(RemoteFractionTestSuite)) } + +func TestActive2FractionTestSuite(t *testing.T) { + suite.Run(t, new(Active2FractionTestSuite)) +} + +type Active2FractionTestSuite struct { + FractionTestSuite +} + +func (s *Active2FractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *Active2FractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(bulks ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newActive2(bulks...) + } +} + +func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active2.Active2 { + baseName := filepath.Join(s.tmpDir, "test_fraction") + a := active2.New( + baseName, + s.config, + s.activeIndexer2, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + s.AppendBulks(a, bulks...) + + return a +} + +func (s *Active2FractionTestSuite) TearDownTest() { + if f, ok := s.fraction.(*active2.Active2); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") + } + + s.TearDownTestCommon() +} + +func (s *Active2FractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} diff --git a/fracmanager/cache_maintainer.go b/fracmanager/cache_maintainer.go index 5139b4ca..c5aeb3ba 100644 --- a/fracmanager/cache_maintainer.go +++ b/fracmanager/cache_maintainer.go @@ -7,7 +7,7 @@ import ( "go.uber.org/zap" "github.com/ozontech/seq-db/cache" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" @@ -141,8 +141,8 @@ func (cm *CacheMaintainer) CreateSortDocsCache() *cache.Cache[[]byte] { return newCache[[]byte](cm, sortName) } -func (cm *CacheMaintainer) CreateIndexCache() *frac.IndexCache { - return &frac.IndexCache{ +func (cm *CacheMaintainer) CreateIndexCache() *sealed.IndexCache { + return &sealed.IndexCache{ MIDs: newCache[[]byte](cm, midsName), RIDs: newCache[[]byte](cm, ridsName), Params: newCache[seqids.BlockParams](cm, paramsName), diff --git a/fracmanager/config.go b/fracmanager/config.go index de96c957..d6e22fc1 100644 --- a/fracmanager/config.go +++ b/fracmanager/config.go @@ -7,7 +7,6 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/util" ) @@ -23,7 +22,7 @@ type Config struct { MaintenanceDelay time.Duration CacheCleanupDelay time.Duration CacheGCDelay time.Duration - SealParams common.SealParams + SealParams frac.SealParams SortCacheSize uint64 // size for docs cache for active fraction Fraction frac.Config MinSealFracSize uint64 diff --git a/fracmanager/frac_info_cache.go b/fracmanager/frac_info_cache.go index 7356b1bb..46b29296 100644 --- a/fracmanager/frac_info_cache.go +++ b/fracmanager/frac_info_cache.go @@ -10,7 +10,7 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" ) @@ -22,7 +22,7 @@ type fracInfoCache struct { fileName string mu sync.RWMutex - cache map[string]*common.Info + cache map[string]*frac.Info version uint64 // if we increment the counter every second it will take 31 billion years (quite enough) saveMu sync.Mutex @@ -31,7 +31,7 @@ type fracInfoCache struct { func NewFracInfoCache(filePath string) *fracInfoCache { fc := &fracInfoCache{ - cache: make(map[string]*common.Info), + cache: make(map[string]*frac.Info), mu: sync.RWMutex{}, fullPath: filePath, fileName: filepath.Base(filePath), @@ -74,7 +74,7 @@ func (fc *fracInfoCache) LoadFromDisk(fileName string) { } // Add adds a new entry to the in-memory [sealedFracCache]. -func (fc *fracInfoCache) Add(info *common.Info) { +func (fc *fracInfoCache) Add(info *frac.Info) { name := info.Name() fc.mu.Lock() @@ -96,7 +96,7 @@ func (fc *fracInfoCache) Remove(name string) { // Get returns fraction info and a flag that indicates // whether the data is present in the map. -func (fc *fracInfoCache) Get(name string) (*common.Info, bool) { +func (fc *fracInfoCache) Get(name string) (*frac.Info, bool) { fc.mu.RLock() defer fc.mu.RUnlock() diff --git a/fracmanager/frac_info_cache_test.go b/fracmanager/frac_info_cache_test.go index fbc64ae9..ee4b2052 100644 --- a/fracmanager/frac_info_cache_test.go +++ b/fracmanager/frac_info_cache_test.go @@ -9,8 +9,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" - testscommon "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/tests/common" ) const dummyFracFixture = `{"a":{"name":"a","ver":"1.1","docs_total":1,"docs_on_disk":363,"docs_raw":450,"meta_on_disk":0,"index_on_disk":1284,"const_regular_block_size":16384,"const_ids_per_block":4096,"const_lid_block_cap":65536,"from":1666193255114,"to":1666193255114,"creation_time":1666193044479},"b":{"name":"b","ver":"1.2","docs_total":1,"docs_on_disk":363,"docs_raw":450,"meta_on_disk":0,"index_on_disk":1276,"const_regular_block_size":16384,"const_ids_per_block":4096,"const_lid_block_cap":65536,"from":1666193602304,"to":1666193602304,"creation_time":1666193598979}}` @@ -21,13 +21,13 @@ func loadFracCacheContent(dataDir string) ([]byte, error) { return content, err } -func loadFracCache(dataDir string) (map[string]*common.Info, error) { +func loadFracCache(dataDir string) (map[string]*frac.Info, error) { content, err := loadFracCacheContent(dataDir) if err != nil { return nil, err } - fracCache := make(map[string]*common.Info) + fracCache := make(map[string]*frac.Info) err = json.Unmarshal(content, &fracCache) if err != nil { return nil, err @@ -43,10 +43,10 @@ func writeToFracCache(dataDir, fname, data string) error { } func TestEmpty(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) f := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) err := f.SyncWithDisk() @@ -67,10 +67,10 @@ func TestEmpty(t *testing.T) { } func TestLoadFromDisk(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -97,9 +97,9 @@ func TestLoadFromDisk(t *testing.T) { } func TestRemoveFraction(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + dataDir := common.GetTestTmpDir(t) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -117,7 +117,7 @@ func TestRemoveFraction(t *testing.T) { assert.NoError(t, err) assert.Equal(t, contents, []byte("{}")) - newInfo := &common.Info{ + newInfo := &frac.Info{ Path: "/data/c", Ver: "1.3", DocsTotal: 0, @@ -138,7 +138,7 @@ func TestRemoveFraction(t *testing.T) { m, err := loadFracCache(dataDir) assert.NoError(t, err) - expected := map[string]*common.Info{"c": newInfo} + expected := map[string]*frac.Info{"c": newInfo} assert.Equal(t, expected, m) f.Remove("c") @@ -151,10 +151,10 @@ func TestRemoveFraction(t *testing.T) { } func TestWriteToDisk(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -162,7 +162,7 @@ func TestWriteToDisk(t *testing.T) { f := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) f.LoadFromDisk(filepath.Join(dataDir, consts.FracCacheFileSuffix)) - newInfo := &common.Info{ + newInfo := &frac.Info{ Path: "/data/c", Ver: "1.3", DocsTotal: 0, @@ -221,15 +221,15 @@ func TestWriteToDisk(t *testing.T) { } func TestUnusedFractionsCleanup(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) - expected := map[string]*common.Info{} + expected := map[string]*frac.Info{} cacheFile := filepath.Join(dataDir, consts.FracCacheFileSuffix) diskFracCache := NewFracInfoCacheFromDisk(cacheFile) diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 62c1faf0..ad5163f7 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -11,6 +11,7 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -39,7 +40,7 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func FillConfigWithDefault(cfg) readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) - idx, stopIdx := frac.NewActiveIndexer(config.IndexWorkers, config.IndexWorkers) + idx, stopIdx := active.NewIndexer(config.IndexWorkers, config.IndexWorkers) cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) @@ -189,7 +190,7 @@ func startMaintWorker(ctx context.Context, cfg *Config, fm *FracManager, wg *syn } // SealOnShutdown seals the active fraction on storage shutdown -func sealOnShutdown(active *frac.Active, provider *fractionProvider, minSealSize uint64) { +func sealOnShutdown(active *active.Active, provider *fractionProvider, minSealSize uint64) { fracSize := active.Info().FullSize() if minSealSize == 0 { diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index 451d76a8..13925cca 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -7,6 +7,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/seq" ) @@ -61,7 +63,7 @@ func TestSealingOnShutdown(t *testing.T) { assert.Equal(t, 1, len(fm.Fractions()), "should have one fraction") assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "fraction should have the same name") - _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*frac.Active) + _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*active.Active) assert.True(t, ok, "fraction should be active") stop() @@ -70,11 +72,11 @@ func TestSealingOnShutdown(t *testing.T) { _, fm, stop = setupFracManager(t, cfg) assert.Equal(t, 2, len(fm.Fractions()), "should have 2 fraction: new active and old sealed") - _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*frac.Sealed) + _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*sealed.Sealed) assert.True(t, ok, "first fraction should be sealed") assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "sealed fraction should have the same name") assert.Equal(t, uint32(0), fm.Fractions()[1].Info().DocsTotal, "active fraction should be empty") - _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*frac.Active) + _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*active.Active) assert.True(t, ok, "new fraction should be active") stop() diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go index 1c2a9fa3..1e9d18d8 100644 --- a/fracmanager/fracs_stats.go +++ b/fracmanager/fracs_stats.go @@ -4,7 +4,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/util" ) @@ -22,7 +22,7 @@ type fracsStats struct { // Add incorporates fraction information into the statistics // Updates all aggregate metrics with the values from the provided fraction info -func (s *fracsStats) Add(info *common.Info) { +func (s *fracsStats) Add(info *frac.Info) { s.count++ s.docsCount += uint64(info.DocsTotal) s.docsSizeRaw += info.DocsRaw @@ -33,7 +33,7 @@ func (s *fracsStats) Add(info *common.Info) { // Sub removes fraction information from the statistics // Decrements all aggregate metrics with the values from the provided fraction info -func (s *fracsStats) Sub(info *common.Info) { +func (s *fracsStats) Sub(info *frac.Info) { s.count-- s.docsCount -= uint64(info.DocsTotal) s.docsSizeRaw -= info.DocsRaw diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index e2915598..412a2d4b 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -10,7 +10,7 @@ import ( "github.com/oklog/ulid/v2" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/storage" @@ -25,14 +25,14 @@ type fractionProvider struct { s3cli *s3.Client // Client for S3 storage operations config *Config // Fraction manager configuration cacheProvider *CacheMaintainer // Cache provider for data access optimization - activeIndexer *frac.ActiveIndexer // Indexer for active fractions + activeIndexer *active.Indexer // Indexer for active fractions readLimiter *storage.ReadLimiter // Read rate limiter ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( cfg *Config, s3cli *s3.Client, cp *CacheMaintainer, - readLimiter *storage.ReadLimiter, indexer *frac.ActiveIndexer, + readLimiter *storage.ReadLimiter, indexer *active.Indexer, ) *fractionProvider { return &fractionProvider{ s3cli: s3cli, @@ -44,8 +44,8 @@ func newFractionProvider( } } -func (fp *fractionProvider) NewActive(name string) *frac.Active { - return frac.NewActive( +func (fp *fractionProvider) NewActive(name string) *active.Active { + return active.New( name, fp.activeIndexer, fp.readLimiter, @@ -55,8 +55,8 @@ func (fp *fractionProvider) NewActive(name string) *frac.Active { ) } -func (fp *fractionProvider) NewSealed(name string, cachedInfo *common.Info) *frac.Sealed { - return frac.NewSealed( +func (fp *fractionProvider) NewSealed(name string, cachedInfo *frac.Info) *sealed.Sealed { + return sealed.New( name, fp.readLimiter, fp.cacheProvider.CreateIndexCache(), @@ -66,8 +66,8 @@ func (fp *fractionProvider) NewSealed(name string, cachedInfo *common.Info) *fra ) } -func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *sealed.PreloadedData) *frac.Sealed { - return frac.NewSealedPreloaded( +func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *sealed.PreloadedData) *sealed.Sealed { + return sealed.NewPreloaded( name, preloadedData, // Data already loaded into memory fp.readLimiter, @@ -77,8 +77,8 @@ func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *seale ) } -func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedInfo *common.Info) *frac.Remote { - return frac.NewRemote( +func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedInfo *frac.Info) *sealed.Remote { + return sealed.NewRemote( ctx, name, fp.readLimiter, @@ -99,7 +99,7 @@ func (fp *fractionProvider) nextFractionID() string { // CreateActive creates a new active fraction with auto-generated filename // Filename pattern: base_pattern + ULID -func (fp *fractionProvider) CreateActive() *frac.Active { +func (fp *fractionProvider) CreateActive() *active.Active { filePath := fileBasePattern + fp.nextFractionID() baseFilePath := filepath.Join(fp.config.DataDir, filePath) return fp.NewActive(baseFilePath) @@ -107,8 +107,8 @@ func (fp *fractionProvider) CreateActive() *frac.Active { // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading -func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { - src, err := frac.NewActiveSealingSource(active, fp.config.SealParams) +func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { + src, err := active.NewSealingSource(a, fp.config.SealParams) if err != nil { return nil, err } @@ -117,12 +117,12 @@ func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { return nil, err } - return fp.NewSealedPreloaded(active.BaseFileName, preloaded), nil + return fp.NewSealedPreloaded(a.BaseFileName, preloaded), nil } // Offload uploads fraction to S3 storage and returns a remote fraction // IMPORTANT: context controls timeouts and operation cancellation -func (fp *fractionProvider) Offload(ctx context.Context, f *frac.Sealed) (*frac.Remote, error) { +func (fp *fractionProvider) Offload(ctx context.Context, f *sealed.Sealed) (*sealed.Remote, error) { mustBeOffloaded, err := f.Offload(ctx, s3.NewUploader(fp.s3cli)) if err != nil { return nil, err diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index f315b615..893f7e84 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -13,7 +13,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" ) @@ -36,7 +36,7 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() cfg = setupDataDir(t, cfg) rl := storage.NewReadLimiter(1, nil) s3cli, stopS3 := setupS3Client(t) - idx, stopIdx := frac.NewActiveIndexer(1, 1) + idx, stopIdx := active.NewIndexer(1, 1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) provider := newFractionProvider(cfg, s3cli, cache, rl, idx) return provider, func() { diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index 9178dcf1..7d36ec0a 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -7,6 +7,8 @@ import ( "time" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" ) // fractionRegistry manages fraction queues at different lifecycle stages. @@ -35,7 +37,7 @@ type fractionRegistry struct { // NewFractionRegistry creates and initializes a new fraction registry instance. // Populates the registry with existing active, local and remote fractions. // Rebuilds the complete fractions list in chronological order. -func NewFractionRegistry(active *frac.Active, locals []*frac.Sealed, remotes []*frac.Remote) (*fractionRegistry, error) { +func NewFractionRegistry(active *active.Active, locals []*sealed.Sealed, remotes []*sealed.Remote) (*fractionRegistry, error) { if active == nil { return nil, errors.New("active fraction must be specified") } @@ -254,7 +256,7 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { // PromoteToLocal moves fractions from sealing to local queue when sealing completes. // Maintains strict ordering - younger fractions wait for older ones to seal first. -func (r *fractionRegistry) PromoteToLocal(active *activeProxy, sealed *frac.Sealed) { +func (r *fractionRegistry) PromoteToLocal(active *activeProxy, sealed *sealed.Sealed) { r.mu.Lock() defer r.mu.Unlock() @@ -282,7 +284,7 @@ func (r *fractionRegistry) PromoteToLocal(active *activeProxy, sealed *frac.Seal // PromoteToRemote moves fractions from offloading to remote queue when offloading completes. // Special case: Handles fractions that don't require offloading (remote == nil). // Maintains strict ordering - younger fractions wait for older ones to offload. -func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *frac.Remote) { +func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *sealed.Remote) { r.mu.Lock() defer r.mu.Unlock() diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index 597a6cbf..345c5a83 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -8,7 +8,7 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/util" ) @@ -142,7 +142,7 @@ func (lc *lifecycleManager) OffloadLocal(ctx context.Context, sizeLimit uint64, // TryOffload performs a single offload attempt and records metrics // Measures offloading duration and tracks success/failure statistics. -func (lc *lifecycleManager) TryOffload(ctx context.Context, sealed *frac.Sealed) (*frac.Remote, error) { +func (lc *lifecycleManager) TryOffload(ctx context.Context, sealed *sealed.Sealed) (*sealed.Remote, error) { now := time.Now() remote, err := lc.provider.Offload(ctx, sealed) offloadingDuration := time.Since(now).Seconds() diff --git a/fracmanager/loader.go b/fracmanager/loader.go index 6eb788ee..5229c040 100644 --- a/fracmanager/loader.go +++ b/fracmanager/loader.go @@ -9,7 +9,8 @@ import ( "go.uber.org/zap" "golang.org/x/sync/errgroup" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" ) @@ -63,7 +64,7 @@ func (l *Loader) Load(ctx context.Context) (*fractionRegistry, error) { // replayAndSeal replays active fractions and seals old ones // Key method for ensuring data consistency during restart -func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*frac.Active, []*frac.Sealed, error) { +func (l *Loader) replayAndSeal(ctx context.Context, actives []*active.Active) (*active.Active, []*sealed.Sealed, error) { if len(actives) == 0 { return nil, nil, nil } @@ -71,7 +72,7 @@ func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*fr g, ctx := errgroup.WithContext(ctx) g.SetLimit(l.config.ReplayWorkers) - sealed := make([]*frac.Sealed, len(actives)-1) + sealed := make([]*sealed.Sealed, len(actives)-1) for i, a := range actives[:len(actives)-1] { g.Go(func() error { @@ -115,7 +116,7 @@ func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*fr // discover discovers all fractions in filesystem // Returns fractions separated by type: active, local, remote -func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, []*frac.Remote, error) { +func (l *Loader) discover(ctx context.Context) ([]*active.Active, []*sealed.Sealed, []*sealed.Remote, error) { // Scan and analyze fraction files. Filter valid fractions manifests, err := analyzeFiles(l.scanFiles()) if err != nil { @@ -125,9 +126,9 @@ func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, total := len(manifests) logProgress := progressLogger(time.Millisecond * 500) - actives := make([]*frac.Active, 0) - locals := make([]*frac.Sealed, 0, total) - remotes := make([]*frac.Remote, 0, total) + actives := make([]*active.Active, 0) + locals := make([]*sealed.Sealed, 0, total) + remotes := make([]*sealed.Remote, 0, total) loadedInfoCache := NewFracInfoCacheFromDisk(l.infoCache.fullPath) @@ -153,7 +154,7 @@ func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, } // loadSealed loads a sealed fraction using cache -func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *frac.Sealed { +func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *sealed.Sealed { info, found := loadedInfoCache.Get(filepath.Base(basePath)) l.updateStats(found) @@ -163,7 +164,7 @@ func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *fr } // loadRemote loads a remote fraction -func (l *Loader) loadRemote(ctx context.Context, basePath string, loadedInfoCache *fracInfoCache) *frac.Remote { +func (l *Loader) loadRemote(ctx context.Context, basePath string, loadedInfoCache *fracInfoCache) *sealed.Remote { info, found := loadedInfoCache.Get(filepath.Base(basePath)) l.updateStats(found) diff --git a/fracmanager/loader_test.go b/fracmanager/loader_test.go index 57d054f6..472e2282 100644 --- a/fracmanager/loader_test.go +++ b/fracmanager/loader_test.go @@ -12,7 +12,8 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/seq" ) @@ -25,7 +26,7 @@ func setupLoaderTest(t testing.TB, cfg *Config) (*fractionProvider, *Loader, fun return fp, loader, tearDown } -func appendDocsToActive(t testing.TB, active *frac.Active, docCount int) { +func appendDocsToActive(t testing.TB, active *active.Active, docCount int) { dp := indexer.NewTestDocProvider() for i := 1; i <= docCount; i++ { doc := []byte("{\"timestamp\": 0, \"message\": \"msg\"}") @@ -48,7 +49,7 @@ func TestReplayWithEmptyActive(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() appendDocsToActive(t, active, 500+rand.Intn(100)) @@ -78,8 +79,8 @@ func TestReplayWithMultipleEmpty(t *testing.T) { defer tearDown() // fill data - nonEmpty := make([]*common.Info, 0) - actives := make([]*frac.Active, 0, fracCount) + nonEmpty := make([]*frac.Info, 0) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() if i%3 == 0 { @@ -111,7 +112,7 @@ func TestReplayMultiple(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() appendDocsToActive(t, active, 500+rand.Intn(100)) @@ -141,7 +142,7 @@ func TestReplaySingleEmpty(t *testing.T) { defer tearDown() // fill data: one empty fraction - actives := []*frac.Active{fp.CreateActive()} + actives := []*active.Active{fp.CreateActive()} // replay and seal active, sealed, err := loader.replayAndSeal(t.Context(), actives) @@ -161,7 +162,7 @@ func TestReplayContextCancel(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() appendDocsToActive(t, active, 500+rand.Intn(100)) @@ -186,7 +187,7 @@ func TestReplaySingleNonEmpty(t *testing.T) { defer tearDown() // fill data - actives := []*frac.Active{fp.CreateActive()} + actives := []*active.Active{fp.CreateActive()} appendDocsToActive(t, actives[0], 500+rand.Intn(100)) // replay and seal @@ -206,7 +207,7 @@ func TestDiscover(t *testing.T) { defer tearDown() // make some sealed fracs - expectedSealed := map[string]*frac.Sealed{} + expectedSealed := map[string]*sealed.Sealed{} for range fracCount { a := fp.CreateActive() appendDocsToActive(t, a, 10+rand.Intn(10)) @@ -216,7 +217,7 @@ func TestDiscover(t *testing.T) { } // make half sealed fracs remote - expectedRemote := map[string]*frac.Remote{} + expectedRemote := map[string]*sealed.Remote{} for n, s := range expectedSealed { if rand.Intn(2) != 0 { continue diff --git a/fracmanager/proxy_frac.go b/fracmanager/proxy_frac.go index e30dc1a0..349c054a 100644 --- a/fracmanager/proxy_frac.go +++ b/fracmanager/proxy_frac.go @@ -10,8 +10,9 @@ import ( "go.uber.org/zap" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" @@ -39,7 +40,7 @@ func (p *fractionProxy) Redirect(f frac.Fraction) { p.impl = f } -func (p *fractionProxy) Info() *common.Info { +func (p *fractionProxy) Info() *frac.Info { p.mu.RLock() defer p.mu.RUnlock() return p.impl.Info() @@ -74,8 +75,8 @@ func (p *fractionProxy) Search(ctx context.Context, params processor.SearchParam // Lifecycle: Created when fraction becomes active, destroyed after sealing. type activeProxy struct { proxy *fractionProxy // Thread-safe fraction access - instance *frac.Active // Actual active fraction instance - sealed *frac.Sealed // Sealed version (set after sealing) + instance *active.Active // Actual active fraction instance + sealed *sealed.Sealed // Sealed version (set after sealing) mu sync.RWMutex // Protects readonly state wg sync.WaitGroup // Tracks pending write operations @@ -83,7 +84,7 @@ type activeProxy struct { finalized bool // Whether fraction is frozen for writes } -func newActiveProxy(active *frac.Active) *activeProxy { +func newActiveProxy(active *active.Active) *activeProxy { return &activeProxy{ proxy: &fractionProxy{impl: active}, instance: active, @@ -132,14 +133,14 @@ func (p *activeProxy) Finalize() error { // Tracks both local sealed instance and remote version if offloaded. type sealedProxy struct { proxy *fractionProxy // Thread-safe fraction access - instance *frac.Sealed // Local sealed fraction - remote *frac.Remote // Remote version (if offloaded) + instance *sealed.Sealed // Local sealed fraction + remote *sealed.Remote // Remote version (if offloaded) } // remoteProxy represents an offloaded fraction type remoteProxy struct { proxy *fractionProxy // Thread-safe fraction access - instance *frac.Remote // Remote fraction instance + instance *sealed.Remote // Remote fraction instance } // emptyFraction represents a missing or deleted fraction @@ -148,8 +149,8 @@ type remoteProxy struct { type emptyFraction struct { } -func (emptyFraction) Info() *common.Info { - return &common.Info{ +func (emptyFraction) Info() *frac.Info { + return &frac.Info{ Path: "empty", From: math.MaxUint64, To: 0, diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index f85c3f8f..57db7884 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -17,7 +17,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -35,7 +35,7 @@ func TestMain(m *testing.M) { m.Run() } -func fillActiveFraction(active *frac.Active) error { +func fillActiveFraction(active *active.Active) error { const muliplier = 10 file, err := os.Open(filepath.Join(testscommon.TestDataDir, "k8s.logs")) @@ -77,9 +77,9 @@ func fillActiveFraction(active *frac.Active) error { return nil } -func defaultSealingParams() common.SealParams { +func defaultSealingParams() frac.SealParams { const minZstdLevel = 1 - return common.SealParams{ + return frac.SealParams{ IDsZstdLevel: minZstdLevel, LIDsZstdLevel: minZstdLevel, TokenListZstdLevel: minZstdLevel, @@ -102,19 +102,19 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { fp, tearDown := setupFractionProvider(b, &Config{Fraction: *cfg}) defer tearDown() - active := fp.CreateActive() - err := fillActiveFraction(active) + a := fp.CreateActive() + err := fillActiveFraction(a) assert.NoError(b, err) - seal := func(active *frac.Active, params common.SealParams) (*sealed.PreloadedData, error) { - src, err := frac.NewActiveSealingSource(active, params) + seal := func(a *active.Active, params frac.SealParams) (*sealed.PreloadedData, error) { + src, err := active.NewSealingSource(a, params) assert.NoError(b, err) return sealing.Seal(src, params) } params := defaultSealingParams() // The first sealing will sort all the LIDs, so we take this load out of the measurement range - _, err = seal(active, params) + _, err = seal(a, params) assert.NoError(b, err) b.ReportAllocs() @@ -134,7 +134,7 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { } for b.Loop() { - _, err = seal(active, params) + _, err = seal(a, params) assert.NoError(b, err) } } diff --git a/fracmanager/searcher_test.go b/fracmanager/searcher_test.go index 016ec348..c9261a7e 100644 --- a/fracmanager/searcher_test.go +++ b/fracmanager/searcher_test.go @@ -10,14 +10,13 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/parser" "github.com/ozontech/seq-db/seq" ) type testFakeFrac struct { - info *common.Info + info *frac.Info qpr *seq.QPR searchesCount int fetchCount int @@ -25,7 +24,7 @@ type testFakeFrac struct { fetchError error } -func (f *testFakeFrac) Info() *common.Info { +func (f *testFakeFrac) Info() *frac.Info { return f.info } @@ -64,7 +63,7 @@ func (f *testFakeFrac) Search(context.Context, processor.SearchParams) (*seq.QPR func newFakeFrac(from, to seq.MID, qpr *seq.QPR) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: 1}, + info: &frac.Info{From: from, To: to, DocsTotal: 1}, qpr: qpr, documents: make(map[seq.ID][]byte), } @@ -72,14 +71,14 @@ func newFakeFrac(from, to seq.MID, qpr *seq.QPR) *testFakeFrac { func newFakeFracWithDocs(from, to seq.MID, documents map[seq.ID][]byte) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: uint32(len(documents))}, + info: &frac.Info{From: from, To: to, DocsTotal: uint32(len(documents))}, documents: documents, } } func newFakeFracWithFetchError(from, to seq.MID, fetchError error) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: 1}, + info: &frac.Info{From: from, To: to, DocsTotal: 1}, documents: make(map[seq.ID][]byte), fetchError: fetchError, } diff --git a/indexer/meta_data.go b/indexer/meta_data.go index 241f219f..5d33f7b5 100644 --- a/indexer/meta_data.go +++ b/indexer/meta_data.go @@ -10,10 +10,11 @@ import ( ) type MetaData struct { - ID seq.ID - // Size of an uncompressed document in bytes. - Size uint32 - Tokens []tokenizer.MetaToken + ID seq.ID + Size uint32 // Size of an uncompressed document in bytes. + Tokens []tokenizer.MetaToken + tokensCount uint32 + tokensBin []byte } // String used in tests for human-readable output. @@ -72,6 +73,23 @@ func (m *MetaData) UnmarshalBinary(b []byte) error { } } +func (m *MetaData) UnmarshalBinaryLazy(b []byte) error { + if !IsItBinaryEncodedMetaData(b) { + return fmt.Errorf("invalid metadata magic bytes") + } + b = b[2:] + + version := binary.LittleEndian.Uint16(b) + b = b[2:] + + switch version { + case 1: + return m.unmarshalVersion1Lazy(b) + default: + return fmt.Errorf("unimplemented metadata version: %d", version) + } +} + func (m *MetaData) unmarshalVersion1(b []byte) error { // Decode seq.ID. m.ID.MID = seq.MID(binary.LittleEndian.Uint64(b)) @@ -101,3 +119,42 @@ func (m *MetaData) unmarshalVersion1(b []byte) error { } return nil } + +func (m *MetaData) unmarshalVersion1Lazy(b []byte) error { + // Decode seq.ID. + m.ID.MID = seq.MID(binary.LittleEndian.Uint64(b)) + b = b[8:] + m.ID.RID = seq.RID(binary.LittleEndian.Uint64(b)) + b = b[8:] + + // Decode uncompressed document size. + m.Size = binary.LittleEndian.Uint32(b) + b = b[4:] + + m.tokensCount = binary.LittleEndian.Uint32(b) + b = b[4:] + + m.tokensBin = b + + return nil +} + +func (m *MetaData) DecodeTokens(tokens []tokenizer.MetaToken) ([]tokenizer.MetaToken, error) { + b := m.tokensBin + + // Decode tokens. + tokens = tokens[:0] + tokens = slices.Grow(tokens, int(m.tokensCount))[:m.tokensCount] + + for i := range tokens { + var err error + if b, err = tokens[i].UnmarshalBinary(b); err != nil { + return nil, err + } + } + return tokens, nil +} + +func (m *MetaData) TokensCount() uint32 { + return m.tokensCount +} diff --git a/seq/seq.go b/seq/seq.go index 8c56cab7..9a378196 100644 --- a/seq/seq.go +++ b/seq/seq.go @@ -1,6 +1,7 @@ package seq import ( + "cmp" "encoding/binary" "encoding/hex" "fmt" @@ -59,6 +60,13 @@ func LessOrEqual(a, b ID) bool { return a.MID < b.MID } +func Compare(a, b ID) int { + return cmp.Or( + cmp.Compare(a.MID, b.MID), + cmp.Compare(a.RID, b.RID), + ) +} + func Less(a, b ID) bool { if a.MID == b.MID { return a.RID < b.RID diff --git a/tests/setup/env.go b/tests/setup/env.go index d7af955a..5f3172ef 100644 --- a/tests/setup/env.go +++ b/tests/setup/env.go @@ -22,7 +22,7 @@ import ( "github.com/ozontech/seq-db/buildinfo" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/mappingprovider" @@ -34,7 +34,7 @@ import ( "github.com/ozontech/seq-db/seq" seqs3 "github.com/ozontech/seq-db/storage/s3" "github.com/ozontech/seq-db/storeapi" - testscommon "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/tests/common" ) type TestingEnvConfig struct { @@ -91,7 +91,7 @@ func (cfg *TestingEnvConfig) GetFracManagerConfig(replicaID string) fracmanager. c = fracmanager.FillConfigWithDefault(&fracmanager.Config{ FracSize: 256 * uint64(units.MiB), TotalSize: 1 * uint64(units.GiB), - SealParams: common.SealParams{ + SealParams: frac.SealParams{ IDsZstdLevel: fastestZstdLevel, LIDsZstdLevel: fastestZstdLevel, TokenListZstdLevel: fastestZstdLevel, @@ -264,7 +264,7 @@ func (cfg *TestingEnvConfig) MakeStores( for i := range confs { k := i / replicas - testscommon.CreateDir(confs[i].FracManager.DataDir) + common.CreateDir(confs[i].FracManager.DataDir) mappingProvider, err := mappingprovider.New( "", @@ -429,7 +429,7 @@ func (t *TestingEnv) IngestorFetchAddr() string { } func randomListener() (lis net.Listener) { - lis, err := net.Listen("tcp", fmt.Sprintf("%s:0", testscommon.Localhost)) + lis, err := net.Listen("tcp", fmt.Sprintf("%s:0", common.Localhost)) if err != nil { panic(err) } diff --git a/util/fs.go b/util/fs.go index 57fd7b17..1e1c94cc 100644 --- a/util/fs.go +++ b/util/fs.go @@ -6,6 +6,7 @@ package util import ( "errors" "os" + "path/filepath" "go.uber.org/zap" @@ -53,3 +54,21 @@ func RemoveFile(file string) { logger.Error("file removing error", zap.Error(err)) } } + +func MustOpenFile(name string, skipFsync bool) (*os.File, os.FileInfo) { + file, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o664) + if err != nil { + logger.Fatal("can't create docs file", zap.String("file", name), zap.Error(err)) + } + + if !skipFsync { + parentDirPath := filepath.Dir(name) + MustSyncPath(parentDirPath) + } + + stat, err := file.Stat() + if err != nil { + logger.Fatal("can't stat docs file", zap.String("file", name), zap.Error(err)) + } + return file, stat +} From 113b9c3b48771f0d8ea282fc241b2030b546fa48 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Sun, 7 Dec 2025 20:15:16 +0300 Subject: [PATCH 04/28] reuse slices --- frac/active2/data_provider.go | 4 +- frac/active2/indexer.go | 352 +++++++++++++++++------------- frac/active2/indexer_allocator.go | 131 ----------- frac/active2/mem_index.go | 26 ++- frac/active2/merge.go | 4 +- frac/active2/resources.go | 125 +++++++++++ frac/active2/sealing_source.go | 13 +- resources/call_stack.go | 16 ++ resources/global_pools.go | 7 + resources/object_allocator.go | 47 ++++ resources/sized_pool.go | 71 ++++++ resources/slice_allocator.go | 31 +++ resources/slice_on_bytes.go | 38 ++++ 13 files changed, 566 insertions(+), 299 deletions(-) delete mode 100644 frac/active2/indexer_allocator.go create mode 100644 frac/active2/resources.go create mode 100644 resources/call_stack.go create mode 100644 resources/global_pools.go create mode 100644 resources/object_allocator.go create mode 100644 resources/sized_pool.go create mode 100644 resources/slice_allocator.go create mode 100644 resources/slice_on_bytes.go diff --git a/frac/active2/data_provider.go b/frac/active2/data_provider.go index 78771216..13197e70 100644 --- a/frac/active2/data_provider.go +++ b/frac/active2/data_provider.go @@ -27,8 +27,8 @@ func (si *fetchIndex) GetBlocksOffsets(blockIndex uint32) uint64 { func (si *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { docsPos := make([]seq.DocPos, len(ids)) for i, id := range ids { - if lid, ok := si.index.idToLID[id]; ok { - docsPos[i] = si.index.positions[lid] + if lid, ok := si.index.GetLIDByID(id); ok { + docsPos[i] = si.index.positions[lid-1] continue } docsPos[i] = seq.DocPosNotFound diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index bc878b18..c7386223 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -14,290 +14,330 @@ import ( "github.com/ozontech/seq-db/util" ) -const uint32Len = int(unsafe.Sizeof(uint32(0))) +const uint32Size = uint32(unsafe.Sizeof(uint32(0))) +// Indexer indexes documents with concurrency limitation type Indexer struct { sem chan struct{} } -func NewIndexer(workersCount int) *Indexer { +// NewIndexer creates a new indexer with specified number of workers +func NewIndexer(workerCount int) *Indexer { return &Indexer{ - sem: make(chan struct{}, workersCount), + sem: make(chan struct{}, workerCount), } } -func (s *Indexer) Index(meta storage.DocBlock, applyFn func(index *memIndex, err error)) { - s.sem <- struct{}{} +// Index starts asynchronous document indexing +func (idxr *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { + idxr.sem <- struct{}{} go func() { - applyFn(NewMemIndex(meta)) - <-s.sem + apply(NewMemIndex(block)) + <-idxr.sem }() } -func NewMemIndex(metaBlock storage.DocBlock) (*memIndex, error) { +// NewMemIndex creates an in-memory index from a document block +func NewMemIndex(block storage.DocBlock) (*memIndex, error) { sw := stopwatch.New() - res := newIndexerResources() - defer res.releaseAll() + res, cleanup := NewResources() + defer cleanup() - payload, err := decompressionMeta(metaBlock, res, sw) + // Decompress metadata + payload, err := decompressMeta(res, block, sw) if err != nil { return nil, err } - meta, err := decodeMeta(payload, res, sw) + buf := res.Buffer() + + // Decode metadata + meta, err := decodeMetadata(res, buf, payload, sw) if err != nil { return nil, err } - + // Initialize index idx := &memIndex{ idToLID: make(map[seq.ID]uint32, len(meta)), docsCount: uint32(len(meta)), - fieldsTokens: make(map[string]tokensRange), - blocksOffsets: []uint64{metaBlock.GetExt2()}, // only one block per bulk + blocksOffsets: []uint64{block.GetExt2()}, // Only one block per bulk } - tids, lids, tokens, err := extractTokensFromMetadata(meta, idx, res) + // Extract tokens from metadata + tids, lids, tokens, err := extractTokens(res, buf, meta, idx) if err != nil { return nil, err } - tokenLIDs := groupLIDsByToken(res, tids, lids, len(tokens)) + // Group documents by token + tokenDocGroups := groupLIDsByTID(res, tids, lids, len(tokens)) - organizeTokensAndFields(idx, tokens, tokenLIDs) + // Organize tokens and fields + organizeTokens(res, buf, idx, tokens, tokenDocGroups) + // Set special "all" token idx.allTID = uint32(idx.fieldsTokens[seq.TokenAll].start) return idx, nil } -type tokenKey struct { - v, k string +// token represents a unique token as a (field, value) pair. +// Used as a map key during token deduplication. +type token struct { + value string + field string } -func convertMetaToken(t tokenizer.MetaToken) tokenKey { - return tokenKey{ - k: util.ByteToStringUnsafe(t.Key), - v: util.ByteToStringUnsafe(t.Value), +func toToken(t tokenizer.MetaToken) token { + return token{ + value: util.ByteToStringUnsafe(t.Value), + field: util.ByteToStringUnsafe(t.Key), } } -func extractTokensFromMetadata( +// extractTokens extracts tokens from document metadata +func extractTokens( + res *Resources, + buf *indexBuffer, meta []indexer.MetaData, idx *memIndex, - res *indexerResources, -) ([]uint32, []uint32, []tokenKey, error) { - var lidsSize uint32 +) ([]uint32, []uint32, []token, error) { + var totalTokens uint32 var docOffset uint64 - localRes := newIndexerResources() - defer localRes.releaseAll() - - // scan in orig order to calc offsets and size - positions := localRes.newDocPos(len(meta)) + // Calculate document positions in the original block + // Each document is stored as: [size: uint32][data: size bytes] + positions := res.Uint64s().AllocSlice(len(meta)) prev := seq.PackDocPos(0, docOffset) - positions = positions[:len(meta)] // inBoubds - for i, docMeta := range meta { + + for i := range meta { + docMeta := meta[i] if docMeta.Size > 0 { + // Start new document group prev = seq.PackDocPos(0, docOffset) - docOffset += uint64(docMeta.Size) + uint64(uint32Len) + docOffset += uint64(docMeta.Size) + uint64(uint32Size) } - positions[i] = prev - lidsSize += docMeta.TokensCount() + positions[i] = uint64(prev) + totalTokens += docMeta.TokensCount() } - lids := res.newUint32s(int(lidsSize)) - tids := res.newUint32s(int(lidsSize)) - - order := localRes.newUint32s(len(meta)) + // Create ordering by document ID (descending) + // We need to map global document IDs to local IDs (LIDs) + order := res.Uint32s().AllocSlice(len(meta)) for i := range order { order[i] = uint32(i) } - slices.SortFunc(order, func(a, b uint32) int { return seq.Compare(meta[b].ID, meta[a].ID) }) + slices.SortFunc(order, func(a, b uint32) int { + return seq.Compare(meta[b].ID, meta[a].ID) + }) + // Fill index structures with sorted documents ids := make([]seq.ID, len(order)) pos := make([]seq.DocPos, len(order)) - for lid, i := range order { - docMeta := meta[i] + for lid, origIdx := range order { + docMeta := meta[origIdx] ids[lid] = docMeta.ID idx.docsSize += uint64(docMeta.Size) - idx.idToLID[docMeta.ID] = uint32(lid) - pos[lid] = positions[i] + idx.idToLID[docMeta.ID] = uint32(lid) + 1 // store lid+1 (1-based indexing for internal use) + pos[lid] = seq.DocPos(positions[origIdx]) } idx.ids = ids idx.positions = pos + // Extract and process tokens from all documents var err error - var mt tokenKey + var token token - tids = tids[:0] - lids = lids[:0] + // Allocate slices for token-document relationships + lids := res.Uint32s().AllocSlice(int(totalTokens))[:0] // Local document ID for each token occurrence + tids := res.Uint32s().AllocSlice(int(totalTokens))[:0] // Token ID for each occurrence - tokenToTID := localRes.newMetaTokenMap(1000) - tokens, release := localRes.newTokenizerMetaTokens(1000) + // Map tokenKey -> tokenID (global token identifier) + tokenMap := res.TokenMap().Alloc(1000) - for lid, i := range order { - docMeta := meta[i] - if tokens, err = docMeta.DecodeTokens(tokens[:0]); err != nil { + // Process documents in ID-sorted order + for lid, origIdx := range order { + docMeta := meta[origIdx] + + // Decode tokens for this document + if buf.tokens, err = docMeta.DecodeTokens(buf.tokens[:0]); err != nil { return nil, nil, nil, err } - for _, t := range tokens { - mt = convertMetaToken(t) - tid, ok := tokenToTID[mt] - if !ok { - tid = uint32(len(tokenToTID)) - tokenToTID[mt] = tid + + // Process each token in the document + for _, t := range buf.tokens { + token = toToken(t) + tid, exists := tokenMap[token] + if !exists { + tid = uint32(len(tokenMap)) // assign new token ID + tokenMap[token] = tid } tids = append(tids, tid) - lids = append(lids, uint32(lid+1)) + lids = append(lids, uint32(lid)+1) // store lid+1 (1-based indexing for internal use) } } - release(tokens) - - tidToToken := res.newMetaTokens(len(tokenToTID)) - for mt, tid := range tokenToTID { - tidToToken[tid] = mt + // Create reverse mapping: tokenID -> tokenKey + tokens := res.Tokens().AllocSlice(len(tokenMap)) + for key, tokenID := range tokenMap { + tokens[tokenID] = key } - return tids, lids, tidToToken, nil + return tids, lids, tokens, nil } -func groupLIDsByToken(res *indexerResources, tids, lids []uint32, tokensCnt int) [][]uint32 { - // считаем размеры токенлидсов - localRes := newIndexerResources() - defer localRes.releaseAll() - - lens := localRes.newUint32s(tokensCnt) - clear(lens) +// groupLIDsByTID groups document IDs by token +// Input: flat arrays of (tid, lid) pairs +// Output: 2D array where tokenLIDs[tid] = []lid +func groupLIDsByTID(res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { + // Phase 1: Count documents per token + counts := res.Uint32s().AllocSlice(tokenCount) + clear(counts) for _, tid := range tids { - lens[tid]++ + counts[tid]++ } - // нарезаем токенлидсы - tokenLIDs := res.newUint32Slices(tokensCnt) - lidsBuffer := make([]uint32, len(lids)) - for tid, cnt := range lens { - tokenLIDs[tid] = lidsBuffer[:cnt][:0] - lidsBuffer = lidsBuffer[cnt:] + // Phase 2: Allocate slices for each token group + // We use a single large buffer and slice it for efficiency + tokenLIDs := res.Uint32Slices().AllocSlice(tokenCount) + buffer := make([]uint32, len(lids)) + + tokenLIDs = tokenLIDs[:len(counts)] + for tid, count := range counts { + tokenLIDs[tid] = buffer[:count][:0] + buffer = buffer[count:] } - // заполняем токенлидсы - lids = lids[:len(tids)] // isInBounds + // Phase 3: Populate groups with document IDs + // We reuse docIDs slice bounds for safety + lids = lids[:len(tids)] for i, tid := range tids { tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) } + return tokenLIDs } -func organizeTokensAndFields(idx *memIndex, tokens []tokenKey, tokenLIDs [][]uint32) { - localRes := newIndexerResources() - defer localRes.releaseAll() - - order := localRes.newUint32s(len(tokens)) - for i := range order { +// organizeTokens organizes tokens and fields in the index with proper sorting +func organizeTokens(res *Resources, buf *indexBuffer, idx *memIndex, tokens []token, tokenLIDs [][]uint32) { + tokenSize := 0 + order := res.Uint32s().AllocSlice(len(tokens)) + order = order[:len(tokens)] + for i, t := range tokens { order[i] = uint32(i) + tokenSize += len(t.value) } + // Create ordering for sorting tokens + // We'll sort by (field, value) to group tokens by field slices.SortFunc(order, func(a, b uint32) int { - aToken, bToken := tokens[a], tokens[b] + tokenA, tokenB := tokens[a], tokens[b] return cmp.Or( - cmp.Compare(aToken.k, bToken.k), - cmp.Compare(aToken.v, bToken.v), + cmp.Compare(tokenA.field, tokenB.field), + cmp.Compare(tokenA.value, tokenB.value), ) }) - tokensSize := 0 - for _, t := range tokens { - tokensSize += len(t.v) - } - + fieldSize := 0 prevField := "" - fieldsSize := 0 - fields := localRes.newStrings(100)[:0] - fieldsTIDs := localRes.newUint32s(100)[:0] - - bufferTokens := make([]byte, 0, tokensSize) - - orderedTokens := make([][]byte, len(order)) - orderedTokenLIDs := make([][]uint32, len(order)) - - for tid, i := range order { - mt := tokens[i] - if mt.k != prevField || prevField == "" { - // collect uniq fields values - fieldsSize += len(mt.k) - fields = append(fields, mt.k) - fieldsTIDs = append(fieldsTIDs, uint32(tid)) + + // Prepare buffers for sorted data + tokenBuffer := make([]byte, 0, tokenSize) + sortedTokens := make([][]byte, len(order)) + sortedTokenLIDs := make([][]uint32, len(order)) + + // Process tokens in sorted order + for tid, origIdx := range order { + token := tokens[origIdx] + + // Detect field boundaries + // When field name changes, record the field and its first token position + if token.field != prevField || prevField == "" { + fieldSize += len(token.field) + buf.fields = append(buf.fields, token.field) + buf.fieldTIDs = append(buf.fieldTIDs, uint32(tid)) } - prevField = mt.k + prevField = token.field - // copy tokens - p := len(bufferTokens) - bufferTokens = append(bufferTokens, mt.v...) + // Copy token value to buffer and keep reference + start := len(tokenBuffer) + tokenBuffer = append(tokenBuffer, token.value...) - // fill tokens ordered - orderedTokens[tid] = bufferTokens[p:] - orderedTokenLIDs[tid] = tokenLIDs[i] + // Store in sorted arrays + // Note: We use original tokenID as index to preserve tokenID->data mapping + sortedTokens[tid] = tokenBuffer[start:] + sortedTokenLIDs[tid] = tokenLIDs[origIdx] } - - idx.tokens = orderedTokens - idx.tokenLIDs = orderedTokenLIDs - - fieldsTIDs = append(fieldsTIDs, uint32(len(tokens))) - - bufferFields := make([]byte, 0, fieldsSize) - idx.fields = make([][]byte, len(fields)) - for i, field := range fields { - // copy fields - p := len(bufferFields) - bufferFields = append(bufferFields, field...) - idx.fields[i] = bufferFields[p:] - - // fill field range - tid1 := fieldsTIDs[i] - tid2 := fieldsTIDs[i+1] - idx.fieldsTokens[util.ByteToStringUnsafe(bufferFields[p:])] = tokensRange{ - start: tid1, - count: tid2 - tid1, + // Add sentinel value for easier range calculation + buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) + + // Store in index + idx.tokens = sortedTokens + idx.tokenLIDs = sortedTokenLIDs + + // Organize fields + fieldBuffer := make([]byte, 0, fieldSize) + idx.fields = make([][]byte, len(buf.fields)) + idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) + + for i, field := range buf.fields { + // Copy field name to buffer + start := len(fieldBuffer) + fieldBuffer = append(fieldBuffer, field...) + idx.fields[i] = fieldBuffer[start:] + + // Calculate token range for this field + // Each field has continuous range of token IDs in sorted order + startTID := buf.fieldTIDs[i] + endTID := buf.fieldTIDs[i+1] + idx.fieldsTokens[util.ByteToStringUnsafe(fieldBuffer[start:])] = tokenRange{ + start: startTID, + count: endTID - startTID, } } } -func decompressionMeta(meta storage.DocBlock, ia *indexerResources, sw *stopwatch.Stopwatch) ([]byte, error) { +// decompressMeta decompresses metadata from block +func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwatch) ([]byte, error) { m := sw.Start("decompress_meta") defer m.Stop() - payload, err := meta.DecompressTo(ia.newBytes(int(meta.RawLen()))) + // Allocate exact size needed for compressed data + buffer := res.Bytes().AllocSlice(int(block.RawLen())) + payload, err := block.DecompressTo(buffer) if err != nil { return nil, err } return payload, nil } -func decodeMeta(payload []byte, ia *indexerResources, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { +// decodeMetadata decodes document metadata from binary format +// Format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... +func decodeMetadata(res *Resources, buf *indexBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { m := sw.Start("decode_meta") defer m.Stop() - // scan to get length - offset := 0 - offsets := ia.newInts(1000)[:0] - for offset < len(payload) { + // First pass: scan to determine sizes of each metadata entry + var offset uint32 + for offset < uint32(len(payload)) { size := binary.LittleEndian.Uint32(payload[offset:]) - offset += uint32Len + int(size) - offsets = append(offsets, int(size)) + offset += uint32Size + size + buf.sizes = append(buf.sizes, size) } - // decode - meta := ia.newMetaData(len(offsets)) - for i, size := range offsets { - bin := payload[uint32Len : size+uint32Len] - if err := meta[i].UnmarshalBinaryLazy(bin); err != nil { + // Second pass: decode each metadata entry + meta := res.Metadata().AllocSlice(len(buf.sizes)) + for i, size := range buf.sizes { + // Skip size field to get to actual data + data := payload[uint32Size : size+uint32(uint32Size)] + if err := meta[i].UnmarshalBinaryLazy(data); err != nil { return nil, err } - payload = payload[size+uint32Len:] + // Move to next entry + payload = payload[size+uint32(uint32Size):] } return meta, nil diff --git a/frac/active2/indexer_allocator.go b/frac/active2/indexer_allocator.go deleted file mode 100644 index 2864ed7e..00000000 --- a/frac/active2/indexer_allocator.go +++ /dev/null @@ -1,131 +0,0 @@ -package active2 - -import ( - "slices" - "sync" - "unsafe" - - "github.com/ozontech/seq-db/bytespool" - "github.com/ozontech/seq-db/indexer" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/tokenizer" -) - -type indexerResources struct { - releasers []func() -} - -var ( - poolAllocator = sync.Pool{} - poolMetaData = sync.Pool{} - poolMetaToken = sync.Pool{} - poolUint32Slices = sync.Pool{} - poolTokenizerMetaToken = sync.Pool{} - poolMetaTokenMap = sync.Pool{} - poolStrings = sync.Pool{} -) - -func newIndexerResources() *indexerResources { - ai, ok := poolAllocator.Get().(*indexerResources) - if ok { - ai.releasers = ai.releasers[:0] - } else { - ai = &indexerResources{releasers: make([]func(), 0, 64)} - } - return ai -} - -func (r *indexerResources) newUint32s(size int) []uint32 { - buf, free := acquireSlice[uint32](size) - r.releasers = append(r.releasers, free) - return buf -} - -func (r *indexerResources) newInts(size int) []int { - buf, free := acquireSlice[int](size) - r.releasers = append(r.releasers, free) - return buf -} - -func (r *indexerResources) newBytes(size int) []byte { - buf := bytespool.AcquireLen(size) - r.releasers = append(r.releasers, func() { bytespool.Release(buf) }) - return buf.B -} - -func (r *indexerResources) newDocPos(size int) []seq.DocPos { - buf, free := acquireSlice[seq.DocPos](size) - r.releasers = append(r.releasers, free) - return buf -} - -func (r *indexerResources) newMetaTokenMap(size int) map[tokenKey]uint32 { - buf, ok := poolMetaTokenMap.Get().(map[tokenKey]uint32) - if !ok { - buf = make(map[tokenKey]uint32, size) - } else { - clear(buf) - } - r.releasers = append(r.releasers, func() { poolMetaTokenMap.Put(buf) }) - return buf -} - -func (r *indexerResources) newUint32Slices(size int) [][]uint32 { - bufPtr, free := acquireFromPoolPtr[[]uint32](&poolUint32Slices, size) - r.releasers = append(r.releasers, free) - return *bufPtr -} - -func (r *indexerResources) newMetaTokens(size int) []tokenKey { - bufPtr, free := acquireFromPoolPtr[tokenKey](&poolMetaToken, size) - r.releasers = append(r.releasers, free) - return *bufPtr -} - -func (r *indexerResources) newTokenizerMetaTokensPtr(size int) *[]tokenizer.MetaToken { - bufPtr, free := acquireFromPoolPtr[tokenizer.MetaToken](&poolTokenizerMetaToken, size) - r.releasers = append(r.releasers, free) - return bufPtr -} - -func (r *indexerResources) newStrings(size int) []string { - bufPtr, free := acquireFromPoolPtr[string](&poolStrings, size) - r.releasers = append(r.releasers, free) - return *bufPtr -} - -func (a *indexerResources) newTokenizerMetaTokens(size int) ([]tokenizer.MetaToken, func([]tokenizer.MetaToken)) { - bufPtr := a.newTokenizerMetaTokensPtr(size) - return *bufPtr, func(mt []tokenizer.MetaToken) { *bufPtr = mt } -} - -func (r *indexerResources) newMetaData(size int) []indexer.MetaData { - bufPtr, free := acquireFromPoolPtr[indexer.MetaData](&poolMetaData, size) - r.releasers = append(r.releasers, free) - return *bufPtr -} - -func (r *indexerResources) releaseAll() { - for _, r := range r.releasers { - r() - } - poolAllocator.Put(r) -} - -func acquireSlice[T any](size int) ([]T, func()) { - var tmp T - itemSize := int(unsafe.Sizeof(tmp)) - buf := bytespool.AcquireLen(int(size) * itemSize) - res := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf.B))), size) - return res, func() { bytespool.Release(buf) } -} - -func acquireFromPoolPtr[T any](pool *sync.Pool, size int) (*[]T, func()) { - buf, ok := pool.Get().([]T) - if !ok { - buf = make([]T, size) - } else { - buf = slices.Grow(buf[:0], size)[:size] - } - return &buf, func() { pool.Put(buf) } -} diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index 674ca92a..3aebdce8 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -1,21 +1,23 @@ package active2 import ( + "sort" + "github.com/ozontech/seq-db/seq" ) -type tokensRange struct { +type tokenRange struct { start uint32 count uint32 } type memIndex struct { - ids []seq.ID // IDs ordered DESC - tokens [][]byte // tokens ordered ASC by field:token - tokenLIDs [][]uint32 // LIDs list for each token from `tokens` - fieldsTokens map[string]tokensRange // tokens locator for each field - fields [][]byte // fields ordered ASC - blocksOffsets []uint64 // blocks offsets ordered by offset + ids []seq.ID // IDs ordered DESC + tokens [][]byte // tokens ordered ASC by field:token + tokenLIDs [][]uint32 // LIDs list for each token from `tokens` + fieldsTokens map[string]tokenRange // tokens locator for each field + fields [][]byte // fields ordered ASC + blocksOffsets []uint64 // blocks offsets ordered by offset idToLID map[seq.ID]uint32 positions []seq.DocPos allTID uint32 @@ -48,3 +50,13 @@ func (index *memIndex) IsIntersecting(from, to seq.MID) bool { } return true } + +func (index *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { + lid, ok := index.idToLID[id] + return lid, ok + + // alternative + // todo check to use 1-based lids + i, ok := sort.Find(len(index.ids), func(i int) int { return seq.Compare(index.ids[i], id) }) + return uint32(i), ok +} diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 85e30da4..55bc49d9 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -27,7 +27,7 @@ func mergeIndexes(indexes []*memIndex) *memIndex { ids: make([]seq.ID, 0, docsCount), positions: make([]seq.DocPos, docsCount), idToLID: make(map[seq.ID]uint32, docsCount), - fieldsTokens: make(map[string]tokensRange, fieldsCount), + fieldsTokens: make(map[string]tokenRange, fieldsCount), blocksOffsets: make([]uint64, 0, blocksCount), docsSize: docsSize, } @@ -114,7 +114,7 @@ func mergeTokens(dst *memIndex, orig []mergeIterator) { dst.fieldsTokens[string(prevField)] = tr } dst.fields = append(dst.fields, minToken.Key) - dst.fieldsTokens[string(minToken.Key)] = tokensRange{start: uint32(len(dst.tokens))} + dst.fieldsTokens[string(minToken.Key)] = tokenRange{start: uint32(len(dst.tokens))} prevField = minToken.Key } diff --git a/frac/active2/resources.go b/frac/active2/resources.go new file mode 100644 index 00000000..8bf839ba --- /dev/null +++ b/frac/active2/resources.go @@ -0,0 +1,125 @@ +package active2 + +import ( + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/resources" + "github.com/ozontech/seq-db/tokenizer" +) + +var ( + indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) + tokenizerMetaTokenPool = resources.NewSizedPool[tokenizer.MetaToken](24) + tokenKeyPool = resources.NewSizedPool[token](24) + tokenMapPool = resources.TypedPool[map[token]uint32]{} + resourcesPool = resources.TypedPool[*Resources]{} + bufPool = resources.TypedPool[*indexBuffer]{} +) + +// Resources provides pooled memory allocation for index construction. +// It manages reusable buffers to avoid GC pressure during indexing. +type Resources struct { + releases *resources.CallStack + + uint32s resources.SliceOnBytes[uint32] + uint64s resources.SliceOnBytes[uint64] + bytes resources.SliceAllocator[byte] + strings resources.SliceAllocator[string] + uint32Slices resources.SliceAllocator[[]uint32] + + indexerMetaData resources.SliceAllocator[indexer.MetaData] + tokenizerMetaTokens resources.SliceAllocator[tokenizer.MetaToken] + tokenKeys resources.SliceAllocator[token] + + tokenMap resources.MapAllocator[token, uint32] + + buf resources.ObjectAllocator[indexBuffer] +} + +func NewResources() (*Resources, func()) { + r, ok := resourcesPool.Get() + if !ok { + s := resources.CallStack{} + r = &Resources{ + releases: &s, + uint32s: resources.NewUint32s(&s), + uint64s: resources.NewUint64s(&s), + bytes: resources.NewBytes(&s), + strings: resources.NewStrings(&s), + uint32Slices: resources.NewUint32Slices(&s), + + indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), + tokenizerMetaTokens: resources.NewSliceAllocator(&tokenizerMetaTokenPool, &s), + tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), + + tokenMap: resources.NewMapAllocator(&tokenMapPool, &s), + + buf: resources.NewObjectAllocator(&bufPool, &s), + } + } + return r, func() { + r.releases.CallAll() + resourcesPool.Put(r) + } +} + +func (r *Resources) Bytes() resources.SliceAllocator[byte] { + return r.bytes +} + +func (r *Resources) Uint32s() resources.SliceOnBytes[uint32] { + return r.uint32s +} + +func (r *Resources) Uint64s() resources.SliceOnBytes[uint64] { + return r.uint64s +} + +func (r *Resources) Uint32Slices() resources.SliceAllocator[[]uint32] { + return r.uint32Slices +} + +func (r *Resources) Strings() resources.SliceAllocator[string] { + return r.strings +} + +func (r *Resources) Metadata() resources.SliceAllocator[indexer.MetaData] { + return r.indexerMetaData +} + +func (r *Resources) MetaTokens() resources.SliceAllocator[tokenizer.MetaToken] { + return r.tokenizerMetaTokens +} + +func (r *Resources) Tokens() resources.SliceAllocator[token] { + return r.tokenKeys +} + +func (r *Resources) TokenMap() resources.MapAllocator[token, uint32] { + return r.tokenMap +} + +func (r *Resources) Buffer() *indexBuffer { + return r.buf.Alloc(func() *indexBuffer { + return &indexBuffer{ + sizes: make([]uint32, 0, 1000), + fields: make([]string, 0, 100), + fieldTIDs: make([]uint32, 0, 100), + tokens: make([]tokenizer.MetaToken, 0, 1000), + } + }, func(b *indexBuffer) { + b.fields = b.fields[:0] + b.tokens = b.tokens[:0] + b.fieldTIDs = b.fieldTIDs[:0] + b.sizes = b.sizes[:0] + }) +} + +// indexBuffer is a temporary buffer used during index construction to avoid allocations. +// It holds intermediate data structures that are needed during processing but not in the final index. +// All fields are reused across different processing stages to minimize memory allocations. +type indexBuffer struct { + sizes []uint32 + fields []string + fieldTIDs []uint32 + tokens []tokenizer.MetaToken +} diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go index fb9d09a3..030ee7af 100644 --- a/frac/active2/sealing_source.go +++ b/frac/active2/sealing_source.go @@ -1,8 +1,19 @@ package active2 /* +import ( + "iter" + "time" + "unsafe" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + type SealingSource struct { - info *common.Info + info *frac.Info created time.Time index *memIndex lastErr error diff --git a/resources/call_stack.go b/resources/call_stack.go new file mode 100644 index 00000000..50df25d9 --- /dev/null +++ b/resources/call_stack.go @@ -0,0 +1,16 @@ +package resources + +type CallStack struct { + stack []func() +} + +func (s *CallStack) Defer(f func()) { + s.stack = append(s.stack, f) +} + +func (s *CallStack) CallAll() { + for i := len(s.stack) - 1; i >= 0; i-- { + s.stack[i]() + } + s.stack = s.stack[:0] +} diff --git a/resources/global_pools.go b/resources/global_pools.go new file mode 100644 index 00000000..4be6f014 --- /dev/null +++ b/resources/global_pools.go @@ -0,0 +1,7 @@ +package resources + +var ( + BytesPool = NewSizedPool[byte](24) + StringsPool = NewSizedPool[string](24) + Uint32SlicesPool = NewSizedPool[[]uint32](24) +) diff --git a/resources/object_allocator.go b/resources/object_allocator.go new file mode 100644 index 00000000..2603f3d5 --- /dev/null +++ b/resources/object_allocator.go @@ -0,0 +1,47 @@ +package resources + +type MapAllocator[K comparable, V any] struct { + pool *TypedPool[map[K]V] + releases *CallStack +} + +func NewMapAllocator[K comparable, V any](pool *TypedPool[map[K]V], releases *CallStack) MapAllocator[K, V] { + return MapAllocator[K, V]{ + pool: pool, + releases: releases, + } +} + +func (a MapAllocator[K, V]) Alloc(size int) map[K]V { + obj, ok := a.pool.Get() + if ok { + clear(obj) + } else { + obj = make(map[K]V, size) + } + a.releases.Defer(func() { a.pool.Put(obj) }) + return obj +} + +type ObjectAllocator[T any] struct { + pool *TypedPool[*T] + releases *CallStack +} + +func NewObjectAllocator[T any](pool *TypedPool[*T], releases *CallStack) ObjectAllocator[T] { + return ObjectAllocator[T]{ + pool: pool, + releases: releases, + } +} + +func (a ObjectAllocator[T]) Alloc(newFn func() *T, resetFn func(*T)) *T { + obj, ok := a.pool.Get() + if ok { + resetFn(obj) + } else { + obj = newFn() + } + a.releases.Defer(func() { a.pool.Put(obj) }) + return obj +} diff --git a/resources/sized_pool.go b/resources/sized_pool.go new file mode 100644 index 00000000..834771d4 --- /dev/null +++ b/resources/sized_pool.go @@ -0,0 +1,71 @@ +package resources + +import ( + "math/bits" + "sync" +) + +type TypedPool[T any] struct { + pool sync.Pool +} + +func (p *TypedPool[T]) Get() (T, bool) { + item := p.pool.Get() + var val T + if item == nil { + return val, false + } + val, ok := item.(T) + return val, ok +} + +func (p *TypedPool[T]) Put(item T) { + p.pool.Put(item) +} + +type SizedPool[T any] struct { + pools []TypedPool[[]T] +} + +func NewSizedPool[T any](buckets int) SizedPool[T] { + return SizedPool[T]{ + pools: make([]TypedPool[[]T], buckets), + } +} + +func index(size uint) (idx, leftBorder int) { + idx = bits.Len((size - 1) >> 8) + return idx, 1 << (idx + 8) +} + +func (p SizedPool[T]) Get(size int) []T { + idx, poolCapacity := index(uint(size)) + + if idx < len(p.pools) { + if data, ok := p.pools[idx].Get(); ok { + return data[:size] + } + } + + idx++ + if idx < len(p.pools) { + if data, ok := p.pools[idx].Get(); ok { + return data[:size] + } + } + + return make([]T, size, poolCapacity) +} + +func (p SizedPool[T]) Put(item []T) { + capacity := cap(item) + idx, leftBorder := index(uint(capacity)) + + if idx > 0 && capacity < leftBorder { + idx-- + } + + if idx < len(p.pools) { + p.pools[idx].Put(item) + } +} diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go new file mode 100644 index 00000000..fb9fabc4 --- /dev/null +++ b/resources/slice_allocator.go @@ -0,0 +1,31 @@ +package resources + +func NewBytes(releases *CallStack) SliceAllocator[byte] { + return NewSliceAllocator(&BytesPool, releases) +} + +func NewStrings(releases *CallStack) SliceAllocator[string] { + return NewSliceAllocator(&StringsPool, releases) +} + +func NewUint32Slices(releases *CallStack) SliceAllocator[[]uint32] { + return NewSliceAllocator(&Uint32SlicesPool, releases) +} + +type SliceAllocator[T any] struct { + pool *SizedPool[T] + releases *CallStack +} + +func NewSliceAllocator[T any](pool *SizedPool[T], releases *CallStack) SliceAllocator[T] { + return SliceAllocator[T]{ + pool: pool, + releases: releases, + } +} + +func (a SliceAllocator[T]) AllocSlice(size int) []T { + data := a.pool.Get(size) + a.releases.Defer(func() { a.pool.Put(data) }) + return data[:size] +} diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go new file mode 100644 index 00000000..f2a18ffe --- /dev/null +++ b/resources/slice_on_bytes.go @@ -0,0 +1,38 @@ +package resources + +import "unsafe" + +func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { + return NewSliceOnBytes[uint32](releases) +} + +func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { + return NewSliceOnBytes[uint64](releases) +} + +type SliceOnBytes[T any] struct { + pool *SizedPool[byte] + releases *CallStack +} + +func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { + return SliceOnBytes[T]{ + pool: &BytesPool, + releases: releases, + } +} + +func (a SliceOnBytes[T]) AllocSlice(size int) []T { + data, buf := a.getBuf(size) + a.releases.Defer(func() { a.pool.Put(buf) }) + return data +} + +func (a SliceOnBytes[T]) getBuf(size int) ([]T, []byte) { + var tmp T + itemSize := int(unsafe.Sizeof(tmp)) + buf := a.pool.Get(size * itemSize) + capacity := cap(buf) / itemSize + data := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf))), capacity)[:size] + return data, buf +} From d84e7e595390a334040e1fb6eec29db7951aab55 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Wed, 10 Dec 2025 18:14:58 +0300 Subject: [PATCH 05/28] sealing and merging --- frac/active/docs_source.go | 80 +++++++++++++ frac/{ => active}/file_writer.go | 2 +- frac/{ => active}/file_writer_test.go | 2 +- frac/active/sealing_source.go | 48 ++++---- frac/active/writer.go | 9 +- frac/active2/indexer.go | 6 +- frac/active2/mem_index_pool.go | 6 +- frac/active2/merge.go | 96 +++++++-------- frac/active2/merge_iterator.go | 35 +++--- frac/active2/merge_manager.go | 7 +- frac/active2/resources.go | 66 ++++------- frac/active2/sealing_source.go | 71 +++++------ frac/info.go | 9 ++ frac/sealed/sealing/sort_docs.go | 164 ++++++++++++++++++++++++++ frac/tests/fraction_test.go | 92 +++++++++++++-- fracmanager/fracmanager.go | 2 +- fracmanager/fraction_provider.go | 48 ++++++-- fracmanager/fraction_provider_test.go | 6 +- fracmanager/sealer_test.go | 10 +- 19 files changed, 548 insertions(+), 211 deletions(-) create mode 100644 frac/active/docs_source.go rename frac/{ => active}/file_writer.go (99%) rename frac/{ => active}/file_writer_test.go (99%) create mode 100644 frac/sealed/sealing/sort_docs.go diff --git a/frac/active/docs_source.go b/frac/active/docs_source.go new file mode 100644 index 00000000..c70c5ddb --- /dev/null +++ b/frac/active/docs_source.go @@ -0,0 +1,80 @@ +package active + +import ( + "iter" + + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" +) + +var _ sealing.DocsSource = (*DocsSource)(nil) + +type DocsSource struct { + src sealing.Source + blocksOffsets []uint64 + docsReader *storage.DocsReader + lastErr error +} + +func NewDocsSource(src sealing.Source, blocksOffsets []uint64, docsReader *storage.DocsReader) *DocsSource { + return &DocsSource{ + src: src, + blocksOffsets: blocksOffsets, + docsReader: docsReader, + } +} + +// Docs returns an iterator for documents with their IDs. +// Handles duplicate IDs (for nested indexes). +func (ds *DocsSource) Docs() iter.Seq2[seq.ID, []byte] { + ds.lastErr = nil + return func(yield func(seq.ID, []byte) bool) { + var ( + prev seq.ID + curDoc []byte + ) + + // iterate through ID and position blocks + for ids, pos := range ds.src.IDsBlocks(consts.IDsPerBlock) { + for i, id := range ids { + if id == systemSeqID { + curDoc = nil // reserved system document (no payload) + } else if id != prev { + // if ID changed, read new document + if curDoc, ds.lastErr = ds.doc(pos[i]); ds.lastErr != nil { + return + } + } + prev = id + if !yield(id, curDoc) { + return + } + } + } + } +} + +// doc reads a document from storage by its position. +func (ds *DocsSource) doc(pos seq.DocPos) ([]byte, error) { + blockIndex, docOffset := pos.Unpack() + blockOffset := ds.blocksOffsets[blockIndex] + + var doc []byte + err := ds.docsReader.ReadDocsFunc(blockOffset, []uint64{docOffset}, func(b []byte) error { + doc = b + return nil + }) + if err != nil { + return nil, err + } + return doc, nil +} + +func (ds *DocsSource) LastError() error { + if ds.lastErr != nil { + return ds.lastErr + } + return ds.src.LastError() +} diff --git a/frac/file_writer.go b/frac/active/file_writer.go similarity index 99% rename from frac/file_writer.go rename to frac/active/file_writer.go index 9ac5ab9c..0b1c2a86 100644 --- a/frac/file_writer.go +++ b/frac/active/file_writer.go @@ -1,4 +1,4 @@ -package frac +package active import ( "io" diff --git a/frac/file_writer_test.go b/frac/active/file_writer_test.go similarity index 99% rename from frac/file_writer_test.go rename to frac/active/file_writer_test.go index b72c011b..84fe9e02 100644 --- a/frac/file_writer_test.go +++ b/frac/active/file_writer_test.go @@ -1,4 +1,4 @@ -package frac +package active import ( "errors" diff --git a/frac/active/sealing_source.go b/frac/active/sealing_source.go index d4208f4c..3bc5e972 100644 --- a/frac/active/sealing_source.go +++ b/frac/active/sealing_source.go @@ -18,6 +18,7 @@ import ( "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" @@ -40,23 +41,23 @@ import ( // All iterators work with pre-sorted data and return information // in an order optimal for creating disk index structures. type SealingSource struct { - params frac.SealParams // Sealing parameters - info *frac.Info // fraction Info - created time.Time // Creation time of the source - sortedLIDs []uint32 // Sorted LIDs (Local ID) - oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) - mids *UInt64s // MIDs - rids *UInt64s // RIDs - fields []string // Sorted field names - fieldsMaxTIDs []uint32 // Maximum TIDs for each field - tids []uint32 // Sorted TIDs (Token ID) - tokens [][]byte // Tokens (values) by TID - lids []*TokenLIDs // LID lists for each token - docPosOrig []seq.DocPos // Original document positions - docPosSorted []seq.DocPos // Document positions after sorting - blocksOffsets []uint64 // Document block offsets - docsReader *storage.DocsReader // Document storage reader - lastErr error // Last error + params frac.SealParams // Sealing parameters + info *frac.Info // fraction Info + created time.Time // Creation time of the source + sortedLIDs []uint32 // Sorted LIDs (Local ID) + oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) + mids *UInt64s // MIDs + rids *UInt64s // RIDs + fields []string // Sorted field names + fieldsMaxTIDs []uint32 // Maximum TIDs for each field + tids []uint32 // Sorted TIDs (Token ID) + tokens [][]byte // Tokens (values) by TID + lids []*TokenLIDs // LID lists for each token + docPosOrig map[seq.ID]seq.DocPos // Original document positions + docPosSorted []seq.DocPos // Document positions after sorting + blocksOffsets []uint64 // Document block offsets + docsReader *storage.DocsReader // Document storage reader + lastErr error // Last error } // NewSealingSource creates a new data source for sealing @@ -84,7 +85,7 @@ func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, e fieldsMaxTIDs: fieldsMaxTIDs, tokens: active.TokenList.tidToVal, lids: active.TokenList.tidToLIDs, - docPosOrig: active.DocsPositions.lidToPos, + docPosOrig: active.DocsPositions.idToPos, blocksOffsets: active.DocBlocks.vals, docsReader: &active.sortReader, } @@ -93,9 +94,14 @@ func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, e // Sort documents if not skipped in configuration if !active.Config.SkipSortDocs { - if err := src.SortDocs(); err != nil { + ds := NewDocsSource(&src, src.blocksOffsets, &active.sortReader) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) + if err != nil { return nil, err } + src.docPosSorted = positions[1:] + src.blocksOffsets = blocksOffsets + src.info.DocsOnDisk = uint64(onDiskSize) } return &src, nil @@ -232,9 +238,9 @@ func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.Doc // Use sorted or original positions if len(src.docPosSorted) == 0 { - pos = append(pos, src.docPosOrig[lid]) + pos = append(pos, src.docPosOrig[id]) } else { - pos = append(pos, src.docPosSorted[i+1]) // +1 for system document + pos = append(pos, src.docPosSorted[i]) // +1 for system document } } yield(ids, pos) diff --git a/frac/active/writer.go b/frac/active/writer.go index 1aadee64..6528b92d 100644 --- a/frac/active/writer.go +++ b/frac/active/writer.go @@ -3,20 +3,19 @@ package active import ( "os" - "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/metric/stopwatch" "github.com/ozontech/seq-db/storage" ) type Writer struct { - docs *frac.FileWriter - meta *frac.FileWriter + docs *FileWriter + meta *FileWriter } func NewWriter(docsFile, metaFile *os.File, docsOffset, metaOffset int64, skipFsync bool) *Writer { return &Writer{ - docs: frac.NewFileWriter(docsFile, docsOffset, skipFsync), - meta: frac.NewFileWriter(metaFile, metaOffset, skipFsync), + docs: NewFileWriter(docsFile, docsOffset, skipFsync), + meta: NewFileWriter(metaFile, metaOffset, skipFsync), } } diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index c7386223..494e30c1 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -29,11 +29,11 @@ func NewIndexer(workerCount int) *Indexer { } // Index starts asynchronous document indexing -func (idxr *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { - idxr.sem <- struct{}{} +func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { + idx.sem <- struct{}{} go func() { apply(NewMemIndex(block)) - <-idxr.sem + <-idx.sem }() } diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go index 105316ed..c64a730d 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active2/mem_index_pool.go @@ -71,13 +71,13 @@ func (p *memIndexPool) markAsMerging(items []memIndexExt) { } } -func (p *memIndexPool) replace(processed []memIndexExt, merged *memIndex) { - mergedMeta := p.wrapIndex(merged) +func (p *memIndexPool) replace(oldIndexes []memIndexExt, newIndex *memIndex) { + mergedMeta := p.wrapIndex(newIndex) p.mu.Lock() defer p.mu.Unlock() - for _, metaIndex := range processed { + for _, metaIndex := range oldIndexes { delete(p.underMerging, metaIndex.id) } p.readyToMerge[mergedMeta.id] = mergedMeta diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 55bc49d9..b4aacd38 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -14,7 +14,7 @@ func mergeIndexes(indexes []*memIndex) *memIndex { blocksCount := 0 fieldsCount := 0 docsSize := uint64(0) - iterators := make([]mergeIterator, 0, len(indexes)) + iterators := make([]*mergeIterator, 0, len(indexes)) for _, index := range indexes { docsSize += index.docsSize docsCount += len(index.ids) @@ -25,30 +25,32 @@ func mergeIndexes(indexes []*memIndex) *memIndex { dst := &memIndex{ ids: make([]seq.ID, 0, docsCount), - positions: make([]seq.DocPos, docsCount), + positions: make([]seq.DocPos, 0, docsCount), idToLID: make(map[seq.ID]uint32, docsCount), fieldsTokens: make(map[string]tokenRange, fieldsCount), blocksOffsets: make([]uint64, 0, blocksCount), docsSize: docsSize, + docsCount: uint32(docsCount), } + mergeBlocksOffsets(dst, iterators) + doubles := mergeIDs(dst, iterators) mergeTokens(dst, iterators) - mergePositions(dst, iterators) - dst.docsCount = uint32(len(dst.ids)) dst.allTID = dst.fieldsTokens[seq.TokenAll].start if len(doubles) > 0 { + dst.docsCount = uint32(len(doubles)) logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) } return dst } -func mergeIDs(dst *memIndex, orig []mergeIterator) []seq.ID { +func mergeIDs(dst *memIndex, orig []*mergeIterator) []seq.ID { doubles := []seq.ID{} - iterators := append([]mergeIterator{}, orig...) // make copy + iterators := append([]*mergeIterator{}, orig...) // make copy for len(iterators) > 0 { // try select first @@ -64,14 +66,19 @@ func mergeIDs(dst *memIndex, orig []mergeIterator) []seq.ID { } } - lid := uint32(len(dst.ids)) + lid := uint32(len(dst.ids)) + 1 + dst.ids = append(dst.ids, maxID) + dst.idToLID[maxID] = lid + dst.positions = append(dst.positions, iterators[selected[0]].CurrentPos()) + + k := 0 for _, i := range selected { - iterators[i].AddNewLID(lid) - if !iterators[i].ShiftID() { - removeItem(iterators, i) + iterators[i-k].AddNewLID(lid) + if !iterators[i-k].ShiftID() { + iterators = removeItem(iterators, i-k) + k++ } } - dst.ids = append(dst.ids, maxID) if len(selected) > 1 { doubles = append(doubles, maxID) @@ -80,11 +87,11 @@ func mergeIDs(dst *memIndex, orig []mergeIterator) []seq.ID { return doubles } -func mergeTokens(dst *memIndex, orig []mergeIterator) { +func mergeTokens(dst *memIndex, orig []*mergeIterator) { // todo copy tokens to compact mem usage // todo allocate for all lids at once to optimize allocations var prevField []byte - iterators := append([]mergeIterator{}, orig...) // make copy + iterators := append([]*mergeIterator{}, orig...) // make copy for len(iterators) > 0 { // try select first selected := []int{0} @@ -100,11 +107,13 @@ func mergeTokens(dst *memIndex, orig []mergeIterator) { } } + k := 0 lids := make([][]uint32, 0, len(selected)) for _, i := range selected { - lids = append(lids, iterators[i].CurrentTokenLIDs()) - if !iterators[i].ShiftToken() { - removeItem(iterators, i) + lids = append(lids, iterators[i-k].CurrentTokenLIDs()) + if !iterators[i-k].ShiftToken() { + iterators = removeItem(iterators, i-k) + k++ } } @@ -127,35 +136,17 @@ func mergeTokens(dst *memIndex, orig []mergeIterator) { } } -func mergePositions(dst *memIndex, orig []mergeIterator) { - iterators := append([]mergeIterator{}, orig...) // make copy - for len(iterators) > 0 { - // try select first - selected := []int{0} - minOffset := iterators[0].CurrentBlocksOffset() - - for i := 1; i < len(iterators); i++ { - if cur := iterators[i].CurrentBlocksOffset(); cur == minOffset { - selected = append(selected, i) - } else if cur < minOffset { - minOffset = cur - selected = []int{i} - } +func mergeBlocksOffsets(dst *memIndex, src []*mergeIterator) { + var offset uint32 + for _, it := range src { + for _, offset := range it.index.blocksOffsets { + dst.blocksOffsets = append(dst.blocksOffsets, offset) } - - newBlockIndex := len(dst.blocksOffsets) - dst.blocksOffsets = append(dst.blocksOffsets, minOffset) - - for _, i := range selected { - iterators[i].AddNewBlockIndex(newBlockIndex) - if !iterators[i].ShiftBlocksOffset() { - removeItem(iterators, i) - } + for _, p := range it.index.positions { + oldIdx, docOffset := p.Unpack() + it.AddPos(seq.PackDocPos(oldIdx+offset, docOffset)) } - } - - for _, iterator := range orig { - iterator.RepackDocPositions(dst.positions) + offset += uint32(len(it.index.blocksOffsets)) } } @@ -191,9 +182,12 @@ func mergeLIDs(lids [][]uint32) []uint32 { res = append(res, minLID) + k := 0 for _, i := range selected { - if lids[i] = lids[i][1:]; len(lids[i]) == 0 { - removeItem(lids, i) + lids[i-k] = lids[i-k][1:] + if len(lids[i-k]) == 0 { + lids = removeItem(lids, i-k) + k++ } } } @@ -202,8 +196,14 @@ func mergeLIDs(lids [][]uint32) []uint32 { } func removeItem[V any](items []V, i int) []V { - last := len(items) - 1 - items[i] = items[last] - items = items[:last] + k := 0 + for j, v := range items { + if i == j { + continue + } + items[k] = v + k++ + } + items = items[:k] return items } diff --git a/frac/active2/merge_iterator.go b/frac/active2/merge_iterator.go index 904735bd..6fee9fa6 100644 --- a/frac/active2/merge_iterator.go +++ b/frac/active2/merge_iterator.go @@ -14,16 +14,23 @@ type mergeIterator struct { posBlocks int lastFieldToken int newLIDs []uint32 + newPositions []seq.DocPos newBlocks []int } -func newIndexIterator(index *memIndex) mergeIterator { - return mergeIterator{ +func newIndexIterator(index *memIndex) *mergeIterator { + x := &mergeIterator{ index: index, - newLIDs: make([]uint32, len(index.ids)), - newBlocks: make([]int, len(index.blocksOffsets)), + newLIDs: make([]uint32, 0, len(index.ids)), + newBlocks: make([]int, 0, len(index.blocksOffsets)), lastFieldToken: int(index.fieldsTokens[string(index.fields[0])].count) - 1, } + + field := x.index.fields[x.posField] + r := x.index.fieldsTokens[string(field)] + x.lastFieldToken += int(r.count) - 1 + + return x } func (iq *mergeIterator) ShiftID() bool { @@ -38,6 +45,10 @@ func (iq *mergeIterator) CurrentID() seq.ID { return iq.index.ids[iq.posIDs] } +func (iq *mergeIterator) CurrentPos() seq.DocPos { + return iq.newPositions[iq.posIDs] +} + func (iq *mergeIterator) ShiftToken() bool { iq.posToken++ if iq.posToken == len(iq.index.tokens) { @@ -47,7 +58,7 @@ func (iq *mergeIterator) ShiftToken() bool { iq.posField++ field := iq.index.fields[iq.posField] r := iq.index.fieldsTokens[string(field)] - iq.lastFieldToken += int(r.count) - 1 + iq.lastFieldToken += int(r.count) } return true } @@ -63,7 +74,7 @@ func (iq *mergeIterator) CurrentTokenLIDs() []uint32 { src := iq.index.tokenLIDs[iq.posToken] dst := make([]uint32, 0, len(src)) for _, oldLid := range src { - dst = append(dst, iq.newLIDs[oldLid-1]+1) + dst = append(dst, iq.newLIDs[oldLid-1]) } return dst } @@ -80,6 +91,10 @@ func (iq *mergeIterator) CurrentBlocksOffset() uint64 { return iq.index.blocksOffsets[iq.posBlocks] } +func (iq *mergeIterator) AddPos(p seq.DocPos) { + iq.newPositions = append(iq.newPositions, p) +} + func (iq *mergeIterator) AddNewLID(lid uint32) { iq.newLIDs = append(iq.newLIDs, lid) } @@ -87,11 +102,3 @@ func (iq *mergeIterator) AddNewLID(lid uint32) { func (iq *mergeIterator) AddNewBlockIndex(blockIndex int) { iq.newBlocks = append(iq.newBlocks, blockIndex) } - -func (iq *mergeIterator) RepackDocPositions(dst []seq.DocPos) { - for lid, docPos := range iq.index.positions { - oldBlockIndex, docOffset := docPos.Unpack() - newBlockIndex := uint32(iq.newBlocks[oldBlockIndex]) - dst[lid] = seq.PackDocPos(newBlockIndex, docOffset) - } -} diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index 10d480d1..d3355811 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -59,6 +59,7 @@ func (m *MergeManager) MergeAll() *memIndex { m.wg.Wait() indexesToMerge := m.indexes.ReadyToMerge() + m.indexes.markAsMerging(indexesToMerge) mergedIndex := mergeIndexes(extractIndexes(indexesToMerge)) m.indexes.replace(indexesToMerge, mergedIndex) @@ -115,10 +116,10 @@ func (m *MergeManager) mergeScheduler() { break } - for _, groupToMerge := range preparedGroups { + for _, toMerge := range preparedGroups { go func() { - mergedIndex := mergeIndexes(extractIndexes(groupToMerge)) - m.indexes.replace(groupToMerge, mergedIndex) + mergedIndex := mergeIndexes(extractIndexes(toMerge)) + m.indexes.replace(toMerge, mergedIndex) m.releaseWorker() m.triggerMerge() // check if new merge is needed m.wg.Done() diff --git a/frac/active2/resources.go b/frac/active2/resources.go index 8bf839ba..3d684e12 100644 --- a/frac/active2/resources.go +++ b/frac/active2/resources.go @@ -7,12 +7,11 @@ import ( ) var ( - indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) - tokenizerMetaTokenPool = resources.NewSizedPool[tokenizer.MetaToken](24) - tokenKeyPool = resources.NewSizedPool[token](24) - tokenMapPool = resources.TypedPool[map[token]uint32]{} - resourcesPool = resources.TypedPool[*Resources]{} - bufPool = resources.TypedPool[*indexBuffer]{} + tokenKeyPool = resources.NewSizedPool[token](24) + indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) + tokenMapPool = resources.TypedPool[map[token]uint32]{} + resourcesPool = resources.TypedPool[*Resources]{} + bufPool = resources.TypedPool[*indexBuffer]{} ) // Resources provides pooled memory allocation for index construction. @@ -20,19 +19,14 @@ var ( type Resources struct { releases *resources.CallStack - uint32s resources.SliceOnBytes[uint32] - uint64s resources.SliceOnBytes[uint64] - bytes resources.SliceAllocator[byte] - strings resources.SliceAllocator[string] - uint32Slices resources.SliceAllocator[[]uint32] - - indexerMetaData resources.SliceAllocator[indexer.MetaData] - tokenizerMetaTokens resources.SliceAllocator[tokenizer.MetaToken] - tokenKeys resources.SliceAllocator[token] - - tokenMap resources.MapAllocator[token, uint32] - - buf resources.ObjectAllocator[indexBuffer] + uint32s resources.SliceOnBytes[uint32] + uint64s resources.SliceOnBytes[uint64] + bytes resources.SliceAllocator[byte] + uint32Slices resources.SliceAllocator[[]uint32] + tokenKeys resources.SliceAllocator[token] + indexerMetaData resources.SliceAllocator[indexer.MetaData] + tokenMap resources.MapAllocator[token, uint32] + buf resources.ObjectAllocator[indexBuffer] } func NewResources() (*Resources, func()) { @@ -40,20 +34,15 @@ func NewResources() (*Resources, func()) { if !ok { s := resources.CallStack{} r = &Resources{ - releases: &s, - uint32s: resources.NewUint32s(&s), - uint64s: resources.NewUint64s(&s), - bytes: resources.NewBytes(&s), - strings: resources.NewStrings(&s), - uint32Slices: resources.NewUint32Slices(&s), - - indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), - tokenizerMetaTokens: resources.NewSliceAllocator(&tokenizerMetaTokenPool, &s), - tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), - - tokenMap: resources.NewMapAllocator(&tokenMapPool, &s), - - buf: resources.NewObjectAllocator(&bufPool, &s), + releases: &s, + uint32s: resources.NewUint32s(&s), + uint64s: resources.NewUint64s(&s), + bytes: resources.NewBytes(&s), + uint32Slices: resources.NewUint32Slices(&s), + indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), + tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), + tokenMap: resources.NewMapAllocator(&tokenMapPool, &s), + buf: resources.NewObjectAllocator(&bufPool, &s), } } return r, func() { @@ -78,18 +67,10 @@ func (r *Resources) Uint32Slices() resources.SliceAllocator[[]uint32] { return r.uint32Slices } -func (r *Resources) Strings() resources.SliceAllocator[string] { - return r.strings -} - func (r *Resources) Metadata() resources.SliceAllocator[indexer.MetaData] { return r.indexerMetaData } -func (r *Resources) MetaTokens() resources.SliceAllocator[tokenizer.MetaToken] { - return r.tokenizerMetaTokens -} - func (r *Resources) Tokens() resources.SliceAllocator[token] { return r.tokenKeys } @@ -104,7 +85,7 @@ func (r *Resources) Buffer() *indexBuffer { sizes: make([]uint32, 0, 1000), fields: make([]string, 0, 100), fieldTIDs: make([]uint32, 0, 100), - tokens: make([]tokenizer.MetaToken, 0, 1000), + tokens: make([]tokenizer.MetaToken, 0, 100), } }, func(b *indexBuffer) { b.fields = b.fields[:0] @@ -116,7 +97,6 @@ func (r *Resources) Buffer() *indexBuffer { // indexBuffer is a temporary buffer used during index construction to avoid allocations. // It holds intermediate data structures that are needed during processing but not in the final index. -// All fields are reused across different processing stages to minimize memory allocations. type indexBuffer struct { sizes []uint32 fields []string diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go index 030ee7af..359d91c4 100644 --- a/frac/active2/sealing_source.go +++ b/frac/active2/sealing_source.go @@ -1,71 +1,74 @@ package active2 -/* import ( "iter" + "math" "time" - "unsafe" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/util" ) +var ( + _ sealing.Source = (*SealingSource)(nil) + + systemSeqID = seq.ID{ + MID: math.MaxUint64, + RID: math.MaxUint64, + } +) + type SealingSource struct { info *frac.Info - created time.Time index *memIndex lastErr error } -func NewSealingSource(active *Active2, params common.SealParams) (sealing.Source, error) { - info := *active.info // copy - src := SealingSource{ - info: &info, - created: time.Now(), - index: active.indexes.MergeAll(), +func NewSealingSource(a *Active2, params frac.SealParams) (sealing.Source, error) { + info := *a.info // copy + index := *a.indexes.MergeAll() // copy + + ss := &SealingSource{ + info: &info, + index: &index, } - src.prepareInfo() - if !active.Config.SkipSortDocs { - sortedSrc, err := frac.NewSortedSealingSource(&src, &active.sortReader, params) + // Sort documents if not skipped in configuration + if !a.Config.SkipSortDocs { + ds := active.NewDocsSource(ss, index.blocksOffsets, &a.sortReader) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) if err != nil { return nil, err } - return sortedSrc, nil + ss.index.positions = positions[1:] + ss.index.blocksOffsets = blocksOffsets + ss.info.DocsOnDisk = uint64(onDiskSize) } - return &src, nil -} + ss.info.MetaOnDisk = 0 + ss.info.SealingTime = uint64(time.Now().UnixMilli()) + ss.info.BuildDistributionWithIDs(index.ids) -func (src *SealingSource) prepareInfo() { - src.info.MetaOnDisk = 0 - src.info.SealingTime = uint64(src.created.UnixMilli()) - src.info.BuildDistribution(func(yield func(seq.ID) bool) { - for _, id := range src.index.ids { - if !yield(id) { - return - } - } - }) + return ss, nil } -func (src *SealingSource) Info() *common.Info { +func (src *SealingSource) Info() *frac.Info { return src.info } func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { return func(yield func([]seq.ID, []seq.DocPos) bool) { - ids := make([]seq.ID, 0, blockSize) pos := make([]seq.DocPos, 0, blockSize) // first - ids = append(ids, frac.SystemSeqID) // todo; get rid of SystemSeqID in index format + ids = append(ids, systemSeqID) // todo get rid of systemSeqID in index format pos = append(pos, 0) - for _, id := range src.index.ids { + for i, id := range src.index.ids { if len(ids) == blockSize { if !yield(ids, pos) { return @@ -74,14 +77,13 @@ func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.Doc pos = pos[:0] } ids = append(ids, id) - pos = append(pos, src.index.positions[id]) + pos = append(pos, src.index.positions[i]) } yield(ids, pos) } } func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { - const uint32Size = int(unsafe.Sizeof(uint32(0))) return func(yield func([][]byte) bool) { actualSize := 0 block := make([][]byte, 0, blockSize) @@ -93,7 +95,7 @@ func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { actualSize = 0 block = block[:0] } - actualSize += len(token) + uint32Size + actualSize += len(token) + int(uint32Size) block = append(block, token) } yield(block) @@ -126,7 +128,6 @@ func (src *SealingSource) BlocksOffsets() []uint64 { return src.index.blocksOffsets } -func (ss *SealingSource) LastError() error { - return ss.lastErr +func (src *SealingSource) LastError() error { + return src.lastErr } -*/ diff --git a/frac/info.go b/frac/info.go index 093b6e1c..cb9ba3c2 100644 --- a/frac/info.go +++ b/frac/info.go @@ -79,6 +79,15 @@ func (s *Info) BuildDistribution(mids []uint64) { } } +func (s *Info) BuildDistributionWithIDs(ids []seq.ID) { + if !s.InitEmptyDistribution() { + return + } + for _, id := range ids { + s.Distribution.Add(id.MID) + } +} + func (s *Info) InitEmptyDistribution() bool { from := time.UnixMilli(int64(s.From)) creationTime := time.UnixMilli(int64(s.CreationTime)) diff --git a/frac/sealed/sealing/sort_docs.go b/frac/sealed/sealing/sort_docs.go new file mode 100644 index 00000000..0f0f8ac2 --- /dev/null +++ b/frac/sealed/sealing/sort_docs.go @@ -0,0 +1,164 @@ +package sealing + +import ( + "encoding/binary" + "io" + "iter" + "os" + "path/filepath" + "time" + + "github.com/alecthomas/units" + "github.com/ozontech/seq-db/bytespool" + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +type DocsSource interface { + Docs() iter.Seq2[seq.ID, []byte] + LastError() error +} + +// SortDocs sorts documents and writes them in compressed form to disk. +// Creates a temporary file that is then renamed to the final one. +func SortDocs(name string, params frac.SealParams, ds DocsSource) ([]uint64, []seq.DocPos, int, error) { + start := time.Now() + logger.Info("sorting docs...") + + // Create temporary file for sorted documents + sdocsFile, err := os.Create(name + consts.SdocsTmpFileSuffix) + if err != nil { + return nil, nil, 0, err + } + + bw := bytespool.AcquireWriterSize(sdocsFile, int(units.MB)) + defer bytespool.ReleaseWriter(bw) + + // Group documents into blocks + blocks := docBlocks(ds.Docs(), params.DocBlockSize) + + // Write blocks and get new offsets and positions + blocksOffsets, positions, rawSize, onDiskSize, err := writeDocs(blocks, bw, params) + + if err := util.CollapseErrors([]error{ds.LastError(), err}); err != nil { + return nil, nil, 0, err + } + if err := bw.Flush(); err != nil { + return nil, nil, 0, err + } + + // Synchronize and rename file + if err := sdocsFile.Sync(); err != nil { + return nil, nil, 0, err + } + if err := sdocsFile.Close(); err != nil { + return nil, nil, 0, err + } + if err := os.Rename(sdocsFile.Name(), name+consts.SdocsFileSuffix); err != nil { + return nil, nil, 0, err + } + if err := util.SyncPath(filepath.Dir(name)); err != nil { + return nil, nil, 0, err + } + + // Log compression statistics + ratio := float64(rawSize) / float64(onDiskSize) + logger.Info("docs sorting stat", + util.ZapUint64AsSizeStr("raw", uint64(rawSize)), + util.ZapUint64AsSizeStr("compressed", uint64(onDiskSize)), + util.ZapFloat64WithPrec("ratio", ratio, 2), + zap.Int("blocks_count", len(blocksOffsets)), + zap.Int("docs_total", len(positions)), + util.ZapDurationWithPrec("write_duration_ms", time.Since(start), "ms", 0), + ) + + return blocksOffsets, positions, onDiskSize, nil +} + +// writeDocs compresses and writes document blocks, calculating new offsets +// and collecting document positions. +func writeDocs( + blocks iter.Seq2[[]byte, []seq.DocPos], + w io.Writer, + params frac.SealParams, +) ([]uint64, []seq.DocPos, int, int, error) { + offset := 0 + buf := make([]byte, 0) + blocksOffsets := make([]uint64, 0) + allPositions := make([]seq.DocPos, 0) + + rawSize := 0 + diskSize := 0 + + // Process each document block + for block, positions := range blocks { + allPositions = append(allPositions, positions...) + blocksOffsets = append(blocksOffsets, uint64(offset)) + + // Compress document block + buf = storage.CompressDocBlock(block, buf[:0], params.DocBlocksZstdLevel) + + rawSize += len(block) + diskSize += len(buf) + + if _, err := w.Write(buf); err != nil { + return nil, nil, 0, 0, err + } + offset += len(buf) + } + + return blocksOffsets, allPositions, rawSize, diskSize, nil +} + +// docBlocks groups documents into fixed-size blocks. +// Returns an iterator for blocks and corresponding document positions. +func docBlocks(docs iter.Seq2[seq.ID, []byte], blockSize int) iter.Seq2[[]byte, []seq.DocPos] { + return func(yield func([]byte, []seq.DocPos) bool) { + const defaultBlockSize = 128 * units.KiB + if blockSize <= 0 { + blockSize = int(defaultBlockSize) + logger.Warn("document block size not specified", zap.Int("default_size", blockSize)) + } + + var ( + prev seq.ID + index uint32 // Current block index + ) + pos := make([]seq.DocPos, 0) + buf := make([]byte, 0, blockSize) + + // Iterate through documents + for id, doc := range docs { + if id == prev { + // Duplicate IDs (for nested indexes) - store document once, + // but create positions for each LID + pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) + continue + } + prev = id + + // If block is full, yield it + if len(buf) >= blockSize { + if !yield(buf, pos) { + return + } + index++ + buf = buf[:0] + pos = pos[:0] + } + + // Add document position + pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) + + // Write document size and the document itself + buf = binary.LittleEndian.AppendUint32(buf, uint32(len(doc))) + buf = append(buf, doc...) + } + yield(buf, pos) + } +} diff --git a/frac/tests/fraction_test.go b/frac/tests/fraction_test.go index cee4d8c9..8586c4a7 100644 --- a/frac/tests/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -14,7 +14,6 @@ import ( "testing" "time" - "github.com/alecthomas/units" "github.com/johannesboyne/gofakes3" "github.com/johannesboyne/gofakes3/backend/s3mem" "github.com/stretchr/testify/suite" @@ -37,6 +36,7 @@ import ( "github.com/ozontech/seq-db/tokenizer" ) +// TODO сделать разные тесты для сортированных и не сортированных доков type FractionTestSuite struct { suite.Suite tmpDir string @@ -63,7 +63,9 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &frac.Config{} + s.config = &frac.Config{ + SkipSortDocs: true, + } s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -92,7 +94,7 @@ func (s *FractionTestSuite) SetupTestCommon() { DocsPositionsZstdLevel: 1, TokenTableZstdLevel: 1, DocBlocksZstdLevel: 1, - DocBlockSize: 128 * int(units.KiB), + DocBlockSize: 128, // Using a small block size to test multi-block sorting output. } var err error @@ -1057,28 +1059,28 @@ func (s *FractionTestSuite) TestFractionInfo() { switch s.fraction.(type) { case *active.Active: // it varies depending on params and docs shuffled - s.Require().True(info.DocsOnDisk > uint64(350) && info.DocsOnDisk < uint64(400), + s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) - s.Require().True(info.MetaOnDisk >= uint64(350) && info.MetaOnDisk <= uint64(450), + s.Require().True(info.MetaOnDisk >= uint64(450) && info.MetaOnDisk <= uint64(550), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") case *active2.Active2: // it varies depending on params and docs shuffled - s.Require().True(info.DocsOnDisk > uint64(350) && info.DocsOnDisk < uint64(400), + s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) - s.Require().True(info.MetaOnDisk >= uint64(350) && info.MetaOnDisk <= uint64(450), + s.Require().True(info.MetaOnDisk >= uint64(450) && info.MetaOnDisk <= uint64(550), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") case *sealed.Sealed: // it varies depending on params and docs shuffled and docs sorting - s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), + s.Require().True(info.DocsOnDisk > uint64(460) && info.DocsOnDisk < uint64(540), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1600), "index on disk doesn't match. actual value: %d", info.MetaOnDisk) case *sealed.Remote: // it varies depending on params and docs shuffled and docs sorting - s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), + s.Require().True(info.DocsOnDisk > uint64(460) && info.DocsOnDisk < uint64(540), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1500), @@ -1294,7 +1296,7 @@ func (s *FractionTestSuite) AppendBulks(a appender, bulks ...[]string) { var wg sync.WaitGroup for _, bulk := range bulks { - bulkSize := (len(bulk)-1)/2 + 1 + bulkSize := (len(bulk)-1)/3 + 1 for len(bulk) > 0 { l := min(bulkSize, len(bulk)) docs := bulk[:l] @@ -1408,8 +1410,6 @@ func (s *ActiveReplayedFractionTestSuite) SetupSuite() { func (s *ActiveReplayedFractionTestSuite) SetupTest() { s.SetupTestCommon() // Setting this flags allows to keep meta and docs files on disk after Active.Release() is called - s.config.SkipSortDocs = true - s.config.KeepMetaFile = true s.insertDocuments = func(bulks ...[]string) { if s.fraction != nil { @@ -1421,7 +1421,6 @@ func (s *ActiveReplayedFractionTestSuite) SetupTest() { func (s *ActiveReplayedFractionTestSuite) Replay(f *active.Active) frac.Fraction { fracFileName := f.BaseFileName - f.Release() replayedFrac := active.New( fracFileName, s.activeIndexer, @@ -1645,6 +1644,10 @@ func TestActive2FractionTestSuite(t *testing.T) { suite.Run(t, new(Active2FractionTestSuite)) } +func TestSealed2FractionTestSuite(t *testing.T) { + suite.Run(t, new(Sealed2FractionTestSuite)) +} + type Active2FractionTestSuite struct { FractionTestSuite } @@ -1693,3 +1696,66 @@ func (s *Active2FractionTestSuite) TearDownTest() { func (s *Active2FractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } + +type Sealed2FractionTestSuite struct { + Active2FractionTestSuite +} + +func (s *Sealed2FractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *Sealed2FractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(docs ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newSealed2(docs...) + } +} + +func (s *Sealed2FractionTestSuite) TearDownTest() { + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") + } + s.TearDownTestCommon() +} + +func (s *Sealed2FractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} + +func (s *Sealed2FractionTestSuite) newSealed2(bulks ...[]string) *sealed.Sealed { + a := s.newActive2(bulks...) + + activeSealingSource, err := active2.NewSealingSource(a, s.sealParams) + s.Require().NoError(err, "Sealing source creation failed") + + preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) + s.Require().NoError(err, "Sealing failed") + + indexCache := &sealed.IndexCache{ + MIDs: cache.NewCache[[]byte](nil, nil), + RIDs: cache.NewCache[[]byte](nil, nil), + Params: cache.NewCache[seqids.BlockParams](nil, nil), + LIDs: cache.NewCache[*lids.Block](nil, nil), + Tokens: cache.NewCache[*token.Block](nil, nil), + TokenTable: cache.NewCache[token.Table](nil, nil), + Registry: cache.NewCache[[]byte](nil, nil), + } + + f := sealed.NewPreloaded( + a.BaseFileName, + preloaded, + storage.NewReadLimiter(1, nil), + indexCache, + cache.NewCache[[]byte](nil, nil), + s.config, + ) + a.Release() + return f +} diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index ad5163f7..92023857 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -42,7 +42,7 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) idx, stopIdx := active.NewIndexer(config.IndexWorkers, config.IndexWorkers) cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) - provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) + provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx, nil) infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) // Load existing fractions into registry diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 412a2d4b..d7bdc0c6 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -11,6 +11,7 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active2" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/storage" @@ -22,25 +23,27 @@ const fileBasePattern = "seq-db-" // fractionProvider is a factory for creating different types of fractions // Contains all necessary dependencies for creating and managing fractions type fractionProvider struct { - s3cli *s3.Client // Client for S3 storage operations - config *Config // Fraction manager configuration - cacheProvider *CacheMaintainer // Cache provider for data access optimization - activeIndexer *active.Indexer // Indexer for active fractions - readLimiter *storage.ReadLimiter // Read rate limiter - ulidEntropy io.Reader // Entropy source for ULID generation + s3cli *s3.Client // Client for S3 storage operations + config *Config // Fraction manager configuration + cacheProvider *CacheMaintainer // Cache provider for data access optimization + activeIndexer *active.Indexer // Indexer for active fractions + activeIndexer2 *active2.Indexer + readLimiter *storage.ReadLimiter // Read rate limiter + ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( cfg *Config, s3cli *s3.Client, cp *CacheMaintainer, - readLimiter *storage.ReadLimiter, indexer *active.Indexer, + readLimiter *storage.ReadLimiter, indexer *active.Indexer, indexer2 *active2.Indexer, ) *fractionProvider { return &fractionProvider{ - s3cli: s3cli, - config: cfg, - cacheProvider: cp, - activeIndexer: indexer, - readLimiter: readLimiter, - ulidEntropy: ulid.Monotonic(rand.New(rand.NewSource(time.Now().UnixNano())), 0), + s3cli: s3cli, + config: cfg, + cacheProvider: cp, + activeIndexer: indexer, + activeIndexer2: indexer2, + readLimiter: readLimiter, + ulidEntropy: ulid.Monotonic(rand.New(rand.NewSource(time.Now().UnixNano())), 0), } } @@ -55,6 +58,17 @@ func (fp *fractionProvider) NewActive(name string) *active.Active { ) } +func (fp *fractionProvider) NewActive2(name string) *active2.Active2 { + return active2.New( + name, + &fp.config.Fraction, + fp.activeIndexer2, + fp.readLimiter, + fp.cacheProvider.CreateDocBlockCache(), + fp.cacheProvider.CreateSortDocsCache(), + ) +} + func (fp *fractionProvider) NewSealed(name string, cachedInfo *frac.Info) *sealed.Sealed { return sealed.New( name, @@ -105,6 +119,14 @@ func (fp *fractionProvider) CreateActive() *active.Active { return fp.NewActive(baseFilePath) } +// CreateActive creates a new active fraction with auto-generated filename +// Filename pattern: base_pattern + ULID +func (fp *fractionProvider) CreateActive2() *active2.Active2 { + filePath := fileBasePattern + fp.nextFractionID() + baseFilePath := filepath.Join(fp.config.DataDir, filePath) + return fp.NewActive2(baseFilePath) +} + // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index 893f7e84..b4317136 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active2" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" ) @@ -37,8 +38,9 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() rl := storage.NewReadLimiter(1, nil) s3cli, stopS3 := setupS3Client(t) idx, stopIdx := active.NewIndexer(1, 1) + idx2 := active2.NewIndexer(1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) - provider := newFractionProvider(cfg, s3cli, cache, rl, idx) + provider := newFractionProvider(cfg, s3cli, cache, rl, idx, idx2) return provider, func() { stopIdx() stopS3() @@ -46,7 +48,7 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() } func TestFractionID(t *testing.T) { - fp := newFractionProvider(nil, nil, nil, nil, nil) + fp := newFractionProvider(nil, nil, nil, nil, nil, nil) ulid1 := fp.nextFractionID() ulid2 := fp.nextFractionID() assert.NotEqual(t, ulid1, ulid2, "ULIDs should be different") diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index 57db7884..7689b989 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -17,7 +17,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active2" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -35,7 +35,7 @@ func TestMain(m *testing.M) { m.Run() } -func fillActiveFraction(active *active.Active) error { +func fillActiveFraction(active *active2.Active2) error { const muliplier = 10 file, err := os.Open(filepath.Join(testscommon.TestDataDir, "k8s.logs")) @@ -102,12 +102,12 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { fp, tearDown := setupFractionProvider(b, &Config{Fraction: *cfg}) defer tearDown() - a := fp.CreateActive() + a := fp.CreateActive2() err := fillActiveFraction(a) assert.NoError(b, err) - seal := func(a *active.Active, params frac.SealParams) (*sealed.PreloadedData, error) { - src, err := active.NewSealingSource(a, params) + seal := func(a *active2.Active2, params frac.SealParams) (*sealed.PreloadedData, error) { + src, err := active2.NewSealingSource(a, params) assert.NoError(b, err) return sealing.Seal(src, params) } From 92e0f4576d4fb51c3ee8f91d23a58a22b40a7261 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 12 Dec 2025 12:21:40 +0300 Subject: [PATCH 06/28] merging optimizations 1 step --- frac/active2/indexer.go | 2 - frac/active2/indexer_test.go | 49 ++++++++++- frac/active2/mem_index.go | 8 +- frac/active2/merge.go | 113 +++++++++++++++---------- frac/active2/merge_iterator.go | 1 + frac/active2/merge_manager.go | 6 ++ frac/active2/merge_stream.go | 149 +++++++++++++++++++++++++++++++++ 7 files changed, 272 insertions(+), 56 deletions(-) create mode 100644 frac/active2/merge_stream.go diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index 494e30c1..09aee47a 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -59,7 +59,6 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { } // Initialize index idx := &memIndex{ - idToLID: make(map[seq.ID]uint32, len(meta)), docsCount: uint32(len(meta)), blocksOffsets: []uint64{block.GetExt2()}, // Only one block per bulk } @@ -140,7 +139,6 @@ func extractTokens( docMeta := meta[origIdx] ids[lid] = docMeta.ID idx.docsSize += uint64(docMeta.Size) - idx.idToLID[docMeta.ID] = uint32(lid) + 1 // store lid+1 (1-based indexing for internal use) pos[lid] = seq.DocPos(positions[origIdx]) } diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go index 21aec724..ab981597 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active2/indexer_test.go @@ -60,11 +60,58 @@ func BenchmarkIndexer(b *testing.B) { wg.Done() }) } - // runtime.GC() wg.Wait() } } +func BenchmarkMerge(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + idx := NewIndexer(8) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 2000) + assert.NoError(b, err) + + processor := getTestProcessor() + + b.StopTimer() + b.ResetTimer() + for i := 0; i < b.N; i++ { + + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + bulks := make([][]byte, 0, len(readers)) + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + } + + wg := sync.WaitGroup{} + for _, meta := range bulks { + wg.Add(1) + idx.Index(meta, func(index *memIndex, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + active.addIndex(index) + wg.Done() + }) + } + wg.Wait() + + b.StartTimer() + active.indexes.MergeAll() + b.StopTimer() + } +} + func readFileAllAtOnce(filename string) ([][]byte, error) { content, err := os.ReadFile(filename) if err != nil { diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index 3aebdce8..3ec3d3a7 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -18,7 +18,6 @@ type memIndex struct { fieldsTokens map[string]tokenRange // tokens locator for each field fields [][]byte // fields ordered ASC blocksOffsets []uint64 // blocks offsets ordered by offset - idToLID map[seq.ID]uint32 positions []seq.DocPos allTID uint32 @@ -52,11 +51,6 @@ func (index *memIndex) IsIntersecting(from, to seq.MID) bool { } func (index *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { - lid, ok := index.idToLID[id] - return lid, ok - - // alternative - // todo check to use 1-based lids i, ok := sort.Find(len(index.ids), func(i int) int { return seq.Compare(index.ids[i], id) }) - return uint32(i), ok + return uint32(i + 1), ok } diff --git a/frac/active2/merge.go b/frac/active2/merge.go index b4aacd38..fd1771cb 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -2,6 +2,7 @@ package active2 import ( "bytes" + "slices" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" @@ -26,7 +27,6 @@ func mergeIndexes(indexes []*memIndex) *memIndex { dst := &memIndex{ ids: make([]seq.ID, 0, docsCount), positions: make([]seq.DocPos, 0, docsCount), - idToLID: make(map[seq.ID]uint32, docsCount), fieldsTokens: make(map[string]tokenRange, fieldsCount), blocksOffsets: make([]uint64, 0, blocksCount), docsSize: docsSize, @@ -52,9 +52,11 @@ func mergeIDs(dst *memIndex, orig []*mergeIterator) []seq.ID { doubles := []seq.ID{} iterators := append([]*mergeIterator{}, orig...) // make copy + selected := make([]int, 0, len(iterators)) + for len(iterators) > 0 { // try select first - selected := []int{0} + selected = append(selected[:0], 0) maxID := iterators[0].CurrentID() for i := 1; i < len(iterators); i++ { @@ -62,13 +64,12 @@ func mergeIDs(dst *memIndex, orig []*mergeIterator) []seq.ID { selected = append(selected, i) } else if seq.Less(maxID, cur) { maxID = cur - selected = []int{i} + selected = append(selected[:0], i) } } lid := uint32(len(dst.ids)) + 1 dst.ids = append(dst.ids, maxID) - dst.idToLID[maxID] = lid dst.positions = append(dst.positions, iterators[selected[0]].CurrentPos()) k := 0 @@ -92,16 +93,29 @@ func mergeTokens(dst *memIndex, orig []*mergeIterator) { // todo allocate for all lids at once to optimize allocations var prevField []byte iterators := append([]*mergeIterator{}, orig...) // make copy + + selected := make([]int, 0, len(iterators)) + + s := 0 + for _, it := range iterators { + for _, l := range it.index.tokenLIDs { + s += len(l) + } + } + allTokenLIDs := make([]uint32, 0, s) + + p := &streamsPool[uint32]{} + for len(iterators) > 0 { // try select first - selected := []int{0} + selected = append(selected[:0], 0) minToken := iterators[0].CurrentToken() for i := 1; i < len(iterators); i++ { cur := iterators[i].CurrentToken() if cmp := compareMetaToken(cur, minToken); cmp < 0 { minToken = cur - selected = []int{i} + selected = append(selected[:0], i) } else if cmp == 0 { selected = append(selected, i) } @@ -110,7 +124,7 @@ func mergeTokens(dst *memIndex, orig []*mergeIterator) { k := 0 lids := make([][]uint32, 0, len(selected)) for _, i := range selected { - lids = append(lids, iterators[i-k].CurrentTokenLIDs()) + lids = append(lids, iterators[i-k].CurrentTokenLIDs()) // todo переиспольовать CurrentTokenLIDs / lids if !iterators[i-k].ShiftToken() { iterators = removeItem(iterators, i-k) k++ @@ -128,8 +142,16 @@ func mergeTokens(dst *memIndex, orig []*mergeIterator) { } dst.tokens = append(dst.tokens, minToken.Value) - dst.tokenLIDs = append(dst.tokenLIDs, mergeLIDs(lids)) + + start := len(allTokenLIDs) + if string(minToken.Key) == "_all_" { + allTokenLIDs = fillAllLIDs(allTokenLIDs, len(dst.ids)) + } else { + allTokenLIDs = mergeLIDs(lids, allTokenLIDs, p) + } + dst.tokenLIDs = append(dst.tokenLIDs, allTokenLIDs[start:]) } + if tr, ok := dst.fieldsTokens[string(prevField)]; ok { tr.count = uint32(len(dst.tokens)) - tr.start dst.fieldsTokens[string(prevField)] = tr @@ -144,7 +166,7 @@ func mergeBlocksOffsets(dst *memIndex, src []*mergeIterator) { } for _, p := range it.index.positions { oldIdx, docOffset := p.Unpack() - it.AddPos(seq.PackDocPos(oldIdx+offset, docOffset)) + it.AddPos(seq.PackDocPos(oldIdx+offset, docOffset)) // todo - много аллокаций space } offset += uint32(len(it.index.blocksOffsets)) } @@ -158,43 +180,6 @@ func compareMetaToken(mt1, mt2 tokenizer.MetaToken) int { return res } -func mergeLIDs(lids [][]uint32) []uint32 { - size := 0 - for i := range lids { - size += len(lids[i]) - } - res := make([]uint32, 0, size) - - for len(lids) > 0 { - // try select first - selected := []int{0} - minLID := lids[0][0] - - for i := 1; i < len(lids); i++ { - cur := lids[i][0] - if cur == minLID { // can be doubles - selected = append(selected, i) - } else if cur < minLID { - selected = []int{i} - minLID = cur - } - } - - res = append(res, minLID) - - k := 0 - for _, i := range selected { - lids[i-k] = lids[i-k][1:] - if len(lids[i-k]) == 0 { - lids = removeItem(lids, i-k) - k++ - } - } - } - - return res -} - func removeItem[V any](items []V, i int) []V { k := 0 for j, v := range items { @@ -207,3 +192,39 @@ func removeItem[V any](items []V, i int) []V { items = items[:k] return items } + +//////////////////////// + +func fillAllLIDs(buf []uint32, cnt int) []uint32 { + cnt++ + for lid := 1; lid < cnt; lid++ { + buf = append(buf, uint32(lid)) + } + return buf +} + +func mergeLIDs(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { + return mergeLIDsSort(lids, buf, p) + return mergeLIDsTree(lids, buf, p) +} + +func mergeLIDsSort(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { + start := len(buf) + for _, l := range lids { + buf = append(buf, l...) + } + slices.Sort(buf[start:]) + return buf +} + +func mergeLIDsTree(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { + orderedLIDs := MergeSortNSlices(lids, p) + defer p.Reset() + + lid, has := orderedLIDs.Next() + for has { + buf = append(buf, lid) + lid, has = orderedLIDs.Next() + } + return buf +} diff --git a/frac/active2/merge_iterator.go b/frac/active2/merge_iterator.go index 6fee9fa6..b9d745d4 100644 --- a/frac/active2/merge_iterator.go +++ b/frac/active2/merge_iterator.go @@ -23,6 +23,7 @@ func newIndexIterator(index *memIndex) *mergeIterator { index: index, newLIDs: make([]uint32, 0, len(index.ids)), newBlocks: make([]int, 0, len(index.blocksOffsets)), + newPositions: make([]seq.DocPos, 0, len(index.positions)), lastFieldToken: int(index.fieldsTokens[string(index.fields[0])].count) - 1, } diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index d3355811..1a124798 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -58,6 +58,12 @@ func (m *MergeManager) MergeAll() *memIndex { m.wg.Wait() + if len(m.indexes.indexes) == 1 { + return m.indexes.indexes[0] + } + + // todo обработать случай когда нет индексов вообще + indexesToMerge := m.indexes.ReadyToMerge() m.indexes.markAsMerging(indexesToMerge) mergedIndex := mergeIndexes(extractIndexes(indexesToMerge)) diff --git a/frac/active2/merge_stream.go b/frac/active2/merge_stream.go new file mode 100644 index 00000000..3e949d7b --- /dev/null +++ b/frac/active2/merge_stream.go @@ -0,0 +1,149 @@ +package active2 + +import ( + "cmp" +) + +type Ordered interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 +} + +type streamsPool[T Ordered] struct { + ts []TwoStreams[T] + oss []OneSliceStream[T] + tss []TwoSliceStream[T] +} + +func (s *streamsPool[T]) GetTwoStreams(src1, src2 IOrderedStream[T]) (r *TwoStreams[T]) { + s.ts = append(s.ts, TwoStreams[T]{}) + r = &s.ts[len(s.ts)-1] + r.src1 = src1 + r.src2 = src2 + r.Init() + + return r +} + +func (s *streamsPool[T]) GetTwoSliceStream(src1, src2 []T) (r *TwoSliceStream[T]) { + s.tss = append(s.tss, TwoSliceStream[T]{}) + r = &s.tss[len(s.tss)-1] + r.src1 = src1 + r.src2 = src2 + return r +} + +func (s *streamsPool[T]) GetOneSliceStream(src []T) (r *OneSliceStream[T]) { + s.oss = append(s.oss, OneSliceStream[T]{}) + r = &s.oss[len(s.oss)-1] + r.src = src + return r +} + +func (s *streamsPool[T]) Reset() { + s.ts = s.ts[:0] + s.oss = s.oss[:0] + s.tss = s.tss[:0] +} + +func MergeSortNSlices[T Ordered](src [][]T, p *streamsPool[T]) IOrderedStream[T] { + n := len(src) + if n == 1 { + return p.GetOneSliceStream(src[0]) + } + if n == 2 { + return p.GetTwoSliceStream(src[0], src[1]) + } + h := n / 2 + src1 := MergeSortNSlices(src[:h], p) + src2 := MergeSortNSlices(src[h:], p) + return p.GetTwoStreams(src1, src2) +} + +type IOrderedStream[T Ordered] interface { + Next() (T, bool) +} + +type TwoStreams[T Ordered] struct { + src1, src2 IOrderedStream[T] + v1, v2 T + has1, has2 bool +} + +func (s *TwoStreams[T]) Init() { + s.v1, s.has1 = s.src1.Next() + s.v2, s.has2 = s.src2.Next() +} + +func (s *TwoStreams[T]) Next() (v T, has bool) { + if s.has1 && s.has2 { + if s.v1 < s.v2 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + } else { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + } + return v, true + } + if s.has1 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + return v, true + } + if s.has2 { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + return v, true + } + return v, false +} + +type OneSliceStream[T cmp.Ordered] struct { + p int + src []T +} + +func (s *OneSliceStream[T]) Next() (v T, has bool) { + if s.p < len(s.src) { + has = true + v = s.src[s.p] + s.p++ + } + return v, has +} + +type TwoSliceStream[T Ordered] struct { + p1, p2 int + src1, src2 []T +} + +func (s *TwoSliceStream[T]) Next() (v T, has bool) { + n1, n2 := len(s.src1), len(s.src2) + has1 := s.p1 < n1 + has2 := s.p2 < n2 + if has1 && has2 { + + v1 := s.src1[s.p1] + v2 := s.src2[s.p2] + if v1 < v2 { + s.p1++ + return v1, true + } + s.p2++ + return v2, true + } + + if has1 { + v = s.src1[s.p1] + s.p1++ + return v, true + } + + if has2 { + v = s.src2[s.p2] + s.p2++ + return v, true + } + + return v, false +} From 72958fea13e6efa04e93cbca902ce043f5df18c2 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Tue, 16 Dec 2025 03:37:14 +0300 Subject: [PATCH 07/28] merging optimizations 2 step --- frac/active2/iterators.go | 62 +++++ frac/active2/merge.go | 405 +++++++++++++++++++++------------ frac/active2/merge2.go | 211 ----------------- frac/active2/merge_iterator.go | 105 --------- frac/active2/merge_stream.go | 149 ------------ go.mod | 3 + go.sum | 6 + 7 files changed, 329 insertions(+), 612 deletions(-) create mode 100644 frac/active2/iterators.go delete mode 100644 frac/active2/merge2.go delete mode 100644 frac/active2/merge_iterator.go delete mode 100644 frac/active2/merge_stream.go diff --git a/frac/active2/iterators.go b/frac/active2/iterators.go new file mode 100644 index 00000000..8e7d47d7 --- /dev/null +++ b/frac/active2/iterators.go @@ -0,0 +1,62 @@ +package active2 + +type IOrderedIterator[T any] interface { + Next() (T, bool) +} + +func MergeKSortIterators[T any](src []IOrderedIterator[T], cmp func(T, T) int) IOrderedIterator[T] { + n := len(src) + if n == 1 { + return src[0] + } + h := n / 2 + src1 := MergeKSortIterators(src[:h], cmp) + src2 := MergeKSortIterators(src[h:], cmp) + return NewMergeIterator(src1, src2, cmp) +} + +type MergeIterator[T any] struct { + v1, v2 T + has1, has2 bool + src1, src2 IOrderedIterator[T] + cmp func(T, T) int +} + +func NewMergeIterator[T any](src1, src2 IOrderedIterator[T], cmp func(T, T) int) *MergeIterator[T] { + r := MergeIterator[T]{ + src1: src1, + src2: src2, + cmp: cmp, + } + r.Init() + return &r +} + +func (s *MergeIterator[T]) Init() { + s.v1, s.has1 = s.src1.Next() + s.v2, s.has2 = s.src2.Next() +} + +func (s *MergeIterator[T]) Next() (v T, has bool) { + if s.has1 && s.has2 { + if s.cmp(s.v1, s.v2) < 0 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + } else { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + } + return v, true + } + if s.has1 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + return v, true + } + if s.has2 { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + return v, true + } + return v, false +} diff --git a/frac/active2/merge.go b/frac/active2/merge.go index fd1771cb..9496f209 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -4,10 +4,9 @@ import ( "bytes" "slices" - "github.com/ozontech/seq-db/logger" + "github.com/RoaringBitmap/roaring/v2" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/tokenizer" - "go.uber.org/zap" + "github.com/ozontech/seq-db/util" ) func mergeIndexes(indexes []*memIndex) *memIndex { @@ -15,13 +14,11 @@ func mergeIndexes(indexes []*memIndex) *memIndex { blocksCount := 0 fieldsCount := 0 docsSize := uint64(0) - iterators := make([]*mergeIterator, 0, len(indexes)) for _, index := range indexes { docsSize += index.docsSize docsCount += len(index.ids) fieldsCount += len(index.fields) blocksCount += len(index.blocksOffsets) - iterators = append(iterators, newIndexIterator(index)) } dst := &memIndex{ @@ -33,198 +30,312 @@ func mergeIndexes(indexes []*memIndex) *memIndex { docsCount: uint32(docsCount), } - mergeBlocksOffsets(dst, iterators) + newPositions := mergeBlocksOffsets(dst, indexes) - doubles := mergeIDs(dst, iterators) - mergeTokens(dst, iterators) + reLIDs := mergeIDs(dst, indexes, newPositions) + mergeTokens(dst, indexes, reLIDs) dst.allTID = dst.fieldsTokens[seq.TokenAll].start - if len(doubles) > 0 { - dst.docsCount = uint32(len(doubles)) - logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) - } + // todo + // if len(doubles) > 0 { + // dst.docsCount = uint32(len(doubles)) + // logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) + // } return dst } -func mergeIDs(dst *memIndex, orig []*mergeIterator) []seq.ID { - doubles := []seq.ID{} - iterators := append([]*mergeIterator{}, orig...) // make copy +type IDIteratorItem struct { + i int + id seq.ID +} - selected := make([]int, 0, len(iterators)) +type IDIterator struct { + i int + offset int + idx *memIndex +} - for len(iterators) > 0 { - // try select first - selected = append(selected[:0], 0) - maxID := iterators[0].CurrentID() +func (i *IDIterator) Next() (v IDIteratorItem, has bool) { + if i.offset < len(i.idx.ids) { + v.i = i.i + v.id = i.idx.ids[i.offset] + has = true + i.offset++ + } + return v, has +} - for i := 1; i < len(iterators); i++ { - if cur := iterators[i].CurrentID(); cur == maxID { - selected = append(selected, i) - } else if seq.Less(maxID, cur) { - maxID = cur - selected = append(selected[:0], i) - } - } +func mergeIDs(dst *memIndex, indexes []*memIndex, newPositions [][]seq.DocPos) [][]uint32 { + // todo doubles := []seq.ID{} - lid := uint32(len(dst.ids)) + 1 - dst.ids = append(dst.ids, maxID) - dst.positions = append(dst.positions, iterators[selected[0]].CurrentPos()) + newLIDs := make([][]uint32, len(indexes)) + iters := make([]IOrderedIterator[IDIteratorItem], len(indexes)) + for i, idx := range indexes { + iters[i] = &IDIterator{idx: idx, i: i} + newLIDs[i] = make([]uint32, 0, len(idx.ids)) + } - k := 0 - for _, i := range selected { - iterators[i-k].AddNewLID(lid) - if !iterators[i-k].ShiftID() { - iterators = removeItem(iterators, i-k) - k++ - } - } + orderedIDs := MergeKSortIterators(iters, func(a, b IDIteratorItem) int { return seq.Compare(b.id, a.id) }) - if len(selected) > 1 { - doubles = append(doubles, maxID) - } + cur, has := orderedIDs.Next() + + for has { + dst.ids = append(dst.ids, cur.id) + dst.positions = append(dst.positions, newPositions[cur.i][len(newLIDs[cur.i])]) + lid := uint32(len(dst.ids)) + newLIDs[cur.i] = append(newLIDs[cur.i], lid) + cur, has = orderedIDs.Next() } - return doubles + return newLIDs } -func mergeTokens(dst *memIndex, orig []*mergeIterator) { - // todo copy tokens to compact mem usage - // todo allocate for all lids at once to optimize allocations - var prevField []byte - iterators := append([]*mergeIterator{}, orig...) // make copy +type TokenIteratorPayload struct { + idx *memIndex + newLIDs []uint32 +} - selected := make([]int, 0, len(iterators)) +type TokenIteratorItem struct { + tid uint32 + fid uint32 + payload *TokenIteratorPayload +} + +func (i *TokenIteratorItem) Field() []byte { + return i.payload.idx.fields[i.fid] +} + +func (i *TokenIteratorItem) Token() []byte { + return i.payload.idx.tokens[i.tid] +} - s := 0 - for _, it := range iterators { - for _, l := range it.index.tokenLIDs { - s += len(l) +func (i *TokenIteratorItem) LIDs() []uint32 { + return i.payload.idx.tokenLIDs[i.tid] +} + +func (i *TokenIteratorItem) NewLIDs() []uint32 { + return i.payload.newLIDs +} + +type TokenIterator struct { + tid uint32 + fid uint32 + fieldLastTID uint32 + payload TokenIteratorPayload +} + +func NewTokenIterator(idx *memIndex, newLIDs []uint32) *TokenIterator { + return &TokenIterator{ + fieldLastTID: idx.fieldsTokens[string(idx.fields[0])].count - 1, + payload: TokenIteratorPayload{ + idx: idx, + newLIDs: newLIDs, + }, + } +} + +func (it *TokenIterator) Next() (v TokenIteratorItem, has bool) { + if int(it.tid) < len(it.payload.idx.tokens) { + v.tid = uint32(it.tid) + v.fid = uint32(it.fid) + v.payload = &it.payload + has = true + it.tid++ + + if it.tid > it.fieldLastTID { + it.fid++ + if int(it.fid) < len(it.payload.idx.fields) { + it.fieldLastTID += it.payload.idx.fieldsTokens[string(it.payload.idx.fields[it.fid])].count + } } } - allTokenLIDs := make([]uint32, 0, s) + return v, has +} - p := &streamsPool[uint32]{} +func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { + allCount := 0 + totalTokens := 0 + totalLIDsSize := 0 + TokensIterators := make([]IOrderedIterator[TokenIteratorItem], len(indexes)) + for i, index := range indexes { + allCount += len(index.ids) + TokensIterators[i] = NewTokenIterator(index, reLIDs[i]) + totalTokens += len(index.tokens) + for _, lids := range index.tokenLIDs { + totalLIDsSize += len(lids) + } + } - for len(iterators) > 0 { - // try select first - selected = append(selected[:0], 0) - minToken := iterators[0].CurrentToken() + cmpToken := func(a, b TokenIteratorItem) int { + r := bytes.Compare(a.Field(), b.Field()) + if r == 0 { + return bytes.Compare(a.Token(), b.Token()) + } + return r + } - for i := 1; i < len(iterators); i++ { - cur := iterators[i].CurrentToken() - if cmp := compareMetaToken(cur, minToken); cmp < 0 { - minToken = cur - selected = append(selected[:0], i) - } else if cmp == 0 { - selected = append(selected, i) + orderedTokens := MergeKSortIterators(TokensIterators, cmpToken) + + uniqTokensSize := 0 + uniqTokensCount := 0 + + uniqFieldsSize := 0 + uniqFieldsCount := 0 + + prv := TokenIteratorItem{} + var prevField []byte + cur, has := orderedTokens.Next() + items := make([]TokenIteratorItem, 0, totalTokens) + borders := make([]uint8, 0, totalTokens) + for has { + var border uint8 + + if prv.payload == nil || cmpToken(prv, cur) != 0 { + uniqTokensCount++ + uniqTokensSize += len(cur.Token()) + border++ + + if !bytes.Equal(prevField, cur.Field()) { + uniqFieldsCount++ + uniqFieldsSize += len(cur.Field()) + border++ + prevField = cur.Field() } } - k := 0 - lids := make([][]uint32, 0, len(selected)) - for _, i := range selected { - lids = append(lids, iterators[i-k].CurrentTokenLIDs()) // todo переиспольовать CurrentTokenLIDs / lids - if !iterators[i-k].ShiftToken() { - iterators = removeItem(iterators, i-k) - k++ + prv = cur + items = append(items, cur) + borders = append(borders, border) + cur, has = orderedTokens.Next() + } + + dst.fields = make([][]byte, 0, uniqFieldsCount) + dst.tokens = make([][]byte, 0, uniqTokensCount) + dst.tokenLIDs = make([][]uint32, 0, uniqTokensCount) + + allTokens := make([]byte, 0, uniqTokensSize) + allFields := make([]byte, 0, uniqFieldsSize) + tokenRanges := make([]tokenRange, 0, uniqFieldsCount) + + var all bool + lidsSorter := NewLIDsSorter(totalLIDsSize, allCount) + + for i, item := range items { + if borders[i] > 0 { + + if i > 0 { + dst.tokenLIDs = append(dst.tokenLIDs, lidsSorter.Get()) } - } - if !bytes.Equal(prevField, minToken.Key) { // new field - if tr, ok := dst.fieldsTokens[string(prevField)]; ok { - tr.count = uint32(len(dst.tokens)) - tr.start - dst.fieldsTokens[string(prevField)] = tr + if borders[i] > 1 { + + tid := uint32(len(dst.tokens)) + + if i > 0 { + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] + tr.count = tid - tr.start + dst.fieldsTokens[fieldStr] = tr + } + + start := len(allFields) + allFields = append(allFields, item.Field()...) + field := allFields[start:] + dst.fields = append(dst.fields, field) + + tokenRanges = append(tokenRanges, tokenRange{start: tid}) + fieldStr := util.ByteToStringUnsafe(field) + tr := tokenRanges[len(tokenRanges)-1] + dst.fieldsTokens[fieldStr] = tr + + all = fieldStr == "_all_" } - dst.fields = append(dst.fields, minToken.Key) - dst.fieldsTokens[string(minToken.Key)] = tokenRange{start: uint32(len(dst.tokens))} - prevField = minToken.Key - } - dst.tokens = append(dst.tokens, minToken.Value) + start := len(allTokens) + allTokens = append(allTokens, item.Token()...) + dst.tokens = append(dst.tokens, allTokens[start:]) + } - start := len(allTokenLIDs) - if string(minToken.Key) == "_all_" { - allTokenLIDs = fillAllLIDs(allTokenLIDs, len(dst.ids)) + if all { + for range item.LIDs() { + lidsSorter.Add(0) + } } else { - allTokenLIDs = mergeLIDs(lids, allTokenLIDs, p) + newLIDs := item.NewLIDs() + for _, oldLID := range item.LIDs() { + newLID := newLIDs[oldLID-1] + lidsSorter.Add(newLID) + } } - dst.tokenLIDs = append(dst.tokenLIDs, allTokenLIDs[start:]) } - if tr, ok := dst.fieldsTokens[string(prevField)]; ok { - tr.count = uint32(len(dst.tokens)) - tr.start - dst.fieldsTokens[string(prevField)] = tr - } + dst.tokenLIDs = append(dst.tokenLIDs, lidsSorter.Get()) + + tid := uint32(len(dst.tokens)) - 1 + fstr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fstr] + tr.count = tid - tr.start + 1 + dst.fieldsTokens[fstr] = tr + } -func mergeBlocksOffsets(dst *memIndex, src []*mergeIterator) { - var offset uint32 - for _, it := range src { - for _, offset := range it.index.blocksOffsets { - dst.blocksOffsets = append(dst.blocksOffsets, offset) - } - for _, p := range it.index.positions { - oldIdx, docOffset := p.Unpack() - it.AddPos(seq.PackDocPos(oldIdx+offset, docOffset)) // todo - много аллокаций space - } - offset += uint32(len(it.index.blocksOffsets)) - } +type LIDsSorter struct { + all []uint32 + lids []uint32 + offset int + bitmap *roaring.Bitmap } -func compareMetaToken(mt1, mt2 tokenizer.MetaToken) int { - res := bytes.Compare(mt1.Key, mt2.Key) - if res == 0 { - return bytes.Compare(mt1.Value, mt2.Value) +func NewLIDsSorter(size, allCount int) *LIDsSorter { + ls := &LIDsSorter{ + lids: make([]uint32, 0, size), + all: make([]uint32, allCount), + bitmap: roaring.New(), + } + for i := range allCount { + ls.all[i] = uint32(i) + 1 } - return res + return ls } -func removeItem[V any](items []V, i int) []V { - k := 0 - for j, v := range items { - if i == j { - continue - } - items[k] = v - k++ - } - items = items[:k] - return items +func (s *LIDsSorter) Add(lid uint32) { + s.lids = append(s.lids, lid) } -//////////////////////// +func (s *LIDsSorter) Get() (dst []uint32) { + dst = s.lids[s.offset:] -func fillAllLIDs(buf []uint32, cnt int) []uint32 { - cnt++ - for lid := 1; lid < cnt; lid++ { - buf = append(buf, uint32(lid)) + if len(dst) == len(s.all) { + dst = s.all + s.lids = s.lids[:s.offset] + return dst } - return buf -} -func mergeLIDs(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { - return mergeLIDsSort(lids, buf, p) - return mergeLIDsTree(lids, buf, p) -} - -func mergeLIDsSort(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { - start := len(buf) - for _, l := range lids { - buf = append(buf, l...) + if len(dst) > 64_000 { + s.bitmap.AddMany(dst) + s.bitmap.ToExistingArray(&dst) + s.bitmap.Clear() + s.offset = len(s.lids) + return dst } - slices.Sort(buf[start:]) - return buf -} -func mergeLIDsTree(lids [][]uint32, buf []uint32, p *streamsPool[uint32]) []uint32 { - orderedLIDs := MergeSortNSlices(lids, p) - defer p.Reset() + slices.Sort(dst) + s.offset = len(s.lids) + return dst +} - lid, has := orderedLIDs.Next() - for has { - buf = append(buf, lid) - lid, has = orderedLIDs.Next() +func mergeBlocksOffsets(dst *memIndex, indexes []*memIndex) [][]seq.DocPos { + var offset uint32 + positions := make([][]seq.DocPos, len(indexes)) + for i, index := range indexes { + dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) + positions[i] = make([]seq.DocPos, 0, len(index.positions)) + for _, p := range index.positions { + oldIdx, docOffset := p.Unpack() + positions[i] = append(positions[i], seq.PackDocPos(oldIdx+offset, docOffset)) + } + offset += uint32(len(index.blocksOffsets)) } - return buf + return positions } diff --git a/frac/active2/merge2.go b/frac/active2/merge2.go deleted file mode 100644 index 5f447fee..00000000 --- a/frac/active2/merge2.go +++ /dev/null @@ -1,211 +0,0 @@ -package active2 - -/* -import ( - "bytes" - - "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/tokenizer" - "go.uber.org/zap" -) - -func mergeIndexes2(src1, src2 *memIndex) *memIndex { - docsCount := src1.docsCount + src2.docsCount - blocksCount := len(src1.blocksOffsets) + len(src2.blocksOffsets) - docsSize := src1.docsSize + src2.docsSize - fieldsCount := len(src1.fields) + len(src2.fields) - - dst := &memIndex{ - ids: make([]seq.ID, 0, docsCount), - positions: make(map[seq.ID]seq.DocPos, docsCount), - fieldsTokens: make(map[string]tokensRange, fieldsCount), - blocksOffsets: make([]uint64, 0, blocksCount), - docsSize: docsSize, - } - - lids1, lids2, doubles := mergeIDs1(src1, src2, dst) - - mergeTokens(dst, iterators) - mergePositions(dst, iterators) - - dst.docsCount = uint32(len(dst.ids)) - dst.allTID = dst.fieldsTokens[seq.TokenAll].start - - if len(doubles) > 0 { - logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) - } - - return dst -} - -func mergeIDs1(src1, src2, dst *memIndex) ([]uint32, []uint32, []seq.ID) { - doubles := []seq.ID{} - - newLIDs1 := []uint32{} - newLIDs2 := []uint32{} - - var i1, i2 int - for i1 < len(src1.ids) && i2 < len(src2.ids) { - lid := uint32(len(dst.ids)) - id1 := src1.ids[i1] - id2 := src2.ids[i2] - - if seq.Less(id2, id1) { - dst.ids = append(dst.ids, id1) - newLIDs1 = append(newLIDs1, lid) - i1++ - } else if seq.Less(id1, id2) { - dst.ids = append(dst.ids, id2) - newLIDs2 = append(newLIDs2, lid) - i2++ - } else { // id2 == id1 - dst.ids = append(dst.ids, id1) - doubles = append(doubles, id1) - newLIDs1 = append(newLIDs1, lid) - newLIDs2 = append(newLIDs2, lid) - i1++ - i2++ - } - } - - for ; i1 < len(src1.ids); i1++ { - newLIDs1 = append(newLIDs1, uint32(len(dst.ids))) - dst.ids = append(dst.ids, src1.ids[i1]) - } - for ; i2 < len(src1.ids); i2++ { - newLIDs2 = append(newLIDs2, uint32(len(dst.ids))) - dst.ids = append(dst.ids, src2.ids[i2]) - } - - return newLIDs1, newLIDs2, doubles -} - -func mergeTokens(dst *memIndex, orig []mergeIterator) { - // todo copy tokens to compact mem usage - // todo allocate for all lids at once to optimize allocations - var prevField []byte - iterators := append([]mergeIterator{}, orig...) // make copy - for len(iterators) > 0 { - // try select first - selected := []int{0} - minToken := iterators[0].CurrentToken() - - for i := 1; i < len(iterators); i++ { - cur := iterators[i].CurrentToken() - if cmp := compareMetaToken(cur, minToken); cmp < 0 { - minToken = cur - selected = []int{i} - } else if cmp == 0 { - selected = append(selected, i) - } - } - - lids := make([][]uint32, 0, len(selected)) - for _, i := range selected { - lids = append(lids, iterators[i].CurrentTokenLIDs()) - if !iterators[i].ShiftToken() { - removeItem(iterators, i) - } - } - - if !bytes.Equal(prevField, minToken.Key) { // new field - if tr, ok := dst.fieldsTokens[string(prevField)]; ok { - tr.count = uint32(len(dst.tokens)) - tr.start - dst.fieldsTokens[string(prevField)] = tr - } - dst.fields = append(dst.fields, minToken.Key) - dst.fieldsTokens[string(minToken.Key)] = tokensRange{start: uint32(len(dst.tokens))} - prevField = minToken.Key - } - - dst.tokens = append(dst.tokens, minToken.Value) - dst.tokenLIDs = append(dst.tokenLIDs, mergeLIDs(lids)) - } - if tr, ok := dst.fieldsTokens[string(prevField)]; ok { - tr.count = uint32(len(dst.tokens)) - tr.start - dst.fieldsTokens[string(prevField)] = tr - } -} - -func mergePositions(dst *memIndex, orig []mergeIterator) { - iterators := append([]mergeIterator{}, orig...) // make copy - for len(iterators) > 0 { - // try select first - selected := []int{0} - minOffset := iterators[0].CurrentBlocksOffset() - - for i := 1; i < len(iterators); i++ { - if cur := iterators[i].CurrentBlocksOffset(); cur == minOffset { - selected = append(selected, i) - } else if cur < minOffset { - minOffset = cur - selected = []int{i} - } - } - - newBlockIndex := len(dst.blocksOffsets) - dst.blocksOffsets = append(dst.blocksOffsets, minOffset) - - for _, i := range selected { - iterators[i].AddNewBlockIndex(newBlockIndex) - if !iterators[i].ShiftBlocksOffset() { - removeItem(iterators, i) - } - } - } - - for _, iterator := range orig { - iterator.RepackDocPositions(dst.positions) - } -} - -func compareMetaToken(mt1, mt2 tokenizer.MetaToken) int { - res := bytes.Compare(mt1.Key, mt2.Key) - if res == 0 { - return bytes.Compare(mt1.Value, mt2.Value) - } - return res -} - -func mergeLIDs(lids [][]uint32) []uint32 { - size := 0 - for i := range lids { - size += len(lids[i]) - } - res := make([]uint32, 0, size) - - for len(lids) > 0 { - // try select first - selected := []int{0} - minLID := lids[0][0] - - for i := 1; i < len(lids); i++ { - cur := lids[i][0] - if cur == minLID { // can be doubles - selected = append(selected, i) - } else if cur < minLID { - selected = []int{i} - minLID = cur - } - } - - res = append(res, minLID) - - for _, i := range selected { - if lids[i] = lids[i][1:]; len(lids[i]) == 0 { - removeItem(lids, i) - } - } - } - - return res -} - -func removeItem[V any](items []V, i int) []V { - last := len(items) - 1 - items[i] = items[last] - items = items[:last] - return items -} -*/ diff --git a/frac/active2/merge_iterator.go b/frac/active2/merge_iterator.go deleted file mode 100644 index b9d745d4..00000000 --- a/frac/active2/merge_iterator.go +++ /dev/null @@ -1,105 +0,0 @@ -package active2 - -import ( - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/tokenizer" -) - -// For compaction -type mergeIterator struct { - index *memIndex - posIDs int - posField int - posToken int - posBlocks int - lastFieldToken int - newLIDs []uint32 - newPositions []seq.DocPos - newBlocks []int -} - -func newIndexIterator(index *memIndex) *mergeIterator { - x := &mergeIterator{ - index: index, - newLIDs: make([]uint32, 0, len(index.ids)), - newBlocks: make([]int, 0, len(index.blocksOffsets)), - newPositions: make([]seq.DocPos, 0, len(index.positions)), - lastFieldToken: int(index.fieldsTokens[string(index.fields[0])].count) - 1, - } - - field := x.index.fields[x.posField] - r := x.index.fieldsTokens[string(field)] - x.lastFieldToken += int(r.count) - 1 - - return x -} - -func (iq *mergeIterator) ShiftID() bool { - iq.posIDs++ - if iq.posIDs == len(iq.index.ids) { - return false - } - return true -} - -func (iq *mergeIterator) CurrentID() seq.ID { - return iq.index.ids[iq.posIDs] -} - -func (iq *mergeIterator) CurrentPos() seq.DocPos { - return iq.newPositions[iq.posIDs] -} - -func (iq *mergeIterator) ShiftToken() bool { - iq.posToken++ - if iq.posToken == len(iq.index.tokens) { - return false - } - if iq.posToken > iq.lastFieldToken { // need shift field - iq.posField++ - field := iq.index.fields[iq.posField] - r := iq.index.fieldsTokens[string(field)] - iq.lastFieldToken += int(r.count) - } - return true -} - -func (iq *mergeIterator) CurrentToken() tokenizer.MetaToken { - return tokenizer.MetaToken{ - Key: iq.index.fields[iq.posField], - Value: iq.index.tokens[iq.posToken], - } -} - -func (iq *mergeIterator) CurrentTokenLIDs() []uint32 { - src := iq.index.tokenLIDs[iq.posToken] - dst := make([]uint32, 0, len(src)) - for _, oldLid := range src { - dst = append(dst, iq.newLIDs[oldLid-1]) - } - return dst -} - -func (iq *mergeIterator) ShiftBlocksOffset() bool { - iq.posBlocks++ - if iq.posBlocks == len(iq.index.blocksOffsets) { - return false - } - return true -} - -func (iq *mergeIterator) CurrentBlocksOffset() uint64 { - return iq.index.blocksOffsets[iq.posBlocks] -} - -func (iq *mergeIterator) AddPos(p seq.DocPos) { - iq.newPositions = append(iq.newPositions, p) -} - -func (iq *mergeIterator) AddNewLID(lid uint32) { - iq.newLIDs = append(iq.newLIDs, lid) -} - -func (iq *mergeIterator) AddNewBlockIndex(blockIndex int) { - iq.newBlocks = append(iq.newBlocks, blockIndex) -} diff --git a/frac/active2/merge_stream.go b/frac/active2/merge_stream.go deleted file mode 100644 index 3e949d7b..00000000 --- a/frac/active2/merge_stream.go +++ /dev/null @@ -1,149 +0,0 @@ -package active2 - -import ( - "cmp" -) - -type Ordered interface { - ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 -} - -type streamsPool[T Ordered] struct { - ts []TwoStreams[T] - oss []OneSliceStream[T] - tss []TwoSliceStream[T] -} - -func (s *streamsPool[T]) GetTwoStreams(src1, src2 IOrderedStream[T]) (r *TwoStreams[T]) { - s.ts = append(s.ts, TwoStreams[T]{}) - r = &s.ts[len(s.ts)-1] - r.src1 = src1 - r.src2 = src2 - r.Init() - - return r -} - -func (s *streamsPool[T]) GetTwoSliceStream(src1, src2 []T) (r *TwoSliceStream[T]) { - s.tss = append(s.tss, TwoSliceStream[T]{}) - r = &s.tss[len(s.tss)-1] - r.src1 = src1 - r.src2 = src2 - return r -} - -func (s *streamsPool[T]) GetOneSliceStream(src []T) (r *OneSliceStream[T]) { - s.oss = append(s.oss, OneSliceStream[T]{}) - r = &s.oss[len(s.oss)-1] - r.src = src - return r -} - -func (s *streamsPool[T]) Reset() { - s.ts = s.ts[:0] - s.oss = s.oss[:0] - s.tss = s.tss[:0] -} - -func MergeSortNSlices[T Ordered](src [][]T, p *streamsPool[T]) IOrderedStream[T] { - n := len(src) - if n == 1 { - return p.GetOneSliceStream(src[0]) - } - if n == 2 { - return p.GetTwoSliceStream(src[0], src[1]) - } - h := n / 2 - src1 := MergeSortNSlices(src[:h], p) - src2 := MergeSortNSlices(src[h:], p) - return p.GetTwoStreams(src1, src2) -} - -type IOrderedStream[T Ordered] interface { - Next() (T, bool) -} - -type TwoStreams[T Ordered] struct { - src1, src2 IOrderedStream[T] - v1, v2 T - has1, has2 bool -} - -func (s *TwoStreams[T]) Init() { - s.v1, s.has1 = s.src1.Next() - s.v2, s.has2 = s.src2.Next() -} - -func (s *TwoStreams[T]) Next() (v T, has bool) { - if s.has1 && s.has2 { - if s.v1 < s.v2 { - v = s.v1 - s.v1, s.has1 = s.src1.Next() - } else { - v = s.v2 - s.v2, s.has2 = s.src2.Next() - } - return v, true - } - if s.has1 { - v = s.v1 - s.v1, s.has1 = s.src1.Next() - return v, true - } - if s.has2 { - v = s.v2 - s.v2, s.has2 = s.src2.Next() - return v, true - } - return v, false -} - -type OneSliceStream[T cmp.Ordered] struct { - p int - src []T -} - -func (s *OneSliceStream[T]) Next() (v T, has bool) { - if s.p < len(s.src) { - has = true - v = s.src[s.p] - s.p++ - } - return v, has -} - -type TwoSliceStream[T Ordered] struct { - p1, p2 int - src1, src2 []T -} - -func (s *TwoSliceStream[T]) Next() (v T, has bool) { - n1, n2 := len(s.src1), len(s.src2) - has1 := s.p1 < n1 - has2 := s.p2 < n2 - if has1 && has2 { - - v1 := s.src1[s.p1] - v2 := s.src2[s.p2] - if v1 < v2 { - s.p1++ - return v1, true - } - s.p2++ - return v2, true - } - - if has1 { - v = s.src1[s.p1] - s.p1++ - return v, true - } - - if has2 { - v = s.src2[s.p2] - s.p2++ - return v, true - } - - return v, false -} diff --git a/go.mod b/go.mod index 6b9439ec..ca86ec24 100644 --- a/go.mod +++ b/go.mod @@ -45,6 +45,7 @@ require ( ) require ( + github.com/RoaringBitmap/roaring/v2 v2.14.4 // indirect github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect @@ -61,11 +62,13 @@ require ( github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect github.com/aws/smithy-go v1.23.2 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.24.2 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/pprof v0.0.0-20250422154841-e1f9c1950416 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect diff --git a/go.sum b/go.sum index 6989060d..8803c552 100644 --- a/go.sum +++ b/go.sum @@ -27,6 +27,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/KimMachineGun/automemlimit v0.7.5 h1:RkbaC0MwhjL1ZuBKunGDjE/ggwAX43DwZrJqVwyveTk= github.com/KimMachineGun/automemlimit v0.7.5/go.mod h1:QZxpHaGOQoYvFhv/r4u3U0JTC2ZcOwbSr11UZF46UBM= +github.com/RoaringBitmap/roaring/v2 v2.14.4 h1:4aKySrrg9G/5oRtJ3TrZLObVqxgQ9f1znCRBwEwjuVw= +github.com/RoaringBitmap/roaring/v2 v2.14.4/go.mod h1:oMvV6omPWr+2ifRdeZvVJyaz+aoEUopyv5iH0u/+wbY= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0= @@ -71,6 +73,8 @@ github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM= github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bits-and-blooms/bitset v1.24.2 h1:M7/NzVbsytmtfHbumG+K2bremQPMJuqv1JD3vOaFxp0= +github.com/bits-and-blooms/bitset v1.24.2/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/c2h5oh/datasize v0.0.0-20200112174442-28bbd4740fee h1:BnPxIde0gjtTnc9Er7cxvBk8DHLWhEux0SxayC8dP6I= github.com/c2h5oh/datasize v0.0.0-20200112174442-28bbd4740fee/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= github.com/cactus/go-statsd-client v3.1.1+incompatible/go.mod h1:cMRcwZDklk7hXp+Law83urTHUiHMzCev/r4JMYr/zU0= @@ -196,6 +200,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s= From d2dd334938de122eea65add6f0b71326fc1f1c71 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Tue, 16 Dec 2025 12:02:43 +0300 Subject: [PATCH 08/28] full frac write bench --- frac/active/indexer_test.go | 62 ++++++++++++ frac/active/sealing_source.go | 149 ----------------------------- frac/active2/indexer_test.go | 95 ++++++++++++++---- frac/active2/iterators.go | 88 +++++++++++++++++ frac/active2/merge.go | 175 ++++++++++------------------------ frac/tests/fraction_test.go | 4 +- storage/docs_reader.go | 2 +- 7 files changed, 278 insertions(+), 297 deletions(-) diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index 2d0c942c..8aac3e41 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -8,11 +8,14 @@ import ( "testing" "time" + "github.com/alecthomas/units" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric/stopwatch" @@ -112,3 +115,62 @@ func BenchmarkIndexer(b *testing.B) { wg.Wait() } } + +func defaultSealingParams() frac.SealParams { + const minZstdLevel = 1 + return frac.SealParams{ + IDsZstdLevel: minZstdLevel, + LIDsZstdLevel: minZstdLevel, + TokenListZstdLevel: minZstdLevel, + DocsPositionsZstdLevel: minZstdLevel, + TokenTableZstdLevel: minZstdLevel, + DocBlocksZstdLevel: minZstdLevel, + DocBlockSize: 128 * int(units.KiB), + } +} + +func BenchmarkFullWrite(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + idx, stop := NewIndexer(8, 8) + defer stop() + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) + + params := defaultSealingParams() + + processor := getTestProcessor() + allDocs := make([][]byte, 0, len(readers)) + allMeta := make([][]byte, 0, len(readers)) + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + + for b.Loop() { + active := New( + filepath.Join(b.TempDir(), "test"), + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + &frac.Config{}, + ) + + wg := sync.WaitGroup{} + for i, meta := range allMeta { + wg.Add(1) + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + } + wg.Wait() + + src, err := NewSealingSource(active, params) + require.NoError(b, err) + sealed, err := sealing.Seal(src, params) + require.NoError(b, err) + assert.Greater(b, int(sealed.Info.DocsTotal), 0) + } +} diff --git a/frac/active/sealing_source.go b/frac/active/sealing_source.go index 3bc5e972..9ed23253 100644 --- a/frac/active/sealing_source.go +++ b/frac/active/sealing_source.go @@ -2,27 +2,17 @@ package active import ( "bytes" - "encoding/binary" "errors" - "io" "iter" - "os" - "path/filepath" "slices" "time" "unsafe" - "github.com/alecthomas/units" - "go.uber.org/zap" - - "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/sealing" - "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" - "github.com/ozontech/seq-db/util" ) // SealingSource transforms data from in-memory (frac.Active) storage @@ -41,7 +31,6 @@ import ( // All iterators work with pre-sorted data and return information // in an order optimal for creating disk index structures. type SealingSource struct { - params frac.SealParams // Sealing parameters info *frac.Info // fraction Info created time.Time // Creation time of the source sortedLIDs []uint32 // Sorted LIDs (Local ID) @@ -73,7 +62,6 @@ func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, e sortedTIDs := sortTokens(sortedFields, active.TokenList) src := SealingSource{ - params: params, info: &info, created: time.Now(), sortedLIDs: sortedLIDs, @@ -333,140 +321,3 @@ func (src *SealingSource) doc(pos seq.DocPos) ([]byte, error) { } return doc, nil } - -// SortDocs sorts documents and writes them in compressed form to disk. -// Creates a temporary file that is then renamed to the final one. -func (src *SealingSource) SortDocs() error { - start := time.Now() - logger.Info("sorting docs...") - - // Create temporary file for sorted documents - sdocsFile, err := os.Create(src.info.Path + consts.SdocsTmpFileSuffix) - if err != nil { - return err - } - - bw := bytespool.AcquireWriterSize(sdocsFile, int(units.MiB)) - defer bytespool.ReleaseWriter(bw) - - // Group documents into blocks - blocks := docBlocks(src.Docs(), src.params.DocBlockSize) - - // Write blocks and get new offsets and positions - blocksOffsets, positions, err := src.writeDocs(blocks, bw) - - if err := util.CollapseErrors([]error{src.lastErr, err}); err != nil { - return err - } - if err := bw.Flush(); err != nil { - return err - } - - src.docPosSorted = positions - src.blocksOffsets = blocksOffsets - - // Get file statistics - stat, err := sdocsFile.Stat() - if err != nil { - return err - } - src.info.DocsOnDisk = uint64(stat.Size()) - - // Synchronize and rename file - if err := sdocsFile.Sync(); err != nil { - return err - } - if err := sdocsFile.Close(); err != nil { - return err - } - if err := os.Rename(sdocsFile.Name(), src.info.Path+consts.SdocsFileSuffix); err != nil { - return err - } - if err := util.SyncPath(filepath.Dir(src.info.Path)); err != nil { - return err - } - - // Log compression statistics - ratio := float64(src.info.DocsRaw) / float64(src.info.DocsOnDisk) - logger.Info("docs sorting stat", - util.ZapUint64AsSizeStr("raw", src.info.DocsRaw), - util.ZapUint64AsSizeStr("compressed", src.info.DocsOnDisk), - util.ZapFloat64WithPrec("ratio", ratio, 2), - zap.Int("blocks_count", len(blocksOffsets)), - zap.Int("docs_total", len(positions)), - util.ZapDurationWithPrec("write_duration_ms", time.Since(start), "ms", 0), - ) - - return nil -} - -// writeDocs compresses and writes document blocks, calculating new offsets -// and collecting document positions. -func (src *SealingSource) writeDocs(blocks iter.Seq2[[]byte, []seq.DocPos], w io.Writer) ([]uint64, []seq.DocPos, error) { - offset := 0 - buf := make([]byte, 0) - blocksOffsets := make([]uint64, 0) - allPositions := make([]seq.DocPos, 0, len(src.mids.vals)) - - // Process each document block - for block, positions := range blocks { - allPositions = append(allPositions, positions...) - blocksOffsets = append(blocksOffsets, uint64(offset)) - - // Compress document block - buf = storage.CompressDocBlock(block, buf[:0], src.params.DocBlocksZstdLevel) - if _, err := w.Write(buf); err != nil { - return nil, nil, err - } - offset += len(buf) - } - return blocksOffsets, allPositions, nil -} - -// docBlocks groups documents into fixed-size blocks. -// Returns an iterator for blocks and corresponding document positions. -func docBlocks(docs iter.Seq2[seq.ID, []byte], blockSize int) iter.Seq2[[]byte, []seq.DocPos] { - return func(yield func([]byte, []seq.DocPos) bool) { - const defaultBlockSize = 128 * units.KiB - if blockSize <= 0 { - blockSize = int(defaultBlockSize) - logger.Warn("document block size not specified", zap.Int("default_size", blockSize)) - } - - var ( - prev seq.ID - index uint32 // Current block index - ) - pos := make([]seq.DocPos, 0) - buf := make([]byte, 0, blockSize) - - // Iterate through documents - for id, doc := range docs { - if id == prev { - // Duplicate IDs (for nested indexes) - store document once, - // but create positions for each LID - pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) - continue - } - prev = id - - // If block is full, yield it - if len(buf) >= blockSize { - if !yield(buf, pos) { - return - } - index++ - buf = buf[:0] - pos = pos[:0] - } - - // Add document position - pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) - - // Write document size and the document itself - buf = binary.LittleEndian.AppendUint32(buf, uint32(len(doc))) - buf = append(buf, doc...) - } - yield(buf, pos) - } -} diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go index ab981597..0e2c9cb3 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active2/indexer_test.go @@ -8,8 +8,10 @@ import ( "testing" "time" + "github.com/alecthomas/units" "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" @@ -17,6 +19,7 @@ import ( "github.com/ozontech/seq-db/tests/common" "github.com/ozontech/seq-db/tokenizer" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) @@ -29,17 +32,9 @@ func BenchmarkIndexer(b *testing.B) { readers := splitLogsToBulks(allLogs, 2000) assert.NoError(b, err) - active := New( - filepath.Join(b.TempDir(), "test"), - &frac.Config{}, - idx, - storage.NewReadLimiter(1, nil), - cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - ) - processor := getTestProcessor() + b.ResetTimer() for i := 0; i < b.N; i++ { b.StopTimer() bulks := make([][]byte, 0, len(readers)) @@ -47,6 +42,14 @@ func BenchmarkIndexer(b *testing.B) { _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) } + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) b.StartTimer() wg := sync.WaitGroup{} @@ -74,9 +77,14 @@ func BenchmarkMerge(b *testing.B) { processor := getTestProcessor() - b.StopTimer() b.ResetTimer() for i := 0; i < b.N; i++ { + b.StopTimer() + bulks := make([][]byte, 0, len(readers)) + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + } active := New( filepath.Join(b.TempDir(), "test"), @@ -87,12 +95,6 @@ func BenchmarkMerge(b *testing.B) { cache.NewCache[[]byte](nil, nil), ) - bulks := make([][]byte, 0, len(readers)) - for _, readNext := range readers { - _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) - } - wg := sync.WaitGroup{} for _, meta := range bulks { wg.Add(1) @@ -105,10 +107,67 @@ func BenchmarkMerge(b *testing.B) { }) } wg.Wait() - b.StartTimer() + active.indexes.MergeAll() - b.StopTimer() + } +} + +func defaultSealingParams() frac.SealParams { + const minZstdLevel = 1 + return frac.SealParams{ + IDsZstdLevel: minZstdLevel, + LIDsZstdLevel: minZstdLevel, + TokenListZstdLevel: minZstdLevel, + DocsPositionsZstdLevel: minZstdLevel, + TokenTableZstdLevel: minZstdLevel, + DocBlocksZstdLevel: minZstdLevel, + DocBlockSize: 128 * int(units.KiB), + } +} + +func BenchmarkFullWrite(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + idx := NewIndexer(8) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 2000) + assert.NoError(b, err) + + params := defaultSealingParams() + + processor := getTestProcessor() + allDocs := make([][]byte, 0, len(readers)) + allMeta := make([][]byte, 0, len(readers)) + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + + for b.Loop() { + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + wg := sync.WaitGroup{} + for i, meta := range allMeta { + wg.Add(1) + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + } + wg.Wait() + + src, err := NewSealingSource(active, params) + require.NoError(b, err) + sealed, err := sealing.Seal(src, params) + require.NoError(b, err) + assert.Greater(b, int(sealed.Info.DocsTotal), 0) } } diff --git a/frac/active2/iterators.go b/frac/active2/iterators.go index 8e7d47d7..1c6cf863 100644 --- a/frac/active2/iterators.go +++ b/frac/active2/iterators.go @@ -1,5 +1,7 @@ package active2 +import "github.com/ozontech/seq-db/seq" + type IOrderedIterator[T any] interface { Next() (T, bool) } @@ -60,3 +62,89 @@ func (s *MergeIterator[T]) Next() (v T, has bool) { } return v, false } + +type IDIteratorItem struct { + i int + id seq.ID + pos seq.DocPos +} + +type IDIterator struct { + i int + offset int + idx *memIndex + posMap []seq.DocPos +} + +func (it *IDIterator) Next() (v IDIteratorItem, has bool) { + if it.offset < len(it.idx.ids) { + v.i = it.i + v.id = it.idx.ids[it.offset] + v.pos = it.posMap[it.offset] + has = true + it.offset++ + } + return v, has +} + +type TokenIteratorPayload struct { + idx *memIndex + lidsMap []uint32 +} + +type TokenIteratorItem struct { + tid uint32 + fid uint32 + payload *TokenIteratorPayload +} + +func (i *TokenIteratorItem) Field() []byte { + return i.payload.idx.fields[i.fid] +} + +func (i *TokenIteratorItem) Token() []byte { + return i.payload.idx.tokens[i.tid] +} + +func (i *TokenIteratorItem) LIDs() []uint32 { + return i.payload.idx.tokenLIDs[i.tid] +} + +func (i *TokenIteratorItem) lidsMap() []uint32 { + return i.payload.lidsMap +} + +type TokenIterator struct { + tid uint32 + fid uint32 + fieldLastTID uint32 + payload TokenIteratorPayload +} + +func NewTokenIterator(idx *memIndex, lidsMap []uint32) *TokenIterator { + return &TokenIterator{ + fieldLastTID: idx.fieldsTokens[string(idx.fields[0])].count - 1, + payload: TokenIteratorPayload{ + idx: idx, + lidsMap: lidsMap, + }, + } +} + +func (it *TokenIterator) Next() (v TokenIteratorItem, has bool) { + if int(it.tid) < len(it.payload.idx.tokens) { + v.tid = uint32(it.tid) + v.fid = uint32(it.fid) + v.payload = &it.payload + has = true + it.tid++ + + if it.tid > it.fieldLastTID { + it.fid++ + if int(it.fid) < len(it.payload.idx.fields) { + it.fieldLastTID += it.payload.idx.fieldsTokens[string(it.payload.idx.fields[it.fid])].count + } + } + } + return v, has +} diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 9496f209..9b5b1c67 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -30,10 +30,9 @@ func mergeIndexes(indexes []*memIndex) *memIndex { docsCount: uint32(docsCount), } - newPositions := mergeBlocksOffsets(dst, indexes) - - reLIDs := mergeIDs(dst, indexes, newPositions) - mergeTokens(dst, indexes, reLIDs) + posMap := mergeBlocksOffsets(dst, indexes) + lidsMap := mergeIDs(dst, indexes, posMap) + mergeTokens(dst, indexes, lidsMap) dst.allTID = dst.fieldsTokens[seq.TokenAll].start @@ -46,35 +45,18 @@ func mergeIndexes(indexes []*memIndex) *memIndex { return dst } -type IDIteratorItem struct { - i int - id seq.ID -} - -type IDIterator struct { - i int - offset int - idx *memIndex -} - -func (i *IDIterator) Next() (v IDIteratorItem, has bool) { - if i.offset < len(i.idx.ids) { - v.i = i.i - v.id = i.idx.ids[i.offset] - has = true - i.offset++ - } - return v, has -} - -func mergeIDs(dst *memIndex, indexes []*memIndex, newPositions [][]seq.DocPos) [][]uint32 { +func mergeIDs(dst *memIndex, indexes []*memIndex, posMap [][]seq.DocPos) [][]uint32 { // todo doubles := []seq.ID{} - newLIDs := make([][]uint32, len(indexes)) + lidsMap := make([][]uint32, len(indexes)) iters := make([]IOrderedIterator[IDIteratorItem], len(indexes)) for i, idx := range indexes { - iters[i] = &IDIterator{idx: idx, i: i} - newLIDs[i] = make([]uint32, 0, len(idx.ids)) + iters[i] = &IDIterator{ + i: i, + idx: idx, + posMap: posMap[i], + } + lidsMap[i] = make([]uint32, 0, len(idx.ids)) } orderedIDs := MergeKSortIterators(iters, func(a, b IDIteratorItem) int { return seq.Compare(b.id, a.id) }) @@ -83,84 +65,22 @@ func mergeIDs(dst *memIndex, indexes []*memIndex, newPositions [][]seq.DocPos) [ for has { dst.ids = append(dst.ids, cur.id) - dst.positions = append(dst.positions, newPositions[cur.i][len(newLIDs[cur.i])]) + dst.positions = append(dst.positions, cur.pos) lid := uint32(len(dst.ids)) - newLIDs[cur.i] = append(newLIDs[cur.i], lid) + lidsMap[cur.i] = append(lidsMap[cur.i], lid) cur, has = orderedIDs.Next() } - return newLIDs -} - -type TokenIteratorPayload struct { - idx *memIndex - newLIDs []uint32 -} - -type TokenIteratorItem struct { - tid uint32 - fid uint32 - payload *TokenIteratorPayload -} - -func (i *TokenIteratorItem) Field() []byte { - return i.payload.idx.fields[i.fid] -} - -func (i *TokenIteratorItem) Token() []byte { - return i.payload.idx.tokens[i.tid] -} - -func (i *TokenIteratorItem) LIDs() []uint32 { - return i.payload.idx.tokenLIDs[i.tid] + return lidsMap } -func (i *TokenIteratorItem) NewLIDs() []uint32 { - return i.payload.newLIDs -} - -type TokenIterator struct { - tid uint32 - fid uint32 - fieldLastTID uint32 - payload TokenIteratorPayload -} - -func NewTokenIterator(idx *memIndex, newLIDs []uint32) *TokenIterator { - return &TokenIterator{ - fieldLastTID: idx.fieldsTokens[string(idx.fields[0])].count - 1, - payload: TokenIteratorPayload{ - idx: idx, - newLIDs: newLIDs, - }, - } -} - -func (it *TokenIterator) Next() (v TokenIteratorItem, has bool) { - if int(it.tid) < len(it.payload.idx.tokens) { - v.tid = uint32(it.tid) - v.fid = uint32(it.fid) - v.payload = &it.payload - has = true - it.tid++ - - if it.tid > it.fieldLastTID { - it.fid++ - if int(it.fid) < len(it.payload.idx.fields) { - it.fieldLastTID += it.payload.idx.fieldsTokens[string(it.payload.idx.fields[it.fid])].count - } - } - } - return v, has -} - -func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { +func mergeTokens(dst *memIndex, indexes []*memIndex, lidsMap [][]uint32) { allCount := 0 totalTokens := 0 totalLIDsSize := 0 TokensIterators := make([]IOrderedIterator[TokenIteratorItem], len(indexes)) for i, index := range indexes { allCount += len(index.ids) - TokensIterators[i] = NewTokenIterator(index, reLIDs[i]) + TokensIterators[i] = NewTokenIterator(index, lidsMap[i]) totalTokens += len(index.tokens) for _, lids := range index.tokenLIDs { totalLIDsSize += len(lids) @@ -183,31 +103,34 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { uniqFieldsSize := 0 uniqFieldsCount := 0 - prv := TokenIteratorItem{} - var prevField []byte - cur, has := orderedTokens.Next() - items := make([]TokenIteratorItem, 0, totalTokens) + var ( + prevField []byte + prevToken TokenIteratorItem + ) + borders := make([]uint8, 0, totalTokens) - for has { + items := make([]TokenIteratorItem, 0, totalTokens) + + for cur, has := orderedTokens.Next(); has; cur, has = orderedTokens.Next() { var border uint8 - if prv.payload == nil || cmpToken(prv, cur) != 0 { + if prevToken.payload == nil || cmpToken(prevToken, cur) != 0 { uniqTokensCount++ uniqTokensSize += len(cur.Token()) border++ - if !bytes.Equal(prevField, cur.Field()) { + field := cur.Field() + if !bytes.Equal(prevField, field) { uniqFieldsCount++ - uniqFieldsSize += len(cur.Field()) + uniqFieldsSize += len(field) border++ - prevField = cur.Field() + prevField = field } } - prv = cur - items = append(items, cur) borders = append(borders, border) - cur, has = orderedTokens.Next() + items = append(items, cur) + prevToken = cur } dst.fields = make([][]byte, 0, uniqFieldsCount) @@ -218,14 +141,14 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { allFields := make([]byte, 0, uniqFieldsSize) tokenRanges := make([]tokenRange, 0, uniqFieldsCount) - var all bool - lidsSorter := NewLIDsSorter(totalLIDsSize, allCount) + var isAllToken bool + lidsCollector := NewLIDsCollector(totalLIDsSize, allCount) for i, item := range items { if borders[i] > 0 { if i > 0 { - dst.tokenLIDs = append(dst.tokenLIDs, lidsSorter.Get()) + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) } if borders[i] > 1 { @@ -249,7 +172,7 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { tr := tokenRanges[len(tokenRanges)-1] dst.fieldsTokens[fieldStr] = tr - all = fieldStr == "_all_" + isAllToken = fieldStr == seq.TokenAll } start := len(allTokens) @@ -257,38 +180,38 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, reLIDs [][]uint32) { dst.tokens = append(dst.tokens, allTokens[start:]) } - if all { + if isAllToken { for range item.LIDs() { - lidsSorter.Add(0) + lidsCollector.Add(0) } } else { - newLIDs := item.NewLIDs() + lidsMap := item.lidsMap() for _, oldLID := range item.LIDs() { - newLID := newLIDs[oldLID-1] - lidsSorter.Add(newLID) + newLID := lidsMap[oldLID-1] + lidsCollector.Add(newLID) } } } - dst.tokenLIDs = append(dst.tokenLIDs, lidsSorter.Get()) + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) tid := uint32(len(dst.tokens)) - 1 - fstr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) - tr := dst.fieldsTokens[fstr] + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] tr.count = tid - tr.start + 1 - dst.fieldsTokens[fstr] = tr + dst.fieldsTokens[fieldStr] = tr } -type LIDsSorter struct { +type LIDsCollector struct { all []uint32 lids []uint32 offset int bitmap *roaring.Bitmap } -func NewLIDsSorter(size, allCount int) *LIDsSorter { - ls := &LIDsSorter{ +func NewLIDsCollector(size, allCount int) *LIDsCollector { + ls := &LIDsCollector{ lids: make([]uint32, 0, size), all: make([]uint32, allCount), bitmap: roaring.New(), @@ -299,11 +222,11 @@ func NewLIDsSorter(size, allCount int) *LIDsSorter { return ls } -func (s *LIDsSorter) Add(lid uint32) { +func (s *LIDsCollector) Add(lid uint32) { s.lids = append(s.lids, lid) } -func (s *LIDsSorter) Get() (dst []uint32) { +func (s *LIDsCollector) GetSorted() (dst []uint32) { dst = s.lids[s.offset:] if len(dst) == len(s.all) { diff --git a/frac/tests/fraction_test.go b/frac/tests/fraction_test.go index 8586c4a7..4d178011 100644 --- a/frac/tests/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -63,9 +63,7 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &frac.Config{ - SkipSortDocs: true, - } + s.config = &frac.Config{} s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), diff --git a/storage/docs_reader.go b/storage/docs_reader.go index ad5edbd8..b0429ae8 100644 --- a/storage/docs_reader.go +++ b/storage/docs_reader.go @@ -45,7 +45,7 @@ func (r *DocsReader) ReadDocsFunc(blockOffset uint64, docOffsets []uint64, cb fu block, err := r.cache.GetWithError(uint32(blockOffset), func() ([]byte, int, error) { block, _, err := r.reader.ReadDocBlockPayload(int64(blockOffset)) if err != nil { - return nil, 0, fmt.Errorf("can't fetch doc at pos %d: %w", blockOffset, err) + return nil, 0, fmt.Errorf("can't fetch doc block at pos %d: %w", blockOffset, err) } return block, cap(block), nil }) From e015add9e69c72491359bd66d097607e8ea524d4 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Thu, 18 Dec 2025 22:06:03 +0300 Subject: [PATCH 09/28] enable background merging --- frac/active2/iterators.go | 8 ++------ frac/active2/mem_index.go | 15 +++++++++++++++ frac/active2/mem_index_pool.go | 12 +++++++++--- frac/active2/merge_manager.go | 10 +++++----- frac/active2/merge_strategy.go | 10 +++++----- frac/active2/resources.go | 19 +++++++++++++++++++ resources/global_pools.go | 1 + resources/slice_allocator.go | 4 ++++ 8 files changed, 60 insertions(+), 19 deletions(-) diff --git a/frac/active2/iterators.go b/frac/active2/iterators.go index 1c6cf863..7874d523 100644 --- a/frac/active2/iterators.go +++ b/frac/active2/iterators.go @@ -30,15 +30,11 @@ func NewMergeIterator[T any](src1, src2 IOrderedIterator[T], cmp func(T, T) int) src2: src2, cmp: cmp, } - r.Init() + r.v1, r.has1 = r.src1.Next() + r.v2, r.has2 = r.src2.Next() return &r } -func (s *MergeIterator[T]) Init() { - s.v1, s.has1 = s.src1.Next() - s.v2, s.has2 = s.src2.Next() -} - func (s *MergeIterator[T]) Next() (v T, has bool) { if s.has1 && s.has2 { if s.cmp(s.v1, s.v2) < 0 { diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index 3ec3d3a7..36442f2f 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -23,6 +23,17 @@ type memIndex struct { docsSize uint64 docsCount uint32 + + res *Resources + release func() +} + +func newMemIndex() *memIndex { + res, release := NewResources() + return &memIndex{ + res: res, + release: release, + } } func (index *memIndex) getTokenProvider(field string) *tokenProvider { @@ -54,3 +65,7 @@ func (index *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { i, ok := sort.Find(len(index.ids), func(i int) int { return seq.Compare(index.ids[i], id) }) return uint32(i + 1), ok } + +func (index *memIndex) Release() { + // index.release() +} diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go index c64a730d..13be7073 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active2/mem_index_pool.go @@ -4,6 +4,8 @@ import ( "slices" "sync" "sync/atomic" + + "github.com/alecthomas/units" ) // memIndexExt contains index metadata for merge management @@ -28,7 +30,7 @@ func newIndexPool() *memIndexPool { readyToMerge: make(map[uint64]memIndexExt), underMerging: make(map[uint64]memIndexExt), - tiers: newSizeTiers(firstTierMaxSize, maxTierCount, tierSizeDeltaPercent), + tiers: newSizeTiers(firstTierMaxSizeKb, maxTierCount, tierSizeDeltaPercent), } } @@ -92,12 +94,16 @@ func (p *memIndexPool) replace(oldIndexes []memIndexExt, newIndex *memIndex) { for _, metaIndex := range p.underMerging { p.indexes = append(p.indexes, metaIndex.index) // add indexes currently being merged } + + for _, metaIndex := range oldIndexes { + metaIndex.index.Release() + } } func (p *memIndexPool) wrapIndex(index *memIndex) memIndexExt { return memIndexExt{ - id: p.counter.Add(1), // atomically increment counter - tier: p.tiers.Calc(index.docsCount), // determine size tier + id: p.counter.Add(1), // atomically increment counter + tier: p.tiers.Calc(index.docsCount / uint32(units.KiB)), // determine size tier index: index, } } diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index 1a124798..6b96f78b 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -6,10 +6,10 @@ import ( const ( minIndexesToMerge = 4 // minimum number of indexes to trigger merge - forceMergeThreshold = 100 // index count threshold for forced merge + forceMergeThreshold = 64 // index count threshold for forced merge tierSizeDeltaPercent = 10 // percentage difference between size tiers - firstTierMaxSize = 100 // maximum size of the first tier - maxTierCount = 1000 // maximum number of size tiers allowed + firstTierMaxSizeKb = 8 // maximum size of the first tier + maxTierCount = 1000 // todo maximum number of size tiers allowed bucketSizePercent = 50 // percentage difference between size buckets ) @@ -33,9 +33,9 @@ func newMergeManager(maxConcurrentMerges int) *MergeManager { indexes: newIndexPool(), } - // todo // Start background goroutine for merge scheduling - // go m.mergeScheduler() + go m.mergeScheduler() + return &m } diff --git a/frac/active2/merge_strategy.go b/frac/active2/merge_strategy.go index 70da7f36..3fd2f80f 100644 --- a/frac/active2/merge_strategy.go +++ b/frac/active2/merge_strategy.go @@ -59,7 +59,7 @@ import ( Пример для окна Tier 3-4: ┌─────────────────────────────────────────┐ │ До: [1, 2, 0, 3, 1, 2, 1] │ - │ Выбор: ██ ██ │ + │ Выбор: ██ ██ │ │ Результат: 3 элемента из Tier 3 │ │ + 1 элемент из Tier 4 │ │ = 4 элемента всего │ @@ -104,15 +104,15 @@ func selectForMerge(items []memIndexExt, minToMerge int) [][]memIndexExt { } func buildTiersDistribution(items []memIndexExt) []int { - lastTier := 0 + maxTier := 0 tiersDist := make([]int, maxTierCount) for _, index := range items { tiersDist[index.tier]++ - if index.tier > lastTier { - lastTier = index.tier + if index.tier > maxTier { + maxTier = index.tier } } - return tiersDist[:lastTier] + return tiersDist[:maxTier+1] } func extractIndexesInRange(items, buf []memIndexExt, firstTier, lastTier int, tiersDist []int) []memIndexExt { diff --git a/frac/active2/resources.go b/frac/active2/resources.go index 3d684e12..7dde6a5c 100644 --- a/frac/active2/resources.go +++ b/frac/active2/resources.go @@ -3,6 +3,7 @@ package active2 import ( "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/resources" + "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/tokenizer" ) @@ -22,11 +23,14 @@ type Resources struct { uint32s resources.SliceOnBytes[uint32] uint64s resources.SliceOnBytes[uint64] bytes resources.SliceAllocator[byte] + bytesSlices resources.SliceAllocator[[]byte] uint32Slices resources.SliceAllocator[[]uint32] tokenKeys resources.SliceAllocator[token] indexerMetaData resources.SliceAllocator[indexer.MetaData] tokenMap resources.MapAllocator[token, uint32] buf resources.ObjectAllocator[indexBuffer] + ids resources.SliceOnBytes[seq.ID] + docPos resources.SliceOnBytes[seq.DocPos] } func NewResources() (*Resources, func()) { @@ -39,10 +43,13 @@ func NewResources() (*Resources, func()) { uint64s: resources.NewUint64s(&s), bytes: resources.NewBytes(&s), uint32Slices: resources.NewUint32Slices(&s), + bytesSlices: resources.NewBytesSlices(&s), indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), tokenMap: resources.NewMapAllocator(&tokenMapPool, &s), buf: resources.NewObjectAllocator(&bufPool, &s), + ids: resources.NewSliceOnBytes[seq.ID](&s), + docPos: resources.NewSliceOnBytes[seq.DocPos](&s), } } return r, func() { @@ -51,6 +58,10 @@ func NewResources() (*Resources, func()) { } } +func (r *Resources) BytesSlices() resources.SliceAllocator[[]byte] { + return r.bytesSlices +} + func (r *Resources) Bytes() resources.SliceAllocator[byte] { return r.bytes } @@ -59,6 +70,14 @@ func (r *Resources) Uint32s() resources.SliceOnBytes[uint32] { return r.uint32s } +func (r *Resources) IDs() resources.SliceOnBytes[seq.ID] { + return r.ids +} + +func (r *Resources) DocPos() resources.SliceOnBytes[seq.DocPos] { + return r.docPos +} + func (r *Resources) Uint64s() resources.SliceOnBytes[uint64] { return r.uint64s } diff --git a/resources/global_pools.go b/resources/global_pools.go index 4be6f014..97d69e0b 100644 --- a/resources/global_pools.go +++ b/resources/global_pools.go @@ -4,4 +4,5 @@ var ( BytesPool = NewSizedPool[byte](24) StringsPool = NewSizedPool[string](24) Uint32SlicesPool = NewSizedPool[[]uint32](24) + BytesSlicesPool = NewSizedPool[[]byte](24) ) diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index fb9fabc4..cd76f2b3 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -12,6 +12,10 @@ func NewUint32Slices(releases *CallStack) SliceAllocator[[]uint32] { return NewSliceAllocator(&Uint32SlicesPool, releases) } +func NewBytesSlices(releases *CallStack) SliceAllocator[[]byte] { + return NewSliceAllocator(&BytesSlicesPool, releases) +} + type SliceAllocator[T any] struct { pool *SizedPool[T] releases *CallStack From 070a63f44aa3c5ef6ecd8c153d8a98f09f262b44 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 19 Dec 2025 23:29:28 +0300 Subject: [PATCH 10/28] tune merge strategy --- config/shared.go | 2 +- frac/active/indexer.go | 1 - frac/active/indexer_test.go | 30 ++-- frac/active2/active2.go | 103 +++++--------- frac/active2/indexer.go | 94 ++++++------- frac/active2/indexer_test.go | 42 +++--- frac/active2/mem_index.go | 27 ++-- frac/active2/mem_index_pool.go | 82 +++++++++-- frac/active2/merge.go | 95 ++++++------- frac/active2/merge_manager.go | 54 +++----- frac/active2/merge_strategy.go | 244 +++++++++++++++------------------ frac/active2/resources.go | 66 ++++----- frac/active2/sealing_source.go | 21 ++- 13 files changed, 432 insertions(+), 429 deletions(-) diff --git a/config/shared.go b/config/shared.go index 83abec44..696ead8f 100644 --- a/config/shared.go +++ b/config/shared.go @@ -8,7 +8,7 @@ var ( ReaderWorkers int CaseSensitive = false - SkipFsync = false + SkipFsync = true MaxFetchSizeBytes = 4 * units.MiB diff --git a/frac/active/indexer.go b/frac/active/indexer.go index ae8e2caf..284b0218 100644 --- a/frac/active/indexer.go +++ b/frac/active/indexer.go @@ -170,7 +170,6 @@ func (ai *Indexer) appendWorker(index int) { } func (ai *Indexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { - return for _, tl := range tokens { task := mergeTask{ frac: frac, diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index 8aac3e41..509f5878 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -138,17 +138,22 @@ func BenchmarkFullWrite(b *testing.B) { readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - params := defaultSealingParams() - processor := getTestProcessor() - allDocs := make([][]byte, 0, len(readers)) - allMeta := make([][]byte, 0, len(readers)) - for _, readNext := range readers { - _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) - allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + + n := 2 + allDocs := make([][]byte, 0, len(readers)*n) + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } } + params := defaultSealingParams() + for b.Loop() { active := New( filepath.Join(b.TempDir(), "test"), @@ -156,14 +161,16 @@ func BenchmarkFullWrite(b *testing.B) { storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &frac.Config{}, + &frac.Config{SkipSortDocs: true}, ) wg := sync.WaitGroup{} for i, meta := range allMeta { wg.Add(1) - err := active.Append(allDocs[i], meta, &wg) - assert.NoError(b, err) + go func() { + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + }() } wg.Wait() @@ -172,5 +179,6 @@ func BenchmarkFullWrite(b *testing.B) { sealed, err := sealing.Seal(src, params) require.NoError(b, err) assert.Greater(b, int(sealed.Info.DocsTotal), 0) + active.Release() } } diff --git a/frac/active2/active2.go b/frac/active2/active2.go index 86e0e61b..5a43743a 100644 --- a/frac/active2/active2.go +++ b/frac/active2/active2.go @@ -27,11 +27,11 @@ type Active2 struct { BaseFileName string - indexMu sync.RWMutex - info *frac.Info - indexes *MergeManager indexer *Indexer + indexes *memIndexPool + merger *MergeManager + docsFile *os.File docsReader storage.DocsReader sortReader storage.DocsReader @@ -44,10 +44,7 @@ type Active2 struct { writer *active.Writer } -type indexSnapshot struct { - info *frac.Info - indexes []*memIndex -} +const MergerWorkers = 2 func New( baseFileName string, @@ -60,12 +57,16 @@ func New( docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) + info := frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())) + indexes := NewIndexPool(info) + merger := NewMergeManager(indexes, MergerWorkers) + f := &Active2{ BaseFileName: baseFileName, Config: cfg, - info: frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), indexer: indexer, - indexes: newMergeManager(2), + indexes: indexes, + merger: merger, docsFile: docsFile, docsCache: docsCache, @@ -85,12 +86,15 @@ func New( } func (f *Active2) Replay(ctx context.Context) error { - logger.Info("start replaying...", zap.String("name", f.info.Name())) + + info := f.indexes.info + + logger.Info("start replaying...", zap.String("name", info.Name())) t := time.Now() offset := uint64(0) - step := f.info.MetaOnDisk / 10 + step := info.MetaOnDisk / 10 wg := sync.WaitGroup{} next := step @@ -113,12 +117,12 @@ out: if offset > next { next += step - progress := float64(offset) / float64(f.info.MetaOnDisk) * 100 + progress := float64(offset) / float64(info.MetaOnDisk) * 100 logger.Info("replaying batch, meta", - zap.String("name", f.info.Name()), + zap.String("name", info.Name()), zap.Uint64("from", offset), zap.Uint64("to", offset+metaSize), - zap.Uint64("target", f.info.MetaOnDisk), + zap.Uint64("target", info.MetaOnDisk), util.ZapFloat64WithPrec("progress_percentage", progress, 2), ) } @@ -129,7 +133,8 @@ out: if err != nil { logger.Fatal("bulk indexing error", zap.Error(err)) } - f.addIndex(idx) + f.indexes.Add(idx, 0, 0) + f.merger.triggerMerge() wg.Done() }) } @@ -138,12 +143,12 @@ out: wg.Wait() tookSeconds := util.DurationToUnit(time.Since(t), "s") - throughputRaw := util.SizeToUnit(f.info.DocsRaw, "mb") / tookSeconds - throughputMeta := util.SizeToUnit(f.info.MetaOnDisk, "mb") / tookSeconds + throughputRaw := util.SizeToUnit(info.DocsRaw, "mb") / tookSeconds + throughputMeta := util.SizeToUnit(info.MetaOnDisk, "mb") / tookSeconds logger.Info("active fraction replayed", - zap.String("name", f.info.Name()), - zap.Uint32("docs_total", f.info.DocsTotal), - util.ZapUint64AsSizeStr("docs_size", f.info.DocsOnDisk), + zap.String("name", info.Name()), + zap.Uint32("docs_total", info.DocsTotal), + util.ZapUint64AsSizeStr("docs_size", info.DocsOnDisk), util.ZapFloat64WithPrec("took_s", tookSeconds, 1), util.ZapFloat64WithPrec("throughput_raw_mb_sec", throughputRaw, 1), util.ZapFloat64WithPrec("throughput_meta_mb_sec", throughputMeta, 1), @@ -158,14 +163,14 @@ func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { ma.Stop() return err } - f.updateDiskStats(uint64(len(docs)), uint64(len(meta))) mi := sw.Start("send_to_indexer") f.indexer.Index(meta, func(idx *memIndex, err error) { if err != nil { logger.Fatal("bulk indexing error", zap.Error(err)) } - f.addIndex(idx) + f.indexes.Add(idx, uint64(len(docs)), uint64(len(meta))) + f.merger.triggerMerge() wg.Done() }) mi.Stop() @@ -175,32 +180,6 @@ func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { return nil } -func (f *Active2) updateDiskStats(docsLen, metaLen uint64) { - f.indexMu.Lock() - f.info.DocsOnDisk += docsLen - f.info.MetaOnDisk += metaLen - f.indexMu.Unlock() -} - -func (f *Active2) addIndex(index *memIndex) { - maxMID := index.ids[0].MID - minMID := index.ids[len(index.ids)-1].MID - - f.indexMu.Lock() - defer f.indexMu.Unlock() - - f.indexes.Add(index) - - if f.info.From > minMID { - f.info.From = minMID - } - if f.info.To < maxMID { - f.info.To = maxMID - } - f.info.DocsRaw += index.docsSize - f.info.DocsTotal += index.docsCount -} - func (f *Active2) String() string { return frac.FracToString(f, "active") } @@ -211,7 +190,9 @@ func (f *Active2) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { t := sw.Start("total") - ss := f.indexSnapshot(ctx) + ss, release := f.indexes.Snapshot() + defer release() + if ss.info.DocsTotal == 0 { // it is empty active fraction state return nil, nil } @@ -229,7 +210,8 @@ func (f *Active2) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { } func (f *Active2) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - ss := f.indexSnapshot(ctx) + ss, release := f.indexes.Snapshot() + defer release() if ss.info.DocsTotal == 0 { // it is empty active fraction state metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() @@ -262,24 +244,8 @@ func (f *Active2) Search(ctx context.Context, params processor.SearchParams) (*s return res, nil } -func (f *Active2) indexSnapshot(ctx context.Context) *indexSnapshot { - f.indexMu.RLock() - info := *f.info // copy - indexes := f.indexes.Indexes() - f.indexMu.RUnlock() - - return &indexSnapshot{ - info: &info, - indexes: indexes, - } -} - func (f *Active2) Info() *frac.Info { - f.indexMu.RLock() - defer f.indexMu.RUnlock() - - cp := *f.info // copy - return &cp + return f.indexes.Info() } func (f *Active2) Contains(id seq.MID) bool { @@ -314,7 +280,8 @@ func (f *Active2) Suicide() { func (f *Active2) releaseMem() { f.writer.Stop() - f.indexes.Stop() + f.merger.Stop() + f.indexes.Release() f.docsCache.Release() f.sortCache.Release() diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index 09aee47a..9fcbb43a 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -41,7 +41,7 @@ func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, er func NewMemIndex(block storage.DocBlock) (*memIndex, error) { sw := stopwatch.New() - res, cleanup := NewResources() + res, cleanup := AcquireResources() defer cleanup() // Decompress metadata @@ -58,22 +58,24 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { return nil, err } // Initialize index - idx := &memIndex{ - docsCount: uint32(len(meta)), - blocksOffsets: []uint64{block.GetExt2()}, // Only one block per bulk - } + idx := newMemIndex() + idx.docsCount = uint32(len(meta)) + idx.ids = idx.res.AllocIDs(len(meta)) + idx.positions = idx.res.AllocDocPos(len(meta)) + idx.blocksOffsets = idx.res.AllocUint64s(1) // Only one block per bulk + idx.blocksOffsets[0] = block.GetExt2() // Extract tokens from metadata - tids, lids, tokens, err := extractTokens(res, buf, meta, idx) + tids, lids, tokens, err := extractTokens(idx, res, buf, meta) if err != nil { return nil, err } // Group documents by token - tokenDocGroups := groupLIDsByTID(res, tids, lids, len(tokens)) + tokenDocGroups := groupLIDsByTID(idx, res, tids, lids, len(tokens)) // Organize tokens and fields - organizeTokens(res, buf, idx, tokens, tokenDocGroups) + organizeTokens(idx, res, buf, tokens, tokenDocGroups) // Set special "all" token idx.allTID = uint32(idx.fieldsTokens[seq.TokenAll].start) @@ -97,17 +99,17 @@ func toToken(t tokenizer.MetaToken) token { // extractTokens extracts tokens from document metadata func extractTokens( + idx *memIndex, res *Resources, buf *indexBuffer, meta []indexer.MetaData, - idx *memIndex, ) ([]uint32, []uint32, []token, error) { - var totalTokens uint32 var docOffset uint64 + var totalTokens uint32 // Calculate document positions in the original block // Each document is stored as: [size: uint32][data: size bytes] - positions := res.Uint64s().AllocSlice(len(meta)) + positions := res.AllocDocPos(len(meta)) prev := seq.PackDocPos(0, docOffset) for i := range meta { @@ -117,13 +119,13 @@ func extractTokens( prev = seq.PackDocPos(0, docOffset) docOffset += uint64(docMeta.Size) + uint64(uint32Size) } - positions[i] = uint64(prev) + positions[i] = prev totalTokens += docMeta.TokensCount() } // Create ordering by document ID (descending) // We need to map global document IDs to local IDs (LIDs) - order := res.Uint32s().AllocSlice(len(meta)) + order := res.AllocUint32s(len(meta)) for i := range order { order[i] = uint32(i) } @@ -132,29 +134,20 @@ func extractTokens( }) // Fill index structures with sorted documents - ids := make([]seq.ID, len(order)) - pos := make([]seq.DocPos, len(order)) - for lid, origIdx := range order { docMeta := meta[origIdx] - ids[lid] = docMeta.ID + idx.ids[lid] = docMeta.ID + idx.positions[lid] = positions[origIdx] idx.docsSize += uint64(docMeta.Size) - pos[lid] = seq.DocPos(positions[origIdx]) } - idx.ids = ids - idx.positions = pos - // Extract and process tokens from all documents var err error var token token // Allocate slices for token-document relationships - lids := res.Uint32s().AllocSlice(int(totalTokens))[:0] // Local document ID for each token occurrence - tids := res.Uint32s().AllocSlice(int(totalTokens))[:0] // Token ID for each occurrence - - // Map tokenKey -> tokenID (global token identifier) - tokenMap := res.TokenMap().Alloc(1000) + lids := res.AllocUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence + tids := res.AllocUint32s(int(totalTokens))[:0] // Token ID for each occurrence // Process documents in ID-sorted order for lid, origIdx := range order { @@ -168,10 +161,10 @@ func extractTokens( // Process each token in the document for _, t := range buf.tokens { token = toToken(t) - tid, exists := tokenMap[token] + tid, exists := buf.tokenMap[token] if !exists { - tid = uint32(len(tokenMap)) // assign new token ID - tokenMap[token] = tid + tid = uint32(len(buf.tokenMap)) // assign new token ID + buf.tokenMap[token] = tid } tids = append(tids, tid) lids = append(lids, uint32(lid)+1) // store lid+1 (1-based indexing for internal use) @@ -179,8 +172,8 @@ func extractTokens( } // Create reverse mapping: tokenID -> tokenKey - tokens := res.Tokens().AllocSlice(len(tokenMap)) - for key, tokenID := range tokenMap { + tokens := res.AllocTokens(len(buf.tokenMap)) + for key, tokenID := range buf.tokenMap { tokens[tokenID] = key } @@ -190,9 +183,9 @@ func extractTokens( // groupLIDsByTID groups document IDs by token // Input: flat arrays of (tid, lid) pairs // Output: 2D array where tokenLIDs[tid] = []lid -func groupLIDsByTID(res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { +func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { // Phase 1: Count documents per token - counts := res.Uint32s().AllocSlice(tokenCount) + counts := res.AllocUint32s(tokenCount) clear(counts) for _, tid := range tids { counts[tid]++ @@ -200,13 +193,13 @@ func groupLIDsByTID(res *Resources, tids, lids []uint32, tokenCount int) [][]uin // Phase 2: Allocate slices for each token group // We use a single large buffer and slice it for efficiency - tokenLIDs := res.Uint32Slices().AllocSlice(tokenCount) - buffer := make([]uint32, len(lids)) + tokenLIDs := res.AllocUint32Slices(tokenCount) + allTokenLIDs := idx.res.AllocUint32s(len(lids)) tokenLIDs = tokenLIDs[:len(counts)] for tid, count := range counts { - tokenLIDs[tid] = buffer[:count][:0] - buffer = buffer[count:] + tokenLIDs[tid] = allTokenLIDs[:count][:0] + allTokenLIDs = allTokenLIDs[count:] } // Phase 3: Populate groups with document IDs @@ -220,9 +213,9 @@ func groupLIDsByTID(res *Resources, tids, lids []uint32, tokenCount int) [][]uin } // organizeTokens organizes tokens and fields in the index with proper sorting -func organizeTokens(res *Resources, buf *indexBuffer, idx *memIndex, tokens []token, tokenLIDs [][]uint32) { +func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []token, tokenLIDs [][]uint32) { tokenSize := 0 - order := res.Uint32s().AllocSlice(len(tokens)) + order := res.AllocUint32s(len(tokens)) order = order[:len(tokens)] for i, t := range tokens { order[i] = uint32(i) @@ -243,9 +236,9 @@ func organizeTokens(res *Resources, buf *indexBuffer, idx *memIndex, tokens []to prevField := "" // Prepare buffers for sorted data - tokenBuffer := make([]byte, 0, tokenSize) - sortedTokens := make([][]byte, len(order)) - sortedTokenLIDs := make([][]uint32, len(order)) + tokenBuffer := idx.res.AllocBytes(tokenSize)[:0] + idx.tokenLIDs = idx.res.AllocUint32Slices(len(order)) + idx.tokens = idx.res.AllocBytesSlices(len(order)) // Process tokens in sorted order for tid, origIdx := range order { @@ -266,19 +259,16 @@ func organizeTokens(res *Resources, buf *indexBuffer, idx *memIndex, tokens []to // Store in sorted arrays // Note: We use original tokenID as index to preserve tokenID->data mapping - sortedTokens[tid] = tokenBuffer[start:] - sortedTokenLIDs[tid] = tokenLIDs[origIdx] + idx.tokens[tid] = tokenBuffer[start:] + idx.tokenLIDs[tid] = tokenLIDs[origIdx] } // Add sentinel value for easier range calculation buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) - // Store in index - idx.tokens = sortedTokens - idx.tokenLIDs = sortedTokenLIDs - // Organize fields - fieldBuffer := make([]byte, 0, fieldSize) - idx.fields = make([][]byte, len(buf.fields)) + fieldBuffer := idx.res.AllocBytes(fieldSize)[:0] + idx.fields = idx.res.AllocBytesSlices(len(buf.fields)) + idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) for i, field := range buf.fields { @@ -304,7 +294,7 @@ func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwa defer m.Stop() // Allocate exact size needed for compressed data - buffer := res.Bytes().AllocSlice(int(block.RawLen())) + buffer := res.AllocBytes(int(block.RawLen())) payload, err := block.DecompressTo(buffer) if err != nil { return nil, err @@ -327,7 +317,7 @@ func decodeMetadata(res *Resources, buf *indexBuffer, payload []byte, sw *stopwa } // Second pass: decode each metadata entry - meta := res.Metadata().AllocSlice(len(buf.sizes)) + meta := res.AllocMetadata(len(buf.sizes)) for i, size := range buf.sizes { // Skip size field to get to actual data data := payload[uint32Size : size+uint32(uint32Size)] diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go index 0e2c9cb3..ee092d01 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active2/indexer_test.go @@ -55,11 +55,11 @@ func BenchmarkIndexer(b *testing.B) { wg := sync.WaitGroup{} for _, meta := range bulks { wg.Add(1) - idx.Index(meta, func(index *memIndex, err error) { + idx.Index(meta, func(idx *memIndex, err error) { if err != nil { logger.Fatal("bulk indexing error", zap.Error(err)) } - active.addIndex(index) + active.indexes.Add(idx, 0, 0) wg.Done() }) } @@ -98,18 +98,18 @@ func BenchmarkMerge(b *testing.B) { wg := sync.WaitGroup{} for _, meta := range bulks { wg.Add(1) - idx.Index(meta, func(index *memIndex, err error) { + idx.Index(meta, func(idx *memIndex, err error) { if err != nil { logger.Fatal("bulk indexing error", zap.Error(err)) } - active.addIndex(index) + active.indexes.Add(idx, 0, 0) wg.Done() }) } wg.Wait() b.StartTimer() - active.indexes.MergeAll() + active.merger.MergeAll() } } @@ -131,24 +131,29 @@ func BenchmarkFullWrite(b *testing.B) { idx := NewIndexer(8) allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) - readers := splitLogsToBulks(allLogs, 2000) + readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - params := defaultSealingParams() - processor := getTestProcessor() - allDocs := make([][]byte, 0, len(readers)) - allMeta := make([][]byte, 0, len(readers)) - for _, readNext := range readers { - _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) - allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + + n := 2 + allDocs := make([][]byte, 0, len(readers)*n) + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } } + params := defaultSealingParams() + for b.Loop() { active := New( filepath.Join(b.TempDir(), "test"), - &frac.Config{}, + &frac.Config{SkipSortDocs: true}, idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), @@ -158,8 +163,10 @@ func BenchmarkFullWrite(b *testing.B) { wg := sync.WaitGroup{} for i, meta := range allMeta { wg.Add(1) - err := active.Append(allDocs[i], meta, &wg) - assert.NoError(b, err) + go func() { + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + }() } wg.Wait() @@ -168,6 +175,7 @@ func BenchmarkFullWrite(b *testing.B) { sealed, err := sealing.Seal(src, params) require.NoError(b, err) assert.Greater(b, int(sealed.Info.DocsTotal), 0) + active.Release() } } diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index 36442f2f..f53c9ed8 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -2,6 +2,7 @@ package active2 import ( "sort" + "sync" "github.com/ozontech/seq-db/seq" ) @@ -24,48 +25,50 @@ type memIndex struct { docsSize uint64 docsCount uint32 + wg sync.WaitGroup res *Resources release func() } func newMemIndex() *memIndex { - res, release := NewResources() + res, release := AcquireResources() return &memIndex{ res: res, release: release, } } -func (index *memIndex) getTokenProvider(field string) *tokenProvider { - if r, ok := index.fieldsTokens[field]; ok { +func (idx *memIndex) getTokenProvider(field string) *tokenProvider { + if r, ok := idx.fieldsTokens[field]; ok { return &tokenProvider{ firstTID: r.start, lastTID: r.start + r.count - 1, - tokens: index.tokens, + tokens: idx.tokens, } } // Field is not indexed - return empty token provider return &tokenProvider{ firstTID: 1, lastTID: 0, // firstTID > lastTID = no tokens available - tokens: index.tokens, + tokens: idx.tokens, } } -func (index *memIndex) IsIntersecting(from, to seq.MID) bool { - maxMID := index.ids[0].MID - minMID := index.ids[len(index.ids)-1].MID +func (idx *memIndex) IsIntersecting(from, to seq.MID) bool { + maxMID := idx.ids[0].MID + minMID := idx.ids[len(idx.ids)-1].MID if to < minMID || maxMID < from { return false } return true } -func (index *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { - i, ok := sort.Find(len(index.ids), func(i int) int { return seq.Compare(index.ids[i], id) }) +func (idx *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { + i, ok := sort.Find(len(idx.ids), func(i int) int { return seq.Compare(idx.ids[i], id) }) return uint32(i + 1), ok } -func (index *memIndex) Release() { - // index.release() +func (idx *memIndex) Release() { + idx.wg.Wait() + idx.release() } diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go index 13be7073..9d43a0de 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active2/mem_index_pool.go @@ -6,6 +6,7 @@ import ( "sync/atomic" "github.com/alecthomas/units" + "github.com/ozontech/seq-db/frac" ) // memIndexExt contains index metadata for merge management @@ -17,6 +18,7 @@ type memIndexExt struct { type memIndexPool struct { mu sync.RWMutex + info *frac.Info indexes []*memIndex readyToMerge map[uint64]memIndexExt underMerging map[uint64]memIndexExt @@ -25,8 +27,9 @@ type memIndexPool struct { counter atomic.Uint64 // atomic counter for generating index IDs } -func newIndexPool() *memIndexPool { +func NewIndexPool(info *frac.Info) *memIndexPool { return &memIndexPool{ + info: info, readyToMerge: make(map[uint64]memIndexExt), underMerging: make(map[uint64]memIndexExt), @@ -34,21 +37,62 @@ func newIndexPool() *memIndexPool { } } -func (p *memIndexPool) Indexes() []*memIndex { +type indexSnapshot struct { + info *frac.Info + indexes []*memIndex +} + +func (p *memIndexPool) Snapshot() (*indexSnapshot, func()) { + p.mu.RLock() + defer p.mu.RUnlock() + + info := *p.info // copy + iss := indexSnapshot{ + info: &info, + indexes: make([]*memIndex, len(p.indexes)), + } + for i, idx := range p.indexes { + iss.indexes[i] = idx + idx.wg.Add(1) + } + + return &iss, func() { + for _, idx := range iss.indexes { + idx.wg.Done() + } + } +} + +func (p *memIndexPool) Info() *frac.Info { p.mu.RLock() defer p.mu.RUnlock() - return p.indexes + info := *p.info // copy + return &info } -func (p *memIndexPool) Add(index *memIndex) { - metaIndex := p.wrapIndex(index) +func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { + maxMID := idx.ids[0].MID + minMID := idx.ids[len(idx.ids)-1].MID + idxExt := p.wrapIndex(idx) p.mu.Lock() defer p.mu.Unlock() - p.readyToMerge[metaIndex.id] = metaIndex - p.indexes = append(p.indexes, index) + if p.info.From > minMID { + p.info.From = minMID + } + if p.info.To < maxMID { + p.info.To = maxMID + } + p.info.DocsRaw += idx.docsSize + p.info.DocsTotal += idx.docsCount + + p.info.DocsOnDisk += docsLen + p.info.MetaOnDisk += metaLen + + p.readyToMerge[idxExt.id] = idxExt + p.indexes = append(p.indexes, idx) } func (p *memIndexPool) ReadyToMerge() []memIndexExt { @@ -88,15 +132,27 @@ func (p *memIndexPool) replace(oldIndexes []memIndexExt, newIndex *memIndex) { p.indexes = p.indexes[:0] p.indexes = slices.Grow(p.indexes, len(p.readyToMerge)+len(p.underMerging)) - for _, metaIndex := range p.readyToMerge { - p.indexes = append(p.indexes, metaIndex.index) // add all ready indexes + for _, idxExt := range p.readyToMerge { + p.indexes = append(p.indexes, idxExt.index) // add all ready indexes } - for _, metaIndex := range p.underMerging { - p.indexes = append(p.indexes, metaIndex.index) // add indexes currently being merged + for _, idxExt := range p.underMerging { + p.indexes = append(p.indexes, idxExt.index) // add indexes currently being merged } - for _, metaIndex := range oldIndexes { - metaIndex.index.Release() + go func() { + for _, idxExt := range oldIndexes { + idxExt.index.Release() + } + }() +} + +func (p *memIndexPool) Release() { + p.mu.RLock() + indexes := p.indexes + p.mu.RUnlock() + + for _, idx := range indexes { + idx.Release() } } diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 9b5b1c67..13702261 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -12,43 +12,34 @@ import ( func mergeIndexes(indexes []*memIndex) *memIndex { docsCount := 0 blocksCount := 0 - fieldsCount := 0 docsSize := uint64(0) for _, index := range indexes { docsSize += index.docsSize docsCount += len(index.ids) - fieldsCount += len(index.fields) blocksCount += len(index.blocksOffsets) } - dst := &memIndex{ - ids: make([]seq.ID, 0, docsCount), - positions: make([]seq.DocPos, 0, docsCount), - fieldsTokens: make(map[string]tokenRange, fieldsCount), - blocksOffsets: make([]uint64, 0, blocksCount), - docsSize: docsSize, - docsCount: uint32(docsCount), - } + res, release := AcquireResources() + defer release() - posMap := mergeBlocksOffsets(dst, indexes) - lidsMap := mergeIDs(dst, indexes, posMap) - mergeTokens(dst, indexes, lidsMap) + dst := newMemIndex() + dst.docsCount = uint32(docsCount) + dst.ids = dst.res.AllocIDs(docsCount)[:0] + dst.positions = dst.res.AllocDocPos(docsCount)[:0] + dst.blocksOffsets = dst.res.AllocUint64s(blocksCount)[:0] + dst.docsSize = docsSize - dst.allTID = dst.fieldsTokens[seq.TokenAll].start + posMap := mergeBlocksOffsets(dst, res, indexes) + lidsMap := mergeIDs(dst, res, indexes, posMap) + mergeTokens(dst, res, indexes, lidsMap) - // todo - // if len(doubles) > 0 { - // dst.docsCount = uint32(len(doubles)) - // logger.Warn("there are duplicate IDs when compaction", zap.Int("doubles", len(doubles))) - // } + dst.allTID = dst.fieldsTokens[seq.TokenAll].start return dst } -func mergeIDs(dst *memIndex, indexes []*memIndex, posMap [][]seq.DocPos) [][]uint32 { - // todo doubles := []seq.ID{} - - lidsMap := make([][]uint32, len(indexes)) +func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq.DocPos) [][]uint32 { + lidsMap := res.AllocUint32Slices(len(indexes)) iters := make([]IOrderedIterator[IDIteratorItem], len(indexes)) for i, idx := range indexes { iters[i] = &IDIterator{ @@ -56,13 +47,12 @@ func mergeIDs(dst *memIndex, indexes []*memIndex, posMap [][]seq.DocPos) [][]uin idx: idx, posMap: posMap[i], } - lidsMap[i] = make([]uint32, 0, len(idx.ids)) + lidsMap[i] = res.uint32s.AllocSlice(int(idx.docsCount))[:0] } orderedIDs := MergeKSortIterators(iters, func(a, b IDIteratorItem) int { return seq.Compare(b.id, a.id) }) cur, has := orderedIDs.Next() - for has { dst.ids = append(dst.ids, cur.id) dst.positions = append(dst.positions, cur.pos) @@ -73,7 +63,7 @@ func mergeIDs(dst *memIndex, indexes []*memIndex, posMap [][]seq.DocPos) [][]uin return lidsMap } -func mergeTokens(dst *memIndex, indexes []*memIndex, lidsMap [][]uint32) { +func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][]uint32) { allCount := 0 totalTokens := 0 totalLIDsSize := 0 @@ -108,7 +98,7 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, lidsMap [][]uint32) { prevToken TokenIteratorItem ) - borders := make([]uint8, 0, totalTokens) + borders := res.AllocBytes(totalTokens)[:0] items := make([]TokenIteratorItem, 0, totalTokens) for cur, has := orderedTokens.Next(); has; cur, has = orderedTokens.Next() { @@ -133,17 +123,18 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, lidsMap [][]uint32) { prevToken = cur } - dst.fields = make([][]byte, 0, uniqFieldsCount) - dst.tokens = make([][]byte, 0, uniqTokensCount) - dst.tokenLIDs = make([][]uint32, 0, uniqTokensCount) + dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) + dst.fields = dst.res.AllocBytesSlices(uniqFieldsCount)[:0] + dst.tokens = dst.res.AllocBytesSlices(uniqTokensCount)[:0] + dst.tokenLIDs = dst.res.AllocUint32Slices(uniqTokensCount)[:0] - allTokens := make([]byte, 0, uniqTokensSize) - allFields := make([]byte, 0, uniqFieldsSize) - tokenRanges := make([]tokenRange, 0, uniqFieldsCount) + allTokens := dst.res.AllocBytes(uniqTokensSize)[:0] + allFields := dst.res.AllocBytes(uniqFieldsSize)[:0] + allTokenLIDs := dst.res.AllocUint32s(totalLIDsSize)[:0] - var isAllToken bool - lidsCollector := NewLIDsCollector(totalLIDsSize, allCount) + lidsCollector := NewLIDsCollector(allTokenLIDs, genAllLIDs(res, allCount)) + var isAllToken bool for i, item := range items { if borders[i] > 0 { @@ -167,10 +158,8 @@ func mergeTokens(dst *memIndex, indexes []*memIndex, lidsMap [][]uint32) { field := allFields[start:] dst.fields = append(dst.fields, field) - tokenRanges = append(tokenRanges, tokenRange{start: tid}) fieldStr := util.ByteToStringUnsafe(field) - tr := tokenRanges[len(tokenRanges)-1] - dst.fieldsTokens[fieldStr] = tr + dst.fieldsTokens[fieldStr] = tokenRange{start: tid} isAllToken = fieldStr == seq.TokenAll } @@ -210,16 +199,20 @@ type LIDsCollector struct { bitmap *roaring.Bitmap } -func NewLIDsCollector(size, allCount int) *LIDsCollector { - ls := &LIDsCollector{ - lids: make([]uint32, 0, size), - all: make([]uint32, allCount), - bitmap: roaring.New(), +func genAllLIDs(res *Resources, s int) []uint32 { + all := res.AllocUint32s(s) + for i := range all { + all[i] = uint32(i) + 1 } - for i := range allCount { - ls.all[i] = uint32(i) + 1 + return all +} + +func NewLIDsCollector(allTokenLIDs, all []uint32) *LIDsCollector { + return &LIDsCollector{ + lids: allTokenLIDs[:0], + all: all, + bitmap: roaring.New(), } - return ls } func (s *LIDsCollector) Add(lid uint32) { @@ -230,8 +223,8 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { dst = s.lids[s.offset:] if len(dst) == len(s.all) { - dst = s.all - s.lids = s.lids[:s.offset] + s.lids = append(s.lids[:s.offset], s.all...) + s.offset = len(s.lids) return dst } @@ -248,12 +241,12 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { return dst } -func mergeBlocksOffsets(dst *memIndex, indexes []*memIndex) [][]seq.DocPos { +func mergeBlocksOffsets(dst *memIndex, res *Resources, indexes []*memIndex) [][]seq.DocPos { var offset uint32 - positions := make([][]seq.DocPos, len(indexes)) + positions := res.AllocDocPosSlices(len(indexes)) for i, index := range indexes { dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) - positions[i] = make([]seq.DocPos, 0, len(index.positions)) + positions[i] = res.AllocDocPos(len(index.positions))[:0] for _, p := range index.positions { oldIdx, docOffset := p.Unpack() positions[i] = append(positions[i], seq.PackDocPos(oldIdx+offset, docOffset)) diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index 6b96f78b..d5cc9b39 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -5,12 +5,12 @@ import ( ) const ( - minIndexesToMerge = 4 // minimum number of indexes to trigger merge - forceMergeThreshold = 64 // index count threshold for forced merge - tierSizeDeltaPercent = 10 // percentage difference between size tiers - firstTierMaxSizeKb = 8 // maximum size of the first tier - maxTierCount = 1000 // todo maximum number of size tiers allowed - bucketSizePercent = 50 // percentage difference between size buckets + minIndexesToMerge = 4 // minimum number of indexes to trigger merge + forceMergeThreshold = 64 // index count threshold for forced merge + firstTierMaxSizeKb = 8 // maximum size of the first tier + maxTierCount = 64 // maximum number of size tiers allowed + tierSizeDeltaPercent = 25 // percentage difference between size tiers + bucketSizePercent = 50 // percentage difference between size buckets ) // MergeManager manages in-memory index collection and merging @@ -25,12 +25,12 @@ type MergeManager struct { mergeCh chan struct{} // channel to trigger merge process } -// newMergeManager creates a new index manager -func newMergeManager(maxConcurrentMerges int) *MergeManager { +// NewMergeManager creates a new index manager +func NewMergeManager(indexes *memIndexPool, maxConcurrentMerges int) *MergeManager { m := MergeManager{ + indexes: indexes, workers: make(chan struct{}, maxConcurrentMerges), mergeCh: make(chan struct{}, 1), - indexes: newIndexPool(), } // Start background goroutine for merge scheduling @@ -52,43 +52,27 @@ func (m *MergeManager) Stop() { } // MergeAll performs full merge of all available indexes -func (m *MergeManager) MergeAll() *memIndex { +func (m *MergeManager) MergeAll() { m.mu.Lock() defer m.mu.Unlock() m.wg.Wait() - if len(m.indexes.indexes) == 1 { - return m.indexes.indexes[0] + if toMerge := m.indexes.ReadyToMerge(); len(toMerge) > 1 { + m.indexes.markAsMerging(toMerge) + merged := mergeIndexes(extractIndexes(toMerge)) + m.indexes.replace(toMerge, merged) } - - // todo обработать случай когда нет индексов вообще - - indexesToMerge := m.indexes.ReadyToMerge() - m.indexes.markAsMerging(indexesToMerge) - mergedIndex := mergeIndexes(extractIndexes(indexesToMerge)) - m.indexes.replace(indexesToMerge, mergedIndex) - - return mergedIndex } -func extractIndexes(metadataList []memIndexExt) []*memIndex { - result := make([]*memIndex, 0, len(metadataList)) - for _, metadata := range metadataList { - result = append(result, metadata.index) +func extractIndexes(indexesExt []memIndexExt) []*memIndex { + result := make([]*memIndex, 0, len(indexesExt)) + for _, eIdx := range indexesExt { + result = append(result, eIdx.index) } return result } -func (m *MergeManager) Indexes() []*memIndex { - return m.indexes.Indexes() -} - -func (m *MergeManager) Add(index *memIndex) { - m.indexes.Add(index) - m.triggerMerge() -} - // prepareForMerging prepares index groups for merging func (m *MergeManager) prepareForMerging() [][]memIndexExt { m.mu.Lock() @@ -98,7 +82,7 @@ func (m *MergeManager) prepareForMerging() [][]memIndexExt { return nil } - mergeCandidates := selectForMerge(m.indexes.ReadyToMerge(), minIndexesToMerge) + mergeCandidates := pickMergeCandidates(m.indexes.ReadyToMerge(), minIndexesToMerge) for i, candidateGroup := range mergeCandidates { if !m.acquireWorker() { // no free workers diff --git a/frac/active2/merge_strategy.go b/frac/active2/merge_strategy.go index 3fd2f80f..5b4eb181 100644 --- a/frac/active2/merge_strategy.go +++ b/frac/active2/merge_strategy.go @@ -2,157 +2,152 @@ package active2 import ( "math" + + "github.com/ozontech/seq-db/logger" + "go.uber.org/zap" ) -/* -ПРИНЦИП ВЫБОРА КАНДИДАТОВ ДЛЯ СЛИЯНИЯ - -1. ИСХОДНЫЕ ДАННЫЕ: - items (индексы) → сгруппированы по ТИРАМ (tiers) - - Пример: 10 индексов распределены по 7 тирам - - │ Tier 0 │ Tier 1 │ Tier 2 │ Tier 3 │ Tier 4 │ Tier 5 │ Tier 6 │ - ├────────┼────────┼────────┼────────┼────────┼────────┼────────┤ - │ 1 │ 2 │ 0 │ 3 │ 1 │ 2 │ 1 │ - └────────┴────────┴────────┴────────┴────────┴────────┴────────┘ - -2. ПОСТРОЕНИЕ РАСПРЕДЕЛЕНИЯ (buildTiersDistribution): - Считаем количество индексов в каждом тире - -3. ПОИСК ОКНА (mostPopulatedTiersRange): - Скользящее окно размером winSize (по умолчанию 2 тира) - - winSize = round(bucketSizePercent / tierSizeDeltaPercent) - Пример: 50% / 25% = 2 тира - - ┌─────────────────────────────────────────────────────┐ - │ Скользящее окно (размер = 2 тира) │ - ├─────────────────────────────────────────────────────┤ - │ Окно 1: │ Tier 0 + Tier 1 │ = 1 + 2 = 3 элементов │ - │ Окно 2: │ Tier 1 + Tier 2 │ = 2 + 0 = 2 элементов │ - │ Окно 3: │ Tier 2 + Tier 3 │ = 0 + 3 = 3 элементов │ - │ Окно 4: │ Tier 3 + Tier 4 │ = 3 + 1 = 4 элементов | ← max! - │ Окно 5: │ Tier 4 + Tier 5 │ = 1 + 2 = 3 элементов │ - │ Окно 6: │ Tier 5 + Tier 6 │ = 2 + 1 = 3 элементов │ - └─────────────────────────────────────────────────────┘ - - Найденное окно: Tier 3-4 с 4 элементами - Если элементов ≥ minToMerge → успех! - -4. ПРАВИЛА ВЫБОРА: - ┌─────────────────────────────────────────────────────┐ - │ Условие 1: элементов в окне ≥ minToMerge? │ - │ Да → берём это окно │ - │ Нет → переходим к условию 2 │ - ├─────────────────────────────────────────────────────┤ - │ Условие 2: findAtAnyCost = true? │ - │ (len(items) >= forceMergeThreshold) │ - │ Да → увеличиваем winSize в 2 раза │ - │ и ищем снова │ - │ Нет → возвращаем пустой результат │ - └─────────────────────────────────────────────────────┘ - -5. ВЫДЕЛЕНИЕ КАНДИДАТОВ (extractIndexesInRange): - Берём все индексы из найденного диапазона тиров - - Пример для окна Tier 3-4: - ┌─────────────────────────────────────────┐ - │ До: [1, 2, 0, 3, 1, 2, 1] │ - │ Выбор: ██ ██ │ - │ Результат: 3 элемента из Tier 3 │ - │ + 1 элемент из Tier 4 │ - │ = 4 элемента всего │ - └─────────────────────────────────────────┘ - -6. ПОВТОРЕНИЕ ПРОЦЕССА: - Удаляем выбранные элементы из распределения - Повторяем поиск, пока не останется окон - с достаточным количеством элементов - - ┌─────────────────────────────────────────┐ - │ 1-я итерация: выбрали Tier 3-4 (4 elem) │ - │ 2-я итерация: │ - │ Распределение: [1, 2, 0, 0, 0, 2, 1] │ - │ Находим новое окно... │ - └─────────────────────────────────────────┘ - -*/ - -// selectForMerge selects merge candidates based on their size. -// It groups items into sets within which the sizes of the items do not differ -// by more than a specified limit in percent (e.g. 50%) -func selectForMerge(items []memIndexExt, minToMerge int) [][]memIndexExt { - if len(items) < minToMerge { +// Algorithm for selecting indexes for merging (merge): +// +// General concept: +// Indexes are grouped into "tiers" - levels based on their size. +// Merging is performed for indexes from adjacent tiers to minimize +// the size of the resulting index and avoid frequent rebuilds. + +// pickMergeCandidates selects groups of indexes for merging based on their tier. +// items - slice of indexes to analyze. +// minMerge - minimum number of indexes that can be merged. +// Returns a slice of index slices - groups for merging. +func pickMergeCandidates(items []memIndexExt, minMerge int) [][]memIndexExt { + if len(items) < minMerge { return nil } - tiersDist := buildTiersDistribution(items) - findAtAnyCost := len(items) >= forceMergeThreshold - winSize := int(math.Round(float64(bucketSizePercent) / tierSizeDeltaPercent)) + remains := len(items) - var res [][]memIndexExt - for { - countInRange, firstTier, lastTier := mostPopulatedTiersRange(tiersDist, minToMerge, winSize, findAtAnyCost) - if countInRange == 0 { + dist := groupByTier(items) + + // win - size of the "sliding window" in number of tiers. + // bucketSizePercent/tierSizeDeltaPercent determines how many tiers + // to consider as one group when searching for merge candidates. + win := int(math.Round(float64(bucketSizePercent) / tierSizeDeltaPercent)) + + var batches [][]memIndexExt + + for remains > 1 { + // forceMerge - flag for forced merging, activated when there are too many indexes. + forceMerge := remains >= forceMergeThreshold + + // Find the most populated range of tiers. + // batchSize - number of indexes in the found range. + // first, last - boundaries of the tier range. + batchSize, first, last := findBestRange(dist, minMerge, win, forceMerge) + + if batchSize == 0 { break } - buf := make([]memIndexExt, 0, countInRange) - res = append(res, extractIndexesInRange(items, buf, firstTier, lastTier, tiersDist)) + + remains -= batchSize + buf := make([]memIndexExt, 0, batchSize) + batches = append(batches, takeFromTiers(buf, first, last, dist)) } - return res + return batches } -func buildTiersDistribution(items []memIndexExt) []int { +// groupByTier builds a distribution of indexes by their tiers. +// items - input indexes to distribute. +// Returns a slice of slices, where the outer slice index is the tier number, +// and the value is all indexes of that tier. +func groupByTier(items []memIndexExt) [][]memIndexExt { maxTier := 0 - tiersDist := make([]int, maxTierCount) + dist := make([][]memIndexExt, maxTierCount) for _, index := range items { - tiersDist[index.tier]++ + dist[index.tier] = append(dist[index.tier], index) if index.tier > maxTier { maxTier = index.tier } } - return tiersDist[:maxTier+1] + return dist[:maxTier+1] } -func extractIndexesInRange(items, buf []memIndexExt, firstTier, lastTier int, tiersDist []int) []memIndexExt { - for _, index := range items { - if firstTier <= index.tier && index.tier <= lastTier { - buf = append(buf, index) - tiersDist[index.tier]-- - } +// takeFromTiers extracts indexes from the specified range of tiers. +// buf - buffer for collecting indexes (pre-allocated with the required capacity). +// first, last - boundaries of the tier range (inclusive). +// dist - distribution of indexes by tiers. +// Returns a slice of indexes from the specified range. +func takeFromTiers(buf []memIndexExt, first, last int, dist [][]memIndexExt) []memIndexExt { + for tier := first; tier <= last; tier++ { + buf = append(buf, dist[tier]...) + dist[tier] = nil // Clear the distribution cell so these indexes don't participate in subsequent iterations. } return buf } -func mostPopulatedTiersRange(tiersDist []int, minToMerge, winSize int, findAtAnyCost bool) (int, int, int) { - var lastWinTier, maxWinSum int +// findBestRange searches for the most populated range of tiers. +// dist - distribution of indexes by tiers. +// minMerge - minimum number of indexes required for merging. +// win - window size (number of tiers in the range). +// forceMerge - flag for forced search (expands the window if unsuccessful). +// Returns: number of indexes, first tier, last tier. +func findBestRange(dist [][]memIndexExt, minMerge, win int, forceMerge bool) (int, int, int) { + var bestEnd, bestSum int for { - lastWinTier, maxWinSum = findMaxSumWindow(tiersDist, winSize) - if maxWinSum >= minToMerge { // got it! - break + if bestEnd, bestSum = locateBestWindow(dist, win); bestSum == 0 { // Find the window with the maximum sum of indexes. + return 0, 0, 0 + } + + if bestSum >= minMerge { + first := max(0, bestEnd-win) + last := bestEnd + return bestSum, first, last } - if findAtAnyCost { // expand window size and find again - // todo добавить логирования! - winSize *= 2 - continue + + if !forceMerge { + return 0, 0, 0 } - return 0, 0, 0 + + logger.Warn("insufficient indexes for merge, expanding window", + zap.Int("win_before", win), + zap.Int("win_after", win*2), + zap.Int("found", bestSum), + zap.Int("required", minMerge), + ) + win *= 2 } +} + +// locateBestWindow finds the window (range of tiers) with the maximum number of indexes. +// dist - distribution of indexes by tiers. +// winSize - window size (number of tiers). +// Returns: the tier where the window with the maximum sum ends, +// and the maximum sum itself. +func locateBestWindow(dist [][]memIndexExt, winSize int) (int, int) { + maxCount := 0 + bestEnd := 0 - firstTier := max(0, lastWinTier-winSize) - lastTier := lastWinTier + win := winSum{buf: make([]int, winSize)} - return maxWinSum, firstTier, lastTier + for tier, items := range dist { + win.Add(len(items)) + if win.Total() >= maxCount { + bestEnd = tier + maxCount = win.sum + } + } + return bestEnd, maxCount } -// sliding window sum +// winSum - structure for implementing a sliding window sum calculation. +// Used for efficiently calculating the sum within a fixed-size window. type winSum struct { - buf []int - sum int - pos int + buf []int // buffer to store values in the window. + sum int // current sum of values in the window. + pos int // current position in the ring buffer. } +// Add adds a new value to the sliding window. +// v - new value to add. +// The method updates the sum: removes the oldest value and adds the new one. func (w *winSum) Add(v int) { w.sum += v - w.buf[w.pos] w.buf[w.pos] = v @@ -162,17 +157,6 @@ func (w *winSum) Add(v int) { } } -func findMaxSumWindow(tiersDist []int, winSize int) (int, int) { - maxWinSum := 0 - lastWinTier := 0 - win := winSum{buf: make([]int, winSize)} - - for tier, size := range tiersDist { - win.Add(size) - if win.sum >= maxWinSum { - lastWinTier = tier - maxWinSum = win.sum - } - } - return lastWinTier, maxWinSum +func (w *winSum) Total() int { + return w.sum } diff --git a/frac/active2/resources.go b/frac/active2/resources.go index 7dde6a5c..113ac6c6 100644 --- a/frac/active2/resources.go +++ b/frac/active2/resources.go @@ -10,9 +10,9 @@ import ( var ( tokenKeyPool = resources.NewSizedPool[token](24) indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) - tokenMapPool = resources.TypedPool[map[token]uint32]{} - resourcesPool = resources.TypedPool[*Resources]{} + docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](24) bufPool = resources.TypedPool[*indexBuffer]{} + resPool = resources.TypedPool[*Resources]{} ) // Resources provides pooled memory allocation for index construction. @@ -27,75 +27,76 @@ type Resources struct { uint32Slices resources.SliceAllocator[[]uint32] tokenKeys resources.SliceAllocator[token] indexerMetaData resources.SliceAllocator[indexer.MetaData] - tokenMap resources.MapAllocator[token, uint32] buf resources.ObjectAllocator[indexBuffer] ids resources.SliceOnBytes[seq.ID] docPos resources.SliceOnBytes[seq.DocPos] + docPosSlices resources.SliceAllocator[[]seq.DocPos] } -func NewResources() (*Resources, func()) { - r, ok := resourcesPool.Get() +func AcquireResources() (*Resources, func()) { + r, ok := resPool.Get() if !ok { s := resources.CallStack{} r = &Resources{ - releases: &s, + releases: &s, + + bytes: resources.NewBytes(&s), uint32s: resources.NewUint32s(&s), uint64s: resources.NewUint64s(&s), - bytes: resources.NewBytes(&s), uint32Slices: resources.NewUint32Slices(&s), bytesSlices: resources.NewBytesSlices(&s), + ids: resources.NewSliceOnBytes[seq.ID](&s), + docPos: resources.NewSliceOnBytes[seq.DocPos](&s), + docPosSlices: resources.NewSliceAllocator(&docPosSlicesPool, &s), indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), - tokenMap: resources.NewMapAllocator(&tokenMapPool, &s), buf: resources.NewObjectAllocator(&bufPool, &s), - ids: resources.NewSliceOnBytes[seq.ID](&s), - docPos: resources.NewSliceOnBytes[seq.DocPos](&s), } } return r, func() { r.releases.CallAll() - resourcesPool.Put(r) + resPool.Put(r) } } -func (r *Resources) BytesSlices() resources.SliceAllocator[[]byte] { - return r.bytesSlices +func (r *Resources) AllocBytesSlices(s int) [][]byte { + return r.bytesSlices.AllocSlice(s) } -func (r *Resources) Bytes() resources.SliceAllocator[byte] { - return r.bytes +func (r *Resources) AllocBytes(s int) []byte { + return r.bytes.AllocSlice(s) } -func (r *Resources) Uint32s() resources.SliceOnBytes[uint32] { - return r.uint32s +func (r *Resources) AllocUint32s(s int) []uint32 { + return r.uint32s.AllocSlice(s) } -func (r *Resources) IDs() resources.SliceOnBytes[seq.ID] { - return r.ids +func (r *Resources) AllocIDs(s int) []seq.ID { + return r.ids.AllocSlice(s) } -func (r *Resources) DocPos() resources.SliceOnBytes[seq.DocPos] { - return r.docPos +func (r *Resources) AllocDocPos(s int) []seq.DocPos { + return r.docPos.AllocSlice(s) } -func (r *Resources) Uint64s() resources.SliceOnBytes[uint64] { - return r.uint64s +func (r *Resources) AllocDocPosSlices(s int) [][]seq.DocPos { + return r.docPosSlices.AllocSlice(s) } -func (r *Resources) Uint32Slices() resources.SliceAllocator[[]uint32] { - return r.uint32Slices +func (r *Resources) AllocUint64s(s int) []uint64 { + return r.uint64s.AllocSlice(s) } -func (r *Resources) Metadata() resources.SliceAllocator[indexer.MetaData] { - return r.indexerMetaData +func (r *Resources) AllocUint32Slices(s int) [][]uint32 { + return r.uint32Slices.AllocSlice(s) } -func (r *Resources) Tokens() resources.SliceAllocator[token] { - return r.tokenKeys +func (r *Resources) AllocMetadata(s int) []indexer.MetaData { + return r.indexerMetaData.AllocSlice(s) } -func (r *Resources) TokenMap() resources.MapAllocator[token, uint32] { - return r.tokenMap +func (r *Resources) AllocTokens(s int) []token { + return r.tokenKeys.AllocSlice(s) } func (r *Resources) Buffer() *indexBuffer { @@ -105,12 +106,14 @@ func (r *Resources) Buffer() *indexBuffer { fields: make([]string, 0, 100), fieldTIDs: make([]uint32, 0, 100), tokens: make([]tokenizer.MetaToken, 0, 100), + tokenMap: make(map[token]uint32, 1000), } }, func(b *indexBuffer) { b.fields = b.fields[:0] b.tokens = b.tokens[:0] b.fieldTIDs = b.fieldTIDs[:0] b.sizes = b.sizes[:0] + clear(b.tokenMap) }) } @@ -121,4 +124,5 @@ type indexBuffer struct { fields []string fieldTIDs []uint32 tokens []tokenizer.MetaToken + tokenMap map[token]uint32 } diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go index 359d91c4..b5f9a685 100644 --- a/frac/active2/sealing_source.go +++ b/frac/active2/sealing_source.go @@ -1,6 +1,7 @@ package active2 import ( + "errors" "iter" "math" "time" @@ -28,18 +29,24 @@ type SealingSource struct { } func NewSealingSource(a *Active2, params frac.SealParams) (sealing.Source, error) { - info := *a.info // copy - index := *a.indexes.MergeAll() // copy + a.merger.MergeAll() + + iss, release := a.indexes.Snapshot() + defer release() + + if len(iss.indexes) != 1 { + return nil, errors.New("wrong count of fraction memIndexes") + } ss := &SealingSource{ - info: &info, - index: &index, + info: iss.info, + index: iss.indexes[0], } // Sort documents if not skipped in configuration if !a.Config.SkipSortDocs { - ds := active.NewDocsSource(ss, index.blocksOffsets, &a.sortReader) - blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) + ds := active.NewDocsSource(ss, ss.index.blocksOffsets, &a.sortReader) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(ss.info.Path, params, ds) if err != nil { return nil, err } @@ -50,7 +57,7 @@ func NewSealingSource(a *Active2, params frac.SealParams) (sealing.Source, error ss.info.MetaOnDisk = 0 ss.info.SealingTime = uint64(time.Now().UnixMilli()) - ss.info.BuildDistributionWithIDs(index.ids) + ss.info.BuildDistributionWithIDs(ss.index.ids) return ss, nil } From 84a087977e6e945e9d16e688f97a43c0bde329ca Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Sun, 21 Dec 2025 17:29:23 +0300 Subject: [PATCH 11/28] tune merge strategy: remove tires and use generations --- frac/active/indexer_test.go | 7 +- frac/active2/active2.go | 31 +++-- frac/active2/indexer.go | 44 ++++--- frac/active2/indexer_test.go | 61 +++++----- frac/active2/iterators.go | 2 +- frac/active2/mem_index.go | 5 +- frac/active2/mem_index_pool.go | 43 +++---- frac/active2/merge.go | 133 ++++++++++----------- frac/active2/merge_manager.go | 138 ++++++++++++---------- frac/active2/merge_strategy.go | 162 -------------------------- frac/active2/resources.go | 10 +- frac/active2/tiers.go | 80 ------------- frac/tests/fraction_test.go | 18 ++- fracmanager/fracmanager.go | 2 +- fracmanager/fraction_provider.go | 31 +++-- fracmanager/fraction_provider_test.go | 6 +- indexer/processor.go | 2 + util/semaphore.go | 36 ++++++ 18 files changed, 314 insertions(+), 497 deletions(-) delete mode 100644 frac/active2/merge_strategy.go delete mode 100644 frac/active2/tiers.go create mode 100644 util/semaphore.go diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index 509f5878..b483bcd1 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -14,6 +14,7 @@ import ( "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -80,7 +81,7 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewIndexer(8, 8) + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) @@ -131,7 +132,7 @@ func defaultSealingParams() frac.SealParams { func BenchmarkFullWrite(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewIndexer(8, 8) + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) @@ -140,7 +141,7 @@ func BenchmarkFullWrite(b *testing.B) { processor := getTestProcessor() - n := 2 + n := 10 allDocs := make([][]byte, 0, len(readers)*n) allMeta := make([][]byte, 0, len(readers)*n) diff --git a/frac/active2/active2.go b/frac/active2/active2.go index 5a43743a..d8e841d4 100644 --- a/frac/active2/active2.go +++ b/frac/active2/active2.go @@ -44,12 +44,10 @@ type Active2 struct { writer *active.Writer } -const MergerWorkers = 2 - func New( baseFileName string, cfg *frac.Config, - indexer *Indexer, + workers int, readLimiter *storage.ReadLimiter, docsCache *cache.Cache[[]byte], sortCache *cache.Cache[[]byte], @@ -59,14 +57,13 @@ func New( info := frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())) indexes := NewIndexPool(info) - merger := NewMergeManager(indexes, MergerWorkers) f := &Active2{ BaseFileName: baseFileName, Config: cfg, - indexer: indexer, + indexer: NewIndexer(util.NewSemaphore(workers)), + merger: NewMergeManager(indexes, util.NewSemaphore(workers)), indexes: indexes, - merger: merger, docsFile: docsFile, docsCache: docsCache, @@ -130,11 +127,7 @@ out: wg.Add(1) f.indexer.Index(meta, func(idx *memIndex, err error) { - if err != nil { - logger.Fatal("bulk indexing error", zap.Error(err)) - } - f.indexes.Add(idx, 0, 0) - f.merger.triggerMerge() + f.AddIndex(idx, 0, 0, err) wg.Done() }) } @@ -165,14 +158,12 @@ func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { } mi := sw.Start("send_to_indexer") + f.indexer.Index(meta, func(idx *memIndex, err error) { - if err != nil { - logger.Fatal("bulk indexing error", zap.Error(err)) - } - f.indexes.Add(idx, uint64(len(docs)), uint64(len(meta))) - f.merger.triggerMerge() + f.AddIndex(idx, uint64(len(docs)), uint64(len(meta)), err) wg.Done() }) + mi.Stop() ma.Stop() @@ -180,6 +171,14 @@ func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { return nil } +func (f *Active2) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + f.indexes.Add(idx, docsLen, metaLen) + f.merger.triggerMerge() +} + func (f *Active2) String() string { return frac.FracToString(f, "active") } diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index 9fcbb43a..6eebd55c 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -18,22 +18,22 @@ const uint32Size = uint32(unsafe.Sizeof(uint32(0))) // Indexer indexes documents with concurrency limitation type Indexer struct { - sem chan struct{} + workerPool Semaphore } // NewIndexer creates a new indexer with specified number of workers -func NewIndexer(workerCount int) *Indexer { +func NewIndexer(workerPool Semaphore) *Indexer { return &Indexer{ - sem: make(chan struct{}, workerCount), + workerPool: workerPool, } } // Index starts asynchronous document indexing func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { - idx.sem <- struct{}{} + idx.workerPool.Acquire() go func() { apply(NewMemIndex(block)) - <-idx.sem + idx.workerPool.Release() }() } @@ -41,8 +41,8 @@ func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, er func NewMemIndex(block storage.DocBlock) (*memIndex, error) { sw := stopwatch.New() - res, cleanup := AcquireResources() - defer cleanup() + res, release := AcquireResources() + defer release() // Decompress metadata payload, err := decompressMeta(res, block, sw) @@ -72,10 +72,10 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { } // Group documents by token - tokenDocGroups := groupLIDsByTID(idx, res, tids, lids, len(tokens)) + tokenLIDs := groupLIDsByTID(idx, res, tids, lids, len(tokens)) // Organize tokens and fields - organizeTokens(idx, res, buf, tokens, tokenDocGroups) + organizeTokens(idx, res, buf, tokens, tokenLIDs) // Set special "all" token idx.allTID = uint32(idx.fieldsTokens[seq.TokenAll].start) @@ -83,15 +83,15 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { return idx, nil } -// token represents a unique token as a (field, value) pair. +// tokenStr represents a unique token as a (field, value) pair. // Used as a map key during token deduplication. -type token struct { +type tokenStr struct { value string field string } -func toToken(t tokenizer.MetaToken) token { - return token{ +func toToken(t tokenizer.MetaToken) tokenStr { + return tokenStr{ value: util.ByteToStringUnsafe(t.Value), field: util.ByteToStringUnsafe(t.Key), } @@ -103,7 +103,7 @@ func extractTokens( res *Resources, buf *indexBuffer, meta []indexer.MetaData, -) ([]uint32, []uint32, []token, error) { +) ([]uint32, []uint32, []tokenStr, error) { var docOffset uint64 var totalTokens uint32 @@ -115,7 +115,6 @@ func extractTokens( for i := range meta { docMeta := meta[i] if docMeta.Size > 0 { - // Start new document group prev = seq.PackDocPos(0, docOffset) docOffset += uint64(docMeta.Size) + uint64(uint32Size) } @@ -143,7 +142,7 @@ func extractTokens( // Extract and process tokens from all documents var err error - var token token + var token tokenStr // Allocate slices for token-document relationships lids := res.AllocUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence @@ -195,6 +194,7 @@ func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCou // We use a single large buffer and slice it for efficiency tokenLIDs := res.AllocUint32Slices(tokenCount) allTokenLIDs := idx.res.AllocUint32s(len(lids)) + idx.allTokenLIDsCount = len(lids) tokenLIDs = tokenLIDs[:len(counts)] for tid, count := range counts { @@ -206,14 +206,24 @@ func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCou // We reuse docIDs slice bounds for safety lids = lids[:len(tids)] for i, tid := range tids { + if len(tokenLIDs[tid]) > 0 { + if lids[i] == lastLID(tokenLIDs[tid]) { // deduplication + idx.allTokenLIDsCount-- + continue + } + } tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) } return tokenLIDs } +func lastLID(s []uint32) uint32 { + return s[len(s)-1] +} + // organizeTokens organizes tokens and fields in the index with proper sorting -func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []token, tokenLIDs [][]uint32) { +func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { tokenSize := 0 order := res.AllocUint32s(len(tokens)) order = order[:len(tokens)] diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go index ee092d01..d8566e9a 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active2/indexer_test.go @@ -10,6 +10,7 @@ import ( "github.com/alecthomas/units" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -20,32 +21,35 @@ import ( "github.com/ozontech/seq-db/tokenizer" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "go.uber.org/zap" "go.uber.org/zap/zapcore" ) func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx := NewIndexer(8) allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) - readers := splitLogsToBulks(allLogs, 2000) + readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) processor := getTestProcessor() - b.ResetTimer() - for i := 0; i < b.N; i++ { - b.StopTimer() - bulks := make([][]byte, 0, len(readers)) + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { for _, readNext := range readers { _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() active := New( filepath.Join(b.TempDir(), "test"), &frac.Config{}, - idx, + config.NumCPU, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), @@ -53,12 +57,9 @@ func BenchmarkIndexer(b *testing.B) { b.StartTimer() wg := sync.WaitGroup{} - for _, meta := range bulks { + for _, meta := range allMeta { wg.Add(1) - idx.Index(meta, func(idx *memIndex, err error) { - if err != nil { - logger.Fatal("bulk indexing error", zap.Error(err)) - } + active.indexer.Index(meta, func(idx *memIndex, err error) { active.indexes.Add(idx, 0, 0) wg.Done() }) @@ -69,39 +70,40 @@ func BenchmarkIndexer(b *testing.B) { func BenchmarkMerge(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx := NewIndexer(8) allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) - readers := splitLogsToBulks(allLogs, 2000) + readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) processor := getTestProcessor() - b.ResetTimer() - for i := 0; i < b.N; i++ { - b.StopTimer() - bulks := make([][]byte, 0, len(readers)) + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { for _, readNext := range readers { _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() active := New( filepath.Join(b.TempDir(), "test"), &frac.Config{}, - idx, + config.NumCPU, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), ) wg := sync.WaitGroup{} - for _, meta := range bulks { + for _, meta := range allMeta { wg.Add(1) - idx.Index(meta, func(idx *memIndex, err error) { - if err != nil { - logger.Fatal("bulk indexing error", zap.Error(err)) - } + active.indexer.Index(meta, func(idx *memIndex, err error) { active.indexes.Add(idx, 0, 0) wg.Done() }) @@ -128,7 +130,6 @@ func defaultSealingParams() frac.SealParams { func BenchmarkFullWrite(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx := NewIndexer(8) allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) @@ -136,7 +137,7 @@ func BenchmarkFullWrite(b *testing.B) { processor := getTestProcessor() - n := 2 + n := 10 allDocs := make([][]byte, 0, len(readers)*n) allMeta := make([][]byte, 0, len(readers)*n) @@ -154,7 +155,7 @@ func BenchmarkFullWrite(b *testing.B) { active := New( filepath.Join(b.TempDir(), "test"), &frac.Config{SkipSortDocs: true}, - idx, + config.NumCPU, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), diff --git a/frac/active2/iterators.go b/frac/active2/iterators.go index 7874d523..025b2393 100644 --- a/frac/active2/iterators.go +++ b/frac/active2/iterators.go @@ -98,7 +98,7 @@ func (i *TokenIteratorItem) Field() []byte { return i.payload.idx.fields[i.fid] } -func (i *TokenIteratorItem) Token() []byte { +func (i *TokenIteratorItem) Value() []byte { return i.payload.idx.tokens[i.tid] } diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index f53c9ed8..02ec7144 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -22,8 +22,9 @@ type memIndex struct { positions []seq.DocPos allTID uint32 - docsSize uint64 - docsCount uint32 + docsSize uint64 + docsCount uint32 + allTokenLIDsCount int wg sync.WaitGroup res *Resources diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go index 9d43a0de..0bf04586 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active2/mem_index_pool.go @@ -5,7 +5,6 @@ import ( "sync" "sync/atomic" - "github.com/alecthomas/units" "github.com/ozontech/seq-db/frac" ) @@ -13,7 +12,7 @@ import ( type memIndexExt struct { id uint64 // unique runtime ID index *memIndex // actual index - tier int // size tier of the index + gen int } type memIndexPool struct { @@ -23,7 +22,6 @@ type memIndexPool struct { readyToMerge map[uint64]memIndexExt underMerging map[uint64]memIndexExt - tiers sizeTiers // index size tier classifier counter atomic.Uint64 // atomic counter for generating index IDs } @@ -32,8 +30,6 @@ func NewIndexPool(info *frac.Info) *memIndexPool { info: info, readyToMerge: make(map[uint64]memIndexExt), underMerging: make(map[uint64]memIndexExt), - - tiers: newSizeTiers(firstTierMaxSizeKb, maxTierCount, tierSizeDeltaPercent), } } @@ -74,7 +70,7 @@ func (p *memIndexPool) Info() *frac.Info { func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { maxMID := idx.ids[0].MID minMID := idx.ids[len(idx.ids)-1].MID - idxExt := p.wrapIndex(idx) + item := p.wrapIndex(idx, 0) p.mu.Lock() defer p.mu.Unlock() @@ -91,7 +87,7 @@ func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { p.info.DocsOnDisk += docsLen p.info.MetaOnDisk += metaLen - p.readyToMerge[idxExt.id] = idxExt + p.readyToMerge[item.id] = item p.indexes = append(p.indexes, idx) } @@ -117,31 +113,36 @@ func (p *memIndexPool) markAsMerging(items []memIndexExt) { } } -func (p *memIndexPool) replace(oldIndexes []memIndexExt, newIndex *memIndex) { - mergedMeta := p.wrapIndex(newIndex) +func (p *memIndexPool) replace(old []memIndexExt, newIndex *memIndex) { + gen := 0 + for _, item := range old { + gen += item.gen + } + merged := p.wrapIndex(newIndex, gen/len(old)+1) p.mu.Lock() defer p.mu.Unlock() - for _, metaIndex := range oldIndexes { - delete(p.underMerging, metaIndex.id) + for _, eIndex := range old { + delete(p.underMerging, eIndex.id) } - p.readyToMerge[mergedMeta.id] = mergedMeta + p.readyToMerge[merged.id] = merged // Rebuild the index list for reading p.indexes = p.indexes[:0] p.indexes = slices.Grow(p.indexes, len(p.readyToMerge)+len(p.underMerging)) - for _, idxExt := range p.readyToMerge { - p.indexes = append(p.indexes, idxExt.index) // add all ready indexes + for _, item := range p.readyToMerge { + p.indexes = append(p.indexes, item.index) // add all ready indexes } - for _, idxExt := range p.underMerging { - p.indexes = append(p.indexes, idxExt.index) // add indexes currently being merged + for _, item := range p.underMerging { + p.indexes = append(p.indexes, item.index) // add indexes currently being merged } go func() { - for _, idxExt := range oldIndexes { - idxExt.index.Release() + // todo do we need wg here? + for _, item := range old { + item.index.Release() } }() } @@ -156,10 +157,10 @@ func (p *memIndexPool) Release() { } } -func (p *memIndexPool) wrapIndex(index *memIndex) memIndexExt { +func (p *memIndexPool) wrapIndex(index *memIndex, gen int) memIndexExt { return memIndexExt{ - id: p.counter.Add(1), // atomically increment counter - tier: p.tiers.Calc(index.docsCount / uint32(units.KiB)), // determine size tier + id: p.counter.Add(1), + gen: gen, index: index, } } diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 13702261..71fa81df 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -4,30 +4,26 @@ import ( "bytes" "slices" - "github.com/RoaringBitmap/roaring/v2" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/util" ) func mergeIndexes(indexes []*memIndex) *memIndex { - docsCount := 0 blocksCount := 0 - docsSize := uint64(0) - for _, index := range indexes { - docsSize += index.docsSize - docsCount += len(index.ids) - blocksCount += len(index.blocksOffsets) + dst := newMemIndex() + for _, idx := range indexes { + dst.docsSize += idx.docsSize + dst.docsCount += idx.docsCount + dst.allTokenLIDsCount += idx.allTokenLIDsCount + blocksCount += len(idx.blocksOffsets) } res, release := AcquireResources() defer release() - dst := newMemIndex() - dst.docsCount = uint32(docsCount) - dst.ids = dst.res.AllocIDs(docsCount)[:0] - dst.positions = dst.res.AllocDocPos(docsCount)[:0] + dst.ids = dst.res.AllocIDs(int(dst.docsCount))[:0] + dst.positions = dst.res.AllocDocPos(int(dst.docsCount))[:0] dst.blocksOffsets = dst.res.AllocUint64s(blocksCount)[:0] - dst.docsSize = docsSize posMap := mergeBlocksOffsets(dst, res, indexes) lidsMap := mergeIDs(dst, res, indexes, posMap) @@ -47,7 +43,7 @@ func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq idx: idx, posMap: posMap[i], } - lidsMap[i] = res.uint32s.AllocSlice(int(idx.docsCount))[:0] + lidsMap[i] = res.uint32s.AllocSlice(int(idx.docsCount) + 1)[:1] // 1-based } orderedIDs := MergeKSortIterators(iters, func(a, b IDIteratorItem) int { return seq.Compare(b.id, a.id) }) @@ -64,28 +60,22 @@ func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq } func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][]uint32) { - allCount := 0 totalTokens := 0 - totalLIDsSize := 0 - TokensIterators := make([]IOrderedIterator[TokenIteratorItem], len(indexes)) - for i, index := range indexes { - allCount += len(index.ids) - TokensIterators[i] = NewTokenIterator(index, lidsMap[i]) - totalTokens += len(index.tokens) - for _, lids := range index.tokenLIDs { - totalLIDsSize += len(lids) - } + tokensIterators := make([]IOrderedIterator[TokenIteratorItem], len(indexes)) + for i, idx := range indexes { + totalTokens += len(idx.tokens) + tokensIterators[i] = NewTokenIterator(idx, lidsMap[i]) } cmpToken := func(a, b TokenIteratorItem) int { r := bytes.Compare(a.Field(), b.Field()) if r == 0 { - return bytes.Compare(a.Token(), b.Token()) + return bytes.Compare(a.Value(), b.Value()) } return r } - orderedTokens := MergeKSortIterators(TokensIterators, cmpToken) + orderedTokens := MergeKSortIterators(tokensIterators, cmpToken) uniqTokensSize := 0 uniqTokensCount := 0 @@ -99,14 +89,14 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ ) borders := res.AllocBytes(totalTokens)[:0] - items := make([]TokenIteratorItem, 0, totalTokens) + tokens := make([]TokenIteratorItem, 0, totalTokens) for cur, has := orderedTokens.Next(); has; cur, has = orderedTokens.Next() { var border uint8 if prevToken.payload == nil || cmpToken(prevToken, cur) != 0 { uniqTokensCount++ - uniqTokensSize += len(cur.Token()) + uniqTokensSize += len(cur.Value()) border++ field := cur.Field() @@ -119,7 +109,7 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ } borders = append(borders, border) - items = append(items, cur) + tokens = append(tokens, cur) prevToken = cur } @@ -130,12 +120,17 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ allTokens := dst.res.AllocBytes(uniqTokensSize)[:0] allFields := dst.res.AllocBytes(uniqFieldsSize)[:0] - allTokenLIDs := dst.res.AllocUint32s(totalLIDsSize)[:0] - lidsCollector := NewLIDsCollector(allTokenLIDs, genAllLIDs(res, allCount)) + lidsCollector := NewLIDsCollector( + res.AllocUint32s(int(dst.docsCount)), // tmp buf + dst.res.AllocUint32s(dst.allTokenLIDsCount - int(dst.docsCount))[:0], // all token LIDs + dst.res.AllocUint32s(int(dst.docsCount)), // ALL LIDs for token _all_ + res.AllocBytes((int(dst.docsCount) + 1)), // sort buffer + ) var isAllToken bool - for i, item := range items { + for i, token := range tokens { + token := token if borders[i] > 0 { if i > 0 { @@ -154,7 +149,7 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ } start := len(allFields) - allFields = append(allFields, item.Field()...) + allFields = append(allFields, token.Field()...) field := allFields[start:] dst.fields = append(dst.fields, field) @@ -165,19 +160,18 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ } start := len(allTokens) - allTokens = append(allTokens, item.Token()...) + allTokens = append(allTokens, token.Value()...) dst.tokens = append(dst.tokens, allTokens[start:]) } if isAllToken { - for range item.LIDs() { + for range token.LIDs() { lidsCollector.Add(0) } } else { - lidsMap := item.lidsMap() - for _, oldLID := range item.LIDs() { - newLID := lidsMap[oldLID-1] - lidsCollector.Add(newLID) + newLIDsMap := token.lidsMap() + for _, oldLID := range token.LIDs() { + lidsCollector.Add(newLIDsMap[oldLID]) } } } @@ -193,52 +187,59 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ } type LIDsCollector struct { - all []uint32 - lids []uint32 - offset int - bitmap *roaring.Bitmap + tmp []uint32 + lids []uint32 + all []uint32 + buf []uint8 } -func genAllLIDs(res *Resources, s int) []uint32 { - all := res.AllocUint32s(s) +func NewLIDsCollector(tmp, lids, all []uint32, buf []uint8) *LIDsCollector { + clear(buf) for i := range all { all[i] = uint32(i) + 1 } - return all -} - -func NewLIDsCollector(allTokenLIDs, all []uint32) *LIDsCollector { return &LIDsCollector{ - lids: allTokenLIDs[:0], - all: all, - bitmap: roaring.New(), + tmp: tmp[:0], + lids: lids[:0], + all: all, + buf: buf, } } func (s *LIDsCollector) Add(lid uint32) { - s.lids = append(s.lids, lid) + s.tmp = append(s.tmp, lid) } func (s *LIDsCollector) GetSorted() (dst []uint32) { - dst = s.lids[s.offset:] + n := len(s.tmp) - if len(dst) == len(s.all) { - s.lids = append(s.lids[:s.offset], s.all...) - s.offset = len(s.lids) - return dst + if n == len(s.all) { + s.tmp = s.tmp[:0] + return s.all } - if len(dst) > 64_000 { - s.bitmap.AddMany(dst) - s.bitmap.ToExistingArray(&dst) - s.bitmap.Clear() - s.offset = len(s.lids) - return dst + if n > 16_000 { + for _, v := range s.tmp { + s.buf[v] = 1 + } + start := len(s.lids) + for lid, ok := range s.buf { + if ok == 1 { + s.buf[lid] = 0 + s.lids = append(s.lids, uint32(lid)) + } + } + s.tmp = s.tmp[:0] + return s.lids[start:] } - slices.Sort(dst) - s.offset = len(s.lids) - return dst + if n > 1 { + slices.Sort(s.tmp) + } + start := len(s.lids) + s.lids = append(s.lids, s.tmp...) + s.tmp = s.tmp[:0] + return s.lids[start:] } func mergeBlocksOffsets(dst *memIndex, res *Resources, indexes []*memIndex) [][]seq.DocPos { diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index d5cc9b39..4ecf8ce9 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -2,17 +2,23 @@ package active2 import ( "sync" + + "github.com/ozontech/seq-db/logger" + "go.uber.org/zap" ) const ( - minIndexesToMerge = 4 // minimum number of indexes to trigger merge - forceMergeThreshold = 64 // index count threshold for forced merge - firstTierMaxSizeKb = 8 // maximum size of the first tier - maxTierCount = 64 // maximum number of size tiers allowed - tierSizeDeltaPercent = 25 // percentage difference between size tiers - bucketSizePercent = 50 // percentage difference between size buckets + maxGenerations = 32 + minIndexesToMerge = 16 // minimum number of indexes to trigger merge + forceMergeThreshold = 4096 // index count threshold for forced merge ) +type Semaphore interface { + Acquire() + Release() + Capacity() int +} + // MergeManager manages in-memory index collection and merging type MergeManager struct { mu sync.Mutex @@ -21,16 +27,16 @@ type MergeManager struct { stopped bool indexes *memIndexPool - workers chan struct{} // semaphore to limit concurrent merge operations - mergeCh chan struct{} // channel to trigger merge process + workerPool Semaphore + mergeCh chan struct{} // channel to trigger merge process } // NewMergeManager creates a new index manager -func NewMergeManager(indexes *memIndexPool, maxConcurrentMerges int) *MergeManager { +func NewMergeManager(indexes *memIndexPool, workerPool Semaphore) *MergeManager { m := MergeManager{ - indexes: indexes, - workers: make(chan struct{}, maxConcurrentMerges), - mergeCh: make(chan struct{}, 1), + indexes: indexes, + workerPool: workerPool, + mergeCh: make(chan struct{}, 1), } // Start background goroutine for merge scheduling @@ -59,83 +65,89 @@ func (m *MergeManager) MergeAll() { m.wg.Wait() if toMerge := m.indexes.ReadyToMerge(); len(toMerge) > 1 { + logger.Debug("merge all mini-indexes", zap.Int("batch", len(toMerge))) m.indexes.markAsMerging(toMerge) merged := mergeIndexes(extractIndexes(toMerge)) m.indexes.replace(toMerge, merged) } } -func extractIndexes(indexesExt []memIndexExt) []*memIndex { - result := make([]*memIndex, 0, len(indexesExt)) - for _, eIdx := range indexesExt { - result = append(result, eIdx.index) +func extractIndexes(items []memIndexExt) []*memIndex { + result := make([]*memIndex, 0, len(items)) + for _, item := range items { + result = append(result, item.index) } return result } -// prepareForMerging prepares index groups for merging -func (m *MergeManager) prepareForMerging() [][]memIndexExt { - m.mu.Lock() - defer m.mu.Unlock() +func (m *MergeManager) mergeScheduler() { + for range m.mergeCh { + m.workerPool.Acquire() // wait for a free worker - if m.stopped { - return nil - } + m.mu.Lock() - mergeCandidates := pickMergeCandidates(m.indexes.ReadyToMerge(), minIndexesToMerge) + if m.stopped { + m.mu.Unlock() + m.workerPool.Release() + continue + } - for i, candidateGroup := range mergeCandidates { - if !m.acquireWorker() { // no free workers - mergeCandidates = mergeCandidates[:i] // truncate unprocessable tail - break + batch := pickToMerge(m.indexes.ReadyToMerge(), minIndexesToMerge) + if len(batch) == 0 { + m.mu.Unlock() + m.workerPool.Release() + continue } - m.indexes.markAsMerging(candidateGroup) - } - // Important: call Add() inside lock to prevent races during shutdown - m.wg.Add(len(mergeCandidates)) + m.indexes.markAsMerging(batch) + m.wg.Add(1) // important to inc wg inside the lock + m.mu.Unlock() - return mergeCandidates -} + logger.Debug("merge indexes", zap.Int("gen", batch[0].gen), zap.Int("batch", len(batch))) -func (m *MergeManager) mergeScheduler() { - for range m.mergeCh { - for { - preparedGroups := m.prepareForMerging() - if len(preparedGroups) == 0 { - break - } - - for _, toMerge := range preparedGroups { - go func() { - mergedIndex := mergeIndexes(extractIndexes(toMerge)) - m.indexes.replace(toMerge, mergedIndex) - m.releaseWorker() - m.triggerMerge() // check if new merge is needed - m.wg.Done() - }() - } - } + go func() { + merged := mergeIndexes(extractIndexes(batch)) + m.workerPool.Release() + m.indexes.replace(batch, merged) + m.triggerMerge() // check if new merge is needed + m.wg.Done() + }() } } -func (m *MergeManager) acquireWorker() bool { +func (m *MergeManager) triggerMerge() { select { - case m.workers <- struct{}{}: - return true + case m.mergeCh <- struct{}{}: default: - return false + // Trigger already set, no need for additional notification } } -func (m *MergeManager) releaseWorker() { - <-m.workers +func pickToMerge(items []memIndexExt, minBatchSize int) []memIndexExt { + if len(items) < minBatchSize { + return nil + } + + if len(items) > forceMergeThreshold { + return items + } + + batch := largestBatch(items) + if len(batch) < minBatchSize { + return nil + } + return batch } -func (m *MergeManager) triggerMerge() { - select { - case m.mergeCh <- struct{}{}: - default: - // Trigger already set, no need for additional notification +func largestBatch(items []memIndexExt) []memIndexExt { + maxGen := 0 + batches := make([][]memIndexExt, maxGenerations) + for _, item := range items { + gen := min(maxGenerations, item.gen) + batches[gen] = append(batches[gen], item) + if len(batches[gen]) > len(batches[maxGen]) || len(batches[gen]) == len(batches[maxGen]) && gen > maxGen { + maxGen = gen + } } + return batches[maxGen] } diff --git a/frac/active2/merge_strategy.go b/frac/active2/merge_strategy.go deleted file mode 100644 index 5b4eb181..00000000 --- a/frac/active2/merge_strategy.go +++ /dev/null @@ -1,162 +0,0 @@ -package active2 - -import ( - "math" - - "github.com/ozontech/seq-db/logger" - "go.uber.org/zap" -) - -// Algorithm for selecting indexes for merging (merge): -// -// General concept: -// Indexes are grouped into "tiers" - levels based on their size. -// Merging is performed for indexes from adjacent tiers to minimize -// the size of the resulting index and avoid frequent rebuilds. - -// pickMergeCandidates selects groups of indexes for merging based on their tier. -// items - slice of indexes to analyze. -// minMerge - minimum number of indexes that can be merged. -// Returns a slice of index slices - groups for merging. -func pickMergeCandidates(items []memIndexExt, minMerge int) [][]memIndexExt { - if len(items) < minMerge { - return nil - } - - remains := len(items) - - dist := groupByTier(items) - - // win - size of the "sliding window" in number of tiers. - // bucketSizePercent/tierSizeDeltaPercent determines how many tiers - // to consider as one group when searching for merge candidates. - win := int(math.Round(float64(bucketSizePercent) / tierSizeDeltaPercent)) - - var batches [][]memIndexExt - - for remains > 1 { - // forceMerge - flag for forced merging, activated when there are too many indexes. - forceMerge := remains >= forceMergeThreshold - - // Find the most populated range of tiers. - // batchSize - number of indexes in the found range. - // first, last - boundaries of the tier range. - batchSize, first, last := findBestRange(dist, minMerge, win, forceMerge) - - if batchSize == 0 { - break - } - - remains -= batchSize - buf := make([]memIndexExt, 0, batchSize) - batches = append(batches, takeFromTiers(buf, first, last, dist)) - } - return batches -} - -// groupByTier builds a distribution of indexes by their tiers. -// items - input indexes to distribute. -// Returns a slice of slices, where the outer slice index is the tier number, -// and the value is all indexes of that tier. -func groupByTier(items []memIndexExt) [][]memIndexExt { - maxTier := 0 - dist := make([][]memIndexExt, maxTierCount) - for _, index := range items { - dist[index.tier] = append(dist[index.tier], index) - if index.tier > maxTier { - maxTier = index.tier - } - } - return dist[:maxTier+1] -} - -// takeFromTiers extracts indexes from the specified range of tiers. -// buf - buffer for collecting indexes (pre-allocated with the required capacity). -// first, last - boundaries of the tier range (inclusive). -// dist - distribution of indexes by tiers. -// Returns a slice of indexes from the specified range. -func takeFromTiers(buf []memIndexExt, first, last int, dist [][]memIndexExt) []memIndexExt { - for tier := first; tier <= last; tier++ { - buf = append(buf, dist[tier]...) - dist[tier] = nil // Clear the distribution cell so these indexes don't participate in subsequent iterations. - } - return buf -} - -// findBestRange searches for the most populated range of tiers. -// dist - distribution of indexes by tiers. -// minMerge - minimum number of indexes required for merging. -// win - window size (number of tiers in the range). -// forceMerge - flag for forced search (expands the window if unsuccessful). -// Returns: number of indexes, first tier, last tier. -func findBestRange(dist [][]memIndexExt, minMerge, win int, forceMerge bool) (int, int, int) { - var bestEnd, bestSum int - for { - if bestEnd, bestSum = locateBestWindow(dist, win); bestSum == 0 { // Find the window with the maximum sum of indexes. - return 0, 0, 0 - } - - if bestSum >= minMerge { - first := max(0, bestEnd-win) - last := bestEnd - return bestSum, first, last - } - - if !forceMerge { - return 0, 0, 0 - } - - logger.Warn("insufficient indexes for merge, expanding window", - zap.Int("win_before", win), - zap.Int("win_after", win*2), - zap.Int("found", bestSum), - zap.Int("required", minMerge), - ) - win *= 2 - } -} - -// locateBestWindow finds the window (range of tiers) with the maximum number of indexes. -// dist - distribution of indexes by tiers. -// winSize - window size (number of tiers). -// Returns: the tier where the window with the maximum sum ends, -// and the maximum sum itself. -func locateBestWindow(dist [][]memIndexExt, winSize int) (int, int) { - maxCount := 0 - bestEnd := 0 - - win := winSum{buf: make([]int, winSize)} - - for tier, items := range dist { - win.Add(len(items)) - if win.Total() >= maxCount { - bestEnd = tier - maxCount = win.sum - } - } - return bestEnd, maxCount -} - -// winSum - structure for implementing a sliding window sum calculation. -// Used for efficiently calculating the sum within a fixed-size window. -type winSum struct { - buf []int // buffer to store values in the window. - sum int // current sum of values in the window. - pos int // current position in the ring buffer. -} - -// Add adds a new value to the sliding window. -// v - new value to add. -// The method updates the sum: removes the oldest value and adds the new one. -func (w *winSum) Add(v int) { - w.sum += v - w.buf[w.pos] - w.buf[w.pos] = v - w.pos++ - if w.pos == len(w.buf) { - w.pos = 0 - } -} - -func (w *winSum) Total() int { - return w.sum -} diff --git a/frac/active2/resources.go b/frac/active2/resources.go index 113ac6c6..9d587b1c 100644 --- a/frac/active2/resources.go +++ b/frac/active2/resources.go @@ -8,7 +8,7 @@ import ( ) var ( - tokenKeyPool = resources.NewSizedPool[token](24) + tokenKeyPool = resources.NewSizedPool[tokenStr](24) indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](24) bufPool = resources.TypedPool[*indexBuffer]{} @@ -25,7 +25,7 @@ type Resources struct { bytes resources.SliceAllocator[byte] bytesSlices resources.SliceAllocator[[]byte] uint32Slices resources.SliceAllocator[[]uint32] - tokenKeys resources.SliceAllocator[token] + tokenKeys resources.SliceAllocator[tokenStr] indexerMetaData resources.SliceAllocator[indexer.MetaData] buf resources.ObjectAllocator[indexBuffer] ids resources.SliceOnBytes[seq.ID] @@ -95,7 +95,7 @@ func (r *Resources) AllocMetadata(s int) []indexer.MetaData { return r.indexerMetaData.AllocSlice(s) } -func (r *Resources) AllocTokens(s int) []token { +func (r *Resources) AllocTokens(s int) []tokenStr { return r.tokenKeys.AllocSlice(s) } @@ -106,7 +106,7 @@ func (r *Resources) Buffer() *indexBuffer { fields: make([]string, 0, 100), fieldTIDs: make([]uint32, 0, 100), tokens: make([]tokenizer.MetaToken, 0, 100), - tokenMap: make(map[token]uint32, 1000), + tokenMap: make(map[tokenStr]uint32, 1000), } }, func(b *indexBuffer) { b.fields = b.fields[:0] @@ -124,5 +124,5 @@ type indexBuffer struct { fields []string fieldTIDs []uint32 tokens []tokenizer.MetaToken - tokenMap map[token]uint32 + tokenMap map[tokenStr]uint32 } diff --git a/frac/active2/tiers.go b/frac/active2/tiers.go deleted file mode 100644 index b0fa6f17..00000000 --- a/frac/active2/tiers.go +++ /dev/null @@ -1,80 +0,0 @@ -package active2 - -import ( - "math" -) - -// sizeTiers splits the entire space of integers into successive ranges [A(n) ; B(n)] where: -// - A(n+1) = B(n) + 1 -// - (B(n) - A(n)) / A(n) ~ deltaPercent -// -// Example: for newSizeTiers(100, 200, 10) we will have: -// -// Tier 0: [ 0; 100 ] delta: 0.00% -// Tier 1: [ 101; 106 ] delta: 6.00% -// Tier 2: [ 107; 117 ] delta: 10.00% -// Tier 3: [ 118; 129 ] delta: 10.00% -// Tier 4: [ 130; 142 ] delta: 10.00% -// Tier 5: [ 143; 156 ] delta: 10.00% -// Tier 6: [ 157; 171 ] delta: 10.00% -// Tier 7: [ 172; 189 ] delta: 10.00% -// Tier 8: [ 190; 207 ] delta: 9.00% -// Tier 9: [ 208; 228 ] delta: 10.00% -// Tier 10: [ 229; 251 ] delta: 10.00% -// Tier 11: [ 252; 276 ] delta: 10.00% -// Tier 12: [ 277; 304 ] delta: 10.00% -// Tier 13: [ 305; 334 ] delta: 10.00% -// Tier 14: [ 335; 368 ] delta: 10.00% -// Tier 15: [ 369; 405 ] delta: 10.00% -// Tier 16: [ 406; 445 ] delta: 10.00% -// Tier 17: [ 446; 490 ] delta: 10.00% -// Tier 18: [ 491; 539 ] delta: 10.00% -// Tier 19: [ 540; 593 ] delta: 10.00% -// Tier 20: [ 594; 652 ] delta: 10.00% -// Tier 21: [ 653; 717 ] delta: 10.00% -// Tier 22: [ 718; 789 ] delta: 10.00% -// Tier 23: [ 790; 868 ] delta: 10.00% -// Tier 24: [ 869; 955 ] delta: 10.00% -// Tier 25: [ 956; 1051 ] delta: 10.00% -// Tier 26: [ 1052; 1156 ] delta: 10.00% -// -// etc. -// -// So, sizeTiers returns us the tier (the number of range) for any integer value. -type sizeTiers struct { - firstMax uint32 - maxTier int - deltaK float64 - offset float64 -} - -// newSizeTiers creates a calculator that determines the ordinal number of the range (size tier) a given integer value falls into. -// -// Parameters: -// -// firstMax - The upper bound of the initial (first) range. Range #0 is always [0, firstMax]. -// maxTier - The maximum number of ranges (tiers) to create. For the last range (#maxTier), -// the upper bound is considered to be positive infinity (+inf). -// deltaPercent - The defined growth percentage. The ratio of the difference between the upper bounds -// of two adjacent ranges to the lower bound of the next range is approximately equal -// to this value. Formula for tier #n: (B_n - B_{n-1}) / B_{n-1} ≈ deltaPercent / 100 -func newSizeTiers(firstMax uint32, maxTier, deltaPercent int) sizeTiers { - deltaK := 1 / math.Log(1+float64(deltaPercent)/100) - return sizeTiers{ - maxTier: maxTier, - firstMax: firstMax, - deltaK: deltaK, - offset: math.Floor(deltaK*(math.Log(float64(firstMax)))) - 1, - } -} - -func (t sizeTiers) Calc(size uint32) int { - if size <= t.firstMax { - return 0 - } - tier := int(math.Floor(t.deltaK*(math.Log(float64(size)))) - t.offset) - if tier > t.maxTier { - return t.maxTier - } - return tier -} diff --git a/frac/tests/fraction_test.go b/frac/tests/fraction_test.go index 4d178011..82d6a6cb 100644 --- a/frac/tests/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -39,14 +39,13 @@ import ( // TODO сделать разные тесты для сортированных и не сортированных доков type FractionTestSuite struct { suite.Suite - tmpDir string - config *frac.Config - mapping seq.Mapping - tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *active.Indexer - activeIndexer2 *active2.Indexer - stopIndexer func() - sealParams frac.SealParams + tmpDir string + config *frac.Config + mapping seq.Mapping + tokenizers map[seq.TokenizerType]tokenizer.Tokenizer + activeIndexer *active.Indexer + stopIndexer func() + sealParams frac.SealParams fraction frac.Fraction @@ -55,7 +54,6 @@ type FractionTestSuite struct { func (s *FractionTestSuite) SetupSuiteCommon() { s.activeIndexer, s.stopIndexer = active.NewIndexer(4, 10) - s.activeIndexer2 = active2.NewIndexer(4) } func (s *FractionTestSuite) TearDownSuiteCommon() { @@ -1670,7 +1668,7 @@ func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active2.Active a := active2.New( baseName, s.config, - s.activeIndexer2, + 4, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 92023857..ad5163f7 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -42,7 +42,7 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) idx, stopIdx := active.NewIndexer(config.IndexWorkers, config.IndexWorkers) cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) - provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx, nil) + provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) // Load existing fractions into registry diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index d7bdc0c6..5eaf13ec 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -9,6 +9,7 @@ import ( "github.com/oklog/ulid/v2" + "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/active2" @@ -23,27 +24,25 @@ const fileBasePattern = "seq-db-" // fractionProvider is a factory for creating different types of fractions // Contains all necessary dependencies for creating and managing fractions type fractionProvider struct { - s3cli *s3.Client // Client for S3 storage operations - config *Config // Fraction manager configuration - cacheProvider *CacheMaintainer // Cache provider for data access optimization - activeIndexer *active.Indexer // Indexer for active fractions - activeIndexer2 *active2.Indexer - readLimiter *storage.ReadLimiter // Read rate limiter - ulidEntropy io.Reader // Entropy source for ULID generation + s3cli *s3.Client // Client for S3 storage operations + config *Config // Fraction manager configuration + cacheProvider *CacheMaintainer // Cache provider for data access optimization + activeIndexer *active.Indexer // Indexer for active fractions + readLimiter *storage.ReadLimiter // Read rate limiter + ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( cfg *Config, s3cli *s3.Client, cp *CacheMaintainer, - readLimiter *storage.ReadLimiter, indexer *active.Indexer, indexer2 *active2.Indexer, + readLimiter *storage.ReadLimiter, indexer *active.Indexer, ) *fractionProvider { return &fractionProvider{ - s3cli: s3cli, - config: cfg, - cacheProvider: cp, - activeIndexer: indexer, - activeIndexer2: indexer2, - readLimiter: readLimiter, - ulidEntropy: ulid.Monotonic(rand.New(rand.NewSource(time.Now().UnixNano())), 0), + s3cli: s3cli, + config: cfg, + cacheProvider: cp, + activeIndexer: indexer, + readLimiter: readLimiter, + ulidEntropy: ulid.Monotonic(rand.New(rand.NewSource(time.Now().UnixNano())), 0), } } @@ -62,7 +61,7 @@ func (fp *fractionProvider) NewActive2(name string) *active2.Active2 { return active2.New( name, &fp.config.Fraction, - fp.activeIndexer2, + config.NumCPU, fp.readLimiter, fp.cacheProvider.CreateDocBlockCache(), fp.cacheProvider.CreateSortDocsCache(), diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index b4317136..893f7e84 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -14,7 +14,6 @@ import ( "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac/active" - "github.com/ozontech/seq-db/frac/active2" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" ) @@ -38,9 +37,8 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() rl := storage.NewReadLimiter(1, nil) s3cli, stopS3 := setupS3Client(t) idx, stopIdx := active.NewIndexer(1, 1) - idx2 := active2.NewIndexer(1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) - provider := newFractionProvider(cfg, s3cli, cache, rl, idx, idx2) + provider := newFractionProvider(cfg, s3cli, cache, rl, idx) return provider, func() { stopIdx() stopS3() @@ -48,7 +46,7 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() } func TestFractionID(t *testing.T) { - fp := newFractionProvider(nil, nil, nil, nil, nil, nil) + fp := newFractionProvider(nil, nil, nil, nil, nil) ulid1 := fp.nextFractionID() ulid2 := fp.nextFractionID() assert.NotEqual(t, ulid1, ulid2, "ULIDs should be different") diff --git a/indexer/processor.go b/indexer/processor.go index dbf7c106..589b67b5 100644 --- a/indexer/processor.go +++ b/indexer/processor.go @@ -210,6 +210,8 @@ func (p *Processor) ProcessBulk( dstDocs = binary.LittleEndian.AppendUint32(dstDocs, uint32(len(doc))) dstDocs = append(dstDocs, doc...) for _, m := range meta { + // todo: it is possible to have a few equal tokens here + // todo: probably we need deduplicate it here dstMeta = marshalAppendMeta(dstMeta, m) } } diff --git a/util/semaphore.go b/util/semaphore.go new file mode 100644 index 00000000..77b936ff --- /dev/null +++ b/util/semaphore.go @@ -0,0 +1,36 @@ +package util + +type Semaphore struct { + b chan struct{} +} + +func NewSemaphore(capacity int) *Semaphore { + return &Semaphore{ + b: make(chan struct{}, capacity), + } +} + +func (s *Semaphore) Capacity() int { + return cap(s.b) +} + +func (s *Semaphore) InProgress() int { + return len(s.b) +} + +func (s *Semaphore) TryToAcquire() bool { + select { + case s.b <- struct{}{}: + return true + default: + return false + } +} + +func (s *Semaphore) Acquire() { + s.b <- struct{}{} +} + +func (s *Semaphore) Release() { + <-s.b +} From b32f934ac7cffdb4b4ee6a5eb5f3937ea25c2eec Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 22 Dec 2025 02:17:02 +0300 Subject: [PATCH 12/28] naming + comments --- frac/active/indexer_test.go | 35 ++++--- frac/active2/active2.go | 6 +- frac/active2/indexer.go | 62 +++++++------ frac/active2/indexer_test.go | 4 +- frac/active2/iterators.go | 130 ++++++++++++++++---------- frac/active2/mem_index.go | 8 +- frac/active2/mem_index_pool.go | 163 ++++++++++++++++++++------------- frac/active2/merge.go | 59 ++++++------ frac/active2/merge_manager.go | 124 +++++++++++++------------ frac/active2/resources.go | 84 ++++++++--------- frac/active2/sealing_source.go | 2 +- resources/object_allocator.go | 16 ++-- resources/slice_allocator.go | 24 ++--- resources/slice_on_bytes.go | 2 +- 14 files changed, 404 insertions(+), 315 deletions(-) diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index b483bcd1..d86df4ba 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -88,28 +88,33 @@ func BenchmarkIndexer(b *testing.B) { readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - active := New( - filepath.Join(b.TempDir(), "test"), - idx, - storage.NewReadLimiter(1, nil), - cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - &frac.Config{}, - ) - processor := getTestProcessor() - for i := 0; i < b.N; i++ { - b.StopTimer() - bulks := make([][]byte, 0, len(readers)) + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { for _, readNext := range readers { _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + active := New( + filepath.Join(b.TempDir(), "test"), + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + &frac.Config{}, + ) b.StartTimer() wg := sync.WaitGroup{} - for _, meta := range bulks { + for _, meta := range allMeta { wg.Add(1) idx.Index(active, meta, &wg, stopwatch.New()) } @@ -141,7 +146,7 @@ func BenchmarkFullWrite(b *testing.B) { processor := getTestProcessor() - n := 10 + n := 2 allDocs := make([][]byte, 0, len(readers)*n) allMeta := make([][]byte, 0, len(readers)*n) diff --git a/frac/active2/active2.go b/frac/active2/active2.go index d8e841d4..a2bdac09 100644 --- a/frac/active2/active2.go +++ b/frac/active2/active2.go @@ -30,7 +30,7 @@ type Active2 struct { indexer *Indexer indexes *memIndexPool - merger *MergeManager + merger *mergeManager docsFile *os.File docsReader storage.DocsReader @@ -62,7 +62,7 @@ func New( BaseFileName: baseFileName, Config: cfg, indexer: NewIndexer(util.NewSemaphore(workers)), - merger: NewMergeManager(indexes, util.NewSemaphore(workers)), + merger: newMergeManager(indexes, util.NewSemaphore(workers)), indexes: indexes, docsFile: docsFile, @@ -176,7 +176,7 @@ func (f *Active2) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { logger.Fatal("bulk indexing error", zap.Error(err)) } f.indexes.Add(idx, docsLen, metaLen) - f.merger.triggerMerge() + f.merger.requestMerge() } func (f *Active2) String() string { diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go index 6eebd55c..c8ff105d 100644 --- a/frac/active2/indexer.go +++ b/frac/active2/indexer.go @@ -18,16 +18,26 @@ const uint32Size = uint32(unsafe.Sizeof(uint32(0))) // Indexer indexes documents with concurrency limitation type Indexer struct { - workerPool Semaphore + workerPool WorkerLimiter } // NewIndexer creates a new indexer with specified number of workers -func NewIndexer(workerPool Semaphore) *Indexer { +func NewIndexer(workerPool WorkerLimiter) *Indexer { return &Indexer{ workerPool: workerPool, } } +// indexerBuffer is a temporary reusable buffer used during index construction to avoid allocations. +// It holds intermediate data structures that are needed during processing but not in the final index. +type indexerBuffer struct { + sizes []uint32 + fields []string + fieldTIDs []uint32 + tokens []tokenizer.MetaToken + tokenMap map[tokenStr]uint32 +} + // Index starts asynchronous document indexing func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { idx.workerPool.Acquire() @@ -41,7 +51,7 @@ func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, er func NewMemIndex(block storage.DocBlock) (*memIndex, error) { sw := stopwatch.New() - res, release := AcquireResources() + res, release := NewResources() defer release() // Decompress metadata @@ -50,7 +60,7 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { return nil, err } - buf := res.Buffer() + buf := res.GetBuffer() // Decode metadata meta, err := decodeMetadata(res, buf, payload, sw) @@ -60,9 +70,9 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { // Initialize index idx := newMemIndex() idx.docsCount = uint32(len(meta)) - idx.ids = idx.res.AllocIDs(len(meta)) - idx.positions = idx.res.AllocDocPos(len(meta)) - idx.blocksOffsets = idx.res.AllocUint64s(1) // Only one block per bulk + idx.ids = idx.res.GetIDs(len(meta)) + idx.positions = idx.res.GetDocPos(len(meta)) + idx.blocksOffsets = idx.res.GetUint64s(1) // Only one block per bulk idx.blocksOffsets[0] = block.GetExt2() // Extract tokens from metadata @@ -101,7 +111,7 @@ func toToken(t tokenizer.MetaToken) tokenStr { func extractTokens( idx *memIndex, res *Resources, - buf *indexBuffer, + buf *indexerBuffer, meta []indexer.MetaData, ) ([]uint32, []uint32, []tokenStr, error) { var docOffset uint64 @@ -109,7 +119,7 @@ func extractTokens( // Calculate document positions in the original block // Each document is stored as: [size: uint32][data: size bytes] - positions := res.AllocDocPos(len(meta)) + positions := res.GetDocPos(len(meta)) prev := seq.PackDocPos(0, docOffset) for i := range meta { @@ -124,7 +134,7 @@ func extractTokens( // Create ordering by document ID (descending) // We need to map global document IDs to local IDs (LIDs) - order := res.AllocUint32s(len(meta)) + order := res.GetUint32s(len(meta)) for i := range order { order[i] = uint32(i) } @@ -145,8 +155,8 @@ func extractTokens( var token tokenStr // Allocate slices for token-document relationships - lids := res.AllocUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence - tids := res.AllocUint32s(int(totalTokens))[:0] // Token ID for each occurrence + lids := res.GetUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence + tids := res.GetUint32s(int(totalTokens))[:0] // Token ID for each occurrence // Process documents in ID-sorted order for lid, origIdx := range order { @@ -171,7 +181,7 @@ func extractTokens( } // Create reverse mapping: tokenID -> tokenKey - tokens := res.AllocTokens(len(buf.tokenMap)) + tokens := res.GetTokens(len(buf.tokenMap)) for key, tokenID := range buf.tokenMap { tokens[tokenID] = key } @@ -184,7 +194,7 @@ func extractTokens( // Output: 2D array where tokenLIDs[tid] = []lid func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { // Phase 1: Count documents per token - counts := res.AllocUint32s(tokenCount) + counts := res.GetUint32s(tokenCount) clear(counts) for _, tid := range tids { counts[tid]++ @@ -192,8 +202,8 @@ func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCou // Phase 2: Allocate slices for each token group // We use a single large buffer and slice it for efficiency - tokenLIDs := res.AllocUint32Slices(tokenCount) - allTokenLIDs := idx.res.AllocUint32s(len(lids)) + tokenLIDs := res.GetUint32Slices(tokenCount) + allTokenLIDs := idx.res.GetUint32s(len(lids)) idx.allTokenLIDsCount = len(lids) tokenLIDs = tokenLIDs[:len(counts)] @@ -223,9 +233,9 @@ func lastLID(s []uint32) uint32 { } // organizeTokens organizes tokens and fields in the index with proper sorting -func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { +func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { tokenSize := 0 - order := res.AllocUint32s(len(tokens)) + order := res.GetUint32s(len(tokens)) order = order[:len(tokens)] for i, t := range tokens { order[i] = uint32(i) @@ -246,9 +256,9 @@ func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []to prevField := "" // Prepare buffers for sorted data - tokenBuffer := idx.res.AllocBytes(tokenSize)[:0] - idx.tokenLIDs = idx.res.AllocUint32Slices(len(order)) - idx.tokens = idx.res.AllocBytesSlices(len(order)) + tokenBuffer := idx.res.GetBytes(tokenSize)[:0] + idx.tokenLIDs = idx.res.GetUint32Slices(len(order)) + idx.tokens = idx.res.GetBytesSlices(len(order)) // Process tokens in sorted order for tid, origIdx := range order { @@ -276,8 +286,8 @@ func organizeTokens(idx *memIndex, res *Resources, buf *indexBuffer, tokens []to buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) // Organize fields - fieldBuffer := idx.res.AllocBytes(fieldSize)[:0] - idx.fields = idx.res.AllocBytesSlices(len(buf.fields)) + fieldBuffer := idx.res.GetBytes(fieldSize)[:0] + idx.fields = idx.res.GetBytesSlices(len(buf.fields)) idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) @@ -304,7 +314,7 @@ func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwa defer m.Stop() // Allocate exact size needed for compressed data - buffer := res.AllocBytes(int(block.RawLen())) + buffer := res.GetBytes(int(block.RawLen())) payload, err := block.DecompressTo(buffer) if err != nil { return nil, err @@ -314,7 +324,7 @@ func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwa // decodeMetadata decodes document metadata from binary format // Format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... -func decodeMetadata(res *Resources, buf *indexBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { +func decodeMetadata(res *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { m := sw.Start("decode_meta") defer m.Stop() @@ -327,7 +337,7 @@ func decodeMetadata(res *Resources, buf *indexBuffer, payload []byte, sw *stopwa } // Second pass: decode each metadata entry - meta := res.AllocMetadata(len(buf.sizes)) + meta := res.GetMetadata(len(buf.sizes)) for i, size := range buf.sizes { // Skip size field to get to actual data data := payload[uint32Size : size+uint32(uint32Size)] diff --git a/frac/active2/indexer_test.go b/frac/active2/indexer_test.go index d8566e9a..d2b69c4e 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active2/indexer_test.go @@ -111,7 +111,7 @@ func BenchmarkMerge(b *testing.B) { wg.Wait() b.StartTimer() - active.merger.MergeAll() + active.merger.ForceMergeAll() } } @@ -137,7 +137,7 @@ func BenchmarkFullWrite(b *testing.B) { processor := getTestProcessor() - n := 10 + n := 2 allDocs := make([][]byte, 0, len(readers)*n) allMeta := make([][]byte, 0, len(readers)*n) diff --git a/frac/active2/iterators.go b/frac/active2/iterators.go index 025b2393..22291eba 100644 --- a/frac/active2/iterators.go +++ b/frac/active2/iterators.go @@ -2,40 +2,52 @@ package active2 import "github.com/ozontech/seq-db/seq" -type IOrderedIterator[T any] interface { - Next() (T, bool) +// OrderedStream - interface for iterators with ordered elements +type OrderedStream[T any] interface { + Next() (T, bool) // Returns the next element and a flag indicating if an element exists } -func MergeKSortIterators[T any](src []IOrderedIterator[T], cmp func(T, T) int) IOrderedIterator[T] { +// MergeSortedStreams - performs K-way merging of sorted iterators (merge sort at iterator level) +// Uses a "divide and conquer" strategy for efficient merging +func MergeSortedStreams[T any](src []OrderedStream[T], cmp func(T, T) int) OrderedStream[T] { n := len(src) + // Base case of recursion: if there's only one iterator if n == 1 { return src[0] } + // Recursively split the iterator array in half h := n / 2 - src1 := MergeKSortIterators(src[:h], cmp) - src2 := MergeKSortIterators(src[h:], cmp) - return NewMergeIterator(src1, src2, cmp) + src1 := MergeSortedStreams(src[:h], cmp) // Left half + src2 := MergeSortedStreams(src[h:], cmp) // Right half + // Merge the two sorted halves + return NewTwoWayMergeStream(src1, src2, cmp) } -type MergeIterator[T any] struct { - v1, v2 T - has1, has2 bool - src1, src2 IOrderedIterator[T] - cmp func(T, T) int +// TwoWayMergeStream - implementation of an iterator for merging two sorted streams +type TwoWayMergeStream[T any] struct { + v1, v2 T // Current values from each source + has1, has2 bool // Flags indicating the presence of current values + src1, src2 OrderedStream[T] // Source iterators + cmp func(T, T) int // Comparison function for sorting } -func NewMergeIterator[T any](src1, src2 IOrderedIterator[T], cmp func(T, T) int) *MergeIterator[T] { - r := MergeIterator[T]{ +// NewTwoWayMergeStream - constructor for MergeIterator +// Initializes the iterator and prefetches the first values from both sources +func NewTwoWayMergeStream[T any](src1, src2 OrderedStream[T], cmp func(T, T) int) *TwoWayMergeStream[T] { + r := TwoWayMergeStream[T]{ src1: src1, src2: src2, cmp: cmp, } + // Prefetch the first values to enable comparison r.v1, r.has1 = r.src1.Next() r.v2, r.has2 = r.src2.Next() return &r } -func (s *MergeIterator[T]) Next() (v T, has bool) { +// Next - returns the next element when merging two sorted streams +// Algorithm is similar to merging in mergesort, but works with streams +func (s *TwoWayMergeStream[T]) Next() (v T, has bool) { if s.has1 && s.has2 { if s.cmp(s.v1, s.v2) < 0 { v = s.v1 @@ -59,85 +71,109 @@ func (s *MergeIterator[T]) Next() (v T, has bool) { return v, false } -type IDIteratorItem struct { - i int - id seq.ID - pos seq.DocPos +// DocRef - item of the document identifier iterator +// Contains information about the document's position in the index +type DocRef struct { + i int // Stream index (for identifying the source) + id seq.ID // Document identifier + pos seq.DocPos // Document position } -type IDIterator struct { - i int - offset int - idx *memIndex - posMap []seq.DocPos +// DocStream - iterator over the array of document identifiers in memIndex +type DocStream struct { + i int // Stream index (source identifier) + offset int // Current position in the ids array + idx *memIndex // Reference to the in-memory index + posMap []seq.DocPos // Map of document positions } -func (it *IDIterator) Next() (v IDIteratorItem, has bool) { +// Next - returns the next document ID from memIndex +func (it *DocStream) Next() (v DocRef, has bool) { + // Check if we haven't exceeded the bounds of the identifiers array if it.offset < len(it.idx.ids) { v.i = it.i v.id = it.idx.ids[it.offset] v.pos = it.posMap[it.offset] has = true - it.offset++ + it.offset++ // Move pointer for the next call } return v, has } -type TokenIteratorPayload struct { - idx *memIndex - lidsMap []uint32 +// TokenContext - shared data for the token iterator +// Contains a reference to the index and local identifier mapping +type TokenContext struct { + idx *memIndex // In-memory index + lidsMap []uint32 // Local identifiers map } -type TokenIteratorItem struct { - tid uint32 - fid uint32 - payload *TokenIteratorPayload +// TokenRef - item of the token iterator +// Represents a single token with metadata +type TokenRef struct { + tid uint32 // Token identifier + fid uint32 // Field identifier + payload *TokenContext // Shared data } -func (i *TokenIteratorItem) Field() []byte { +// Field - returns the field name by its identifier +func (i *TokenRef) Field() []byte { return i.payload.idx.fields[i.fid] } -func (i *TokenIteratorItem) Value() []byte { +// Value - returns the token value by its identifier +func (i *TokenRef) Value() []byte { return i.payload.idx.tokens[i.tid] } -func (i *TokenIteratorItem) LIDs() []uint32 { +// LIDs - returns the list of local identifiers for the token +func (i *TokenRef) LIDs() []uint32 { return i.payload.idx.tokenLIDs[i.tid] } -func (i *TokenIteratorItem) lidsMap() []uint32 { +// lidsMap - returns the local identifiers map +func (i *TokenRef) lidsMap() []uint32 { return i.payload.lidsMap } -type TokenIterator struct { - tid uint32 - fid uint32 - fieldLastTID uint32 - payload TokenIteratorPayload +// TokenStream - iterator over tokens in the index +// Iterates through tokens grouped by fields +type TokenStream struct { + tid uint32 // Current token identifier + fid uint32 // Current field identifier + fieldLastTID uint32 // Last TID of the current field + payload TokenContext // Iterator shared data } -func NewTokenIterator(idx *memIndex, lidsMap []uint32) *TokenIterator { - return &TokenIterator{ +// NewTokenStream - constructor for TokenIterator +// Initializes the iterator with starting values +func NewTokenStream(idx *memIndex, lidsMap []uint32) *TokenStream { + return &TokenStream{ + // Calculate the last TID for the first field fieldLastTID: idx.fieldsTokens[string(idx.fields[0])].count - 1, - payload: TokenIteratorPayload{ + payload: TokenContext{ idx: idx, lidsMap: lidsMap, }, } } -func (it *TokenIterator) Next() (v TokenIteratorItem, has bool) { +// Next - returns the next token from the index +// Sequentially iterates through tokens, switching between fields +func (it *TokenStream) Next() (v TokenRef, has bool) { + // Check if we haven't exceeded the bounds of the tokens array if int(it.tid) < len(it.payload.idx.tokens) { v.tid = uint32(it.tid) v.fid = uint32(it.fid) v.payload = &it.payload has = true - it.tid++ + it.tid++ // Move to the next token + // Check if we've reached the end of the current field if it.tid > it.fieldLastTID { - it.fid++ + it.fid++ // Move to the next field + // If there's a next field, update the boundary for the new field if int(it.fid) < len(it.payload.idx.fields) { + // Sum the token counts of fields to get the new boundary it.fieldLastTID += it.payload.idx.fieldsTokens[string(it.payload.idx.fields[it.fid])].count } } diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go index 02ec7144..dfb04d3a 100644 --- a/frac/active2/mem_index.go +++ b/frac/active2/mem_index.go @@ -32,7 +32,7 @@ type memIndex struct { } func newMemIndex() *memIndex { - res, release := AcquireResources() + res, release := NewResources() return &memIndex{ res: res, release: release, @@ -70,6 +70,8 @@ func (idx *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { } func (idx *memIndex) Release() { - idx.wg.Wait() - idx.release() + go func() { // non blocking call + idx.wg.Wait() + idx.release() + }() } diff --git a/frac/active2/mem_index_pool.go b/frac/active2/mem_index_pool.go index 0bf04586..15b9ac30 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active2/mem_index_pool.go @@ -8,57 +8,77 @@ import ( "github.com/ozontech/seq-db/frac" ) -// memIndexExt contains index metadata for merge management -type memIndexExt struct { - id uint64 // unique runtime ID - index *memIndex // actual index - gen int +// indexEntry is an internal structure that describes a memIndex +// inside the pool and its state during merge operations. +type indexEntry struct { + id uint64 // unique runtime ID of the index + index *memIndex // pointer to the actual index + gen int // generation, used for merge management } +// memIndexPool manages the lifecycle of in-memory indexes: +// - keeps indexes ready for use +// - tracks indexes currently participating in merge +// - provides consistent snapshots for readers type memIndexPool struct { - mu sync.RWMutex - info *frac.Info - indexes []*memIndex - readyToMerge map[uint64]memIndexExt - underMerging map[uint64]memIndexExt + mu sync.RWMutex // protects all fields below + info *frac.Info // aggregated information for all indexes - counter atomic.Uint64 // atomic counter for generating index IDs + ready map[uint64]indexEntry // indexes ready to be merged + merging map[uint64]indexEntry // indexes currently being merged + + // readable is a flat list of indexes available for reading. + // It contains both ready and merging indexes. + readable []*memIndex + + nextID atomic.Uint64 // atomic counter for generating index IDs } +// NewIndexPool creates a new index pool func NewIndexPool(info *frac.Info) *memIndexPool { return &memIndexPool{ - info: info, - readyToMerge: make(map[uint64]memIndexExt), - underMerging: make(map[uint64]memIndexExt), + info: info, + ready: make(map[uint64]indexEntry), + merging: make(map[uint64]indexEntry), } } +// indexSnapshot represents a consistent snapshot of the pool state. +// It is used to safely read indexes without holding the pool lock. type indexSnapshot struct { - info *frac.Info - indexes []*memIndex + info *frac.Info // copy of aggregated info + indexes []*memIndex // indexes available for reading } +// Snapshot returns a snapshot and a release function. +// While the snapshot is alive, indexes are protected from being released via wg. func (p *memIndexPool) Snapshot() (*indexSnapshot, func()) { p.mu.RLock() defer p.mu.RUnlock() - info := *p.info // copy + // Copy info so the snapshot is immutable + info := *p.info + iss := indexSnapshot{ info: &info, - indexes: make([]*memIndex, len(p.indexes)), + indexes: make([]*memIndex, len(p.readable)), } - for i, idx := range p.indexes { + + // Increment usage counter for each index + for i, idx := range p.readable { iss.indexes[i] = idx idx.wg.Add(1) } return &iss, func() { + // release function decrements wg counters for _, idx := range iss.indexes { idx.wg.Done() } } } +// Info returns a copy of the aggregated pool information func (p *memIndexPool) Info() *frac.Info { p.mu.RLock() defer p.mu.RUnlock() @@ -67,10 +87,12 @@ func (p *memIndexPool) Info() *frac.Info { return &info } +// Add adds a new memIndex to the pool and updates aggregated statistics func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { maxMID := idx.ids[0].MID minMID := idx.ids[len(idx.ids)-1].MID - item := p.wrapIndex(idx, 0) + + entry := p.newEntry(idx, 0) p.mu.Lock() defer p.mu.Unlock() @@ -81,85 +103,102 @@ func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { if p.info.To < maxMID { p.info.To = maxMID } + p.info.DocsRaw += idx.docsSize p.info.DocsTotal += idx.docsCount p.info.DocsOnDisk += docsLen p.info.MetaOnDisk += metaLen - p.readyToMerge[item.id] = item - p.indexes = append(p.indexes, idx) + p.ready[entry.id] = entry + p.readable = append(p.readable, idx) } -func (p *memIndexPool) ReadyToMerge() []memIndexExt { +// ReadyToMerge returns indexes that can be taken for merge (returns a copy without modifying the pool state) +func (p *memIndexPool) ReadyToMerge() []indexEntry { p.mu.RLock() defer p.mu.RUnlock() - items := make([]memIndexExt, 0, len(p.readyToMerge)) - for _, item := range p.readyToMerge { - items = append(items, item) + entries := make([]indexEntry, 0, len(p.ready)) + for _, entry := range p.ready { + entries = append(entries, entry) } - return items + return entries } -// markAsMerging moves indexes from "ready" to "merging" state -func (p *memIndexPool) markAsMerging(items []memIndexExt) { +// takeForMerge moves indexes from the "ready" state to the "merging" state +func (p *memIndexPool) takeForMerge(entries []indexEntry) { p.mu.Lock() defer p.mu.Unlock() - for _, item := range items { - delete(p.readyToMerge, item.id) - p.underMerging[item.id] = item + for _, entry := range entries { + delete(p.ready, entry.id) + p.merging[entry.id] = entry } } -func (p *memIndexPool) replace(old []memIndexExt, newIndex *memIndex) { - gen := 0 - for _, item := range old { - gen += item.gen - } - merged := p.wrapIndex(newIndex, gen/len(old)+1) +// replace replaces several old indexes with a single merged index +func (p *memIndexPool) replace(old []indexEntry, merged *memIndex) { + newEntry := p.newEntry(merged, avgGeneration(old)+1) + + defer func() { + for _, entry := range old { + entry.index.Release() + } + }() p.mu.Lock() defer p.mu.Unlock() - for _, eIndex := range old { - delete(p.underMerging, eIndex.id) + for _, entry := range old { + delete(p.merging, entry.id) } - p.readyToMerge[merged.id] = merged + p.ready[newEntry.id] = newEntry - // Rebuild the index list for reading - p.indexes = p.indexes[:0] - p.indexes = slices.Grow(p.indexes, len(p.readyToMerge)+len(p.underMerging)) + p.rebuildReadable() +} - for _, item := range p.readyToMerge { - p.indexes = append(p.indexes, item.index) // add all ready indexes - } - for _, item := range p.underMerging { - p.indexes = append(p.indexes, item.index) // add indexes currently being merged +// avgGeneration calculates the average generation of indexes +func avgGeneration(entries []indexEntry) int { + gen := 0 + for _, entry := range entries { + gen += entry.gen } + return gen / len(entries) +} - go func() { - // todo do we need wg here? - for _, item := range old { - item.index.Release() - } - }() +// rebuildReadable rebuilds the list of indexes available for reading (ready + merging) +func (p *memIndexPool) rebuildReadable() { + p.readable = p.readable[:0] + p.readable = slices.Grow(p.readable, len(p.ready)+len(p.merging)) + + for _, entry := range p.ready { + p.readable = append(p.readable, entry.index) + } + for _, entry := range p.merging { + p.readable = append(p.readable, entry.index) + } } +// Release fully releases the pool and all contained indexes func (p *memIndexPool) Release() { - p.mu.RLock() - indexes := p.indexes - p.mu.RUnlock() + p.mu.Lock() + defer p.mu.Unlock() - for _, idx := range indexes { + for _, idx := range p.readable { idx.Release() } + + p.info = nil + p.readable = nil + p.ready = nil + p.merging = nil } -func (p *memIndexPool) wrapIndex(index *memIndex, gen int) memIndexExt { - return memIndexExt{ - id: p.counter.Add(1), +// newEntry creates a new indexEntry with a unique ID +func (p *memIndexPool) newEntry(index *memIndex, gen int) indexEntry { + return indexEntry{ + id: p.nextID.Add(1), gen: gen, index: index, } diff --git a/frac/active2/merge.go b/frac/active2/merge.go index 71fa81df..4a33899a 100644 --- a/frac/active2/merge.go +++ b/frac/active2/merge.go @@ -18,12 +18,12 @@ func mergeIndexes(indexes []*memIndex) *memIndex { blocksCount += len(idx.blocksOffsets) } - res, release := AcquireResources() + res, release := NewResources() defer release() - dst.ids = dst.res.AllocIDs(int(dst.docsCount))[:0] - dst.positions = dst.res.AllocDocPos(int(dst.docsCount))[:0] - dst.blocksOffsets = dst.res.AllocUint64s(blocksCount)[:0] + dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] + dst.positions = dst.res.GetDocPos(int(dst.docsCount))[:0] + dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] posMap := mergeBlocksOffsets(dst, res, indexes) lidsMap := mergeIDs(dst, res, indexes, posMap) @@ -35,18 +35,18 @@ func mergeIndexes(indexes []*memIndex) *memIndex { } func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq.DocPos) [][]uint32 { - lidsMap := res.AllocUint32Slices(len(indexes)) - iters := make([]IOrderedIterator[IDIteratorItem], len(indexes)) + lidsMap := res.GetUint32Slices(len(indexes)) + iters := make([]OrderedStream[DocRef], len(indexes)) for i, idx := range indexes { - iters[i] = &IDIterator{ + iters[i] = &DocStream{ i: i, idx: idx, posMap: posMap[i], } - lidsMap[i] = res.uint32s.AllocSlice(int(idx.docsCount) + 1)[:1] // 1-based + lidsMap[i] = res.GetUint32s(int(idx.docsCount) + 1)[:1] // 1-based } - orderedIDs := MergeKSortIterators(iters, func(a, b IDIteratorItem) int { return seq.Compare(b.id, a.id) }) + orderedIDs := MergeSortedStreams(iters, func(a, b DocRef) int { return seq.Compare(b.id, a.id) }) cur, has := orderedIDs.Next() for has { @@ -61,13 +61,13 @@ func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][]uint32) { totalTokens := 0 - tokensIterators := make([]IOrderedIterator[TokenIteratorItem], len(indexes)) + tokensIterators := make([]OrderedStream[TokenRef], len(indexes)) for i, idx := range indexes { totalTokens += len(idx.tokens) - tokensIterators[i] = NewTokenIterator(idx, lidsMap[i]) + tokensIterators[i] = NewTokenStream(idx, lidsMap[i]) } - cmpToken := func(a, b TokenIteratorItem) int { + cmpToken := func(a, b TokenRef) int { r := bytes.Compare(a.Field(), b.Field()) if r == 0 { return bytes.Compare(a.Value(), b.Value()) @@ -75,7 +75,7 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ return r } - orderedTokens := MergeKSortIterators(tokensIterators, cmpToken) + orderedTokens := MergeSortedStreams(tokensIterators, cmpToken) uniqTokensSize := 0 uniqTokensCount := 0 @@ -85,11 +85,11 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ var ( prevField []byte - prevToken TokenIteratorItem + prevToken TokenRef ) - borders := res.AllocBytes(totalTokens)[:0] - tokens := make([]TokenIteratorItem, 0, totalTokens) + borders := res.GetBytes(totalTokens)[:0] + tokens := make([]TokenRef, 0, totalTokens) for cur, has := orderedTokens.Next(); has; cur, has = orderedTokens.Next() { var border uint8 @@ -114,18 +114,18 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ } dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) - dst.fields = dst.res.AllocBytesSlices(uniqFieldsCount)[:0] - dst.tokens = dst.res.AllocBytesSlices(uniqTokensCount)[:0] - dst.tokenLIDs = dst.res.AllocUint32Slices(uniqTokensCount)[:0] + dst.fields = dst.res.GetBytesSlices(uniqFieldsCount)[:0] + dst.tokens = dst.res.GetBytesSlices(uniqTokensCount)[:0] + dst.tokenLIDs = dst.res.GetUint32Slices(uniqTokensCount)[:0] - allTokens := dst.res.AllocBytes(uniqTokensSize)[:0] - allFields := dst.res.AllocBytes(uniqFieldsSize)[:0] + allTokens := dst.res.GetBytes(uniqTokensSize)[:0] + allFields := dst.res.GetBytes(uniqFieldsSize)[:0] lidsCollector := NewLIDsCollector( - res.AllocUint32s(int(dst.docsCount)), // tmp buf - dst.res.AllocUint32s(dst.allTokenLIDsCount - int(dst.docsCount))[:0], // all token LIDs - dst.res.AllocUint32s(int(dst.docsCount)), // ALL LIDs for token _all_ - res.AllocBytes((int(dst.docsCount) + 1)), // sort buffer + res.GetUint32s(int(dst.docsCount)), // tmp buf + dst.res.GetUint32s(dst.allTokenLIDsCount - int(dst.docsCount))[:0], // all token LIDs + dst.res.GetUint32s(int(dst.docsCount)), // ALL LIDs for token _all_ + res.GetBytes((int(dst.docsCount) + 1)), // sort buffer ) var isAllToken bool @@ -166,7 +166,7 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ if isAllToken { for range token.LIDs() { - lidsCollector.Add(0) + lidsCollector.Add(0) // stub } } else { newLIDsMap := token.lidsMap() @@ -183,7 +183,6 @@ func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][ tr := dst.fieldsTokens[fieldStr] tr.count = tid - tr.start + 1 dst.fieldsTokens[fieldStr] = tr - } type LIDsCollector struct { @@ -218,7 +217,7 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { return s.all } - if n > 16_000 { + if 100*n/len(s.all) > 50 { for _, v := range s.tmp { s.buf[v] = 1 } @@ -244,10 +243,10 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { func mergeBlocksOffsets(dst *memIndex, res *Resources, indexes []*memIndex) [][]seq.DocPos { var offset uint32 - positions := res.AllocDocPosSlices(len(indexes)) + positions := res.GetDocPosSlices(len(indexes)) for i, index := range indexes { dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) - positions[i] = res.AllocDocPos(len(index.positions))[:0] + positions[i] = res.GetDocPos(len(index.positions))[:0] for _, p := range index.positions { oldIdx, docOffset := p.Unpack() positions[i] = append(positions[i], seq.PackDocPos(oldIdx+offset, docOffset)) diff --git a/frac/active2/merge_manager.go b/frac/active2/merge_manager.go index 4ecf8ce9..0addef5c 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active2/merge_manager.go @@ -7,72 +7,74 @@ import ( "go.uber.org/zap" ) +// Tuning parameters for index merge strategy const ( - maxGenerations = 32 - minIndexesToMerge = 16 // minimum number of indexes to trigger merge - forceMergeThreshold = 4096 // index count threshold for forced merge + maxGenerationBuckets = 32 // Maximum number of generation buckets used for grouping + minMergeBatchSize = 16 // Minimum batch size required to start a merge + forceMergeThreshold = 4096 // Merge all indexes if total count exceeds this limit ) -type Semaphore interface { - Acquire() - Release() - Capacity() int +type WorkerLimiter interface { + Acquire() // Blocks until a worker slot is available + Release() // Frees a previously acquired slot } -// MergeManager manages in-memory index collection and merging -type MergeManager struct { - mu sync.Mutex - wg sync.WaitGroup +// mergeManager coordinates background merging of in-memory indexes +type mergeManager struct { + mu sync.Mutex // Protects internal state + wg sync.WaitGroup // Tracks active merge jobs - stopped bool - indexes *memIndexPool + stopped bool // Indicates shutdown state + indexPool *memIndexPool // Source of indexes to be merged - workerPool Semaphore - mergeCh chan struct{} // channel to trigger merge process + mergeWorkers WorkerLimiter // Limits parallel merge execution + mergeSignal chan struct{} // Coalesced signal to trigger merge evaluation } -// NewMergeManager creates a new index manager -func NewMergeManager(indexes *memIndexPool, workerPool Semaphore) *MergeManager { - m := MergeManager{ - indexes: indexes, - workerPool: workerPool, - mergeCh: make(chan struct{}, 1), +// newMergeManager initializes merge manager and starts merge loop +func newMergeManager(indexes *memIndexPool, workerPool WorkerLimiter) *mergeManager { + m := mergeManager{ + indexPool: indexes, + mergeWorkers: workerPool, + mergeSignal: make(chan struct{}, 1), } - // Start background goroutine for merge scheduling - go m.mergeScheduler() + // Background goroutine responsible for scheduling merges + go m.mergeLoop() return &m } -// Stop shuts down the index manager and waits for current operations to complete -func (m *MergeManager) Stop() { +// Stop gracefully stops the manager and waits for ongoing merges +func (m *mergeManager) Stop() { m.mu.Lock() defer m.mu.Unlock() m.stopped = true - // Wait for all current merge operations to complete + // Ensure all in-flight merges are completed m.wg.Wait() - close(m.mergeCh) + close(m.mergeSignal) } -// MergeAll performs full merge of all available indexes -func (m *MergeManager) MergeAll() { +// ForceMergeAll performs full merge of all available indexes +func (m *mergeManager) ForceMergeAll() { m.mu.Lock() defer m.mu.Unlock() + // Ensure no background merges are running m.wg.Wait() - if toMerge := m.indexes.ReadyToMerge(); len(toMerge) > 1 { - logger.Debug("merge all mini-indexes", zap.Int("batch", len(toMerge))) - m.indexes.markAsMerging(toMerge) - merged := mergeIndexes(extractIndexes(toMerge)) - m.indexes.replace(toMerge, merged) + if batch := m.indexPool.ReadyToMerge(); len(batch) > 1 { + logger.Debug("force merge all indexes", zap.Int("batch", len(batch))) + m.indexPool.takeForMerge(batch) + merged := mergeIndexes(unwrapIndexes(batch)) + m.indexPool.replace(batch, merged) } } -func extractIndexes(items []memIndexExt) []*memIndex { +// unwrapIndexes extracts raw memIndex pointers from wrappers +func unwrapIndexes(items []indexEntry) []*memIndex { result := make([]*memIndex, 0, len(items)) for _, item := range items { result = append(result, item.index) @@ -80,50 +82,55 @@ func extractIndexes(items []memIndexExt) []*memIndex { return result } -func (m *MergeManager) mergeScheduler() { - for range m.mergeCh { - m.workerPool.Acquire() // wait for a free worker +// mergeLoop continuously reacts to merge signals and schedules work +func (m *mergeManager) mergeLoop() { + for range m.mergeSignal { + m.mergeWorkers.Acquire() // wait for a free worker m.mu.Lock() if m.stopped { m.mu.Unlock() - m.workerPool.Release() + m.mergeWorkers.Release() continue } - batch := pickToMerge(m.indexes.ReadyToMerge(), minIndexesToMerge) + // Decide which indexes are worth merging right now + batch := selectMergeBatch(m.indexPool.ReadyToMerge(), minMergeBatchSize) if len(batch) == 0 { m.mu.Unlock() - m.workerPool.Release() + m.mergeWorkers.Release() continue } - m.indexes.markAsMerging(batch) + m.indexPool.takeForMerge(batch) m.wg.Add(1) // important to inc wg inside the lock m.mu.Unlock() - logger.Debug("merge indexes", zap.Int("gen", batch[0].gen), zap.Int("batch", len(batch))) + logger.Debug("merge indexes", zap.Int("generation", batch[0].gen), zap.Int("size", len(batch))) - go func() { - merged := mergeIndexes(extractIndexes(batch)) - m.workerPool.Release() - m.indexes.replace(batch, merged) - m.triggerMerge() // check if new merge is needed - m.wg.Done() - }() + go func(batch []indexEntry) { + defer m.wg.Done() + defer m.mergeWorkers.Release() + + merged := mergeIndexes(unwrapIndexes(batch)) + m.indexPool.replace(batch, merged) + m.requestMerge() // re-check if further merges are possible + }(batch) } } -func (m *MergeManager) triggerMerge() { +// requestMerge schedules a merge check if one is not already pending +func (m *mergeManager) requestMerge() { select { - case m.mergeCh <- struct{}{}: + case m.mergeSignal <- struct{}{}: default: - // Trigger already set, no need for additional notification + // Merge signal already pending; avoid redundant wakeups } } -func pickToMerge(items []memIndexExt, minBatchSize int) []memIndexExt { +// selectMergeBatch chooses an optimal merge candidate batch +func selectMergeBatch(items []indexEntry, minBatchSize int) []indexEntry { if len(items) < minBatchSize { return nil } @@ -132,18 +139,19 @@ func pickToMerge(items []memIndexExt, minBatchSize int) []memIndexExt { return items } - batch := largestBatch(items) + batch := largestGenerationGroup(items) if len(batch) < minBatchSize { return nil } return batch } -func largestBatch(items []memIndexExt) []memIndexExt { +// largestGenerationGroup returns the biggest generation-aligned batch +func largestGenerationGroup(items []indexEntry) []indexEntry { maxGen := 0 - batches := make([][]memIndexExt, maxGenerations) + batches := make([][]indexEntry, maxGenerationBuckets) for _, item := range items { - gen := min(maxGenerations, item.gen) + gen := min(maxGenerationBuckets, item.gen) batches[gen] = append(batches[gen], item) if len(batches[gen]) > len(batches[maxGen]) || len(batches[gen]) == len(batches[maxGen]) && gen > maxGen { maxGen = gen diff --git a/frac/active2/resources.go b/frac/active2/resources.go index 9d587b1c..b87e73b9 100644 --- a/frac/active2/resources.go +++ b/frac/active2/resources.go @@ -11,7 +11,7 @@ var ( tokenKeyPool = resources.NewSizedPool[tokenStr](24) indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](24) - bufPool = resources.TypedPool[*indexBuffer]{} + bufPool = resources.TypedPool[*indexerBuffer]{} resPool = resources.TypedPool[*Resources]{} ) @@ -22,18 +22,18 @@ type Resources struct { uint32s resources.SliceOnBytes[uint32] uint64s resources.SliceOnBytes[uint64] - bytes resources.SliceAllocator[byte] - bytesSlices resources.SliceAllocator[[]byte] - uint32Slices resources.SliceAllocator[[]uint32] - tokenKeys resources.SliceAllocator[tokenStr] - indexerMetaData resources.SliceAllocator[indexer.MetaData] - buf resources.ObjectAllocator[indexBuffer] + bytes resources.SlicesPool[byte] + bytesSlices resources.SlicesPool[[]byte] + uint32Slices resources.SlicesPool[[]uint32] + tokenKeys resources.SlicesPool[tokenStr] + indexerMetaData resources.SlicesPool[indexer.MetaData] + buf resources.ObjectsPool[indexerBuffer] ids resources.SliceOnBytes[seq.ID] docPos resources.SliceOnBytes[seq.DocPos] - docPosSlices resources.SliceAllocator[[]seq.DocPos] + docPosSlices resources.SlicesPool[[]seq.DocPos] } -func AcquireResources() (*Resources, func()) { +func NewResources() (*Resources, func()) { r, ok := resPool.Get() if !ok { s := resources.CallStack{} @@ -47,10 +47,10 @@ func AcquireResources() (*Resources, func()) { bytesSlices: resources.NewBytesSlices(&s), ids: resources.NewSliceOnBytes[seq.ID](&s), docPos: resources.NewSliceOnBytes[seq.DocPos](&s), - docPosSlices: resources.NewSliceAllocator(&docPosSlicesPool, &s), - indexerMetaData: resources.NewSliceAllocator(&indexerMetaDataPool, &s), - tokenKeys: resources.NewSliceAllocator(&tokenKeyPool, &s), - buf: resources.NewObjectAllocator(&bufPool, &s), + docPosSlices: resources.NewSlicesPool(&docPosSlicesPool, &s), + indexerMetaData: resources.NewSlicesPool(&indexerMetaDataPool, &s), + tokenKeys: resources.NewSlicesPool(&tokenKeyPool, &s), + buf: resources.NewObjectsPool(&bufPool, &s), } } return r, func() { @@ -59,56 +59,56 @@ func AcquireResources() (*Resources, func()) { } } -func (r *Resources) AllocBytesSlices(s int) [][]byte { - return r.bytesSlices.AllocSlice(s) +func (r *Resources) GetBytesSlices(s int) [][]byte { + return r.bytesSlices.GetSlice(s) } -func (r *Resources) AllocBytes(s int) []byte { - return r.bytes.AllocSlice(s) +func (r *Resources) GetBytes(s int) []byte { + return r.bytes.GetSlice(s) } -func (r *Resources) AllocUint32s(s int) []uint32 { - return r.uint32s.AllocSlice(s) +func (r *Resources) GetUint32s(s int) []uint32 { + return r.uint32s.GetSlice(s) } -func (r *Resources) AllocIDs(s int) []seq.ID { - return r.ids.AllocSlice(s) +func (r *Resources) GetIDs(s int) []seq.ID { + return r.ids.GetSlice(s) } -func (r *Resources) AllocDocPos(s int) []seq.DocPos { - return r.docPos.AllocSlice(s) +func (r *Resources) GetDocPos(s int) []seq.DocPos { + return r.docPos.GetSlice(s) } -func (r *Resources) AllocDocPosSlices(s int) [][]seq.DocPos { - return r.docPosSlices.AllocSlice(s) +func (r *Resources) GetDocPosSlices(s int) [][]seq.DocPos { + return r.docPosSlices.GetSlice(s) } -func (r *Resources) AllocUint64s(s int) []uint64 { - return r.uint64s.AllocSlice(s) +func (r *Resources) GetUint64s(s int) []uint64 { + return r.uint64s.GetSlice(s) } -func (r *Resources) AllocUint32Slices(s int) [][]uint32 { - return r.uint32Slices.AllocSlice(s) +func (r *Resources) GetUint32Slices(s int) [][]uint32 { + return r.uint32Slices.GetSlice(s) } -func (r *Resources) AllocMetadata(s int) []indexer.MetaData { - return r.indexerMetaData.AllocSlice(s) +func (r *Resources) GetMetadata(s int) []indexer.MetaData { + return r.indexerMetaData.GetSlice(s) } -func (r *Resources) AllocTokens(s int) []tokenStr { - return r.tokenKeys.AllocSlice(s) +func (r *Resources) GetTokens(s int) []tokenStr { + return r.tokenKeys.GetSlice(s) } -func (r *Resources) Buffer() *indexBuffer { - return r.buf.Alloc(func() *indexBuffer { - return &indexBuffer{ +func (r *Resources) GetBuffer() *indexerBuffer { + return r.buf.Get(func() *indexerBuffer { + return &indexerBuffer{ sizes: make([]uint32, 0, 1000), fields: make([]string, 0, 100), fieldTIDs: make([]uint32, 0, 100), tokens: make([]tokenizer.MetaToken, 0, 100), tokenMap: make(map[tokenStr]uint32, 1000), } - }, func(b *indexBuffer) { + }, func(b *indexerBuffer) { b.fields = b.fields[:0] b.tokens = b.tokens[:0] b.fieldTIDs = b.fieldTIDs[:0] @@ -116,13 +116,3 @@ func (r *Resources) Buffer() *indexBuffer { clear(b.tokenMap) }) } - -// indexBuffer is a temporary buffer used during index construction to avoid allocations. -// It holds intermediate data structures that are needed during processing but not in the final index. -type indexBuffer struct { - sizes []uint32 - fields []string - fieldTIDs []uint32 - tokens []tokenizer.MetaToken - tokenMap map[tokenStr]uint32 -} diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go index b5f9a685..14004eb9 100644 --- a/frac/active2/sealing_source.go +++ b/frac/active2/sealing_source.go @@ -29,7 +29,7 @@ type SealingSource struct { } func NewSealingSource(a *Active2, params frac.SealParams) (sealing.Source, error) { - a.merger.MergeAll() + a.merger.ForceMergeAll() iss, release := a.indexes.Snapshot() defer release() diff --git a/resources/object_allocator.go b/resources/object_allocator.go index 2603f3d5..66262c0e 100644 --- a/resources/object_allocator.go +++ b/resources/object_allocator.go @@ -1,18 +1,18 @@ package resources -type MapAllocator[K comparable, V any] struct { +type MapsPool[K comparable, V any] struct { pool *TypedPool[map[K]V] releases *CallStack } -func NewMapAllocator[K comparable, V any](pool *TypedPool[map[K]V], releases *CallStack) MapAllocator[K, V] { - return MapAllocator[K, V]{ +func NewMapsPool[K comparable, V any](pool *TypedPool[map[K]V], releases *CallStack) MapsPool[K, V] { + return MapsPool[K, V]{ pool: pool, releases: releases, } } -func (a MapAllocator[K, V]) Alloc(size int) map[K]V { +func (a MapsPool[K, V]) Alloc(size int) map[K]V { obj, ok := a.pool.Get() if ok { clear(obj) @@ -23,19 +23,19 @@ func (a MapAllocator[K, V]) Alloc(size int) map[K]V { return obj } -type ObjectAllocator[T any] struct { +type ObjectsPool[T any] struct { pool *TypedPool[*T] releases *CallStack } -func NewObjectAllocator[T any](pool *TypedPool[*T], releases *CallStack) ObjectAllocator[T] { - return ObjectAllocator[T]{ +func NewObjectsPool[T any](pool *TypedPool[*T], releases *CallStack) ObjectsPool[T] { + return ObjectsPool[T]{ pool: pool, releases: releases, } } -func (a ObjectAllocator[T]) Alloc(newFn func() *T, resetFn func(*T)) *T { +func (a ObjectsPool[T]) Get(newFn func() *T, resetFn func(*T)) *T { obj, ok := a.pool.Get() if ok { resetFn(obj) diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index cd76f2b3..67485688 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -1,34 +1,34 @@ package resources -func NewBytes(releases *CallStack) SliceAllocator[byte] { - return NewSliceAllocator(&BytesPool, releases) +func NewBytes(releases *CallStack) SlicesPool[byte] { + return NewSlicesPool(&BytesPool, releases) } -func NewStrings(releases *CallStack) SliceAllocator[string] { - return NewSliceAllocator(&StringsPool, releases) +func NewStrings(releases *CallStack) SlicesPool[string] { + return NewSlicesPool(&StringsPool, releases) } -func NewUint32Slices(releases *CallStack) SliceAllocator[[]uint32] { - return NewSliceAllocator(&Uint32SlicesPool, releases) +func NewUint32Slices(releases *CallStack) SlicesPool[[]uint32] { + return NewSlicesPool(&Uint32SlicesPool, releases) } -func NewBytesSlices(releases *CallStack) SliceAllocator[[]byte] { - return NewSliceAllocator(&BytesSlicesPool, releases) +func NewBytesSlices(releases *CallStack) SlicesPool[[]byte] { + return NewSlicesPool(&BytesSlicesPool, releases) } -type SliceAllocator[T any] struct { +type SlicesPool[T any] struct { pool *SizedPool[T] releases *CallStack } -func NewSliceAllocator[T any](pool *SizedPool[T], releases *CallStack) SliceAllocator[T] { - return SliceAllocator[T]{ +func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] { + return SlicesPool[T]{ pool: pool, releases: releases, } } -func (a SliceAllocator[T]) AllocSlice(size int) []T { +func (a SlicesPool[T]) GetSlice(size int) []T { data := a.pool.Get(size) a.releases.Defer(func() { a.pool.Put(data) }) return data[:size] diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index f2a18ffe..8965d46a 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -22,7 +22,7 @@ func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { } } -func (a SliceOnBytes[T]) AllocSlice(size int) []T { +func (a SliceOnBytes[T]) GetSlice(size int) []T { data, buf := a.getBuf(size) a.releases.Defer(func() { a.pool.Put(buf) }) return data From b858ddb32a0d69a9302fce0a3293ce653d8de657 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Wed, 24 Dec 2025 20:46:40 +0300 Subject: [PATCH 13/28] replace old active with a new one --- frac/{active2 => active}/active2.go | 38 +- frac/{active2 => active}/data_provider.go | 60 ++- frac/active/indexer.go | 437 ++++++++++++------ frac/active/indexer_test.go | 164 ++++--- frac/{active2 => active}/iterators.go | 2 +- frac/active/mem_index.go | 99 ++++ frac/{active2 => active}/mem_index_pool.go | 2 +- frac/active/merge.go | 333 +++++++++++++ frac/{active2 => active}/merge_manager.go | 2 +- frac/{active2 => active}/metrics.go | 14 +- frac/{active2 => active}/resources.go | 2 +- frac/active/sealing_source.go | 347 ++++---------- frac/active2/indexer.go | 352 -------------- frac/active2/mem_index.go | 77 --- frac/active2/merge.go | 257 ---------- frac/active2/sealing_source.go | 140 ------ frac/{active => active_old}/active.go | 2 +- frac/{active => active_old}/docs_positions.go | 2 +- frac/{active => active_old}/docs_source.go | 2 +- frac/{active => active_old}/file_writer.go | 2 +- .../file_writer_test.go | 2 +- frac/{active => active_old}/ids.go | 2 +- frac/{active => active_old}/ids_test.go | 2 +- frac/{active => active_old}/index.go | 2 +- frac/active_old/indexer.go | 195 ++++++++ frac/{active2 => active_old}/indexer_test.go | 166 +++---- frac/{active => active_old}/inverser.go | 2 +- .../meta_data_collector.go | 2 +- frac/active_old/sealing_source.go | 323 +++++++++++++ frac/{active => active_old}/token_lids.go | 2 +- frac/{active => active_old}/token_list.go | 2 +- frac/{active => active_old}/writer.go | 2 +- frac/tests/fraction_test.go | 37 +- fracmanager/fracmanager.go | 3 +- fracmanager/fraction_provider.go | 23 +- fracmanager/fraction_provider_test.go | 4 +- fracmanager/sealer_test.go | 8 +- 37 files changed, 1641 insertions(+), 1470 deletions(-) rename frac/{active2 => active}/active2.go (87%) rename frac/{active2 => active}/data_provider.go (60%) rename frac/{active2 => active}/iterators.go (99%) create mode 100644 frac/active/mem_index.go rename frac/{active2 => active}/mem_index_pool.go (99%) create mode 100644 frac/active/merge.go rename frac/{active2 => active}/merge_manager.go (99%) rename frac/{active2 => active}/metrics.go (78%) rename frac/{active2 => active}/resources.go (99%) delete mode 100644 frac/active2/indexer.go delete mode 100644 frac/active2/mem_index.go delete mode 100644 frac/active2/merge.go delete mode 100644 frac/active2/sealing_source.go rename frac/{active => active_old}/active.go (99%) rename frac/{active => active_old}/docs_positions.go (98%) rename frac/{active => active_old}/docs_source.go (98%) rename frac/{active => active_old}/file_writer.go (99%) rename frac/{active => active_old}/file_writer_test.go (99%) rename frac/{active => active_old}/ids.go (97%) rename frac/{active => active_old}/ids_test.go (98%) rename frac/{active => active_old}/index.go (99%) create mode 100644 frac/active_old/indexer.go rename frac/{active2 => active_old}/indexer_test.go (79%) rename frac/{active => active_old}/inverser.go (98%) rename frac/{active => active_old}/meta_data_collector.go (99%) create mode 100644 frac/active_old/sealing_source.go rename frac/{active => active_old}/token_lids.go (99%) rename frac/{active => active_old}/token_list.go (99%) rename frac/{active => active_old}/writer.go (97%) diff --git a/frac/active2/active2.go b/frac/active/active2.go similarity index 87% rename from frac/active2/active2.go rename to frac/active/active2.go index a2bdac09..401cc3d7 100644 --- a/frac/active2/active2.go +++ b/frac/active/active2.go @@ -1,4 +1,4 @@ -package active2 +package active import ( "context" @@ -11,7 +11,7 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" @@ -22,7 +22,7 @@ import ( "go.uber.org/zap" ) -type Active2 struct { +type Active struct { Config *frac.Config BaseFileName string @@ -41,7 +41,7 @@ type Active2 struct { metaFile *os.File metaReader storage.DocBlocksReader - writer *active.Writer + writer *active_old.Writer } func New( @@ -51,14 +51,14 @@ func New( readLimiter *storage.ReadLimiter, docsCache *cache.Cache[[]byte], sortCache *cache.Cache[[]byte], -) *Active2 { +) *Active { docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) info := frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())) indexes := NewIndexPool(info) - f := &Active2{ + f := &Active{ BaseFileName: baseFileName, Config: cfg, indexer: NewIndexer(util.NewSemaphore(workers)), @@ -74,7 +74,7 @@ func New( metaFile: metaFile, metaReader: storage.NewDocBlocksReader(readLimiter, metaFile), - writer: active.NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), + writer: active_old.NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), } logger.Info("active fraction created", zap.String("fraction", baseFileName)) @@ -82,7 +82,7 @@ func New( return f } -func (f *Active2) Replay(ctx context.Context) error { +func (f *Active) Replay(ctx context.Context) error { info := f.indexes.info @@ -149,7 +149,7 @@ out: return nil } -func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { +func (f *Active) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { sw := stopwatch.New() ma := sw.Start("append") if err = f.writer.Write(docs, meta, sw); err != nil { @@ -171,7 +171,7 @@ func (f *Active2) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { return nil } -func (f *Active2) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { +func (f *Active) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { if err != nil { logger.Fatal("bulk indexing error", zap.Error(err)) } @@ -179,11 +179,11 @@ func (f *Active2) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { f.merger.requestMerge() } -func (f *Active2) String() string { +func (f *Active) String() string { return frac.FracToString(f, "active") } -func (f *Active2) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { +func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { sw := stopwatch.New() defer sw.Export(fetcherStagesSec) @@ -208,7 +208,7 @@ func (f *Active2) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { return res, nil } -func (f *Active2) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { +func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { ss, release := f.indexes.Snapshot() defer release() @@ -243,19 +243,19 @@ func (f *Active2) Search(ctx context.Context, params processor.SearchParams) (*s return res, nil } -func (f *Active2) Info() *frac.Info { +func (f *Active) Info() *frac.Info { return f.indexes.Info() } -func (f *Active2) Contains(id seq.MID) bool { +func (f *Active) Contains(id seq.MID) bool { return f.Info().IsIntersecting(id, id) } -func (f *Active2) IsIntersecting(from, to seq.MID) bool { +func (f *Active) IsIntersecting(from, to seq.MID) bool { return f.Info().IsIntersecting(from, to) } -func (f *Active2) Release() { +func (f *Active) Release() { f.releaseMem() if !f.Config.KeepMetaFile { @@ -269,7 +269,7 @@ func (f *Active2) Release() { } -func (f *Active2) Suicide() { +func (f *Active) Suicide() { f.releaseMem() util.RemoveFile(f.metaFile.Name()) @@ -277,7 +277,7 @@ func (f *Active2) Suicide() { util.RemoveFile(f.BaseFileName + consts.SdocsFileSuffix) } -func (f *Active2) releaseMem() { +func (f *Active) releaseMem() { f.writer.Stop() f.merger.Stop() f.indexes.Release() diff --git a/frac/active2/data_provider.go b/frac/active/data_provider.go similarity index 60% rename from frac/active2/data_provider.go rename to frac/active/data_provider.go index 13197e70..0f1730d2 100644 --- a/frac/active2/data_provider.go +++ b/frac/active/data_provider.go @@ -1,29 +1,32 @@ -package active2 +package active import ( "context" "fmt" "sort" - "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/parser" "github.com/ozontech/seq-db/pattern" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" - "github.com/prometheus/client_golang/prometheus" ) +// fetchIndex is used during the fetch phase: +// reading data and document positions. type fetchIndex struct { index *memIndex docsReader *storage.DocsReader } +// GetBlocksOffsets returns the offset of a block by its index. func (si *fetchIndex) GetBlocksOffsets(blockIndex uint32) uint64 { return si.index.blocksOffsets[blockIndex] } +// GetDocPos returns document positions for the given IDs. +// If a document is not found, DocPosNotFound is returned. func (si *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { docsPos := make([]seq.DocPos, len(ids)) for i, id := range ids { @@ -36,22 +39,29 @@ func (si *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { return docsPos } +// ReadDocs reads documents from storage +// using the block offset and document offsets inside the block. func (si *fetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { return si.docsReader.ReadDocs(blockOffset, docOffsets) } +// searchIndex is used during the search phase: +// matching tokens, documents, and query conditions. type searchIndex struct { ctx context.Context index *memIndex } +// GetValByTID returns the token value by its TID. func (si *searchIndex) GetValByTID(tid uint32) []byte { return si.index.tokens[tid] } +// GetTIDsByTokenExpr finds TIDs by a token expression from the query. func (si *searchIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { field := parser.GetField(t) tp := si.index.getTokenProvider(field) + tids, err := pattern.Search(si.ctx, t, tp) if err != nil { return nil, fmt.Errorf("search error: %w field: %s, query: %s", err, field, parser.GetHint(t)) @@ -59,32 +69,45 @@ func (si *searchIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { return tids, nil } +// GetLIDsFromTIDs converts a list of TIDs into a list of nodes (Node), +// each representing a set of local document identifiers (LIDs) +// that satisfy the token. func (si *searchIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { nodes := make([]node.Node, 0, len(tids)) for _, tid := range tids { - nodes = append(nodes, si.geTidLidsNode(tid, minLID, maxLID, order)) + nodes = append(nodes, si.getTIDLIDsNode(tid, minLID, maxLID, order)) } return nodes } -func (si *searchIndex) geTidLidsNode(tid, minLID, maxLID uint32, order seq.DocsOrder) node.Node { - if tid == si.index.allTID { +// getTIDLIDsNode creates a node.Node for a single TID. +func (si *searchIndex) getTIDLIDsNode(tid, minLID, maxLID uint32, order seq.DocsOrder) node.Node { + tidLIDs := si.index.tokenLIDs[tid] + if len(tidLIDs) == 0 { // empty list means ALL documents return node.NewRange(minLID, maxLID, order.IsReverse()) } - tidLIDs := si.index.tokenLIDs[tid] + // Regular token — static list of LIDs return node.NewStatic(narrowDownLIDs(tidLIDs, minLID, maxLID), order.IsReverse()) } +// narrowDownLIDs restricts a sorted list of LIDs to the range [minLID, maxLID]. func narrowDownLIDs(tidLIDs []uint32, minLID, maxLID uint32) []uint32 { n := len(tidLIDs) - left := sort.Search(n, func(i int) bool { return tidLIDs[i] >= minLID }) - right := sort.Search(n, func(i int) bool { return tidLIDs[i] > maxLID }) + + left := sort.Search(n, func(i int) bool { + return tidLIDs[i] >= minLID + }) + right := sort.Search(n, func(i int) bool { + return tidLIDs[i] > maxLID + }) + if left > right { return nil } return tidLIDs[left:right] } +// LessOrEqual compares a document by LID with the given ID. func (si *searchIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { checkedMID := si.GetMID(lid) if checkedMID == id.MID { @@ -93,46 +116,45 @@ func (si *searchIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { return checkedMID < id.MID } +// GetMID returns the document MID by LID. func (si *searchIndex) GetMID(lid seq.LID) seq.MID { return si.index.ids[lid-1].MID } +// GetRID returns the document RID by LID. func (si *searchIndex) GetRID(lid seq.LID) seq.RID { return si.index.ids[lid-1].RID } +// Len returns the number of documents + 1 (LID starts from 1). func (si *searchIndex) Len() int { return len(si.index.ids) + 1 } -func getActiveSearchMetric(params processor.SearchParams) *prometheus.HistogramVec { - if params.HasAgg() { - return searchAggSec - } - if params.HasHist() { - return searchHstSec - } - return searchSimpleSec -} - +// tokenProvider is an adapter for pattern.Search. +// It provides access to tokens in the specified TID range. type tokenProvider struct { firstTID uint32 lastTID uint32 tokens [][]byte } +// GetToken returns a token by TID. func (p *tokenProvider) GetToken(tid uint32) []byte { return p.tokens[tid] } +// FirstTID returns the minimum TID. func (p *tokenProvider) FirstTID() uint32 { return p.firstTID } +// LastTID returns the maximum TID. func (p *tokenProvider) LastTID() uint32 { return p.lastTID } +// Ordered reports that tokens are sorted. func (p *tokenProvider) Ordered() bool { return true } diff --git a/frac/active/indexer.go b/frac/active/indexer.go index 284b0218..8e86e17f 100644 --- a/frac/active/indexer.go +++ b/frac/active/indexer.go @@ -1,195 +1,354 @@ package active import ( + "bytes" + "cmp" "encoding/binary" - "sync" + "slices" + "unsafe" - "go.uber.org/zap" - - "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/indexer" - "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tokenizer" + "github.com/ozontech/seq-db/util" ) +const uint32Size = uint32(unsafe.Sizeof(uint32(0))) + +// Indexer indexes documents with concurrency limitation type Indexer struct { - ch chan *indexTask - chMerge chan *mergeTask - workerCount int + workerPool WorkerLimiter } -type indexTask struct { - Frac *Active - Metas storage.DocBlock - Pos uint64 - Wg *sync.WaitGroup +// NewIndexer creates a new indexer with specified number of workers +func NewIndexer(workerPool WorkerLimiter) *Indexer { + return &Indexer{ + workerPool: workerPool, + } } -type mergeTask struct { - frac *Active - tokenLIDs *TokenLIDs +// indexerBuffer is a temporary reusable buffer used during index construction to avoid allocations. +// It holds intermediate data structures that are needed during processing but not in the final index. +type indexerBuffer struct { + sizes []uint32 + fields []string + fieldTIDs []uint32 + tokens []tokenizer.MetaToken + tokenMap map[tokenStr]uint32 } -func NewIndexer(workerCount, chLen int) (*Indexer, func()) { - idx := Indexer{ - ch: make(chan *indexTask, chLen), - chMerge: make(chan *mergeTask, chLen), - workerCount: workerCount, - } - stopIdx := idx.start() - return &idx, stopIdx +// Index starts asynchronous document indexing +func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { + idx.workerPool.Acquire() + go func() { + apply(NewMemIndex(block)) + idx.workerPool.Release() + }() } -func (ai *Indexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { - m := sw.Start("send_index_chan") - ai.ch <- &indexTask{ - Pos: storage.DocBlock(metas).GetExt2(), - Metas: metas, - Frac: frac, - Wg: wg, - } - m.Stop() -} +// NewMemIndex creates an in-memory index from a document block +func NewMemIndex(block storage.DocBlock) (*memIndex, error) { + sw := stopwatch.New() -func (ai *Indexer) start() func() { - wg := sync.WaitGroup{} - wg.Add(ai.workerCount) + res, release := NewResources() + defer release() - for i := 0; i < ai.workerCount; i++ { - go func(index int) { - defer wg.Done() - ai.appendWorker(index) - }(i) + // Decompress metadata + payload, err := decompressMeta(res, block, sw) + if err != nil { + return nil, err } - wg.Add(ai.workerCount) - for i := 0; i < ai.workerCount; i++ { - go func() { - defer wg.Done() - ai.mergeWorker() - }() - } + buf := res.GetBuffer() - return func() { - close(ai.ch) - close(ai.chMerge) - wg.Wait() + // Decode metadata + meta, err := decodeMetadata(res, buf, payload, sw) + if err != nil { + return nil, err + } + // Initialize index + idx := newMemIndex() + idx.docsCount = uint32(len(meta)) + idx.ids = idx.res.GetIDs(len(meta)) + idx.positions = idx.res.GetDocPos(len(meta)) + idx.blocksOffsets = idx.res.GetUint64s(1) // Only one block per bulk + idx.blocksOffsets[0] = block.GetExt2() + + // Extract tokens from metadata + tids, lids, tokens, err := extractTokens(idx, res, buf, meta) + if err != nil { + return nil, err } + + // Group documents by token + tokenLIDs := groupLIDsByTID(idx, res, tids, lids, len(tokens)) + + // Organize tokens and fields + organizeTokens(idx, res, buf, tokens, tokenLIDs) + + return idx, nil } -func (ai *Indexer) mergeWorker() { - for task := range ai.chMerge { - task.tokenLIDs.GetLIDs(task.frac.MIDs, task.frac.RIDs) // GetLIDs cause sort and merge LIDs from queue - } +// tokenStr represents a unique token as a (field, value) pair. +// Used as a map key during token deduplication. +type tokenStr struct { + value string + field string } -var metaDataPool = sync.Pool{ - New: func() any { - return new(indexer.MetaData) - }, +func toToken(t tokenizer.MetaToken) tokenStr { + return tokenStr{ + value: util.ByteToStringUnsafe(t.Value), + field: util.ByteToStringUnsafe(t.Key), + } } -func (ai *Indexer) appendWorker(index int) { - // collector of bulk meta data - collector := newMetaDataCollector() +// extractTokens extracts tokens from document metadata +func extractTokens( + idx *memIndex, + res *Resources, + buf *indexerBuffer, + meta []indexer.MetaData, +) ([]uint32, []uint32, []tokenStr, error) { + var docOffset uint64 + var totalTokens uint32 + + // Calculate document positions in the original block + // Each document is stored as: [size: uint32][data: size bytes] + positions := res.GetDocPos(len(meta)) + prev := seq.PackDocPos(0, docOffset) + + for i := range meta { + docMeta := meta[i] + if docMeta.Size > 0 { + prev = seq.PackDocPos(0, docOffset) + docOffset += uint64(docMeta.Size) + uint64(uint32Size) + } + positions[i] = prev + totalTokens += docMeta.TokensCount() + } - for task := range ai.ch { - var err error + // Create ordering by document ID (descending) + // We need to map global document IDs to local IDs (LIDs) + order := res.GetUint32s(len(meta)) + for i := range order { + order[i] = uint32(i) + } + slices.SortFunc(order, func(a, b uint32) int { + return seq.Compare(meta[b].ID, meta[a].ID) + }) + + // Fill index structures with sorted documents + for lid, origIdx := range order { + docMeta := meta[origIdx] + idx.ids[lid] = docMeta.ID + idx.positions[lid] = positions[origIdx] + idx.docsSize += uint64(docMeta.Size) + } - sw := stopwatch.New() - total := sw.Start("total_indexing") + // Extract and process tokens from all documents + var err error + var token tokenStr - metaBuf := bytespool.Acquire(int(task.Metas.RawLen())) + // Allocate slices for token-document relationships + lids := res.GetUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence + tids := res.GetUint32s(int(totalTokens))[:0] // Token ID for each occurrence - if metaBuf.B, err = task.Metas.DecompressTo(metaBuf.B); err != nil { - logger.Panic("error decompressing meta", zap.Error(err)) // TODO: error handling + // Process documents in ID-sorted order + for lid, origIdx := range order { + docMeta := meta[origIdx] + + // Decode tokens for this document + if buf.tokens, err = docMeta.DecodeTokens(buf.tokens[:0]); err != nil { + return nil, nil, nil, err } - metasPayload := metaBuf.B - - active := task.Frac - blockIndex := active.DocBlocks.Append(task.Pos) - collector.Init(blockIndex) - - parsingMetric := sw.Start("metas_parsing") - meta := metaDataPool.Get().(*indexer.MetaData) - for len(metasPayload) > 0 { - n := binary.LittleEndian.Uint32(metasPayload) - metasPayload = metasPayload[4:] - documentMetadata := metasPayload[:n] - metasPayload = metasPayload[n:] - - if err := meta.UnmarshalBinary(documentMetadata); err != nil { - logger.Panic("BUG: can't unmarshal meta", zap.Error(err)) + + buf.tokenMap[tokenStr{field: seq.TokenAll}] = 0 // reserve ALL token (just for proper sealing) + + // Process each token in the document + for _, t := range buf.tokens { + if bytes.Equal(t.Key, seq.AllTokenName) { + continue } - collector.AppendMeta(*meta) - } - metaDataPool.Put(meta) - bytespool.Release(metaBuf) - parsingMetric.Stop() - - m := sw.Start("doc_params_set") - appendedIDs := active.DocsPositions.SetMultiple(collector.IDs, collector.Positions) - if len(appendedIDs) != len(collector.IDs) { - // There are duplicates in the active fraction. - // It is possible in case we retry same bulk requests. - // So we need to remove duplicates from collector. - doublesCnt := len(collector.IDs) - len(appendedIDs) - metric.BulkDuplicateDocsTotal.Observe(float64(doublesCnt)) - logger.Warn("found duplicates", zap.Int("batch", doublesCnt), zap.Int("worker", index)) - collector.Filter(appendedIDs) + token = toToken(t) + tid, exists := buf.tokenMap[token] + if !exists { + tid = uint32(len(buf.tokenMap)) // assign new token ID + buf.tokenMap[token] = tid + } + tids = append(tids, tid) + lids = append(lids, uint32(lid)+1) // store lid+1 (1-based indexing for internal use) } - m.Stop() - - m = sw.Start("append_ids") - lids := active.AppendIDs(collector.IDs) - m.Stop() + } - m = sw.Start("token_list_append") - tokenLIDsPlaces := collector.PrepareTokenLIDsPlaces() - active.TokenList.Append(collector.TokensValues, collector.FieldsLengths, tokenLIDsPlaces) - m.Stop() + // Create reverse mapping: tokenID -> tokenKey + tokens := res.GetTokens(len(buf.tokenMap)) + for key, tokenID := range buf.tokenMap { + tokens[tokenID] = key + } - m = sw.Start("group_lids") - groups := collector.GroupLIDsByToken(lids) - m.Stop() + return tids, lids, tokens, nil +} - m = sw.Start("put_lids_queue") - tokensToMerge := addLIDsToTokens(tokenLIDsPlaces, groups) - ai.sendTokensToMergeWorkers(active, tokensToMerge) - m.Stop() +// groupLIDsByTID groups document IDs by token +// Input: flat arrays of (tid, lid) pairs +// Output: 2D array where tokenLIDs[tid] = []lid +func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { + // Phase 1: Count documents per token + counts := res.GetUint32s(tokenCount) + clear(counts) + for _, tid := range tids { + counts[tid]++ + } - active.UpdateStats(collector.MinMID, collector.MaxMID, collector.DocsCounter, collector.SizeCounter) + // Phase 2: Allocate slices for each token group + // We use a single large buffer and slice it for efficiency + tokenLIDs := res.GetUint32Slices(tokenCount) + allTokenLIDs := idx.res.GetUint32s(len(lids)) + idx.allTokenLIDsCount = len(lids) - task.Wg.Done() + tokenLIDs = tokenLIDs[:len(counts)] + for tid, count := range counts { + tokenLIDs[tid] = allTokenLIDs[:count][:0] + allTokenLIDs = allTokenLIDs[count:] + } - total.Stop() - sw.Export(bulkStagesSeconds) + // Phase 3: Populate groups with LIDs + lids = lids[:len(tids)] + for i, tid := range tids { + if len(tokenLIDs[tid]) > 0 { + if lids[i] == lastLID(tokenLIDs[tid]) { // deduplication + idx.allTokenLIDsCount-- + continue + } + } + tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) } + + return tokenLIDs +} + +func lastLID(s []uint32) uint32 { + return s[len(s)-1] } -func (ai *Indexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { - for _, tl := range tokens { - task := mergeTask{ - frac: frac, - tokenLIDs: tl, +// organizeTokens organizes tokens and fields in the index with proper sorting +func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { + tokenSize := 0 + order := res.GetUint32s(len(tokens)) + order = order[:len(tokens)] + for i, t := range tokens { + order[i] = uint32(i) + tokenSize += len(t.value) + } + + // Create ordering for sorting tokens + // We'll sort by (field, value) to group tokens by field + slices.SortFunc(order, func(a, b uint32) int { + tokenA, tokenB := tokens[a], tokens[b] + return cmp.Or( + cmp.Compare(tokenA.field, tokenB.field), + cmp.Compare(tokenA.value, tokenB.value), + ) + }) + + fieldSize := 0 + prevField := "" + + // Prepare buffers for sorted data + tokenBuffer := idx.res.GetBytes(tokenSize)[:0] + idx.tokenLIDs = idx.res.GetUint32Slices(len(order)) + idx.tokens = idx.res.GetBytesSlices(len(order)) + + // Process tokens in sorted order + for tid, origIdx := range order { + token := tokens[origIdx] + + // Detect field boundaries + // When field name changes, record the field and its first token position + if token.field != prevField || prevField == "" { + fieldSize += len(token.field) + buf.fields = append(buf.fields, token.field) + buf.fieldTIDs = append(buf.fieldTIDs, uint32(tid)) } - select { - case ai.chMerge <- &task: - default: // skip background merge if workers are busy + prevField = token.field + + // Copy token value to buffer and keep reference + start := len(tokenBuffer) + tokenBuffer = append(tokenBuffer, token.value...) + + // Store in sorted arrays + // Note: We use original tokenID as index to preserve tokenID->data mapping + idx.tokens[tid] = tokenBuffer[start:] + idx.tokenLIDs[tid] = tokenLIDs[origIdx] + } + // Add sentinel value for easier range calculation + buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) + + // Organize fields + fieldBuffer := idx.res.GetBytes(fieldSize)[:0] + idx.fields = idx.res.GetBytesSlices(len(buf.fields)) + + idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) + + for i, field := range buf.fields { + // Copy field name to buffer + start := len(fieldBuffer) + fieldBuffer = append(fieldBuffer, field...) + idx.fields[i] = fieldBuffer[start:] + + // Calculate token range for this field + // Each field has continuous range of token IDs in sorted order + startTID := buf.fieldTIDs[i] + endTID := buf.fieldTIDs[i+1] + idx.fieldsTokens[util.ByteToStringUnsafe(fieldBuffer[start:])] = tokenRange{ + start: startTID, + count: endTID - startTID, } } } -func addLIDsToTokens(tlids []*TokenLIDs, lids [][]uint32) []*TokenLIDs { - const minMergeQueue = 10000 +// decompressMeta decompresses metadata from block +func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwatch) ([]byte, error) { + m := sw.Start("decompress_meta") + defer m.Stop() - needMerge := make([]*TokenLIDs, 0, len(tlids)) - for i, tl := range tlids { - if l := tl.PutLIDsInQueue(lids[i]); l > minMergeQueue { - needMerge = append(needMerge, tl) + // Allocate exact size needed for compressed data + buffer := res.GetBytes(int(block.RawLen())) + payload, err := block.DecompressTo(buffer) + if err != nil { + return nil, err + } + return payload, nil +} + +// decodeMetadata decodes document metadata from binary format +// Format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... +func decodeMetadata(res *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { + m := sw.Start("decode_meta") + defer m.Stop() + + // First pass: scan to determine sizes of each metadata entry + var offset uint32 + for offset < uint32(len(payload)) { + size := binary.LittleEndian.Uint32(payload[offset:]) + offset += uint32Size + size + buf.sizes = append(buf.sizes, size) + } + + // Second pass: decode each metadata entry + meta := res.GetMetadata(len(buf.sizes)) + for i, size := range buf.sizes { + // Skip size field to get to actual data + data := payload[uint32Size : size+uint32(uint32Size)] + if err := meta[i].UnmarshalBinaryLazy(data); err != nil { + return nil, err } + // Move to next entry + payload = payload[size+uint32(uint32Size):] } - return needMerge + + return meta, nil } diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index d86df4ba..3938a8b8 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -9,80 +9,67 @@ import ( "time" "github.com/alecthomas/units" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.uber.org/zap/zapcore" - "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric/stopwatch" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tests/common" "github.com/ozontech/seq-db/tokenizer" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zapcore" ) -func readFileAllAtOnce(filename string) ([][]byte, error) { - content, err := os.ReadFile(filename) - if err != nil { - return nil, err - } - lines := bytes.Split(content, []byte{'\n'}) - if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { - lines = lines[:len(lines)-1] - } - return lines, nil -} +func BenchmarkIndexer(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) -func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { - funcs := []func() ([]byte, error){} - for len(data) > 0 { - size := min(len(data), bulkSize) - funcs = append(funcs, testBufReader(data[0:size])) - data = data[size:] - } - return funcs -} + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) -func testBufReader(data [][]byte) func() ([]byte, error) { - orig := data - return func() ([]byte, error) { - if len(data) == 0 { - data = orig - return nil, nil + processor := getTestProcessor() + + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) } - line := data[0] - data = data[1:] - return line, nil } -} -func getTestProcessor() *indexer.Processor { - mapping := seq.Mapping{ - "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), - "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), - "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), - "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), - } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + config.NumCPU, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + b.StartTimer() - tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ - seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), - seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), - seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), - seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), + wg := sync.WaitGroup{} + for _, meta := range allMeta { + wg.Add(1) + active.indexer.Index(meta, func(idx *memIndex, err error) { + active.indexes.Add(idx, 0, 0) + wg.Done() + }) + } + wg.Wait() } - - return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) } -func BenchmarkIndexer(b *testing.B) { +func BenchmarkMerge(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewIndexer(config.NumCPU, config.NumCPU) - defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) @@ -103,22 +90,28 @@ func BenchmarkIndexer(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { b.StopTimer() + active := New( filepath.Join(b.TempDir(), "test"), - idx, + &frac.Config{}, + config.NumCPU, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &frac.Config{}, ) - b.StartTimer() wg := sync.WaitGroup{} for _, meta := range allMeta { wg.Add(1) - idx.Index(active, meta, &wg, stopwatch.New()) + active.indexer.Index(meta, func(idx *memIndex, err error) { + active.indexes.Add(idx, 0, 0) + wg.Done() + }) } wg.Wait() + b.StartTimer() + + active.merger.ForceMergeAll() } } @@ -137,8 +130,6 @@ func defaultSealingParams() frac.SealParams { func BenchmarkFullWrite(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewIndexer(config.NumCPU, config.NumCPU) - defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) @@ -163,11 +154,11 @@ func BenchmarkFullWrite(b *testing.B) { for b.Loop() { active := New( filepath.Join(b.TempDir(), "test"), - idx, + &frac.Config{SkipSortDocs: true}, + config.NumCPU, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &frac.Config{SkipSortDocs: true}, ) wg := sync.WaitGroup{} @@ -188,3 +179,56 @@ func BenchmarkFullWrite(b *testing.B) { active.Release() } } + +func readFileAllAtOnce(filename string) ([][]byte, error) { + content, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + lines := bytes.Split(content, []byte{'\n'}) + if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { + lines = lines[:len(lines)-1] + } + return lines, nil +} + +func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { + funcs := []func() ([]byte, error){} + for len(data) > 0 { + size := min(len(data), bulkSize) + funcs = append(funcs, testBufReader(data[0:size])) + data = data[size:] + } + return funcs +} + +func testBufReader(data [][]byte) func() ([]byte, error) { + orig := data + return func() ([]byte, error) { + if len(data) == 0 { + data = orig + return nil, nil + } + line := data[0] + data = data[1:] + return line, nil + } +} + +func getTestProcessor() *indexer.Processor { + mapping := seq.Mapping{ + "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), + "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), + "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), + "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), + } + + tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ + seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), + seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), + seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), + seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), + } + + return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) +} diff --git a/frac/active2/iterators.go b/frac/active/iterators.go similarity index 99% rename from frac/active2/iterators.go rename to frac/active/iterators.go index 22291eba..db98a755 100644 --- a/frac/active2/iterators.go +++ b/frac/active/iterators.go @@ -1,4 +1,4 @@ -package active2 +package active import "github.com/ozontech/seq-db/seq" diff --git a/frac/active/mem_index.go b/frac/active/mem_index.go new file mode 100644 index 00000000..74eadd4e --- /dev/null +++ b/frac/active/mem_index.go @@ -0,0 +1,99 @@ +package active + +import ( + "sort" + "sync" + + "github.com/ozontech/seq-db/seq" +) + +// tokenRange describes a range of tokens belonging to a specific field. +type tokenRange struct { + start uint32 // first TID of the field + count uint32 // number of tokens in the field +} + +// memIndex is an in-memory index of an active segment. +// It is used for searching, mapping tokens to documents, and retrieving document positions. +type memIndex struct { + // Important: + // - index in ids array + 1 = LID (local document id) + // - index in positions array + 1 = LID also + // - index in tokens array = TID (token id) + // - index in fieldsTokens array = TID + + ids []seq.ID // list of document IDs sorted in descending order (DESC) + tokens [][]byte // list of all tokens sorted in ascending order (ASC) by key field:token + tokenLIDs [][]uint32 // for each TID stores a sorted list of LIDs of documents containing this token + fieldsTokens map[string]tokenRange // mapping field → token range (TID) belonging to this field + fields [][]byte // list of all fields sorted in ascending order (ASC) + blocksOffsets []uint64 // offsets of document blocks in storage, sorted in ascending order + positions []seq.DocPos // position of each document inside a block; index corresponds to LID-1 + + docsSize uint64 // total size of documents in bytes + docsCount uint32 // number of documents in the index + allTokenLIDsCount int // total number of tokenLIDs (for fast calc allocation size in merging) + + wg sync.WaitGroup // used to wait for background operations to finish before releasing resources + res *Resources // shared resource pool (memory, buffers, etc.) + release func() // function to release resources +} + +// newMemIndex creates a new in-memory index and initializes resources. +func newMemIndex() *memIndex { + res, release := NewResources() + return &memIndex{ + res: res, + release: release, + } +} + +// getTokenProvider returns a tokenProvider for the specified field. +// It restricts the TID range to tokens belonging only to this field. +func (idx *memIndex) getTokenProvider(field string) *tokenProvider { + if r, ok := idx.fieldsTokens[field]; ok { + return &tokenProvider{ + firstTID: r.start, + lastTID: r.start + r.count - 1, + tokens: idx.tokens, + } + } + + // Field is not indexed — return an empty provider with firstTID > lastTID. + return &tokenProvider{ + firstTID: 1, + lastTID: 0, + tokens: idx.tokens, + } +} + +// IsIntersecting checks whether the MID range [from, to] intersects +// with the range of documents stored in the index. +func (idx *memIndex) IsIntersecting(from, to seq.MID) bool { + maxMID := idx.ids[0].MID + minMID := idx.ids[len(idx.ids)-1].MID + + if to < minMID || maxMID < from { + return false + } + return true +} + +// GetLIDByID searches for the local document ID (LID) by global ID (MID + RID). +// Returns the LID (starting from 1) and a flag indicating whether it was found. +func (idx *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { + i, ok := sort.Find(len(idx.ids), func(i int) int { + return seq.Compare(idx.ids[i], id) + }) + return uint32(i + 1), ok +} + +// Release frees index resources. +// The call is non-blocking: actual release happens in a separate goroutine +// after all ongoing operations are completed. +func (idx *memIndex) Release() { + go func() { + idx.wg.Wait() + idx.release() + }() +} diff --git a/frac/active2/mem_index_pool.go b/frac/active/mem_index_pool.go similarity index 99% rename from frac/active2/mem_index_pool.go rename to frac/active/mem_index_pool.go index 15b9ac30..1da469f1 100644 --- a/frac/active2/mem_index_pool.go +++ b/frac/active/mem_index_pool.go @@ -1,4 +1,4 @@ -package active2 +package active import ( "slices" diff --git a/frac/active/merge.go b/frac/active/merge.go new file mode 100644 index 00000000..f16fbac8 --- /dev/null +++ b/frac/active/merge.go @@ -0,0 +1,333 @@ +package active + +import ( + "bytes" + "slices" + + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + +// mergeIndexes merges several in-memory indexes (memIndex) +// into a single resulting index. +func mergeIndexes(indexes []*memIndex) *memIndex { + // Count the total number of blocks, documents, and tokens to preallocate memory. + blocksCount := 0 + dst := newMemIndex() + + for _, idx := range indexes { + dst.docsSize += idx.docsSize + dst.docsCount += idx.docsCount + dst.allTokenLIDsCount += idx.allTokenLIDsCount + blocksCount += len(idx.blocksOffsets) + } + + // Shared temporary resources for merging + res, release := NewResources() + defer release() + + // Preallocate memory for final structures + dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] + dst.positions = dst.res.GetDocPos(int(dst.docsCount))[:0] + dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] + + // 1. Merge block offsets and recalculate document positions + posMap := mergeBlocksOffsets(dst, res, indexes) + + // 2. Merge documents (IDs), get old LID → new LID mapping + lidsMap := mergeIDs(dst, res, indexes, posMap) + + // 3. Merge tokens using the new document LIDs + mergeTokens(dst, res, indexes, lidsMap) + + return dst +} + +// mergeIDs merges documents from all indexes into a single ordered stream. +// Returns a mapping of oldLID → newLID for each input index. +func mergeIDs( + dst *memIndex, + res *Resources, + indexes []*memIndex, + posMap [][]seq.DocPos, +) [][]uint32 { + + // Store old LID → new LID mapping for each index + lidsMap := res.GetUint32Slices(len(indexes)) + + // Iterators over documents of each index + docStreams := make([]OrderedStream[DocRef], len(indexes)) + + for i, idx := range indexes { + docStreams[i] = &DocStream{ + i: i, // index number + idx: idx, // the index itself + posMap: posMap[i], // recalculated document positions + } + + // LIDs start from 1, so add a "dummy" element immediately + lidsMap[i] = res.GetUint32s(int(idx.docsCount) + 1)[:1] + } + + // Merge all document streams into one, + // sorting by ID (in reverse order) + mergedDocStream := MergeSortedStreams( + docStreams, + func(a, b DocRef) int { + return seq.Compare(b.id, a.id) + }, + ) + + // Iterate over the merged stream + docRef, has := mergedDocStream.Next() + for has { + // Add document to the resulting index + dst.ids = append(dst.ids, docRef.id) + dst.positions = append(dst.positions, docRef.pos) + + // New LID is the position in dst.ids (1-based) + lid := uint32(len(dst.ids)) + + // Record oldLID → newLID mapping + lidsMap[docRef.i] = append(lidsMap[docRef.i], lid) + + docRef, has = mergedDocStream.Next() + } + + return lidsMap +} + +// mergeTokens merges tokens from all indexes, +// reusing the new document LIDs. +func mergeTokens( + dst *memIndex, + res *Resources, + indexes []*memIndex, + lidsMap [][]uint32, +) { + totalTokens := 0 + tokenStreams := make([]OrderedStream[TokenRef], len(indexes)) + + // create iterators over tokens + for i, idx := range indexes { + totalTokens += len(idx.tokens) + tokenStreams[i] = NewTokenStream(idx, lidsMap[i]) + } + + cmpToken := func(a, b TokenRef) int { // token comparison: first by field, then by value + r := bytes.Compare(a.Field(), b.Field()) + if r == 0 { + return bytes.Compare(a.Value(), b.Value()) + } + return r + } + + // merged and sorted token stream + mergedTokenStream := MergeSortedStreams(tokenStreams, cmpToken) + + // statistics for unique values + uniqTokensSize := 0 + uniqTokensCount := 0 + uniqFieldsSize := 0 + uniqFieldsCount := 0 + + var ( + prevField []byte + prevToken TokenRef + ) + + // borders[i] indicates: + const ( + borderSame = 0b00 // tokensRef[i] is the same token as in tokensRef[i-1] (but other index) + borderToken = 0b01 // tokensRef[i] is new token + borderField = 0b10 // tokensRef[i] is new token and new field + ) + + borders := res.GetBytes(totalTokens)[:0] + tokensRef := make([]TokenRef, 0, totalTokens) + + // First pass: count unique tokens and fields + for tokenRef, has := mergedTokenStream.Next(); has; tokenRef, has = mergedTokenStream.Next() { + var border uint8 = borderSame + + // New token + if prevToken.payload == nil || cmpToken(prevToken, tokenRef) != 0 { + uniqTokensCount++ + uniqTokensSize += len(tokenRef.Value()) + border |= borderToken + + // New field + field := tokenRef.Field() + if !bytes.Equal(prevField, field) { + uniqFieldsCount++ + uniqFieldsSize += len(field) + border |= borderField + prevField = field + } + } + + borders = append(borders, border) + tokensRef = append(tokensRef, tokenRef) + prevToken = tokenRef + } + + // Initialize resulting index structures + dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) + dst.fields = dst.res.GetBytesSlices(uniqFieldsCount)[:0] + dst.tokens = dst.res.GetBytesSlices(uniqTokensCount)[:0] + dst.tokenLIDs = dst.res.GetUint32Slices(uniqTokensCount)[:0] + + allTokens := dst.res.GetBytes(uniqTokensSize)[:0] + allFields := dst.res.GetBytes(uniqFieldsSize)[:0] + + // Collector for document LIDs for each token + lidsCollector := NewLIDsCollector( + res.GetUint32s(int(dst.docsCount)), // temporary buffer + dst.res.GetUint32s(dst.allTokenLIDsCount)[:0], // all token LIDs + dst.res.GetUint32s(int(dst.docsCount)), // LIDs for _all_ + res.GetBytes((int(dst.docsCount) + 1)), // buffer for sorting + ) + + // Second pass: fill structures + for i, tokenRef := range tokensRef { + if borders[i]&borderToken == borderToken { // new token value + + if i > 0 { // finish previous token + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) + } + + if borders[i]&borderField == borderField { // new field + tid := uint32(len(dst.tokens)) + + if i > 0 { // finish previous field + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] + tr.count = tid - tr.start + dst.fieldsTokens[fieldStr] = tr + } + + start := len(allFields) + allFields = append(allFields, tokenRef.Field()...) + field := allFields[start:] + dst.fields = append(dst.fields, field) + + fieldStr := util.ByteToStringUnsafe(field) + dst.fieldsTokens[fieldStr] = tokenRange{start: tid} + } + start := len(allTokens) + allTokens = append(allTokens, tokenRef.Value()...) + dst.tokens = append(dst.tokens, allTokens[start:]) + } + + // Add document LIDs for the token + newLIDsMap := tokenRef.lidsMap() + for _, oldLID := range tokenRef.LIDs() { + lidsCollector.Add(newLIDsMap[oldLID]) + } + } + + // Final token + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) + + // Close the last field + tid := uint32(len(dst.tokens)) - 1 + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] + tr.count = tid - tr.start + 1 + dst.fieldsTokens[fieldStr] = tr +} + +// LIDsCollector collects and efficiently sorts document LIDs for a token. +type LIDsCollector struct { + tmp []uint32 // temporary accumulation + lids []uint32 // overall array + all []uint32 // full set of LIDs (1..N) + buf []uint8 // bitmap +} + +// Initialize collector +func NewLIDsCollector(tmp, lids, all []uint32, buf []uint8) *LIDsCollector { + clear(buf) + for i := range all { + all[i] = uint32(i) + 1 + } + return &LIDsCollector{ + tmp: tmp[:0], + lids: lids[:0], + all: all, + buf: buf, + } +} + +// Add a single LID +func (s *LIDsCollector) Add(lid uint32) { + s.tmp = append(s.tmp, lid) +} + +// Returns sorted LID list, +// choosing the optimal algorithm depending on density. +func (s *LIDsCollector) GetSorted() (dst []uint32) { + n := len(s.tmp) + + // If all documents are covered — return all + if n == len(s.all) { + s.tmp = s.tmp[:0] + return s.all + } + + // If density is high — use bitmap + if 100*n/len(s.all) > 50 { + for _, v := range s.tmp { + s.buf[v] = 1 + } + start := len(s.lids) + for lid, ok := range s.buf { + if ok == 1 { + s.buf[lid] = 0 + s.lids = append(s.lids, uint32(lid)) + } + } + s.tmp = s.tmp[:0] + return s.lids[start:] + } + + // Otherwise, normal sorting + if n > 1 { + slices.Sort(s.tmp) + } + start := len(s.lids) + s.lids = append(s.lids, s.tmp...) + s.tmp = s.tmp[:0] + return s.lids[start:] +} + +// mergeBlocksOffsets merges block offsets +// and recalculates document positions considering the offset. +func mergeBlocksOffsets( + dst *memIndex, + res *Resources, + indexes []*memIndex, +) [][]seq.DocPos { + + var offset uint32 + positions := res.GetDocPosSlices(len(indexes)) + + for i, index := range indexes { + // Copy block offsets + dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) + + // Recalculate document positions + positions[i] = res.GetDocPos(len(index.positions))[:0] + for _, p := range index.positions { + oldIdx, docOffset := p.Unpack() + positions[i] = append( + positions[i], + seq.PackDocPos(oldIdx+offset, docOffset), + ) + } + + offset += uint32(len(index.blocksOffsets)) + } + + return positions +} diff --git a/frac/active2/merge_manager.go b/frac/active/merge_manager.go similarity index 99% rename from frac/active2/merge_manager.go rename to frac/active/merge_manager.go index 0addef5c..6b3f0ade 100644 --- a/frac/active2/merge_manager.go +++ b/frac/active/merge_manager.go @@ -1,4 +1,4 @@ -package active2 +package active import ( "sync" diff --git a/frac/active2/metrics.go b/frac/active/metrics.go similarity index 78% rename from frac/active2/metrics.go rename to frac/active/metrics.go index dac3fd68..dc526640 100644 --- a/frac/active2/metrics.go +++ b/frac/active/metrics.go @@ -1,6 +1,7 @@ -package active2 +package active import ( + "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/metric" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -40,3 +41,14 @@ var ( Buckets: metric.SecondsBuckets, }, []string{"stage"}) ) + +// getActiveSearchMetric selects a Prometheus metric depending on the type of search query. +func getActiveSearchMetric(params processor.SearchParams) *prometheus.HistogramVec { + if params.HasAgg() { + return searchAggSec + } + if params.HasHist() { + return searchHstSec + } + return searchSimpleSec +} diff --git a/frac/active2/resources.go b/frac/active/resources.go similarity index 99% rename from frac/active2/resources.go rename to frac/active/resources.go index b87e73b9..0d7235f5 100644 --- a/frac/active2/resources.go +++ b/frac/active/resources.go @@ -1,4 +1,4 @@ -package active2 +package active import ( "github.com/ozontech/seq-db/indexer" diff --git a/frac/active/sealing_source.go b/frac/active/sealing_source.go index 9ed23253..ce6c5e76 100644 --- a/frac/active/sealing_source.go +++ b/frac/active/sealing_source.go @@ -1,173 +1,117 @@ package active import ( - "bytes" - "errors" "iter" - "slices" + "math" "time" - "unsafe" - "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" ) -// SealingSource transforms data from in-memory (frac.Active) storage -// into a format suitable for disk writing during index creation. -// -// The main purpose of this type is to provide access to sorted data -// through a set of iterators that allow sequential processing of -// data in sized blocks for disk writing: -// -// - TokenBlocks() - iterator for token blocks, sorted by fields and values -// - Fields() - iterator for sorted fields with maximum TIDs -// - IDsBlocks() - iterator for document ID blocks and their positions -// - TokenLIDs() - iterator for LID lists for each token -// - Docs() - iterator for documents themselves with duplicate handling -// -// All iterators work with pre-sorted data and return information -// in an order optimal for creating disk index structures. +var ( + _ sealing.Source = (*SealingSource)(nil) + + // Special system ID added as the first entry + systemSeqID = seq.ID{ + MID: math.MaxUint64, + RID: math.MaxUint64, + } +) + +// SealingSource provides data from a single memIndex in the format required by the sealing stage. type SealingSource struct { - info *frac.Info // fraction Info - created time.Time // Creation time of the source - sortedLIDs []uint32 // Sorted LIDs (Local ID) - oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) - mids *UInt64s // MIDs - rids *UInt64s // RIDs - fields []string // Sorted field names - fieldsMaxTIDs []uint32 // Maximum TIDs for each field - tids []uint32 // Sorted TIDs (Token ID) - tokens [][]byte // Tokens (values) by TID - lids []*TokenLIDs // LID lists for each token - docPosOrig map[seq.ID]seq.DocPos // Original document positions - docPosSorted []seq.DocPos // Document positions after sorting - blocksOffsets []uint64 // Document block offsets - docsReader *storage.DocsReader // Document storage reader - lastErr error // Last error + info *frac.Info + index *memIndex + lastErr error } -// NewSealingSource creates a new data source for sealing -// based on an active in-memory index. -func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, error) { - info := *active.info // copy - sortedLIDs := active.GetAllDocuments() +// NewSealingSource prepares a sealing source from Active2 state. +func NewSealingSource(a *Active, params frac.SealParams) (sealing.Source, error) { + a.merger.ForceMergeAll() - // Sort fields and get maximum TIDs for each field - sortedFields, fieldsMaxTIDs := sortFields(active.TokenList) + iss, release := a.indexes.Snapshot() + defer release() - // Sort tokens within each field - sortedTIDs := sortTokens(sortedFields, active.TokenList) + if len(iss.indexes) != 1 { + logger.Panic("invalid state: sealing requires a single memIndex") + } - src := SealingSource{ - info: &info, - created: time.Now(), - sortedLIDs: sortedLIDs, - oldToNewLIDs: makeInverser(sortedLIDs), // Create LID mapping - mids: active.MIDs, - rids: active.RIDs, - fields: sortedFields, - tids: sortedTIDs, - fieldsMaxTIDs: fieldsMaxTIDs, - tokens: active.TokenList.tidToVal, - lids: active.TokenList.tidToLIDs, - docPosOrig: active.DocsPositions.idToPos, - blocksOffsets: active.DocBlocks.vals, - docsReader: &active.sortReader, + ss := &SealingSource{ + info: iss.info, + index: iss.indexes[0], } - src.prepareInfo() + // Sort documents unless explicitly disabled + if !a.Config.SkipSortDocs { + ds := active_old.NewDocsSource(ss, ss.index.blocksOffsets, &a.sortReader) - // Sort documents if not skipped in configuration - if !active.Config.SkipSortDocs { - ds := NewDocsSource(&src, src.blocksOffsets, &active.sortReader) - blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs( + ss.info.Path, + params, + ds, + ) if err != nil { return nil, err } - src.docPosSorted = positions[1:] - src.blocksOffsets = blocksOffsets - src.info.DocsOnDisk = uint64(onDiskSize) - } - - return &src, nil -} -// sortFields sorts field names and calculates maximum TIDs for each field. -// Returns sorted field list and array of maximum TIDs. -func sortFields(tl *tokenList) ([]string, []uint32) { - fields := make([]string, 0, len(tl.FieldTIDs)) - for field := range tl.FieldTIDs { - fields = append(fields, field) + // Skip system document position + ss.index.positions = positions[1:] + ss.index.blocksOffsets = blocksOffsets + ss.info.DocsOnDisk = uint64(onDiskSize) } - slices.Sort(fields) - pos := 0 - maxTIDs := make([]uint32, 0, len(fields)) - for _, field := range fields { - pos += len(tl.FieldTIDs[field]) - maxTIDs = append(maxTIDs, uint32(pos)) - } + ss.info.MetaOnDisk = 0 + ss.info.SealingTime = uint64(time.Now().UnixMilli()) - return fields, maxTIDs -} + ss.info.BuildDistributionWithIDs(ss.index.ids) -// sortTokens sorts tokens lexicographically within each field. -// Returns sorted list of TIDs. -func sortTokens(sortedFields []string, tl *tokenList) []uint32 { - pos := 0 - tids := make([]uint32, 0, len(tl.tidToVal)) - for _, field := range sortedFields { - tids = append(tids, tl.FieldTIDs[field]...) - chunk := tids[pos:] - slices.SortFunc(chunk, func(i, j uint32) int { - a := tl.tidToVal[i] - b := tl.tidToVal[j] - return bytes.Compare(a, b) // Sort by token value - }) - pos = len(tids) - } - return tids + return ss, nil } -// LastError returns the last error that occurred during processing. -func (src *SealingSource) LastError() error { - return src.lastErr +// Info returns fraction metadata. +func (src *SealingSource) Info() *frac.Info { + return src.info } -// prepareInfo prepares metadata for disk writing. -func (src *SealingSource) prepareInfo() { - src.info.MetaOnDisk = 0 - src.info.SealingTime = uint64(src.created.UnixMilli()) - src.info.BuildDistribution(src.mids.vals) -} +// IDsBlocks yields document IDs and their positions in fixed-size blocks. +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { + return func(yield func([]seq.ID, []seq.DocPos) bool) { + ids := make([]seq.ID, 0, blockSize) + pos := make([]seq.DocPos, 0, blockSize) -// Info returns index metadata information. -func (src *SealingSource) Info() *frac.Info { - return src.info + // Add system entry + ids = append(ids, systemSeqID) + pos = append(pos, 0) + + for i, id := range src.index.ids { + if len(ids) == blockSize { + if !yield(ids, pos) { + return + } + ids = ids[:0] + pos = pos[:0] + } + + ids = append(ids, id) + pos = append(pos, src.index.positions[i]) + } + + yield(ids, pos) + } } -// TokenBlocks returns an iterator for token blocks for disk writing. -// Tokens are pre-sorted: first by fields, then lexicographically within each field. -// Each block contains up to blockSize bytes of data for efficient writing. +// TokenBlocks yields tokens grouped by total byte size. func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { - const tokenLengthSize = int(unsafe.Sizeof(uint32(0))) return func(yield func([][]byte) bool) { - if len(src.tids) == 0 { - return - } - if blockSize <= 0 { - src.lastErr = errors.New("sealing: token block size must be > 0") - return - } - actualSize := 0 block := make([][]byte, 0, blockSize) - // Iterate through all sorted TIDs - for _, tid := range src.tids { + for _, token := range src.index.tokens { if actualSize >= blockSize { if !yield(block) { return @@ -175,149 +119,54 @@ func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { actualSize = 0 block = block[:0] } - token := src.tokens[tid] - actualSize += tokenLengthSize // Add the size of the token length field - actualSize += len(token) // Add the size of the token itself + + actualSize += len(token) + int(uint32Size) block = append(block, token) } + yield(block) } } -// Fields returns an iterator for sorted fields and their maximum TIDs. -// Fields are sorted lexicographically, ensuring predictable order -// when building disk index structures. +// Fields yields field names with their token upper bounds. func (src *SealingSource) Fields() iter.Seq2[string, uint32] { return func(yield func(string, uint32) bool) { - for i, field := range src.fields { - if !yield(field, src.fieldsMaxTIDs[i]) { + for _, field := range src.index.fields { + f := util.ByteToStringUnsafe(field) + r := src.index.fieldsTokens[f] + + if !yield(f, r.start+r.count) { return } } } } -// IDsBlocks returns an iterator for document ID blocks and corresponding positions. -// IDs are sorted. Block size is controlled by blockSize parameter for balance between -// performance and memory usage. -func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { - return func(yield func([]seq.ID, []seq.DocPos) bool) { - mids := src.mids.vals - rids := src.rids.vals - - ids := make([]seq.ID, 0, blockSize) - pos := make([]seq.DocPos, 0, blockSize) - - // First reserved ID (system). This position is not used because Local IDs (LIDs) use 1-based indexing. - ids = append(ids, seq.ID{MID: seq.MID(mids[0]), RID: seq.RID(rids[0])}) - pos = append(pos, 0) - - // Iterate through sorted LIDs - for i, lid := range src.sortedLIDs { - if len(ids) == blockSize { - if !yield(ids, pos) { - return - } - ids = ids[:0] - pos = pos[:0] - } - id := seq.ID{MID: seq.MID(mids[lid]), RID: seq.RID(rids[lid])} - ids = append(ids, id) - - // Use sorted or original positions - if len(src.docPosSorted) == 0 { - pos = append(pos, src.docPosOrig[id]) - } else { - pos = append(pos, src.docPosSorted[i]) // +1 for system document - } - } - yield(ids, pos) +// TokenLIDs yields document LIDs for each token. +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { + all := make([]uint32, src.index.docsCount) + for i := range all { + all[i] = uint32(i) + 1 } -} -// BlocksOffsets returns document block offsets. -func (src *SealingSource) BlocksOffsets() []uint64 { - return src.blocksOffsets -} - -// TokenLIDs returns an iterator for LID lists for each token. -// LIDs are converted to new numbering after document sorting. -// Each iterator call returns a list of documents containing a specific token, -// in sorted order. -func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { return func(yield func([]uint32) bool) { - newLIDs := []uint32{} - - // For each sorted TID - for _, tid := range src.tids { - // Get original LIDs for this token - oldLIDs := src.lids[tid].GetLIDs(src.mids, src.rids) - newLIDs = slices.Grow(newLIDs[:0], len(oldLIDs)) - - // Convert old LIDs to new through mapping - for _, lid := range oldLIDs { - newLIDs = append(newLIDs, src.oldToNewLIDs[lid]) + for _, tokenLIDs := range src.index.tokenLIDs { + if len(tokenLIDs) == 0 { + tokenLIDs = all } - - if !yield(newLIDs) { + if !yield(tokenLIDs) { return } } } } -// makeInverser creates an array for converting old LIDs to new ones. -// sortedLIDs[i] = oldLID -> inverser[oldLID] = i+1 -func makeInverser(sortedLIDs []uint32) []uint32 { - inverser := make([]uint32, len(sortedLIDs)+1) - for i, lid := range sortedLIDs { - inverser[lid] = uint32(i + 1) // +1 because 0 position is reserved and unused - } - return inverser -} - -// Docs returns an iterator for documents with their IDs. -// Handles duplicate IDs (for nested indexes). -func (src *SealingSource) Docs() iter.Seq2[seq.ID, []byte] { - src.lastErr = nil - return func(yield func(seq.ID, []byte) bool) { - var ( - prev seq.ID - curDoc []byte - ) - - // Iterate through ID and position blocks - for ids, pos := range src.IDsBlocks(consts.IDsPerBlock) { - for i, id := range ids { - if id == systemSeqID { - curDoc = nil // reserved system document (no payload) - } else if id != prev { - // If ID changed, read new document - if curDoc, src.lastErr = src.doc(pos[i]); src.lastErr != nil { - return - } - } - prev = id - if !yield(id, curDoc) { - return - } - } - } - } +// BlocksOffsets returns document block offsets. +func (src *SealingSource) BlocksOffsets() []uint64 { + return src.index.blocksOffsets } -// doc reads a document from storage by its position. -func (src *SealingSource) doc(pos seq.DocPos) ([]byte, error) { - blockIndex, docOffset := pos.Unpack() - blockOffset := src.blocksOffsets[blockIndex] - - var doc []byte - err := src.docsReader.ReadDocsFunc(blockOffset, []uint64{docOffset}, func(b []byte) error { - doc = b - return nil - }) - if err != nil { - return nil, err - } - return doc, nil +// LastError returns the last recorded source error. +func (src *SealingSource) LastError() error { + return src.lastErr } diff --git a/frac/active2/indexer.go b/frac/active2/indexer.go deleted file mode 100644 index c8ff105d..00000000 --- a/frac/active2/indexer.go +++ /dev/null @@ -1,352 +0,0 @@ -package active2 - -import ( - "cmp" - "encoding/binary" - "slices" - "unsafe" - - "github.com/ozontech/seq-db/indexer" - "github.com/ozontech/seq-db/metric/stopwatch" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" - "github.com/ozontech/seq-db/tokenizer" - "github.com/ozontech/seq-db/util" -) - -const uint32Size = uint32(unsafe.Sizeof(uint32(0))) - -// Indexer indexes documents with concurrency limitation -type Indexer struct { - workerPool WorkerLimiter -} - -// NewIndexer creates a new indexer with specified number of workers -func NewIndexer(workerPool WorkerLimiter) *Indexer { - return &Indexer{ - workerPool: workerPool, - } -} - -// indexerBuffer is a temporary reusable buffer used during index construction to avoid allocations. -// It holds intermediate data structures that are needed during processing but not in the final index. -type indexerBuffer struct { - sizes []uint32 - fields []string - fieldTIDs []uint32 - tokens []tokenizer.MetaToken - tokenMap map[tokenStr]uint32 -} - -// Index starts asynchronous document indexing -func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { - idx.workerPool.Acquire() - go func() { - apply(NewMemIndex(block)) - idx.workerPool.Release() - }() -} - -// NewMemIndex creates an in-memory index from a document block -func NewMemIndex(block storage.DocBlock) (*memIndex, error) { - sw := stopwatch.New() - - res, release := NewResources() - defer release() - - // Decompress metadata - payload, err := decompressMeta(res, block, sw) - if err != nil { - return nil, err - } - - buf := res.GetBuffer() - - // Decode metadata - meta, err := decodeMetadata(res, buf, payload, sw) - if err != nil { - return nil, err - } - // Initialize index - idx := newMemIndex() - idx.docsCount = uint32(len(meta)) - idx.ids = idx.res.GetIDs(len(meta)) - idx.positions = idx.res.GetDocPos(len(meta)) - idx.blocksOffsets = idx.res.GetUint64s(1) // Only one block per bulk - idx.blocksOffsets[0] = block.GetExt2() - - // Extract tokens from metadata - tids, lids, tokens, err := extractTokens(idx, res, buf, meta) - if err != nil { - return nil, err - } - - // Group documents by token - tokenLIDs := groupLIDsByTID(idx, res, tids, lids, len(tokens)) - - // Organize tokens and fields - organizeTokens(idx, res, buf, tokens, tokenLIDs) - - // Set special "all" token - idx.allTID = uint32(idx.fieldsTokens[seq.TokenAll].start) - - return idx, nil -} - -// tokenStr represents a unique token as a (field, value) pair. -// Used as a map key during token deduplication. -type tokenStr struct { - value string - field string -} - -func toToken(t tokenizer.MetaToken) tokenStr { - return tokenStr{ - value: util.ByteToStringUnsafe(t.Value), - field: util.ByteToStringUnsafe(t.Key), - } -} - -// extractTokens extracts tokens from document metadata -func extractTokens( - idx *memIndex, - res *Resources, - buf *indexerBuffer, - meta []indexer.MetaData, -) ([]uint32, []uint32, []tokenStr, error) { - var docOffset uint64 - var totalTokens uint32 - - // Calculate document positions in the original block - // Each document is stored as: [size: uint32][data: size bytes] - positions := res.GetDocPos(len(meta)) - prev := seq.PackDocPos(0, docOffset) - - for i := range meta { - docMeta := meta[i] - if docMeta.Size > 0 { - prev = seq.PackDocPos(0, docOffset) - docOffset += uint64(docMeta.Size) + uint64(uint32Size) - } - positions[i] = prev - totalTokens += docMeta.TokensCount() - } - - // Create ordering by document ID (descending) - // We need to map global document IDs to local IDs (LIDs) - order := res.GetUint32s(len(meta)) - for i := range order { - order[i] = uint32(i) - } - slices.SortFunc(order, func(a, b uint32) int { - return seq.Compare(meta[b].ID, meta[a].ID) - }) - - // Fill index structures with sorted documents - for lid, origIdx := range order { - docMeta := meta[origIdx] - idx.ids[lid] = docMeta.ID - idx.positions[lid] = positions[origIdx] - idx.docsSize += uint64(docMeta.Size) - } - - // Extract and process tokens from all documents - var err error - var token tokenStr - - // Allocate slices for token-document relationships - lids := res.GetUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence - tids := res.GetUint32s(int(totalTokens))[:0] // Token ID for each occurrence - - // Process documents in ID-sorted order - for lid, origIdx := range order { - docMeta := meta[origIdx] - - // Decode tokens for this document - if buf.tokens, err = docMeta.DecodeTokens(buf.tokens[:0]); err != nil { - return nil, nil, nil, err - } - - // Process each token in the document - for _, t := range buf.tokens { - token = toToken(t) - tid, exists := buf.tokenMap[token] - if !exists { - tid = uint32(len(buf.tokenMap)) // assign new token ID - buf.tokenMap[token] = tid - } - tids = append(tids, tid) - lids = append(lids, uint32(lid)+1) // store lid+1 (1-based indexing for internal use) - } - } - - // Create reverse mapping: tokenID -> tokenKey - tokens := res.GetTokens(len(buf.tokenMap)) - for key, tokenID := range buf.tokenMap { - tokens[tokenID] = key - } - - return tids, lids, tokens, nil -} - -// groupLIDsByTID groups document IDs by token -// Input: flat arrays of (tid, lid) pairs -// Output: 2D array where tokenLIDs[tid] = []lid -func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { - // Phase 1: Count documents per token - counts := res.GetUint32s(tokenCount) - clear(counts) - for _, tid := range tids { - counts[tid]++ - } - - // Phase 2: Allocate slices for each token group - // We use a single large buffer and slice it for efficiency - tokenLIDs := res.GetUint32Slices(tokenCount) - allTokenLIDs := idx.res.GetUint32s(len(lids)) - idx.allTokenLIDsCount = len(lids) - - tokenLIDs = tokenLIDs[:len(counts)] - for tid, count := range counts { - tokenLIDs[tid] = allTokenLIDs[:count][:0] - allTokenLIDs = allTokenLIDs[count:] - } - - // Phase 3: Populate groups with document IDs - // We reuse docIDs slice bounds for safety - lids = lids[:len(tids)] - for i, tid := range tids { - if len(tokenLIDs[tid]) > 0 { - if lids[i] == lastLID(tokenLIDs[tid]) { // deduplication - idx.allTokenLIDsCount-- - continue - } - } - tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) - } - - return tokenLIDs -} - -func lastLID(s []uint32) uint32 { - return s[len(s)-1] -} - -// organizeTokens organizes tokens and fields in the index with proper sorting -func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { - tokenSize := 0 - order := res.GetUint32s(len(tokens)) - order = order[:len(tokens)] - for i, t := range tokens { - order[i] = uint32(i) - tokenSize += len(t.value) - } - - // Create ordering for sorting tokens - // We'll sort by (field, value) to group tokens by field - slices.SortFunc(order, func(a, b uint32) int { - tokenA, tokenB := tokens[a], tokens[b] - return cmp.Or( - cmp.Compare(tokenA.field, tokenB.field), - cmp.Compare(tokenA.value, tokenB.value), - ) - }) - - fieldSize := 0 - prevField := "" - - // Prepare buffers for sorted data - tokenBuffer := idx.res.GetBytes(tokenSize)[:0] - idx.tokenLIDs = idx.res.GetUint32Slices(len(order)) - idx.tokens = idx.res.GetBytesSlices(len(order)) - - // Process tokens in sorted order - for tid, origIdx := range order { - token := tokens[origIdx] - - // Detect field boundaries - // When field name changes, record the field and its first token position - if token.field != prevField || prevField == "" { - fieldSize += len(token.field) - buf.fields = append(buf.fields, token.field) - buf.fieldTIDs = append(buf.fieldTIDs, uint32(tid)) - } - prevField = token.field - - // Copy token value to buffer and keep reference - start := len(tokenBuffer) - tokenBuffer = append(tokenBuffer, token.value...) - - // Store in sorted arrays - // Note: We use original tokenID as index to preserve tokenID->data mapping - idx.tokens[tid] = tokenBuffer[start:] - idx.tokenLIDs[tid] = tokenLIDs[origIdx] - } - // Add sentinel value for easier range calculation - buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) - - // Organize fields - fieldBuffer := idx.res.GetBytes(fieldSize)[:0] - idx.fields = idx.res.GetBytesSlices(len(buf.fields)) - - idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) - - for i, field := range buf.fields { - // Copy field name to buffer - start := len(fieldBuffer) - fieldBuffer = append(fieldBuffer, field...) - idx.fields[i] = fieldBuffer[start:] - - // Calculate token range for this field - // Each field has continuous range of token IDs in sorted order - startTID := buf.fieldTIDs[i] - endTID := buf.fieldTIDs[i+1] - idx.fieldsTokens[util.ByteToStringUnsafe(fieldBuffer[start:])] = tokenRange{ - start: startTID, - count: endTID - startTID, - } - } -} - -// decompressMeta decompresses metadata from block -func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwatch) ([]byte, error) { - m := sw.Start("decompress_meta") - defer m.Stop() - - // Allocate exact size needed for compressed data - buffer := res.GetBytes(int(block.RawLen())) - payload, err := block.DecompressTo(buffer) - if err != nil { - return nil, err - } - return payload, nil -} - -// decodeMetadata decodes document metadata from binary format -// Format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... -func decodeMetadata(res *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { - m := sw.Start("decode_meta") - defer m.Stop() - - // First pass: scan to determine sizes of each metadata entry - var offset uint32 - for offset < uint32(len(payload)) { - size := binary.LittleEndian.Uint32(payload[offset:]) - offset += uint32Size + size - buf.sizes = append(buf.sizes, size) - } - - // Second pass: decode each metadata entry - meta := res.GetMetadata(len(buf.sizes)) - for i, size := range buf.sizes { - // Skip size field to get to actual data - data := payload[uint32Size : size+uint32(uint32Size)] - if err := meta[i].UnmarshalBinaryLazy(data); err != nil { - return nil, err - } - // Move to next entry - payload = payload[size+uint32(uint32Size):] - } - - return meta, nil -} diff --git a/frac/active2/mem_index.go b/frac/active2/mem_index.go deleted file mode 100644 index dfb04d3a..00000000 --- a/frac/active2/mem_index.go +++ /dev/null @@ -1,77 +0,0 @@ -package active2 - -import ( - "sort" - "sync" - - "github.com/ozontech/seq-db/seq" -) - -type tokenRange struct { - start uint32 - count uint32 -} - -type memIndex struct { - ids []seq.ID // IDs ordered DESC - tokens [][]byte // tokens ordered ASC by field:token - tokenLIDs [][]uint32 // LIDs list for each token from `tokens` - fieldsTokens map[string]tokenRange // tokens locator for each field - fields [][]byte // fields ordered ASC - blocksOffsets []uint64 // blocks offsets ordered by offset - positions []seq.DocPos - allTID uint32 - - docsSize uint64 - docsCount uint32 - allTokenLIDsCount int - - wg sync.WaitGroup - res *Resources - release func() -} - -func newMemIndex() *memIndex { - res, release := NewResources() - return &memIndex{ - res: res, - release: release, - } -} - -func (idx *memIndex) getTokenProvider(field string) *tokenProvider { - if r, ok := idx.fieldsTokens[field]; ok { - return &tokenProvider{ - firstTID: r.start, - lastTID: r.start + r.count - 1, - tokens: idx.tokens, - } - } - // Field is not indexed - return empty token provider - return &tokenProvider{ - firstTID: 1, - lastTID: 0, // firstTID > lastTID = no tokens available - tokens: idx.tokens, - } -} - -func (idx *memIndex) IsIntersecting(from, to seq.MID) bool { - maxMID := idx.ids[0].MID - minMID := idx.ids[len(idx.ids)-1].MID - if to < minMID || maxMID < from { - return false - } - return true -} - -func (idx *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { - i, ok := sort.Find(len(idx.ids), func(i int) int { return seq.Compare(idx.ids[i], id) }) - return uint32(i + 1), ok -} - -func (idx *memIndex) Release() { - go func() { // non blocking call - idx.wg.Wait() - idx.release() - }() -} diff --git a/frac/active2/merge.go b/frac/active2/merge.go deleted file mode 100644 index 4a33899a..00000000 --- a/frac/active2/merge.go +++ /dev/null @@ -1,257 +0,0 @@ -package active2 - -import ( - "bytes" - "slices" - - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/util" -) - -func mergeIndexes(indexes []*memIndex) *memIndex { - blocksCount := 0 - dst := newMemIndex() - for _, idx := range indexes { - dst.docsSize += idx.docsSize - dst.docsCount += idx.docsCount - dst.allTokenLIDsCount += idx.allTokenLIDsCount - blocksCount += len(idx.blocksOffsets) - } - - res, release := NewResources() - defer release() - - dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] - dst.positions = dst.res.GetDocPos(int(dst.docsCount))[:0] - dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] - - posMap := mergeBlocksOffsets(dst, res, indexes) - lidsMap := mergeIDs(dst, res, indexes, posMap) - mergeTokens(dst, res, indexes, lidsMap) - - dst.allTID = dst.fieldsTokens[seq.TokenAll].start - - return dst -} - -func mergeIDs(dst *memIndex, res *Resources, indexes []*memIndex, posMap [][]seq.DocPos) [][]uint32 { - lidsMap := res.GetUint32Slices(len(indexes)) - iters := make([]OrderedStream[DocRef], len(indexes)) - for i, idx := range indexes { - iters[i] = &DocStream{ - i: i, - idx: idx, - posMap: posMap[i], - } - lidsMap[i] = res.GetUint32s(int(idx.docsCount) + 1)[:1] // 1-based - } - - orderedIDs := MergeSortedStreams(iters, func(a, b DocRef) int { return seq.Compare(b.id, a.id) }) - - cur, has := orderedIDs.Next() - for has { - dst.ids = append(dst.ids, cur.id) - dst.positions = append(dst.positions, cur.pos) - lid := uint32(len(dst.ids)) - lidsMap[cur.i] = append(lidsMap[cur.i], lid) - cur, has = orderedIDs.Next() - } - return lidsMap -} - -func mergeTokens(dst *memIndex, res *Resources, indexes []*memIndex, lidsMap [][]uint32) { - totalTokens := 0 - tokensIterators := make([]OrderedStream[TokenRef], len(indexes)) - for i, idx := range indexes { - totalTokens += len(idx.tokens) - tokensIterators[i] = NewTokenStream(idx, lidsMap[i]) - } - - cmpToken := func(a, b TokenRef) int { - r := bytes.Compare(a.Field(), b.Field()) - if r == 0 { - return bytes.Compare(a.Value(), b.Value()) - } - return r - } - - orderedTokens := MergeSortedStreams(tokensIterators, cmpToken) - - uniqTokensSize := 0 - uniqTokensCount := 0 - - uniqFieldsSize := 0 - uniqFieldsCount := 0 - - var ( - prevField []byte - prevToken TokenRef - ) - - borders := res.GetBytes(totalTokens)[:0] - tokens := make([]TokenRef, 0, totalTokens) - - for cur, has := orderedTokens.Next(); has; cur, has = orderedTokens.Next() { - var border uint8 - - if prevToken.payload == nil || cmpToken(prevToken, cur) != 0 { - uniqTokensCount++ - uniqTokensSize += len(cur.Value()) - border++ - - field := cur.Field() - if !bytes.Equal(prevField, field) { - uniqFieldsCount++ - uniqFieldsSize += len(field) - border++ - prevField = field - } - } - - borders = append(borders, border) - tokens = append(tokens, cur) - prevToken = cur - } - - dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) - dst.fields = dst.res.GetBytesSlices(uniqFieldsCount)[:0] - dst.tokens = dst.res.GetBytesSlices(uniqTokensCount)[:0] - dst.tokenLIDs = dst.res.GetUint32Slices(uniqTokensCount)[:0] - - allTokens := dst.res.GetBytes(uniqTokensSize)[:0] - allFields := dst.res.GetBytes(uniqFieldsSize)[:0] - - lidsCollector := NewLIDsCollector( - res.GetUint32s(int(dst.docsCount)), // tmp buf - dst.res.GetUint32s(dst.allTokenLIDsCount - int(dst.docsCount))[:0], // all token LIDs - dst.res.GetUint32s(int(dst.docsCount)), // ALL LIDs for token _all_ - res.GetBytes((int(dst.docsCount) + 1)), // sort buffer - ) - - var isAllToken bool - for i, token := range tokens { - token := token - if borders[i] > 0 { - - if i > 0 { - dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) - } - - if borders[i] > 1 { - - tid := uint32(len(dst.tokens)) - - if i > 0 { - fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) - tr := dst.fieldsTokens[fieldStr] - tr.count = tid - tr.start - dst.fieldsTokens[fieldStr] = tr - } - - start := len(allFields) - allFields = append(allFields, token.Field()...) - field := allFields[start:] - dst.fields = append(dst.fields, field) - - fieldStr := util.ByteToStringUnsafe(field) - dst.fieldsTokens[fieldStr] = tokenRange{start: tid} - - isAllToken = fieldStr == seq.TokenAll - } - - start := len(allTokens) - allTokens = append(allTokens, token.Value()...) - dst.tokens = append(dst.tokens, allTokens[start:]) - } - - if isAllToken { - for range token.LIDs() { - lidsCollector.Add(0) // stub - } - } else { - newLIDsMap := token.lidsMap() - for _, oldLID := range token.LIDs() { - lidsCollector.Add(newLIDsMap[oldLID]) - } - } - } - - dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) - - tid := uint32(len(dst.tokens)) - 1 - fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) - tr := dst.fieldsTokens[fieldStr] - tr.count = tid - tr.start + 1 - dst.fieldsTokens[fieldStr] = tr -} - -type LIDsCollector struct { - tmp []uint32 - lids []uint32 - all []uint32 - buf []uint8 -} - -func NewLIDsCollector(tmp, lids, all []uint32, buf []uint8) *LIDsCollector { - clear(buf) - for i := range all { - all[i] = uint32(i) + 1 - } - return &LIDsCollector{ - tmp: tmp[:0], - lids: lids[:0], - all: all, - buf: buf, - } -} - -func (s *LIDsCollector) Add(lid uint32) { - s.tmp = append(s.tmp, lid) -} - -func (s *LIDsCollector) GetSorted() (dst []uint32) { - n := len(s.tmp) - - if n == len(s.all) { - s.tmp = s.tmp[:0] - return s.all - } - - if 100*n/len(s.all) > 50 { - for _, v := range s.tmp { - s.buf[v] = 1 - } - start := len(s.lids) - for lid, ok := range s.buf { - if ok == 1 { - s.buf[lid] = 0 - s.lids = append(s.lids, uint32(lid)) - } - } - s.tmp = s.tmp[:0] - return s.lids[start:] - } - - if n > 1 { - slices.Sort(s.tmp) - } - start := len(s.lids) - s.lids = append(s.lids, s.tmp...) - s.tmp = s.tmp[:0] - return s.lids[start:] -} - -func mergeBlocksOffsets(dst *memIndex, res *Resources, indexes []*memIndex) [][]seq.DocPos { - var offset uint32 - positions := res.GetDocPosSlices(len(indexes)) - for i, index := range indexes { - dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) - positions[i] = res.GetDocPos(len(index.positions))[:0] - for _, p := range index.positions { - oldIdx, docOffset := p.Unpack() - positions[i] = append(positions[i], seq.PackDocPos(oldIdx+offset, docOffset)) - } - offset += uint32(len(index.blocksOffsets)) - } - return positions -} diff --git a/frac/active2/sealing_source.go b/frac/active2/sealing_source.go deleted file mode 100644 index 14004eb9..00000000 --- a/frac/active2/sealing_source.go +++ /dev/null @@ -1,140 +0,0 @@ -package active2 - -import ( - "errors" - "iter" - "math" - "time" - - "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/active" - "github.com/ozontech/seq-db/frac/sealed/sealing" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/util" -) - -var ( - _ sealing.Source = (*SealingSource)(nil) - - systemSeqID = seq.ID{ - MID: math.MaxUint64, - RID: math.MaxUint64, - } -) - -type SealingSource struct { - info *frac.Info - index *memIndex - lastErr error -} - -func NewSealingSource(a *Active2, params frac.SealParams) (sealing.Source, error) { - a.merger.ForceMergeAll() - - iss, release := a.indexes.Snapshot() - defer release() - - if len(iss.indexes) != 1 { - return nil, errors.New("wrong count of fraction memIndexes") - } - - ss := &SealingSource{ - info: iss.info, - index: iss.indexes[0], - } - - // Sort documents if not skipped in configuration - if !a.Config.SkipSortDocs { - ds := active.NewDocsSource(ss, ss.index.blocksOffsets, &a.sortReader) - blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(ss.info.Path, params, ds) - if err != nil { - return nil, err - } - ss.index.positions = positions[1:] - ss.index.blocksOffsets = blocksOffsets - ss.info.DocsOnDisk = uint64(onDiskSize) - } - - ss.info.MetaOnDisk = 0 - ss.info.SealingTime = uint64(time.Now().UnixMilli()) - ss.info.BuildDistributionWithIDs(ss.index.ids) - - return ss, nil -} - -func (src *SealingSource) Info() *frac.Info { - return src.info -} - -func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { - return func(yield func([]seq.ID, []seq.DocPos) bool) { - ids := make([]seq.ID, 0, blockSize) - pos := make([]seq.DocPos, 0, blockSize) - - // first - ids = append(ids, systemSeqID) // todo get rid of systemSeqID in index format - pos = append(pos, 0) - - for i, id := range src.index.ids { - if len(ids) == blockSize { - if !yield(ids, pos) { - return - } - ids = ids[:0] - pos = pos[:0] - } - ids = append(ids, id) - pos = append(pos, src.index.positions[i]) - } - yield(ids, pos) - } -} - -func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { - return func(yield func([][]byte) bool) { - actualSize := 0 - block := make([][]byte, 0, blockSize) - for _, token := range src.index.tokens { - if actualSize >= blockSize { - if !yield(block) { - return - } - actualSize = 0 - block = block[:0] - } - actualSize += len(token) + int(uint32Size) - block = append(block, token) - } - yield(block) - } -} - -func (src *SealingSource) Fields() iter.Seq2[string, uint32] { - return func(yield func(string, uint32) bool) { - for _, field := range src.index.fields { - f := util.ByteToStringUnsafe(field) - r := src.index.fieldsTokens[f] - if !yield(f, r.start+r.count) { - return - } - } - } -} - -func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { - return func(yield func([]uint32) bool) { - for _, tokenLIDs := range src.index.tokenLIDs { - if !yield(tokenLIDs) { - return - } - } - } -} - -func (src *SealingSource) BlocksOffsets() []uint64 { - return src.index.blocksOffsets -} - -func (src *SealingSource) LastError() error { - return src.lastErr -} diff --git a/frac/active/active.go b/frac/active_old/active.go similarity index 99% rename from frac/active/active.go rename to frac/active_old/active.go index 9db7a5d7..506c615e 100644 --- a/frac/active/active.go +++ b/frac/active_old/active.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "context" diff --git a/frac/active/docs_positions.go b/frac/active_old/docs_positions.go similarity index 98% rename from frac/active/docs_positions.go rename to frac/active_old/docs_positions.go index 2f5f8bc5..3acb7c2b 100644 --- a/frac/active/docs_positions.go +++ b/frac/active_old/docs_positions.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "sync" diff --git a/frac/active/docs_source.go b/frac/active_old/docs_source.go similarity index 98% rename from frac/active/docs_source.go rename to frac/active_old/docs_source.go index c70c5ddb..0dae2120 100644 --- a/frac/active/docs_source.go +++ b/frac/active_old/docs_source.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "iter" diff --git a/frac/active/file_writer.go b/frac/active_old/file_writer.go similarity index 99% rename from frac/active/file_writer.go rename to frac/active_old/file_writer.go index 0b1c2a86..8525c462 100644 --- a/frac/active/file_writer.go +++ b/frac/active_old/file_writer.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "io" diff --git a/frac/active/file_writer_test.go b/frac/active_old/file_writer_test.go similarity index 99% rename from frac/active/file_writer_test.go rename to frac/active_old/file_writer_test.go index 84fe9e02..66a5e0b0 100644 --- a/frac/active/file_writer_test.go +++ b/frac/active_old/file_writer_test.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "errors" diff --git a/frac/active/ids.go b/frac/active_old/ids.go similarity index 97% rename from frac/active/ids.go rename to frac/active_old/ids.go index 5207ecd4..8d0508f8 100644 --- a/frac/active/ids.go +++ b/frac/active_old/ids.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "sync" diff --git a/frac/active/ids_test.go b/frac/active_old/ids_test.go similarity index 98% rename from frac/active/ids_test.go rename to frac/active_old/ids_test.go index 71a15131..9995ae82 100644 --- a/frac/active/ids_test.go +++ b/frac/active_old/ids_test.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "sync" diff --git a/frac/active/index.go b/frac/active_old/index.go similarity index 99% rename from frac/active/index.go rename to frac/active_old/index.go index 3c1e01d1..fe49a22a 100644 --- a/frac/active/index.go +++ b/frac/active_old/index.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "context" diff --git a/frac/active_old/indexer.go b/frac/active_old/indexer.go new file mode 100644 index 00000000..91672462 --- /dev/null +++ b/frac/active_old/indexer.go @@ -0,0 +1,195 @@ +package active_old + +import ( + "encoding/binary" + "sync" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/bytespool" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/metric" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/storage" +) + +type Indexer struct { + ch chan *indexTask + chMerge chan *mergeTask + workerCount int +} + +type indexTask struct { + Frac *Active + Metas storage.DocBlock + Pos uint64 + Wg *sync.WaitGroup +} + +type mergeTask struct { + frac *Active + tokenLIDs *TokenLIDs +} + +func NewIndexer(workerCount, chLen int) (*Indexer, func()) { + idx := Indexer{ + ch: make(chan *indexTask, chLen), + chMerge: make(chan *mergeTask, chLen), + workerCount: workerCount, + } + stopIdx := idx.start() + return &idx, stopIdx +} + +func (ai *Indexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { + m := sw.Start("send_index_chan") + ai.ch <- &indexTask{ + Pos: storage.DocBlock(metas).GetExt2(), + Metas: metas, + Frac: frac, + Wg: wg, + } + m.Stop() +} + +func (ai *Indexer) start() func() { + wg := sync.WaitGroup{} + wg.Add(ai.workerCount) + + for i := 0; i < ai.workerCount; i++ { + go func(index int) { + defer wg.Done() + ai.appendWorker(index) + }(i) + } + + wg.Add(ai.workerCount) + for i := 0; i < ai.workerCount; i++ { + go func() { + defer wg.Done() + ai.mergeWorker() + }() + } + + return func() { + close(ai.ch) + close(ai.chMerge) + wg.Wait() + } +} + +func (ai *Indexer) mergeWorker() { + for task := range ai.chMerge { + task.tokenLIDs.GetLIDs(task.frac.MIDs, task.frac.RIDs) // GetLIDs cause sort and merge LIDs from queue + } +} + +var metaDataPool = sync.Pool{ + New: func() any { + return new(indexer.MetaData) + }, +} + +func (ai *Indexer) appendWorker(index int) { + // collector of bulk meta data + collector := newMetaDataCollector() + + for task := range ai.ch { + var err error + + sw := stopwatch.New() + total := sw.Start("total_indexing") + + metaBuf := bytespool.Acquire(int(task.Metas.RawLen())) + + if metaBuf.B, err = task.Metas.DecompressTo(metaBuf.B); err != nil { + logger.Panic("error decompressing meta", zap.Error(err)) // TODO: error handling + } + metasPayload := metaBuf.B + + active := task.Frac + blockIndex := active.DocBlocks.Append(task.Pos) + collector.Init(blockIndex) + + parsingMetric := sw.Start("metas_parsing") + meta := metaDataPool.Get().(*indexer.MetaData) + for len(metasPayload) > 0 { + n := binary.LittleEndian.Uint32(metasPayload) + metasPayload = metasPayload[4:] + documentMetadata := metasPayload[:n] + metasPayload = metasPayload[n:] + + if err := meta.UnmarshalBinary(documentMetadata); err != nil { + logger.Panic("BUG: can't unmarshal meta", zap.Error(err)) + } + collector.AppendMeta(*meta) + } + metaDataPool.Put(meta) + bytespool.Release(metaBuf) + parsingMetric.Stop() + + m := sw.Start("doc_params_set") + appendedIDs := active.DocsPositions.SetMultiple(collector.IDs, collector.Positions) + if len(appendedIDs) != len(collector.IDs) { + // There are duplicates in the active fraction. + // It is possible in case we retry same bulk requests. + // So we need to remove duplicates from collector. + doublesCnt := len(collector.IDs) - len(appendedIDs) + metric.BulkDuplicateDocsTotal.Observe(float64(doublesCnt)) + logger.Warn("found duplicates", zap.Int("batch", doublesCnt), zap.Int("worker", index)) + collector.Filter(appendedIDs) + } + m.Stop() + + m = sw.Start("append_ids") + lids := active.AppendIDs(collector.IDs) + m.Stop() + + m = sw.Start("token_list_append") + tokenLIDsPlaces := collector.PrepareTokenLIDsPlaces() + active.TokenList.Append(collector.TokensValues, collector.FieldsLengths, tokenLIDsPlaces) + m.Stop() + + m = sw.Start("group_lids") + groups := collector.GroupLIDsByToken(lids) + m.Stop() + + m = sw.Start("put_lids_queue") + tokensToMerge := addLIDsToTokens(tokenLIDsPlaces, groups) + ai.sendTokensToMergeWorkers(active, tokensToMerge) + m.Stop() + + active.UpdateStats(collector.MinMID, collector.MaxMID, collector.DocsCounter, collector.SizeCounter) + + task.Wg.Done() + + total.Stop() + sw.Export(bulkStagesSeconds) + } +} + +func (ai *Indexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { + for _, tl := range tokens { + task := mergeTask{ + frac: frac, + tokenLIDs: tl, + } + select { + case ai.chMerge <- &task: + default: // skip background merge if workers are busy + } + } +} + +func addLIDsToTokens(tlids []*TokenLIDs, lids [][]uint32) []*TokenLIDs { + const minMergeQueue = 10000 + + needMerge := make([]*TokenLIDs, 0, len(tlids)) + for i, tl := range tlids { + if l := tl.PutLIDsInQueue(lids[i]); l > minMergeQueue { + needMerge = append(needMerge, tl) + } + } + return needMerge +} diff --git a/frac/active2/indexer_test.go b/frac/active_old/indexer_test.go similarity index 79% rename from frac/active2/indexer_test.go rename to frac/active_old/indexer_test.go index d2b69c4e..02b794fd 100644 --- a/frac/active2/indexer_test.go +++ b/frac/active_old/indexer_test.go @@ -1,4 +1,4 @@ -package active2 +package active_old import ( "bytes" @@ -9,67 +9,80 @@ import ( "time" "github.com/alecthomas/units" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zapcore" + "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/metric/stopwatch" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tests/common" "github.com/ozontech/seq-db/tokenizer" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.uber.org/zap/zapcore" ) -func BenchmarkIndexer(b *testing.B) { - logger.SetLevel(zapcore.FatalLevel) - - allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) - readers := splitLogsToBulks(allLogs, 1000) - assert.NoError(b, err) - - processor := getTestProcessor() +func readFileAllAtOnce(filename string) ([][]byte, error) { + content, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + lines := bytes.Split(content, []byte{'\n'}) + if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { + lines = lines[:len(lines)-1] + } + return lines, nil +} - n := 2 - allMeta := make([][]byte, 0, len(readers)*n) +func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { + funcs := []func() ([]byte, error){} + for len(data) > 0 { + size := min(len(data), bulkSize) + funcs = append(funcs, testBufReader(data[0:size])) + data = data[size:] + } + return funcs +} - for range n { - for _, readNext := range readers { - _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) +func testBufReader(data [][]byte) func() ([]byte, error) { + orig := data + return func() ([]byte, error) { + if len(data) == 0 { + data = orig + return nil, nil } + line := data[0] + data = data[1:] + return line, nil } +} - b.ResetTimer() - for i := 0; i < b.N; i++ { - b.StopTimer() - active := New( - filepath.Join(b.TempDir(), "test"), - &frac.Config{}, - config.NumCPU, - storage.NewReadLimiter(1, nil), - cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - ) - b.StartTimer() +func getTestProcessor() *indexer.Processor { + mapping := seq.Mapping{ + "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), + "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), + "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), + "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), + } - wg := sync.WaitGroup{} - for _, meta := range allMeta { - wg.Add(1) - active.indexer.Index(meta, func(idx *memIndex, err error) { - active.indexes.Add(idx, 0, 0) - wg.Done() - }) - } - wg.Wait() + tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ + seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), + seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), + seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), + seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), } + + return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) } -func BenchmarkMerge(b *testing.B) { +func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) + defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) @@ -90,28 +103,22 @@ func BenchmarkMerge(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { b.StopTimer() - active := New( filepath.Join(b.TempDir(), "test"), - &frac.Config{}, - config.NumCPU, + idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), + &frac.Config{}, ) + b.StartTimer() wg := sync.WaitGroup{} for _, meta := range allMeta { wg.Add(1) - active.indexer.Index(meta, func(idx *memIndex, err error) { - active.indexes.Add(idx, 0, 0) - wg.Done() - }) + idx.Index(active, meta, &wg, stopwatch.New()) } wg.Wait() - b.StartTimer() - - active.merger.ForceMergeAll() } } @@ -130,6 +137,8 @@ func defaultSealingParams() frac.SealParams { func BenchmarkFullWrite(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) + defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) @@ -154,11 +163,11 @@ func BenchmarkFullWrite(b *testing.B) { for b.Loop() { active := New( filepath.Join(b.TempDir(), "test"), - &frac.Config{SkipSortDocs: true}, - config.NumCPU, + idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), + &frac.Config{SkipSortDocs: true}, ) wg := sync.WaitGroup{} @@ -179,56 +188,3 @@ func BenchmarkFullWrite(b *testing.B) { active.Release() } } - -func readFileAllAtOnce(filename string) ([][]byte, error) { - content, err := os.ReadFile(filename) - if err != nil { - return nil, err - } - lines := bytes.Split(content, []byte{'\n'}) - if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { - lines = lines[:len(lines)-1] - } - return lines, nil -} - -func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { - funcs := []func() ([]byte, error){} - for len(data) > 0 { - size := min(len(data), bulkSize) - funcs = append(funcs, testBufReader(data[0:size])) - data = data[size:] - } - return funcs -} - -func testBufReader(data [][]byte) func() ([]byte, error) { - orig := data - return func() ([]byte, error) { - if len(data) == 0 { - data = orig - return nil, nil - } - line := data[0] - data = data[1:] - return line, nil - } -} - -func getTestProcessor() *indexer.Processor { - mapping := seq.Mapping{ - "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), - "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), - "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), - "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), - } - - tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ - seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), - seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), - seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), - seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), - } - - return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) -} diff --git a/frac/active/inverser.go b/frac/active_old/inverser.go similarity index 98% rename from frac/active/inverser.go rename to frac/active_old/inverser.go index a96521eb..e4f6eefd 100644 --- a/frac/active/inverser.go +++ b/frac/active_old/inverser.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "unsafe" diff --git a/frac/active/meta_data_collector.go b/frac/active_old/meta_data_collector.go similarity index 99% rename from frac/active/meta_data_collector.go rename to frac/active_old/meta_data_collector.go index 07fb59d9..a8dfaa15 100644 --- a/frac/active/meta_data_collector.go +++ b/frac/active_old/meta_data_collector.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "math" diff --git a/frac/active_old/sealing_source.go b/frac/active_old/sealing_source.go new file mode 100644 index 00000000..9f93aeb7 --- /dev/null +++ b/frac/active_old/sealing_source.go @@ -0,0 +1,323 @@ +package active_old + +import ( + "bytes" + "errors" + "iter" + "slices" + "time" + "unsafe" + + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" +) + +// SealingSource transforms data from in-memory (frac.Active) storage +// into a format suitable for disk writing during index creation. +// +// The main purpose of this type is to provide access to sorted data +// through a set of iterators that allow sequential processing of +// data in sized blocks for disk writing: +// +// - TokenBlocks() - iterator for token blocks, sorted by fields and values +// - Fields() - iterator for sorted fields with maximum TIDs +// - IDsBlocks() - iterator for document ID blocks and their positions +// - TokenLIDs() - iterator for LID lists for each token +// - Docs() - iterator for documents themselves with duplicate handling +// +// All iterators work with pre-sorted data and return information +// in an order optimal for creating disk index structures. +type SealingSource struct { + info *frac.Info // fraction Info + created time.Time // Creation time of the source + sortedLIDs []uint32 // Sorted LIDs (Local ID) + oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) + mids *UInt64s // MIDs + rids *UInt64s // RIDs + fields []string // Sorted field names + fieldsMaxTIDs []uint32 // Maximum TIDs for each field + tids []uint32 // Sorted TIDs (Token ID) + tokens [][]byte // Tokens (values) by TID + lids []*TokenLIDs // LID lists for each token + docPosOrig map[seq.ID]seq.DocPos // Original document positions + docPosSorted []seq.DocPos // Document positions after sorting + blocksOffsets []uint64 // Document block offsets + docsReader *storage.DocsReader // Document storage reader + lastErr error // Last error +} + +// NewSealingSource creates a new data source for sealing +// based on an active in-memory index. +func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, error) { + info := *active.info // copy + sortedLIDs := active.GetAllDocuments() + + // Sort fields and get maximum TIDs for each field + sortedFields, fieldsMaxTIDs := sortFields(active.TokenList) + + // Sort tokens within each field + sortedTIDs := sortTokens(sortedFields, active.TokenList) + + src := SealingSource{ + info: &info, + created: time.Now(), + sortedLIDs: sortedLIDs, + oldToNewLIDs: makeInverser(sortedLIDs), // Create LID mapping + mids: active.MIDs, + rids: active.RIDs, + fields: sortedFields, + tids: sortedTIDs, + fieldsMaxTIDs: fieldsMaxTIDs, + tokens: active.TokenList.tidToVal, + lids: active.TokenList.tidToLIDs, + docPosOrig: active.DocsPositions.idToPos, + blocksOffsets: active.DocBlocks.vals, + docsReader: &active.sortReader, + } + + src.prepareInfo() + + // Sort documents if not skipped in configuration + if !active.Config.SkipSortDocs { + ds := NewDocsSource(&src, src.blocksOffsets, &active.sortReader) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) + if err != nil { + return nil, err + } + src.docPosSorted = positions[1:] + src.blocksOffsets = blocksOffsets + src.info.DocsOnDisk = uint64(onDiskSize) + } + + return &src, nil +} + +// sortFields sorts field names and calculates maximum TIDs for each field. +// Returns sorted field list and array of maximum TIDs. +func sortFields(tl *tokenList) ([]string, []uint32) { + fields := make([]string, 0, len(tl.FieldTIDs)) + for field := range tl.FieldTIDs { + fields = append(fields, field) + } + slices.Sort(fields) + + pos := 0 + maxTIDs := make([]uint32, 0, len(fields)) + for _, field := range fields { + pos += len(tl.FieldTIDs[field]) + maxTIDs = append(maxTIDs, uint32(pos)) + } + + return fields, maxTIDs +} + +// sortTokens sorts tokens lexicographically within each field. +// Returns sorted list of TIDs. +func sortTokens(sortedFields []string, tl *tokenList) []uint32 { + pos := 0 + tids := make([]uint32, 0, len(tl.tidToVal)) + for _, field := range sortedFields { + tids = append(tids, tl.FieldTIDs[field]...) + chunk := tids[pos:] + slices.SortFunc(chunk, func(i, j uint32) int { + a := tl.tidToVal[i] + b := tl.tidToVal[j] + return bytes.Compare(a, b) // Sort by token value + }) + pos = len(tids) + } + return tids +} + +// LastError returns the last error that occurred during processing. +func (src *SealingSource) LastError() error { + return src.lastErr +} + +// prepareInfo prepares metadata for disk writing. +func (src *SealingSource) prepareInfo() { + src.info.MetaOnDisk = 0 + src.info.SealingTime = uint64(src.created.UnixMilli()) + src.info.BuildDistribution(src.mids.vals) +} + +// Info returns index metadata information. +func (src *SealingSource) Info() *frac.Info { + return src.info +} + +// TokenBlocks returns an iterator for token blocks for disk writing. +// Tokens are pre-sorted: first by fields, then lexicographically within each field. +// Each block contains up to blockSize bytes of data for efficient writing. +func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { + const tokenLengthSize = int(unsafe.Sizeof(uint32(0))) + return func(yield func([][]byte) bool) { + if len(src.tids) == 0 { + return + } + if blockSize <= 0 { + src.lastErr = errors.New("sealing: token block size must be > 0") + return + } + + actualSize := 0 + block := make([][]byte, 0, blockSize) + + // Iterate through all sorted TIDs + for _, tid := range src.tids { + if actualSize >= blockSize { + if !yield(block) { + return + } + actualSize = 0 + block = block[:0] + } + token := src.tokens[tid] + actualSize += tokenLengthSize // Add the size of the token length field + actualSize += len(token) // Add the size of the token itself + block = append(block, token) + } + yield(block) + } +} + +// Fields returns an iterator for sorted fields and their maximum TIDs. +// Fields are sorted lexicographically, ensuring predictable order +// when building disk index structures. +func (src *SealingSource) Fields() iter.Seq2[string, uint32] { + return func(yield func(string, uint32) bool) { + for i, field := range src.fields { + if !yield(field, src.fieldsMaxTIDs[i]) { + return + } + } + } +} + +// IDsBlocks returns an iterator for document ID blocks and corresponding positions. +// IDs are sorted. Block size is controlled by blockSize parameter for balance between +// performance and memory usage. +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { + return func(yield func([]seq.ID, []seq.DocPos) bool) { + mids := src.mids.vals + rids := src.rids.vals + + ids := make([]seq.ID, 0, blockSize) + pos := make([]seq.DocPos, 0, blockSize) + + // First reserved ID (system). This position is not used because Local IDs (LIDs) use 1-based indexing. + ids = append(ids, seq.ID{MID: seq.MID(mids[0]), RID: seq.RID(rids[0])}) + pos = append(pos, 0) + + // Iterate through sorted LIDs + for i, lid := range src.sortedLIDs { + if len(ids) == blockSize { + if !yield(ids, pos) { + return + } + ids = ids[:0] + pos = pos[:0] + } + id := seq.ID{MID: seq.MID(mids[lid]), RID: seq.RID(rids[lid])} + ids = append(ids, id) + + // Use sorted or original positions + if len(src.docPosSorted) == 0 { + pos = append(pos, src.docPosOrig[id]) + } else { + pos = append(pos, src.docPosSorted[i]) // +1 for system document + } + } + yield(ids, pos) + } +} + +// BlocksOffsets returns document block offsets. +func (src *SealingSource) BlocksOffsets() []uint64 { + return src.blocksOffsets +} + +// TokenLIDs returns an iterator for LID lists for each token. +// LIDs are converted to new numbering after document sorting. +// Each iterator call returns a list of documents containing a specific token, +// in sorted order. +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { + return func(yield func([]uint32) bool) { + newLIDs := []uint32{} + + // For each sorted TID + for _, tid := range src.tids { + // Get original LIDs for this token + oldLIDs := src.lids[tid].GetLIDs(src.mids, src.rids) + newLIDs = slices.Grow(newLIDs[:0], len(oldLIDs)) + + // Convert old LIDs to new through mapping + for _, lid := range oldLIDs { + newLIDs = append(newLIDs, src.oldToNewLIDs[lid]) + } + + if !yield(newLIDs) { + return + } + } + } +} + +// makeInverser creates an array for converting old LIDs to new ones. +// sortedLIDs[i] = oldLID -> inverser[oldLID] = i+1 +func makeInverser(sortedLIDs []uint32) []uint32 { + inverser := make([]uint32, len(sortedLIDs)+1) + for i, lid := range sortedLIDs { + inverser[lid] = uint32(i + 1) // +1 because 0 position is reserved and unused + } + return inverser +} + +// Docs returns an iterator for documents with their IDs. +// Handles duplicate IDs (for nested indexes). +func (src *SealingSource) Docs() iter.Seq2[seq.ID, []byte] { + src.lastErr = nil + return func(yield func(seq.ID, []byte) bool) { + var ( + prev seq.ID + curDoc []byte + ) + + // Iterate through ID and position blocks + for ids, pos := range src.IDsBlocks(consts.IDsPerBlock) { + for i, id := range ids { + if id == systemSeqID { + curDoc = nil // reserved system document (no payload) + } else if id != prev { + // If ID changed, read new document + if curDoc, src.lastErr = src.doc(pos[i]); src.lastErr != nil { + return + } + } + prev = id + if !yield(id, curDoc) { + return + } + } + } + } +} + +// doc reads a document from storage by its position. +func (src *SealingSource) doc(pos seq.DocPos) ([]byte, error) { + blockIndex, docOffset := pos.Unpack() + blockOffset := src.blocksOffsets[blockIndex] + + var doc []byte + err := src.docsReader.ReadDocsFunc(blockOffset, []uint64{docOffset}, func(b []byte) error { + doc = b + return nil + }) + if err != nil { + return nil, err + } + return doc, nil +} diff --git a/frac/active/token_lids.go b/frac/active_old/token_lids.go similarity index 99% rename from frac/active/token_lids.go rename to frac/active_old/token_lids.go index a4f8e851..1d1dafb6 100644 --- a/frac/active/token_lids.go +++ b/frac/active_old/token_lids.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "math" diff --git a/frac/active/token_list.go b/frac/active_old/token_list.go similarity index 99% rename from frac/active/token_list.go rename to frac/active_old/token_list.go index 122c88d7..94a66031 100644 --- a/frac/active/token_list.go +++ b/frac/active_old/token_list.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "context" diff --git a/frac/active/writer.go b/frac/active_old/writer.go similarity index 97% rename from frac/active/writer.go rename to frac/active_old/writer.go index 6528b92d..96dfabe2 100644 --- a/frac/active/writer.go +++ b/frac/active_old/writer.go @@ -1,4 +1,4 @@ -package active +package active_old import ( "os" diff --git a/frac/tests/fraction_test.go b/frac/tests/fraction_test.go index 82d6a6cb..2dac417c 100644 --- a/frac/tests/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -21,7 +21,8 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" - "github.com/ozontech/seq-db/frac/active2" + active1 "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" @@ -43,7 +44,7 @@ type FractionTestSuite struct { config *frac.Config mapping seq.Mapping tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *active.Indexer + activeIndexer *active_old.Indexer stopIndexer func() sealParams frac.SealParams @@ -53,7 +54,7 @@ type FractionTestSuite struct { } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer, s.stopIndexer = active.NewIndexer(4, 10) + s.activeIndexer, s.stopIndexer = active_old.NewIndexer(4, 10) } func (s *FractionTestSuite) TearDownSuiteCommon() { @@ -1053,14 +1054,14 @@ func (s *FractionTestSuite) TestFractionInfo() { s.Require().Equal(seq.MID(946731654000), info.To, "to doesn't match") switch s.fraction.(type) { - case *active.Active: + case *active_old.Active: // it varies depending on params and docs shuffled s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().True(info.MetaOnDisk >= uint64(450) && info.MetaOnDisk <= uint64(550), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") - case *active2.Active2: + case *active.Active: // it varies depending on params and docs shuffled s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) @@ -1270,13 +1271,13 @@ func (s *FractionTestSuite) AssertHist( func (s *FractionTestSuite) newActive(bulks ...[]string) *active.Active { baseName := filepath.Join(s.tmpDir, "test_fraction") - a := active.New( + a := active1.New( baseName, - s.activeIndexer, + s.config, + 4, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - s.config, ) s.AppendBulks(a, bulks...) @@ -1419,11 +1420,12 @@ func (s *ActiveReplayedFractionTestSuite) Replay(f *active.Active) frac.Fraction fracFileName := f.BaseFileName replayedFrac := active.New( fracFileName, - s.activeIndexer, + s.config, + 4, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - &frac.Config{}) + cache.NewCache[[]byte](nil, nil)) + err := replayedFrac.Replay(context.Background()) s.Require().NoError(err, "replay failed") return replayedFrac @@ -1663,15 +1665,16 @@ func (s *Active2FractionTestSuite) SetupTest() { } } -func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active2.Active2 { +func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active_old.Active { + baseName := filepath.Join(s.tmpDir, "test_fraction") - a := active2.New( + a := active_old.New( baseName, - s.config, - 4, + s.activeIndexer, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), + s.config, ) s.AppendBulks(a, bulks...) @@ -1680,7 +1683,7 @@ func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active2.Active } func (s *Active2FractionTestSuite) TearDownTest() { - if f, ok := s.fraction.(*active2.Active2); ok { + if f, ok := s.fraction.(*active1.Active); ok { f.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -1728,7 +1731,7 @@ func (s *Sealed2FractionTestSuite) TearDownSuite() { func (s *Sealed2FractionTestSuite) newSealed2(bulks ...[]string) *sealed.Sealed { a := s.newActive2(bulks...) - activeSealingSource, err := active2.NewSealingSource(a, s.sealParams) + activeSealingSource, err := active_old.NewSealingSource(a, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index ad5163f7..c7c44764 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -12,6 +12,7 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -40,7 +41,7 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func FillConfigWithDefault(cfg) readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) - idx, stopIdx := active.NewIndexer(config.IndexWorkers, config.IndexWorkers) + idx, stopIdx := active_old.NewIndexer(config.IndexWorkers, config.IndexWorkers) cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 5eaf13ec..178711e0 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -12,7 +12,8 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" - "github.com/ozontech/seq-db/frac/active2" + active1 "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/storage" @@ -27,14 +28,14 @@ type fractionProvider struct { s3cli *s3.Client // Client for S3 storage operations config *Config // Fraction manager configuration cacheProvider *CacheMaintainer // Cache provider for data access optimization - activeIndexer *active.Indexer // Indexer for active fractions + activeIndexer *active_old.Indexer // Indexer for active fractions readLimiter *storage.ReadLimiter // Read rate limiter ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( cfg *Config, s3cli *s3.Client, cp *CacheMaintainer, - readLimiter *storage.ReadLimiter, indexer *active.Indexer, + readLimiter *storage.ReadLimiter, indexer *active_old.Indexer, ) *fractionProvider { return &fractionProvider{ s3cli: s3cli, @@ -49,22 +50,22 @@ func newFractionProvider( func (fp *fractionProvider) NewActive(name string) *active.Active { return active.New( name, - fp.activeIndexer, + &fp.config.Fraction, + config.NumCPU, fp.readLimiter, fp.cacheProvider.CreateDocBlockCache(), fp.cacheProvider.CreateSortDocsCache(), - &fp.config.Fraction, ) } -func (fp *fractionProvider) NewActive2(name string) *active2.Active2 { - return active2.New( +func (fp *fractionProvider) NewActiveOld(name string) *active_old.Active { + return active_old.New( name, - &fp.config.Fraction, - config.NumCPU, + fp.activeIndexer, fp.readLimiter, fp.cacheProvider.CreateDocBlockCache(), fp.cacheProvider.CreateSortDocsCache(), + &fp.config.Fraction, ) } @@ -120,10 +121,10 @@ func (fp *fractionProvider) CreateActive() *active.Active { // CreateActive creates a new active fraction with auto-generated filename // Filename pattern: base_pattern + ULID -func (fp *fractionProvider) CreateActive2() *active2.Active2 { +func (fp *fractionProvider) CreateActive2() *active1.Active { filePath := fileBasePattern + fp.nextFractionID() baseFilePath := filepath.Join(fp.config.DataDir, filePath) - return fp.NewActive2(baseFilePath) + return fp.NewActive(baseFilePath) } // Seal converts an active fraction to a sealed one diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index 893f7e84..9e175178 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -13,7 +13,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" ) @@ -36,7 +36,7 @@ func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func() cfg = setupDataDir(t, cfg) rl := storage.NewReadLimiter(1, nil) s3cli, stopS3 := setupS3Client(t) - idx, stopIdx := active.NewIndexer(1, 1) + idx, stopIdx := active_old.NewIndexer(1, 1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) provider := newFractionProvider(cfg, s3cli, cache, rl, idx) return provider, func() { diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index 7689b989..ca369959 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -17,7 +17,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/active2" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -35,7 +35,7 @@ func TestMain(m *testing.M) { m.Run() } -func fillActiveFraction(active *active2.Active2) error { +func fillActiveFraction(active *active.Active) error { const muliplier = 10 file, err := os.Open(filepath.Join(testscommon.TestDataDir, "k8s.logs")) @@ -106,8 +106,8 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { err := fillActiveFraction(a) assert.NoError(b, err) - seal := func(a *active2.Active2, params frac.SealParams) (*sealed.PreloadedData, error) { - src, err := active2.NewSealingSource(a, params) + seal := func(a *active.Active, params frac.SealParams) (*sealed.PreloadedData, error) { + src, err := active.NewSealingSource(a, params) assert.NoError(b, err) return sealing.Seal(src, params) } From f7f33055302c7db58af1fadbc3128f5e51a2d2f2 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 26 Dec 2025 16:27:31 +0300 Subject: [PATCH 14/28] add deduplication, fix mid.time overflow, replace aold active with new one --- frac/active/{active2.go => active.go} | 0 frac/active/indexer.go | 25 +++++++++----- frac/active/mem_index.go | 1 + frac/active/mem_index_pool.go | 25 ++++++++++++-- frac/active/merge.go | 43 ++++++++++++++++++----- frac/active/resources.go | 2 +- frac/active/sealing_source.go | 24 +++++++------ frac/sealed/sealing/sealer.go | 9 +++-- frac/sealed/token/table.go | 20 +++++++++++ frac/sealed/token/table_entry.go | 9 +++++ frac/tests/fraction_test.go | 49 ++++++++++++++------------- fracmanager/loader_test.go | 28 ++++++++------- seq/seq.go | 16 ++++++++- 13 files changed, 181 insertions(+), 70 deletions(-) rename frac/active/{active2.go => active.go} (100%) diff --git a/frac/active/active2.go b/frac/active/active.go similarity index 100% rename from frac/active/active2.go rename to frac/active/active.go diff --git a/frac/active/indexer.go b/frac/active/indexer.go index 8e86e17f..a058402b 100644 --- a/frac/active/indexer.go +++ b/frac/active/indexer.go @@ -4,6 +4,7 @@ import ( "bytes" "cmp" "encoding/binary" + "hash/fnv" "slices" "unsafe" @@ -72,7 +73,7 @@ func NewMemIndex(block storage.DocBlock) (*memIndex, error) { idx := newMemIndex() idx.docsCount = uint32(len(meta)) idx.ids = idx.res.GetIDs(len(meta)) - idx.positions = idx.res.GetDocPos(len(meta)) + idx.positions = idx.res.GetDocPosSlice(len(meta)) idx.blocksOffsets = idx.res.GetUint64s(1) // Only one block per bulk idx.blocksOffsets[0] = block.GetExt2() @@ -117,7 +118,7 @@ func extractTokens( // Calculate document positions in the original block // Each document is stored as: [size: uint32][data: size bytes] - positions := res.GetDocPos(len(meta)) + positions := res.GetDocPosSlice(len(meta)) prev := seq.PackDocPos(0, docOffset) for i := range meta { @@ -140,13 +141,18 @@ func extractTokens( return seq.Compare(meta[b].ID, meta[a].ID) }) + hash := fnv.New64a() + var idBinary [16]byte + // Fill index structures with sorted documents - for lid, origIdx := range order { + for i, origIdx := range order { docMeta := meta[origIdx] - idx.ids[lid] = docMeta.ID - idx.positions[lid] = positions[origIdx] + idx.ids[i] = docMeta.ID + idx.positions[i] = positions[origIdx] idx.docsSize += uint64(docMeta.Size) + hash.Write(docMeta.ID.AppendBinary(idBinary[:0])) } + idx.hash = hash.Sum64() // Extract and process tokens from all documents var err error @@ -156,8 +162,10 @@ func extractTokens( lids := res.GetUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence tids := res.GetUint32s(int(totalTokens))[:0] // Token ID for each occurrence + buf.tokenMap[tokenStr{field: seq.TokenAll}] = 0 // reserve ALL token (just for proper sealing) + // Process documents in ID-sorted order - for lid, origIdx := range order { + for i, origIdx := range order { docMeta := meta[origIdx] // Decode tokens for this document @@ -165,9 +173,8 @@ func extractTokens( return nil, nil, nil, err } - buf.tokenMap[tokenStr{field: seq.TokenAll}] = 0 // reserve ALL token (just for proper sealing) - // Process each token in the document + lid := uint32(i + 1) for _, t := range buf.tokens { if bytes.Equal(t.Key, seq.AllTokenName) { continue @@ -179,7 +186,7 @@ func extractTokens( buf.tokenMap[token] = tid } tids = append(tids, tid) - lids = append(lids, uint32(lid)+1) // store lid+1 (1-based indexing for internal use) + lids = append(lids, lid) // store lid+1 (1-based indexing for internal use) } } diff --git a/frac/active/mem_index.go b/frac/active/mem_index.go index 74eadd4e..1a5a46b1 100644 --- a/frac/active/mem_index.go +++ b/frac/active/mem_index.go @@ -30,6 +30,7 @@ type memIndex struct { blocksOffsets []uint64 // offsets of document blocks in storage, sorted in ascending order positions []seq.DocPos // position of each document inside a block; index corresponds to LID-1 + hash uint64 docsSize uint64 // total size of documents in bytes docsCount uint32 // number of documents in the index allTokenLIDsCount int // total number of tokenLIDs (for fast calc allocation size in merging) diff --git a/frac/active/mem_index_pool.go b/frac/active/mem_index_pool.go index 1da469f1..c5ee276e 100644 --- a/frac/active/mem_index_pool.go +++ b/frac/active/mem_index_pool.go @@ -6,6 +6,7 @@ import ( "sync/atomic" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" ) // indexEntry is an internal structure that describes a memIndex @@ -21,8 +22,9 @@ type indexEntry struct { // - tracks indexes currently participating in merge // - provides consistent snapshots for readers type memIndexPool struct { - mu sync.RWMutex // protects all fields below - info *frac.Info // aggregated information for all indexes + mu sync.RWMutex // protects all fields below + info *frac.Info // aggregated information for all indexes + hashes map[uint64]struct{} ready map[uint64]indexEntry // indexes ready to be merged merging map[uint64]indexEntry // indexes currently being merged @@ -38,6 +40,7 @@ type memIndexPool struct { func NewIndexPool(info *frac.Info) *memIndexPool { return &memIndexPool{ info: info, + hashes: make(map[uint64]struct{}, 1000), ready: make(map[uint64]indexEntry), merging: make(map[uint64]indexEntry), } @@ -97,6 +100,14 @@ func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { p.mu.Lock() defer p.mu.Unlock() + if idx.hash > 0 { + if _, ok := p.hashes[idx.hash]; ok { + logger.Warn("a duplicate index (bulk) has been detected") + return + } + p.hashes[idx.hash] = struct{}{} + } + if p.info.From > minMID { p.info.From = minMID } @@ -150,11 +161,21 @@ func (p *memIndexPool) replace(old []indexEntry, merged *memIndex) { p.mu.Lock() defer p.mu.Unlock() + var docsCountToRemove uint32 for _, entry := range old { + docsCountToRemove += entry.index.docsCount delete(p.merging, entry.id) } p.ready[newEntry.id] = newEntry + // update info: the number of documents to be deleted may be greater + // than the number to be added due to deduplication + if docsCountToRemove > p.info.DocsTotal { + panic("inconsistent state of index pool") + } + p.info.DocsTotal -= uint32(docsCountToRemove) + p.info.DocsTotal += newEntry.index.docsCount + p.rebuildReadable() } diff --git a/frac/active/merge.go b/frac/active/merge.go index f16fbac8..8f9e0664 100644 --- a/frac/active/merge.go +++ b/frac/active/merge.go @@ -2,10 +2,13 @@ package active import ( "bytes" + "cmp" "slices" + "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/util" + "go.uber.org/zap" ) // mergeIndexes merges several in-memory indexes (memIndex) @@ -28,7 +31,7 @@ func mergeIndexes(indexes []*memIndex) *memIndex { // Preallocate memory for final structures dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] - dst.positions = dst.res.GetDocPos(int(dst.docsCount))[:0] + dst.positions = dst.res.GetDocPosSlice(int(dst.docsCount))[:0] dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] // 1. Merge block offsets and recalculate document positions @@ -74,13 +77,28 @@ func mergeIDs( mergedDocStream := MergeSortedStreams( docStreams, func(a, b DocRef) int { - return seq.Compare(b.id, a.id) + r := seq.Compare(b.id, a.id) + if r == 0 { + r = cmp.Compare(a.i, b.i) + } + return r }, ) + var ( + doubles int + prevRef DocRef + ) + // Iterate over the merged stream - docRef, has := mergedDocStream.Next() - for has { + for docRef, has := mergedDocStream.Next(); has; docRef, has = mergedDocStream.Next() { + if docRef.id == prevRef.id && docRef.i != prevRef.i { + doubles++ + lidsMap[docRef.i] = append(lidsMap[docRef.i], 0) // add zero LID for consistent mapping + continue + } + prevRef = docRef + // Add document to the resulting index dst.ids = append(dst.ids, docRef.id) dst.positions = append(dst.positions, docRef.pos) @@ -90,8 +108,11 @@ func mergeIDs( // Record oldLID → newLID mapping lidsMap[docRef.i] = append(lidsMap[docRef.i], lid) + } - docRef, has = mergedDocStream.Next() + if doubles > 0 { + dst.docsCount -= uint32(doubles) + logger.Warn("doubles in index", zap.Int("count", doubles)) } return lidsMap @@ -138,7 +159,7 @@ func mergeTokens( // borders[i] indicates: const ( - borderSame = 0b00 // tokensRef[i] is the same token as in tokensRef[i-1] (but other index) + borderNone = 0b00 // tokensRef[i] is the same token as in tokensRef[i-1] (but other index) borderToken = 0b01 // tokensRef[i] is new token borderField = 0b10 // tokensRef[i] is new token and new field ) @@ -148,7 +169,7 @@ func mergeTokens( // First pass: count unique tokens and fields for tokenRef, has := mergedTokenStream.Next(); has; tokenRef, has = mergedTokenStream.Next() { - var border uint8 = borderSame + var border uint8 = borderNone // New token if prevToken.payload == nil || cmpToken(prevToken, tokenRef) != 0 { @@ -280,6 +301,7 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { for _, v := range s.tmp { s.buf[v] = 1 } + s.buf[0] = 0 // avoiding a zero LID caused by duplicates start := len(s.lids) for lid, ok := range s.buf { if ok == 1 { @@ -295,6 +317,11 @@ func (s *LIDsCollector) GetSorted() (dst []uint32) { if n > 1 { slices.Sort(s.tmp) } + i := 0 + for i < len(s.tmp) && s.tmp[i] == 0 { // skipping zero LIDs caused by duplicates + i++ + } + s.tmp = s.tmp[i:] start := len(s.lids) s.lids = append(s.lids, s.tmp...) s.tmp = s.tmp[:0] @@ -317,7 +344,7 @@ func mergeBlocksOffsets( dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) // Recalculate document positions - positions[i] = res.GetDocPos(len(index.positions))[:0] + positions[i] = res.GetDocPosSlice(len(index.positions))[:0] for _, p := range index.positions { oldIdx, docOffset := p.Unpack() positions[i] = append( diff --git a/frac/active/resources.go b/frac/active/resources.go index 0d7235f5..85966083 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -75,7 +75,7 @@ func (r *Resources) GetIDs(s int) []seq.ID { return r.ids.GetSlice(s) } -func (r *Resources) GetDocPos(s int) []seq.DocPos { +func (r *Resources) GetDocPosSlice(s int) []seq.DocPos { return r.docPos.GetSlice(s) } diff --git a/frac/active/sealing_source.go b/frac/active/sealing_source.go index ce6c5e76..d1f553d7 100644 --- a/frac/active/sealing_source.go +++ b/frac/active/sealing_source.go @@ -25,9 +25,11 @@ var ( // SealingSource provides data from a single memIndex in the format required by the sealing stage. type SealingSource struct { - info *frac.Info - index *memIndex - lastErr error + info *frac.Info + index *memIndex + blocksOffsets []uint64 + positions []seq.DocPos + lastErr error } // NewSealingSource prepares a sealing source from Active2 state. @@ -41,9 +43,12 @@ func NewSealingSource(a *Active, params frac.SealParams) (sealing.Source, error) logger.Panic("invalid state: sealing requires a single memIndex") } + index := iss.indexes[0] ss := &SealingSource{ - info: iss.info, - index: iss.indexes[0], + info: iss.info, + index: index, + positions: index.positions, + blocksOffsets: index.blocksOffsets, } // Sort documents unless explicitly disabled @@ -59,9 +64,8 @@ func NewSealingSource(a *Active, params frac.SealParams) (sealing.Source, error) return nil, err } - // Skip system document position - ss.index.positions = positions[1:] - ss.index.blocksOffsets = blocksOffsets + ss.positions = positions[1:] // skip system document position + ss.blocksOffsets = blocksOffsets ss.info.DocsOnDisk = uint64(onDiskSize) } @@ -98,7 +102,7 @@ func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.Doc } ids = append(ids, id) - pos = append(pos, src.index.positions[i]) + pos = append(pos, src.positions[i]) } yield(ids, pos) @@ -163,7 +167,7 @@ func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { // BlocksOffsets returns document block offsets. func (src *SealingSource) BlocksOffsets() []uint64 { - return src.index.blocksOffsets + return src.blocksOffsets } // LastError returns the last recorded source error. diff --git a/frac/sealed/sealing/sealer.go b/frac/sealed/sealing/sealer.go index b7444ab2..259955e4 100644 --- a/frac/sealed/sealing/sealer.go +++ b/frac/sealed/sealing/sealer.go @@ -5,6 +5,7 @@ import ( "iter" "os" "path/filepath" + "slices" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" @@ -84,15 +85,19 @@ func Seal(src Source, params frac.SealParams) (*sealed.PreloadedData, error) { // Ensure directory metadata is synced to disk util.MustSyncPath(filepath.Dir(info.Path)) + // copy this because it uses active fraction structures under the hood that must be released + tokenTable := indexSealer.TokenTable().Clone() + blocksOffsets := slices.Clone(src.BlocksOffsets()) + // Build preloaded data structure for fast query access lidsTable := indexSealer.LIDsTable() preloaded := sealed.PreloadedData{ Info: info, - TokenTable: indexSealer.TokenTable(), + TokenTable: tokenTable, BlocksData: sealed.BlocksData{ IDsTable: indexSealer.IDsTable(), LIDsTable: &lidsTable, - BlocksOffsets: src.BlocksOffsets(), + BlocksOffsets: blocksOffsets, }, } diff --git a/frac/sealed/token/table.go b/frac/sealed/token/table.go index 92c6102d..adf46b7d 100644 --- a/frac/sealed/token/table.go +++ b/frac/sealed/token/table.go @@ -2,6 +2,7 @@ package token import ( "sort" + "strings" "unsafe" "go.uber.org/zap" @@ -99,3 +100,22 @@ func (t Table) Size() int { } return size } + +func (t Table) Clone() Table { + res := make(Table, len(t)) + for k, v := range t { + res[strings.Clone(k)] = v.Clone() + } + return res +} + +func (fd *FieldData) Clone() *FieldData { + res := FieldData{ + MinVal: strings.Clone(fd.MinVal), + Entries: make([]*TableEntry, 0, len(fd.Entries)), + } + for _, e := range fd.Entries { + res.Entries = append(res.Entries, e.Clone()) + } + return &res +} diff --git a/frac/sealed/token/table_entry.go b/frac/sealed/token/table_entry.go index a16b9a55..5d77d9e4 100644 --- a/frac/sealed/token/table_entry.go +++ b/frac/sealed/token/table_entry.go @@ -1,5 +1,7 @@ package token +import "strings" + // TableEntry is part of token.Table and points to a fragment of token.Block type TableEntry struct { StartIndex uint32 // offset from the beginning of the block to the first token pointed to by the TableEntry @@ -30,3 +32,10 @@ func (t *TableEntry) checkTIDInBlock(tid uint32) bool { return true } + +func (t *TableEntry) Clone() *TableEntry { + res := *t + res.MaxVal = strings.Clone(t.MaxVal) + res.MinVal = strings.Clone(t.MinVal) + return &res +} diff --git a/frac/tests/fraction_test.go b/frac/tests/fraction_test.go index 2dac417c..4b6d06d2 100644 --- a/frac/tests/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -21,7 +21,6 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/active" - active1 "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed" @@ -62,7 +61,9 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &frac.Config{} + s.config = &frac.Config{ + // SkipSortDocs: true, + } s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -1271,7 +1272,7 @@ func (s *FractionTestSuite) AssertHist( func (s *FractionTestSuite) newActive(bulks ...[]string) *active.Active { baseName := filepath.Join(s.tmpDir, "test_fraction") - a := active1.New( + a := active.New( baseName, s.config, 4, @@ -1638,34 +1639,34 @@ func TestRemoteFractionTestSuite(t *testing.T) { suite.Run(t, new(RemoteFractionTestSuite)) } -func TestActive2FractionTestSuite(t *testing.T) { - suite.Run(t, new(Active2FractionTestSuite)) +func TestActiveOldFractionTestSuite(t *testing.T) { + suite.Run(t, new(ActiveOldFractionTestSuite)) } -func TestSealed2FractionTestSuite(t *testing.T) { - suite.Run(t, new(Sealed2FractionTestSuite)) +func TestSealedOldFractionTestSuite(t *testing.T) { + suite.Run(t, new(SealedOldFractionTestSuite)) } -type Active2FractionTestSuite struct { +type ActiveOldFractionTestSuite struct { FractionTestSuite } -func (s *Active2FractionTestSuite) SetupSuite() { +func (s *ActiveOldFractionTestSuite) SetupSuite() { s.SetupSuiteCommon() } -func (s *Active2FractionTestSuite) SetupTest() { +func (s *ActiveOldFractionTestSuite) SetupTest() { s.SetupTestCommon() s.insertDocuments = func(bulks ...[]string) { if s.fraction != nil { s.Require().Fail("can insert docs only once") } - s.fraction = s.newActive2(bulks...) + s.fraction = s.newActiveOld(bulks...) } } -func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active_old.Active { +func (s *ActiveOldFractionTestSuite) newActiveOld(bulks ...[]string) *active_old.Active { baseName := filepath.Join(s.tmpDir, "test_fraction") a := active_old.New( @@ -1682,8 +1683,8 @@ func (s *Active2FractionTestSuite) newActive2(bulks ...[]string) *active_old.Act return a } -func (s *Active2FractionTestSuite) TearDownTest() { - if f, ok := s.fraction.(*active1.Active); ok { +func (s *ActiveOldFractionTestSuite) TearDownTest() { + if f, ok := s.fraction.(*active_old.Active); ok { f.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -1692,30 +1693,30 @@ func (s *Active2FractionTestSuite) TearDownTest() { s.TearDownTestCommon() } -func (s *Active2FractionTestSuite) TearDownSuite() { +func (s *ActiveOldFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -type Sealed2FractionTestSuite struct { - Active2FractionTestSuite +type SealedOldFractionTestSuite struct { + ActiveOldFractionTestSuite } -func (s *Sealed2FractionTestSuite) SetupSuite() { +func (s *SealedOldFractionTestSuite) SetupSuite() { s.SetupSuiteCommon() } -func (s *Sealed2FractionTestSuite) SetupTest() { +func (s *SealedOldFractionTestSuite) SetupTest() { s.SetupTestCommon() s.insertDocuments = func(docs ...[]string) { if s.fraction != nil { s.Require().Fail("can insert docs only once") } - s.fraction = s.newSealed2(docs...) + s.fraction = s.newSealedOld(docs...) } } -func (s *Sealed2FractionTestSuite) TearDownTest() { +func (s *SealedOldFractionTestSuite) TearDownTest() { if f, ok := s.fraction.(*sealed.Sealed); ok { f.Release() } else { @@ -1724,12 +1725,12 @@ func (s *Sealed2FractionTestSuite) TearDownTest() { s.TearDownTestCommon() } -func (s *Sealed2FractionTestSuite) TearDownSuite() { +func (s *SealedOldFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -func (s *Sealed2FractionTestSuite) newSealed2(bulks ...[]string) *sealed.Sealed { - a := s.newActive2(bulks...) +func (s *SealedOldFractionTestSuite) newSealedOld(bulks ...[]string) *sealed.Sealed { + a := s.newActiveOld(bulks...) activeSealingSource, err := active_old.NewSealingSource(a, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") diff --git a/fracmanager/loader_test.go b/fracmanager/loader_test.go index 472e2282..43a1e967 100644 --- a/fracmanager/loader_test.go +++ b/fracmanager/loader_test.go @@ -112,28 +112,30 @@ func TestReplayMultiple(t *testing.T) { defer tearDown() // fill data - actives := make([]*active.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) // empty active fractions for replay for i := 0; i < fracCount; i++ { - active := fp.CreateActive() - appendDocsToActive(t, active, 500+rand.Intn(100)) - actives = append(actives, active) + a := fp.CreateActive() + appendDocsToActive(t, a, 500+rand.Intn(100)) + actives = append(actives, fp.NewActive(a.BaseFileName)) } - active := fp.CreateActive() - appendDocsToActive(t, active, 5) - actives = append(actives, active) + a := fp.CreateActive() + appendDocsToActive(t, a, 5) + actives = append(actives, fp.NewActive(a.BaseFileName)) // replay and seal - active, sealed, err := loader.replayAndSeal(t.Context(), actives) + a, s, err := loader.replayAndSeal(t.Context(), actives) assert.NoError(t, err) // checks - assert.Equal(t, len(actives), len(sealed)+1, "should replay same number of fractions") + assert.Equal(t, len(actives)-1, len(s), "should replay same number of fractions") for i := 0; i < fracCount; i++ { - assert.Equal(t, actives[i].Info().Name(), sealed[i].Info().Name(), "fraction %d should have the same name", i) - assert.Equal(t, actives[i].Info().DocsTotal, sealed[i].Info().DocsTotal, "fraction %d should have the same doc count", i) + assert.Equal(t, actives[i].Info().Name(), s[i].Info().Name(), "fraction %d should have the same name", i) + assert.Equal(t, actives[i].Info().DocsTotal, s[i].Info().DocsTotal, "fraction %d should have the same doc count", i) } - assert.Equal(t, actives[fracCount].Info().Name(), active.Info().Name(), "new active fraction should have the same name") - assert.Equal(t, uint32(5), active.Info().DocsTotal, "new active fraction should not be empty") + assert.Equal(t, actives[fracCount].Info().Name(), a.Info().Name(), "new active fraction should have the same name") + assert.Equal(t, uint32(5), a.Info().DocsTotal, + "new active fraction should have exact 5 docs but %d given", a.Info().DocsTotal, + ) } func TestReplaySingleEmpty(t *testing.T) { diff --git a/seq/seq.go b/seq/seq.go index 9a378196..797cf2c3 100644 --- a/seq/seq.go +++ b/seq/seq.go @@ -5,6 +5,8 @@ import ( "encoding/binary" "encoding/hex" "fmt" + "math" + "slices" "time" "github.com/ozontech/seq-db/util" @@ -20,7 +22,12 @@ type RID uint64 // random part of ID type LID uint32 // local id for a fraction func (m MID) Time() time.Time { - return time.UnixMilli(int64(m)) + if uint64(m) <= math.MaxInt64 { + return time.UnixMilli(int64(m)) + } else { + // since MaxInt64 is 292278994 year in milliseconds, so we assume this MID is "infinite future" + return time.UnixMilli(math.MaxInt64) + } } func (d ID) String() string { @@ -53,6 +60,13 @@ func (d ID) Bytes() []byte { return final } +func (d ID) AppendBinary(buf []byte) []byte { + buf = slices.Grow(buf, 16) + buf = binary.LittleEndian.AppendUint64(buf, uint64(d.MID)) + buf = binary.LittleEndian.AppendUint64(buf, uint64(d.RID)) + return buf +} + func LessOrEqual(a, b ID) bool { if a.MID == b.MID { return a.RID <= b.RID From 06da5343eaec1c4f007ee045b373ae58dd1a4857 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 26 Dec 2025 21:10:08 +0300 Subject: [PATCH 15/28] remake clone token table --- benchmarks/docker-compose-seqdb.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/docker-compose-seqdb.yml b/benchmarks/docker-compose-seqdb.yml index f0884ca3..93bd7e7d 100644 --- a/benchmarks/docker-compose-seqdb.yml +++ b/benchmarks/docker-compose-seqdb.yml @@ -7,7 +7,7 @@ services: limits: cpus: "4" memory: "8GB" - image: ghcr.io/ozontech/seq-db:v0.61.0 + image: 'gitlab-registry.ozon.ru/sre/images/seq-db:che@sha256:82d0dd34cb5d6db9e0450bc8d2cd1d9e29414ec2ba81dc8c4ae643dea6eb1bd0' ports: - '9002:9002' volumes: From c17dfd3104eda3be1533d79ea2b861cfe1e4b749 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 26 Dec 2025 21:16:59 +0300 Subject: [PATCH 16/28] remake clone token table 2 --- frac/sealed/sealing/blocks_builder.go | 6 +++++- frac/sealed/sealing/sealer.go | 2 +- frac/sealed/token/table.go | 20 -------------------- frac/sealed/token/table_entry.go | 9 --------- fracmanager/fraction_provider.go | 17 ++++++++++++++++- 5 files changed, 22 insertions(+), 32 deletions(-) diff --git a/frac/sealed/sealing/blocks_builder.go b/frac/sealed/sealing/blocks_builder.go index 14a5cac7..51fa9e2e 100644 --- a/frac/sealed/sealing/blocks_builder.go +++ b/frac/sealed/sealing/blocks_builder.go @@ -4,6 +4,7 @@ import ( "encoding/binary" "errors" "iter" + "strings" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -104,7 +105,10 @@ func (bb *blocksBuilder) BuildTokenBlocks( } // Entry covers TIDs from currentTID to min(fieldMaxTID, block.ext.maxTID) entry := createTokenTableEntry(currentTID, fieldMaxTID, idx, block) - table = append(table, token.FieldTable{Field: fieldName, Entries: []*token.TableEntry{entry}}) + table = append(table, token.FieldTable{ + Field: strings.Clone(fieldName), + Entries: []*token.TableEntry{entry}, + }) currentTID += entry.ValCount } diff --git a/frac/sealed/sealing/sealer.go b/frac/sealed/sealing/sealer.go index 259955e4..8e63c439 100644 --- a/frac/sealed/sealing/sealer.go +++ b/frac/sealed/sealing/sealer.go @@ -86,7 +86,7 @@ func Seal(src Source, params frac.SealParams) (*sealed.PreloadedData, error) { util.MustSyncPath(filepath.Dir(info.Path)) // copy this because it uses active fraction structures under the hood that must be released - tokenTable := indexSealer.TokenTable().Clone() + tokenTable := indexSealer.TokenTable() blocksOffsets := slices.Clone(src.BlocksOffsets()) // Build preloaded data structure for fast query access diff --git a/frac/sealed/token/table.go b/frac/sealed/token/table.go index adf46b7d..92c6102d 100644 --- a/frac/sealed/token/table.go +++ b/frac/sealed/token/table.go @@ -2,7 +2,6 @@ package token import ( "sort" - "strings" "unsafe" "go.uber.org/zap" @@ -100,22 +99,3 @@ func (t Table) Size() int { } return size } - -func (t Table) Clone() Table { - res := make(Table, len(t)) - for k, v := range t { - res[strings.Clone(k)] = v.Clone() - } - return res -} - -func (fd *FieldData) Clone() *FieldData { - res := FieldData{ - MinVal: strings.Clone(fd.MinVal), - Entries: make([]*TableEntry, 0, len(fd.Entries)), - } - for _, e := range fd.Entries { - res.Entries = append(res.Entries, e.Clone()) - } - return &res -} diff --git a/frac/sealed/token/table_entry.go b/frac/sealed/token/table_entry.go index 5d77d9e4..a16b9a55 100644 --- a/frac/sealed/token/table_entry.go +++ b/frac/sealed/token/table_entry.go @@ -1,7 +1,5 @@ package token -import "strings" - // TableEntry is part of token.Table and points to a fragment of token.Block type TableEntry struct { StartIndex uint32 // offset from the beginning of the block to the first token pointed to by the TableEntry @@ -32,10 +30,3 @@ func (t *TableEntry) checkTIDInBlock(tid uint32) bool { return true } - -func (t *TableEntry) Clone() *TableEntry { - res := *t - res.MaxVal = strings.Clone(t.MaxVal) - res.MinVal = strings.Clone(t.MinVal) - return &res -} diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 178711e0..c68a1c8e 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -129,7 +129,7 @@ func (fp *fractionProvider) CreateActive2() *active1.Active { // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading -func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { +func (fp *fractionProvider) Seal1(a *active.Active) (*sealed.Sealed, error) { src, err := active.NewSealingSource(a, fp.config.SealParams) if err != nil { return nil, err @@ -142,6 +142,21 @@ func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { return fp.NewSealedPreloaded(a.BaseFileName, preloaded), nil } +// Seal converts an active fraction to a sealed one +// Process includes sorting, indexing, and data optimization for reading +func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { + src, err := active.NewSealingSource(a, fp.config.SealParams) + if err != nil { + return nil, err + } + _, err = sealing.Seal(src, fp.config.SealParams) + if err != nil { + return nil, err + } + + return fp.NewSealed(a.BaseFileName, nil), nil +} + // Offload uploads fraction to S3 storage and returns a remote fraction // IMPORTANT: context controls timeouts and operation cancellation func (fp *fractionProvider) Offload(ctx context.Context, f *sealed.Sealed) (*sealed.Remote, error) { From 7860811fa4dd8b8fde2e2cb9346a4af5bffdedcc Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 26 Dec 2025 21:37:56 +0300 Subject: [PATCH 17/28] disable alloc pools --- resources/slice_allocator.go | 2 ++ resources/slice_on_bytes.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index 67485688..538261d2 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -29,6 +29,8 @@ func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] } func (a SlicesPool[T]) GetSlice(size int) []T { + return make([]T, size) + data := a.pool.Get(size) a.releases.Defer(func() { a.pool.Put(data) }) return data[:size] diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index 8965d46a..3c80fffe 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -23,6 +23,8 @@ func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { } func (a SliceOnBytes[T]) GetSlice(size int) []T { + return make([]T, size) + data, buf := a.getBuf(size) a.releases.Defer(func() { a.pool.Put(buf) }) return data From 210b19b1824de4d74ce58851e7769f434a4dfab7 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 26 Dec 2025 21:53:44 +0300 Subject: [PATCH 18/28] enable alloc pools --- resources/call_stack.go | 1 + resources/slice_allocator.go | 2 -- resources/slice_on_bytes.go | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/resources/call_stack.go b/resources/call_stack.go index 50df25d9..c60cf301 100644 --- a/resources/call_stack.go +++ b/resources/call_stack.go @@ -12,5 +12,6 @@ func (s *CallStack) CallAll() { for i := len(s.stack) - 1; i >= 0; i-- { s.stack[i]() } + clear(s.stack) s.stack = s.stack[:0] } diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index 538261d2..67485688 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -29,8 +29,6 @@ func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] } func (a SlicesPool[T]) GetSlice(size int) []T { - return make([]T, size) - data := a.pool.Get(size) a.releases.Defer(func() { a.pool.Put(data) }) return data[:size] diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index 3c80fffe..8965d46a 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -23,8 +23,6 @@ func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { } func (a SliceOnBytes[T]) GetSlice(size int) []T { - return make([]T, size) - data, buf := a.getBuf(size) a.releases.Defer(func() { a.pool.Put(buf) }) return data From e62e18fce6e07901e865fe659fea549a8fc36a54 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 13:45:17 +0300 Subject: [PATCH 19/28] res pool experiments --- frac/active/resources.go | 7 ++++--- resources/slice_on_bytes.go | 9 ++------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index 85966083..ff894114 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -8,9 +8,10 @@ import ( ) var ( - tokenKeyPool = resources.NewSizedPool[tokenStr](24) - indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](24) - docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](24) + s = 8 + tokenKeyPool = resources.NewSizedPool[tokenStr](s) + indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](s) + docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](s) bufPool = resources.TypedPool[*indexerBuffer]{} resPool = resources.TypedPool[*Resources]{} ) diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index 8965d46a..b3b51bcd 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -23,16 +23,11 @@ func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { } func (a SliceOnBytes[T]) GetSlice(size int) []T { - data, buf := a.getBuf(size) - a.releases.Defer(func() { a.pool.Put(buf) }) - return data -} - -func (a SliceOnBytes[T]) getBuf(size int) ([]T, []byte) { var tmp T itemSize := int(unsafe.Sizeof(tmp)) buf := a.pool.Get(size * itemSize) capacity := cap(buf) / itemSize data := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf))), capacity)[:size] - return data, buf + a.releases.Defer(func() { a.pool.Put(buf) }) + return data } From 63674a6f5daabb4dd52fb60ab3dd2eaed565a69d Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 13:53:08 +0300 Subject: [PATCH 20/28] fix panic --- frac/active/mem_index_pool.go | 1 - 1 file changed, 1 deletion(-) diff --git a/frac/active/mem_index_pool.go b/frac/active/mem_index_pool.go index c5ee276e..863eb47e 100644 --- a/frac/active/mem_index_pool.go +++ b/frac/active/mem_index_pool.go @@ -210,7 +210,6 @@ func (p *memIndexPool) Release() { idx.Release() } - p.info = nil p.readable = nil p.ready = nil p.merging = nil From 26b2845890bd49d0d68c1976975892d2c27641c4 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 16:24:04 +0300 Subject: [PATCH 21/28] resources test --- frac/active/resources.go | 28 ++++++++++++++-------------- resources/sized_pool.go | 8 ++++---- resources/slice_allocator.go | 20 ++++++++++---------- resources/slice_on_bytes.go | 10 +++++----- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index ff894114..eda001f7 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -8,7 +8,7 @@ import ( ) var ( - s = 8 + s = 24 tokenKeyPool = resources.NewSizedPool[tokenStr](s) indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](s) docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](s) @@ -21,17 +21,17 @@ var ( type Resources struct { releases *resources.CallStack - uint32s resources.SliceOnBytes[uint32] - uint64s resources.SliceOnBytes[uint64] - bytes resources.SlicesPool[byte] - bytesSlices resources.SlicesPool[[]byte] - uint32Slices resources.SlicesPool[[]uint32] - tokenKeys resources.SlicesPool[tokenStr] - indexerMetaData resources.SlicesPool[indexer.MetaData] + uint32s *resources.SliceOnBytes[uint32] + uint64s *resources.SliceOnBytes[uint64] + bytes *resources.SlicesPool[byte] + bytesSlices *resources.SlicesPool[[]byte] + uint32Slices *resources.SlicesPool[[]uint32] + tokenKeys *resources.SlicesPool[tokenStr] + indexerMetaData *resources.SlicesPool[indexer.MetaData] buf resources.ObjectsPool[indexerBuffer] - ids resources.SliceOnBytes[seq.ID] - docPos resources.SliceOnBytes[seq.DocPos] - docPosSlices resources.SlicesPool[[]seq.DocPos] + ids *resources.SliceOnBytes[seq.ID] + docPos *resources.SliceOnBytes[seq.DocPos] + docPosSlices *resources.SlicesPool[[]seq.DocPos] } func NewResources() (*Resources, func()) { @@ -48,9 +48,9 @@ func NewResources() (*Resources, func()) { bytesSlices: resources.NewBytesSlices(&s), ids: resources.NewSliceOnBytes[seq.ID](&s), docPos: resources.NewSliceOnBytes[seq.DocPos](&s), - docPosSlices: resources.NewSlicesPool(&docPosSlicesPool, &s), - indexerMetaData: resources.NewSlicesPool(&indexerMetaDataPool, &s), - tokenKeys: resources.NewSlicesPool(&tokenKeyPool, &s), + docPosSlices: resources.NewSlicesPool(docPosSlicesPool, &s), + indexerMetaData: resources.NewSlicesPool(indexerMetaDataPool, &s), + tokenKeys: resources.NewSlicesPool(tokenKeyPool, &s), buf: resources.NewObjectsPool(&bufPool, &s), } } diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 834771d4..143e448c 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -27,8 +27,8 @@ type SizedPool[T any] struct { pools []TypedPool[[]T] } -func NewSizedPool[T any](buckets int) SizedPool[T] { - return SizedPool[T]{ +func NewSizedPool[T any](buckets int) *SizedPool[T] { + return &SizedPool[T]{ pools: make([]TypedPool[[]T], buckets), } } @@ -38,7 +38,7 @@ func index(size uint) (idx, leftBorder int) { return idx, 1 << (idx + 8) } -func (p SizedPool[T]) Get(size int) []T { +func (p *SizedPool[T]) Get(size int) []T { idx, poolCapacity := index(uint(size)) if idx < len(p.pools) { @@ -57,7 +57,7 @@ func (p SizedPool[T]) Get(size int) []T { return make([]T, size, poolCapacity) } -func (p SizedPool[T]) Put(item []T) { +func (p *SizedPool[T]) Put(item []T) { capacity := cap(item) idx, leftBorder := index(uint(capacity)) diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index 67485688..3ae3a1a0 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -1,19 +1,19 @@ package resources -func NewBytes(releases *CallStack) SlicesPool[byte] { - return NewSlicesPool(&BytesPool, releases) +func NewBytes(releases *CallStack) *SlicesPool[byte] { + return NewSlicesPool(BytesPool, releases) } -func NewStrings(releases *CallStack) SlicesPool[string] { - return NewSlicesPool(&StringsPool, releases) +func NewStrings(releases *CallStack) *SlicesPool[string] { + return NewSlicesPool(StringsPool, releases) } -func NewUint32Slices(releases *CallStack) SlicesPool[[]uint32] { - return NewSlicesPool(&Uint32SlicesPool, releases) +func NewUint32Slices(releases *CallStack) *SlicesPool[[]uint32] { + return NewSlicesPool(Uint32SlicesPool, releases) } -func NewBytesSlices(releases *CallStack) SlicesPool[[]byte] { - return NewSlicesPool(&BytesSlicesPool, releases) +func NewBytesSlices(releases *CallStack) *SlicesPool[[]byte] { + return NewSlicesPool(BytesSlicesPool, releases) } type SlicesPool[T any] struct { @@ -21,8 +21,8 @@ type SlicesPool[T any] struct { releases *CallStack } -func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] { - return SlicesPool[T]{ +func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) *SlicesPool[T] { + return &SlicesPool[T]{ pool: pool, releases: releases, } diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index b3b51bcd..2766eddf 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -2,11 +2,11 @@ package resources import "unsafe" -func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { +func NewUint32s(releases *CallStack) *SliceOnBytes[uint32] { return NewSliceOnBytes[uint32](releases) } -func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { +func NewUint64s(releases *CallStack) *SliceOnBytes[uint64] { return NewSliceOnBytes[uint64](releases) } @@ -15,9 +15,9 @@ type SliceOnBytes[T any] struct { releases *CallStack } -func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { - return SliceOnBytes[T]{ - pool: &BytesPool, +func NewSliceOnBytes[T any](releases *CallStack) *SliceOnBytes[T] { + return &SliceOnBytes[T]{ + pool: BytesPool, releases: releases, } } From ecf9d59be9538486a28b2993a9e734ee0e1ecc7c Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 20:10:37 +0300 Subject: [PATCH 22/28] res pool experiments 2 --- frac/active/resources.go | 28 +++++++++++++++------------- resources/global_pools.go | 1 - resources/object_allocator.go | 9 +++++---- resources/sized_pool.go | 9 +++++---- resources/slice_allocator.go | 20 ++++++++------------ resources/slice_on_bytes.go | 10 +++++----- 6 files changed, 38 insertions(+), 39 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index eda001f7..a8e172a8 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -21,17 +21,17 @@ var ( type Resources struct { releases *resources.CallStack - uint32s *resources.SliceOnBytes[uint32] - uint64s *resources.SliceOnBytes[uint64] - bytes *resources.SlicesPool[byte] - bytesSlices *resources.SlicesPool[[]byte] - uint32Slices *resources.SlicesPool[[]uint32] - tokenKeys *resources.SlicesPool[tokenStr] - indexerMetaData *resources.SlicesPool[indexer.MetaData] + uint32s resources.SliceOnBytes[uint32] + uint64s resources.SliceOnBytes[uint64] + bytes resources.SlicesPool[byte] + bytesSlices resources.SlicesPool[[]byte] + uint32Slices resources.SlicesPool[[]uint32] + tokenKeys resources.SlicesPool[tokenStr] + indexerMetaData resources.SlicesPool[indexer.MetaData] buf resources.ObjectsPool[indexerBuffer] - ids *resources.SliceOnBytes[seq.ID] - docPos *resources.SliceOnBytes[seq.DocPos] - docPosSlices *resources.SlicesPool[[]seq.DocPos] + ids resources.SliceOnBytes[seq.ID] + docPos resources.SliceOnBytes[seq.DocPos] + docPosSlices resources.SlicesPool[[]seq.DocPos] } func NewResources() (*Resources, func()) { @@ -48,9 +48,9 @@ func NewResources() (*Resources, func()) { bytesSlices: resources.NewBytesSlices(&s), ids: resources.NewSliceOnBytes[seq.ID](&s), docPos: resources.NewSliceOnBytes[seq.DocPos](&s), - docPosSlices: resources.NewSlicesPool(docPosSlicesPool, &s), - indexerMetaData: resources.NewSlicesPool(indexerMetaDataPool, &s), - tokenKeys: resources.NewSlicesPool(tokenKeyPool, &s), + docPosSlices: resources.NewSlicesPool(&docPosSlicesPool, &s), + indexerMetaData: resources.NewSlicesPool(&indexerMetaDataPool, &s), + tokenKeys: resources.NewSlicesPool(&tokenKeyPool, &s), buf: resources.NewObjectsPool(&bufPool, &s), } } @@ -61,6 +61,7 @@ func NewResources() (*Resources, func()) { } func (r *Resources) GetBytesSlices(s int) [][]byte { + return make([][]byte, s) return r.bytesSlices.GetSlice(s) } @@ -89,6 +90,7 @@ func (r *Resources) GetUint64s(s int) []uint64 { } func (r *Resources) GetUint32Slices(s int) [][]uint32 { + return make([][]uint32, s) return r.uint32Slices.GetSlice(s) } diff --git a/resources/global_pools.go b/resources/global_pools.go index 97d69e0b..4529d224 100644 --- a/resources/global_pools.go +++ b/resources/global_pools.go @@ -2,7 +2,6 @@ package resources var ( BytesPool = NewSizedPool[byte](24) - StringsPool = NewSizedPool[string](24) Uint32SlicesPool = NewSizedPool[[]uint32](24) BytesSlicesPool = NewSizedPool[[]byte](24) ) diff --git a/resources/object_allocator.go b/resources/object_allocator.go index 66262c0e..c5da1952 100644 --- a/resources/object_allocator.go +++ b/resources/object_allocator.go @@ -14,12 +14,13 @@ func NewMapsPool[K comparable, V any](pool *TypedPool[map[K]V], releases *CallSt func (a MapsPool[K, V]) Alloc(size int) map[K]V { obj, ok := a.pool.Get() - if ok { - clear(obj) - } else { + if !ok { obj = make(map[K]V, size) } - a.releases.Defer(func() { a.pool.Put(obj) }) + a.releases.Defer(func() { + clear(obj) + a.pool.Put(obj) + }) return obj } diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 143e448c..5ab49b17 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -27,8 +27,8 @@ type SizedPool[T any] struct { pools []TypedPool[[]T] } -func NewSizedPool[T any](buckets int) *SizedPool[T] { - return &SizedPool[T]{ +func NewSizedPool[T any](buckets int) SizedPool[T] { + return SizedPool[T]{ pools: make([]TypedPool[[]T], buckets), } } @@ -38,7 +38,7 @@ func index(size uint) (idx, leftBorder int) { return idx, 1 << (idx + 8) } -func (p *SizedPool[T]) Get(size int) []T { +func (p SizedPool[T]) Get(size int) []T { idx, poolCapacity := index(uint(size)) if idx < len(p.pools) { @@ -57,7 +57,7 @@ func (p *SizedPool[T]) Get(size int) []T { return make([]T, size, poolCapacity) } -func (p *SizedPool[T]) Put(item []T) { +func (p SizedPool[T]) Put(item []T) { capacity := cap(item) idx, leftBorder := index(uint(capacity)) @@ -66,6 +66,7 @@ func (p *SizedPool[T]) Put(item []T) { } if idx < len(p.pools) { + item = item[:0] p.pools[idx].Put(item) } } diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index 3ae3a1a0..fcf267d9 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -1,19 +1,15 @@ package resources -func NewBytes(releases *CallStack) *SlicesPool[byte] { - return NewSlicesPool(BytesPool, releases) +func NewBytes(releases *CallStack) SlicesPool[byte] { + return NewSlicesPool(&BytesPool, releases) } -func NewStrings(releases *CallStack) *SlicesPool[string] { - return NewSlicesPool(StringsPool, releases) +func NewUint32Slices(releases *CallStack) SlicesPool[[]uint32] { + return NewSlicesPool(&Uint32SlicesPool, releases) } -func NewUint32Slices(releases *CallStack) *SlicesPool[[]uint32] { - return NewSlicesPool(Uint32SlicesPool, releases) -} - -func NewBytesSlices(releases *CallStack) *SlicesPool[[]byte] { - return NewSlicesPool(BytesSlicesPool, releases) +func NewBytesSlices(releases *CallStack) SlicesPool[[]byte] { + return NewSlicesPool(&BytesSlicesPool, releases) } type SlicesPool[T any] struct { @@ -21,8 +17,8 @@ type SlicesPool[T any] struct { releases *CallStack } -func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) *SlicesPool[T] { - return &SlicesPool[T]{ +func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] { + return SlicesPool[T]{ pool: pool, releases: releases, } diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index 2766eddf..b3b51bcd 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -2,11 +2,11 @@ package resources import "unsafe" -func NewUint32s(releases *CallStack) *SliceOnBytes[uint32] { +func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { return NewSliceOnBytes[uint32](releases) } -func NewUint64s(releases *CallStack) *SliceOnBytes[uint64] { +func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { return NewSliceOnBytes[uint64](releases) } @@ -15,9 +15,9 @@ type SliceOnBytes[T any] struct { releases *CallStack } -func NewSliceOnBytes[T any](releases *CallStack) *SliceOnBytes[T] { - return &SliceOnBytes[T]{ - pool: BytesPool, +func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { + return SliceOnBytes[T]{ + pool: &BytesPool, releases: releases, } } From fcb45ce98e5831a82eb17e180a1dc6060a888844 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 21:15:23 +0300 Subject: [PATCH 23/28] res pool experiments 3 --- frac/active/resources.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index a8e172a8..1886b23d 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -61,7 +61,6 @@ func NewResources() (*Resources, func()) { } func (r *Resources) GetBytesSlices(s int) [][]byte { - return make([][]byte, s) return r.bytesSlices.GetSlice(s) } @@ -90,7 +89,6 @@ func (r *Resources) GetUint64s(s int) []uint64 { } func (r *Resources) GetUint32Slices(s int) [][]uint32 { - return make([][]uint32, s) return r.uint32Slices.GetSlice(s) } From bff6bcfc74b973877d82b176daf4b4b19d15095a Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 23:09:04 +0300 Subject: [PATCH 24/28] res pool experiments 4 --- frac/active/indexer_test.go | 3 +++ resources/sized_pool.go | 26 ++++++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index 3938a8b8..dbaad7ef 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -15,6 +15,7 @@ import ( "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/resources" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tests/common" @@ -178,6 +179,8 @@ func BenchmarkFullWrite(b *testing.B) { assert.Greater(b, int(sealed.Info.DocsTotal), 0) active.Release() } + + resources.BytesPool.Print() } func readFileAllAtOnce(filename string) ([][]byte, error) { diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 5ab49b17..0f8715cb 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -1,25 +1,36 @@ package resources import ( + "fmt" "math/bits" "sync" + "sync/atomic" ) +const poolLimit = 16 + type TypedPool[T any] struct { - pool sync.Pool + pool sync.Pool + counter atomic.Int64 } func (p *TypedPool[T]) Get() (T, bool) { item := p.pool.Get() var val T if item == nil { + p.counter.Store(0) return val, false } + p.counter.Add(-1) val, ok := item.(T) return val, ok } func (p *TypedPool[T]) Put(item T) { + if p.counter.Load() > poolLimit { + return + } + p.counter.Add(1) p.pool.Put(item) } @@ -47,13 +58,6 @@ func (p SizedPool[T]) Get(size int) []T { } } - idx++ - if idx < len(p.pools) { - if data, ok := p.pools[idx].Get(); ok { - return data[:size] - } - } - return make([]T, size, poolCapacity) } @@ -70,3 +74,9 @@ func (p SizedPool[T]) Put(item []T) { p.pools[idx].Put(item) } } + +func (p SizedPool[T]) Print() { + for i := range p.pools { + fmt.Println("size:", i, p.pools[i].counter.Load()) + } +} From fcc0501fba9d506e6d7b44cd0a5a7de7068b815d Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 23:35:42 +0300 Subject: [PATCH 25/28] res pool experiments 5 --- frac/active/resources.go | 9 +++++---- resources/global_pools.go | 6 +++--- resources/sized_pool.go | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index 1886b23d..47bfb34d 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -7,11 +7,12 @@ import ( "github.com/ozontech/seq-db/tokenizer" ) +const poolBuckets = 16 + var ( - s = 24 - tokenKeyPool = resources.NewSizedPool[tokenStr](s) - indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](s) - docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](s) + tokenKeyPool = resources.NewSizedPool[tokenStr](poolBuckets) + indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](poolBuckets) + docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](poolBuckets) bufPool = resources.TypedPool[*indexerBuffer]{} resPool = resources.TypedPool[*Resources]{} ) diff --git a/resources/global_pools.go b/resources/global_pools.go index 4529d224..38c94a31 100644 --- a/resources/global_pools.go +++ b/resources/global_pools.go @@ -1,7 +1,7 @@ package resources var ( - BytesPool = NewSizedPool[byte](24) - Uint32SlicesPool = NewSizedPool[[]uint32](24) - BytesSlicesPool = NewSizedPool[[]byte](24) + BytesPool = NewSizedPool[byte](16) + Uint32SlicesPool = NewSizedPool[[]uint32](16) + BytesSlicesPool = NewSizedPool[[]byte](16) ) diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 0f8715cb..5ea17d06 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -7,7 +7,7 @@ import ( "sync/atomic" ) -const poolLimit = 16 +const poolLimit = 8 type TypedPool[T any] struct { pool sync.Pool From e73ee179bd3215dcef9f5aeb8ac7d14f15fe82f3 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Mon, 29 Dec 2025 23:54:46 +0300 Subject: [PATCH 26/28] res pool experiments 6 --- frac/active/indexer_test.go | 3 --- frac/active/resources.go | 2 +- resources/global_pools.go | 8 +++++--- resources/sized_pool.go | 23 +++++------------------ 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go index dbaad7ef..3938a8b8 100644 --- a/frac/active/indexer_test.go +++ b/frac/active/indexer_test.go @@ -15,7 +15,6 @@ import ( "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/resources" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tests/common" @@ -179,8 +178,6 @@ func BenchmarkFullWrite(b *testing.B) { assert.Greater(b, int(sealed.Info.DocsTotal), 0) active.Release() } - - resources.BytesPool.Print() } func readFileAllAtOnce(filename string) ([][]byte, error) { diff --git a/frac/active/resources.go b/frac/active/resources.go index 47bfb34d..b7d7fe67 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -7,7 +7,7 @@ import ( "github.com/ozontech/seq-db/tokenizer" ) -const poolBuckets = 16 +const poolBuckets = 24 var ( tokenKeyPool = resources.NewSizedPool[tokenStr](poolBuckets) diff --git a/resources/global_pools.go b/resources/global_pools.go index 38c94a31..ff403952 100644 --- a/resources/global_pools.go +++ b/resources/global_pools.go @@ -1,7 +1,9 @@ package resources +const poolBuckets = 24 + var ( - BytesPool = NewSizedPool[byte](16) - Uint32SlicesPool = NewSizedPool[[]uint32](16) - BytesSlicesPool = NewSizedPool[[]byte](16) + BytesPool = NewSizedPool[byte](poolBuckets) + Uint32SlicesPool = NewSizedPool[[]uint32](poolBuckets) + BytesSlicesPool = NewSizedPool[[]byte](poolBuckets) ) diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 5ea17d06..3b1be375 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -1,36 +1,25 @@ package resources import ( - "fmt" "math/bits" "sync" - "sync/atomic" ) -const poolLimit = 8 - type TypedPool[T any] struct { - pool sync.Pool - counter atomic.Int64 + pool sync.Pool } func (p *TypedPool[T]) Get() (T, bool) { item := p.pool.Get() var val T if item == nil { - p.counter.Store(0) return val, false } - p.counter.Add(-1) val, ok := item.(T) return val, ok } func (p *TypedPool[T]) Put(item T) { - if p.counter.Load() > poolLimit { - return - } - p.counter.Add(1) p.pool.Put(item) } @@ -50,6 +39,8 @@ func index(size uint) (idx, leftBorder int) { } func (p SizedPool[T]) Get(size int) []T { + return make([]T, size) + idx, poolCapacity := index(uint(size)) if idx < len(p.pools) { @@ -62,6 +53,8 @@ func (p SizedPool[T]) Get(size int) []T { } func (p SizedPool[T]) Put(item []T) { + return + capacity := cap(item) idx, leftBorder := index(uint(capacity)) @@ -74,9 +67,3 @@ func (p SizedPool[T]) Put(item []T) { p.pools[idx].Put(item) } } - -func (p SizedPool[T]) Print() { - for i := range p.pools { - fmt.Println("size:", i, p.pools[i].counter.Load()) - } -} From e81e9bf518d9d9e0fa9e6e5b70380f2abdde8b5c Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Tue, 30 Dec 2025 00:19:46 +0300 Subject: [PATCH 27/28] res pool experiments 7 --- frac/active/resources.go | 6 +++++- resources/sized_pool.go | 4 ---- resources/slice_allocator.go | 1 + resources/slice_on_bytes.go | 19 ++++++++++++++----- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/frac/active/resources.go b/frac/active/resources.go index b7d7fe67..3ad33eb8 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -1,6 +1,7 @@ package active import ( + "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/resources" "github.com/ozontech/seq-db/seq" @@ -66,7 +67,10 @@ func (r *Resources) GetBytesSlices(s int) [][]byte { } func (r *Resources) GetBytes(s int) []byte { - return r.bytes.GetSlice(s) + b := bytespool.AcquireLen(s) + r.releases.Defer(func() { bytespool.Release(b) }) + return b.B + // return r.bytes.GetSlice(s) } func (r *Resources) GetUint32s(s int) []uint32 { diff --git a/resources/sized_pool.go b/resources/sized_pool.go index 3b1be375..8142334e 100644 --- a/resources/sized_pool.go +++ b/resources/sized_pool.go @@ -39,8 +39,6 @@ func index(size uint) (idx, leftBorder int) { } func (p SizedPool[T]) Get(size int) []T { - return make([]T, size) - idx, poolCapacity := index(uint(size)) if idx < len(p.pools) { @@ -53,8 +51,6 @@ func (p SizedPool[T]) Get(size int) []T { } func (p SizedPool[T]) Put(item []T) { - return - capacity := cap(item) idx, leftBorder := index(uint(capacity)) diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index fcf267d9..f5b317f3 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -25,6 +25,7 @@ func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] } func (a SlicesPool[T]) GetSlice(size int) []T { + return make([]T, size) data := a.pool.Get(size) a.releases.Defer(func() { a.pool.Put(data) }) return data[:size] diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index b3b51bcd..61245af7 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -1,6 +1,10 @@ package resources -import "unsafe" +import ( + "unsafe" + + "github.com/ozontech/seq-db/bytespool" +) func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { return NewSliceOnBytes[uint32](releases) @@ -11,13 +15,13 @@ func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { } type SliceOnBytes[T any] struct { - pool *SizedPool[byte] + // pool *SizedPool[byte] releases *CallStack } func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { return SliceOnBytes[T]{ - pool: &BytesPool, + // pool: &BytesPool, releases: releases, } } @@ -25,9 +29,14 @@ func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { func (a SliceOnBytes[T]) GetSlice(size int) []T { var tmp T itemSize := int(unsafe.Sizeof(tmp)) - buf := a.pool.Get(size * itemSize) + + b := bytespool.AcquireLen(size * itemSize) + buf := b.B + + // buf := a.pool.Get(size * itemSize) capacity := cap(buf) / itemSize data := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf))), capacity)[:size] - a.releases.Defer(func() { a.pool.Put(buf) }) + // a.releases.Defer(func() { a.pool.Put(buf) }) + a.releases.Defer(func() { bytespool.Release(b) }) return data } From 067dc8a473ad41bffc72fcb156ecf7745597802f Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Wed, 14 Jan 2026 14:25:57 +0300 Subject: [PATCH 28/28] deduplication tests --- frac/active/active_test.go | 151 +++++++++++++++++++++++++++++ frac/active/indexer.go | 139 +++++++++++++-------------- frac/active/merge.go | 180 +++++++++++++++++------------------ frac/active/resources.go | 6 +- resources/call_stack.go | 1 - resources/slice_allocator.go | 1 - resources/slice_on_bytes.go | 18 ++-- 7 files changed, 314 insertions(+), 182 deletions(-) create mode 100644 frac/active/active_test.go diff --git a/frac/active/active_test.go b/frac/active/active_test.go new file mode 100644 index 00000000..6fedcd80 --- /dev/null +++ b/frac/active/active_test.go @@ -0,0 +1,151 @@ +package active + +import ( + "encoding/binary" + "fmt" + "testing" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tokenizer" + "github.com/stretchr/testify/assert" +) + +func TestMerge_DuplicateIDs(t *testing.T) { + // create several indexes with overlapping IDs + idx1 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, // ID 100 will be duplicated + {MID: 101, RID: 2}, + }) + + idx2 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, // duplicate of ID 100 from the first index + {MID: 102, RID: 3}, + }) + + idx3 := createTestIndex(t, []seq.ID{ + {MID: 103, RID: 4}, + {MID: 104, RID: 5}, + }) + + // call mergeIndexes with indexes containing duplicated IDs + indexes := []*memIndex{idx1, idx2, idx3} + result := mergeIndexes(indexes) + + expectedIDs := []seq.ID{ + {MID: 104, RID: 5}, + {MID: 103, RID: 4}, + {MID: 102, RID: 3}, + {MID: 101, RID: 2}, + {MID: 100, RID: 1}, + } + assert.Equal(t, expectedIDs, result.ids) + assert.Equal(t, len(expectedIDs), int(result.docsCount)) + assert.Equal(t, len(expectedIDs)+1, int(result.docsSize), "we can't adjust the total size during deduplication") + + // verify that the _all_ token is correct + allRange := result.fieldsTokens[seq.TokenAll] + allTID := allRange.start + assert.Equal(t, uint32(1), allRange.count) + assert.Empty(t, result.tokenLIDs[allTID], "empty list means ALL documents") + + // verify that the foo:bar token is correct + fooRange := result.fieldsTokens["foo"] + fooTID := fooRange.start + assert.Equal(t, uint32(1), fooRange.count) + assert.Equal(t, []uint32{1, 2, 3, 4, 5}, result.tokenLIDs[fooTID], "") +} + +func createTestIndex(t *testing.T, ids []seq.ID) *memIndex { + meta := []byte{} + for i, id := range ids { + md := indexer.MetaData{ + ID: id, + Size: 1, + Tokens: []tokenizer.MetaToken{ + {Key: []byte("foo"), Value: []byte("bar")}, + {Key: []byte("num"), Value: []byte(fmt.Sprintf("token_%d", i))}, + {Key: []byte("foo"), Value: []byte("bar")}, // duplicate + }, + } + tmp := md.MarshalBinaryTo(nil) + meta = binary.LittleEndian.AppendUint32(meta, uint32(len(tmp))) + meta = append(meta, tmp...) + } + idx, err := NewMemIndex(storage.CompressDocBlock(meta, nil, 1)) + assert.NoError(t, err) + return idx +} + +func TestMemIndexPool_Add_DuplicateBulk(t *testing.T) { + idx1 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, + {MID: 101, RID: 2}, + }) + + idx2 := createTestIndex(t, []seq.ID{ + {MID: 102, RID: 3}, + {MID: 103, RID: 4}, + }) + + idx3 := createTestIndex(t, []seq.ID{ + {MID: 102, RID: 3}, + {MID: 103, RID: 4}, + }) + + assert.NotEqual(t, idx1.hash, idx2.hash) + assert.Equal(t, idx2.hash, idx3.hash) + + info := frac.NewInfo("test", 0, 0) + pool := NewIndexPool(info) + + // add the first index + pool.Add(idx1, 10, 10) + + // verify the index was added + snapshot1, release1 := pool.Snapshot() + assert.Len(t, snapshot1.indexes, 1) + release1() + + // add the second index + pool.Add(idx2, 10, 10) + + // verify the index was added + snapshot2, release2 := pool.Snapshot() + assert.Len(t, snapshot2.indexes, 2) + release2() + + // add the third index with the same hash + pool.Add(idx3, 10, 10) + + // verify the third index was NOT added (should be ignored) + snapshot3, release3 := pool.Snapshot() + assert.Len(t, snapshot3.indexes, 2, "third index with the same hash should not be added") + + // verify that the first and second indexes remain + assert.Equal(t, seq.MID(101), snapshot3.indexes[0].ids[0].MID) + assert.Equal(t, seq.MID(103), snapshot3.indexes[1].ids[0].MID) + release3() + + // verify statistics - DocsTotal should only account for the first index + assert.Equal(t, uint32(4), info.DocsTotal) + assert.Equal(t, uint64(4), info.DocsRaw) + assert.Equal(t, uint64(20), info.DocsOnDisk) + assert.Equal(t, uint64(20), info.MetaOnDisk) + assert.Equal(t, seq.MID(100), info.From) + assert.Equal(t, seq.MID(103), info.To) +} + +func TestIndexer_TokenDeduplication(t *testing.T) { + idx := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, + {MID: 101, RID: 2}, + }) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens[seq.TokenAll].start], 0) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["foo"].start], 2) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["num"].start+0], 1) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["num"].start+1], 1) + assert.Equal(t, 4, idx.allTokenLIDsCount) +} diff --git a/frac/active/indexer.go b/frac/active/indexer.go index a058402b..5c4f49dd 100644 --- a/frac/active/indexer.go +++ b/frac/active/indexer.go @@ -31,7 +31,7 @@ func NewIndexer(workerPool WorkerLimiter) *Indexer { } // indexerBuffer is a temporary reusable buffer used during index construction to avoid allocations. -// It holds intermediate data structures that are needed during processing but not in the final index. +// it holds intermediate data structures that are needed during processing but not in the final index. type indexerBuffer struct { sizes []uint32 fields []string @@ -50,50 +50,50 @@ func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, er } // NewMemIndex creates an in-memory index from a document block -func NewMemIndex(block storage.DocBlock) (*memIndex, error) { +func NewMemIndex(data storage.DocBlock) (*memIndex, error) { sw := stopwatch.New() - res, release := NewResources() + tmp, release := NewResources() defer release() - // Decompress metadata - payload, err := decompressMeta(res, block, sw) + // decompress metadata + payload, err := decompressMeta(tmp, data, sw) if err != nil { return nil, err } - buf := res.GetBuffer() + buf := tmp.GetBuffer() - // Decode metadata - meta, err := decodeMetadata(res, buf, payload, sw) + // decode metadata + meta, err := decodeMetadata(tmp, buf, payload, sw) if err != nil { return nil, err } - // Initialize index + // initialize index idx := newMemIndex() idx.docsCount = uint32(len(meta)) idx.ids = idx.res.GetIDs(len(meta)) idx.positions = idx.res.GetDocPosSlice(len(meta)) - idx.blocksOffsets = idx.res.GetUint64s(1) // Only one block per bulk - idx.blocksOffsets[0] = block.GetExt2() + idx.blocksOffsets = idx.res.GetUint64s(1) // only one block per bulk + idx.blocksOffsets[0] = data.GetExt2() - // Extract tokens from metadata - tids, lids, tokens, err := extractTokens(idx, res, buf, meta) + // extract tokens from metadata + tids, lids, tokens, err := extractTokens(idx, tmp, buf, meta) if err != nil { return nil, err } - // Group documents by token - tokenLIDs := groupLIDsByTID(idx, res, tids, lids, len(tokens)) + // group documents by token + tokenLIDs := groupLIDsByTID(idx, tmp, tids, lids, len(tokens)) - // Organize tokens and fields - organizeTokens(idx, res, buf, tokens, tokenLIDs) + // organize tokens and fields + organizeTokens(idx, tmp, buf, tokens, tokenLIDs) return idx, nil } // tokenStr represents a unique token as a (field, value) pair. -// Used as a map key during token deduplication. +// used as a map key during token deduplication. type tokenStr struct { value string field string @@ -109,16 +109,16 @@ func toToken(t tokenizer.MetaToken) tokenStr { // extractTokens extracts tokens from document metadata func extractTokens( idx *memIndex, - res *Resources, + tmp *Resources, buf *indexerBuffer, meta []indexer.MetaData, ) ([]uint32, []uint32, []tokenStr, error) { var docOffset uint64 var totalTokens uint32 - // Calculate document positions in the original block - // Each document is stored as: [size: uint32][data: size bytes] - positions := res.GetDocPosSlice(len(meta)) + // calculate document positions in the original block + // each document is stored as: [size: uint32][data: size bytes] + positions := tmp.GetDocPosSlice(len(meta)) prev := seq.PackDocPos(0, docOffset) for i := range meta { @@ -131,9 +131,9 @@ func extractTokens( totalTokens += docMeta.TokensCount() } - // Create ordering by document ID (descending) - // We need to map global document IDs to local IDs (LIDs) - order := res.GetUint32s(len(meta)) + // create ordering by document ID (descending) + // we need to map global document IDs to local IDs (LIDs) + order := tmp.GetUint32s(len(meta)) for i := range order { order[i] = uint32(i) } @@ -144,7 +144,7 @@ func extractTokens( hash := fnv.New64a() var idBinary [16]byte - // Fill index structures with sorted documents + // fill index structures with sorted documents for i, origIdx := range order { docMeta := meta[origIdx] idx.ids[i] = docMeta.ID @@ -154,26 +154,26 @@ func extractTokens( } idx.hash = hash.Sum64() - // Extract and process tokens from all documents + // extract and process tokens from all documents var err error var token tokenStr - // Allocate slices for token-document relationships - lids := res.GetUint32s(int(totalTokens))[:0] // Local document ID for each token occurrence - tids := res.GetUint32s(int(totalTokens))[:0] // Token ID for each occurrence + // allocate slices for token-document relationships + lids := tmp.GetUint32s(int(totalTokens))[:0] // local document ID for each token occurrence + tids := tmp.GetUint32s(int(totalTokens))[:0] // token ID for each occurrence buf.tokenMap[tokenStr{field: seq.TokenAll}] = 0 // reserve ALL token (just for proper sealing) - // Process documents in ID-sorted order + // process documents in ID-sorted order for i, origIdx := range order { docMeta := meta[origIdx] - // Decode tokens for this document + // decode tokens for this document if buf.tokens, err = docMeta.DecodeTokens(buf.tokens[:0]); err != nil { return nil, nil, nil, err } - // Process each token in the document + // process each token in the document lid := uint32(i + 1) for _, t := range buf.tokens { if bytes.Equal(t.Key, seq.AllTokenName) { @@ -190,8 +190,8 @@ func extractTokens( } } - // Create reverse mapping: tokenID -> tokenKey - tokens := res.GetTokens(len(buf.tokenMap)) + // create reverse mapping: tokenID -> tokenKey + tokens := tmp.GetTokens(len(buf.tokenMap)) for key, tokenID := range buf.tokenMap { tokens[tokenID] = key } @@ -200,19 +200,19 @@ func extractTokens( } // groupLIDsByTID groups document IDs by token -// Input: flat arrays of (tid, lid) pairs -// Output: 2D array where tokenLIDs[tid] = []lid -func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { - // Phase 1: Count documents per token - counts := res.GetUint32s(tokenCount) +// input: flat arrays of (tid, lid) pairs +// output: 2D array where tokenLIDs[tid] = []lid +func groupLIDsByTID(idx *memIndex, tmp *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { + // phase 1: count documents per token + counts := tmp.GetUint32s(tokenCount) clear(counts) for _, tid := range tids { counts[tid]++ } - // Phase 2: Allocate slices for each token group - // We use a single large buffer and slice it for efficiency - tokenLIDs := res.GetUint32Slices(tokenCount) + // phase 2: allocate slices for each token group + // we use a single large buffer and slice it for efficiency + tokenLIDs := tmp.GetUint32Slices(tokenCount) allTokenLIDs := idx.res.GetUint32s(len(lids)) idx.allTokenLIDsCount = len(lids) @@ -222,11 +222,12 @@ func groupLIDsByTID(idx *memIndex, res *Resources, tids, lids []uint32, tokenCou allTokenLIDs = allTokenLIDs[count:] } - // Phase 3: Populate groups with LIDs + // phase 3: populate groups with LIDs lids = lids[:len(tids)] for i, tid := range tids { if len(tokenLIDs[tid]) > 0 { - if lids[i] == lastLID(tokenLIDs[tid]) { // deduplication + if lids[i] == lastLID(tokenLIDs[tid]) { + // tokens deduplication (the same token can occurs a few times for one doc) idx.allTokenLIDsCount-- continue } @@ -242,17 +243,17 @@ func lastLID(s []uint32) uint32 { } // organizeTokens organizes tokens and fields in the index with proper sorting -func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { +func organizeTokens(idx *memIndex, tmp *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { tokenSize := 0 - order := res.GetUint32s(len(tokens)) + order := tmp.GetUint32s(len(tokens)) order = order[:len(tokens)] for i, t := range tokens { order[i] = uint32(i) tokenSize += len(t.value) } - // Create ordering for sorting tokens - // We'll sort by (field, value) to group tokens by field + // create ordering for sorting tokens + // we'll sort by (field, value) to group tokens by field slices.SortFunc(order, func(a, b uint32) int { tokenA, tokenB := tokens[a], tokens[b] return cmp.Or( @@ -264,17 +265,17 @@ func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens [] fieldSize := 0 prevField := "" - // Prepare buffers for sorted data + // prepare buffers for sorted data tokenBuffer := idx.res.GetBytes(tokenSize)[:0] idx.tokenLIDs = idx.res.GetUint32Slices(len(order)) idx.tokens = idx.res.GetBytesSlices(len(order)) - // Process tokens in sorted order + // process tokens in sorted order for tid, origIdx := range order { token := tokens[origIdx] - // Detect field boundaries - // When field name changes, record the field and its first token position + // detect field boundaries + // when field name changes, record the field and its first token position if token.field != prevField || prevField == "" { fieldSize += len(token.field) buf.fields = append(buf.fields, token.field) @@ -282,32 +283,32 @@ func organizeTokens(idx *memIndex, res *Resources, buf *indexerBuffer, tokens [] } prevField = token.field - // Copy token value to buffer and keep reference + // copy token value to buffer and keep reference start := len(tokenBuffer) tokenBuffer = append(tokenBuffer, token.value...) - // Store in sorted arrays - // Note: We use original tokenID as index to preserve tokenID->data mapping + // store in sorted arrays + // note: we use original tokenID as index to preserve tokenID->data mapping idx.tokens[tid] = tokenBuffer[start:] idx.tokenLIDs[tid] = tokenLIDs[origIdx] } - // Add sentinel value for easier range calculation + // add sentinel value for easier range calculation buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) - // Organize fields + // organize fields fieldBuffer := idx.res.GetBytes(fieldSize)[:0] idx.fields = idx.res.GetBytesSlices(len(buf.fields)) idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) for i, field := range buf.fields { - // Copy field name to buffer + // copy field name to buffer start := len(fieldBuffer) fieldBuffer = append(fieldBuffer, field...) idx.fields[i] = fieldBuffer[start:] - // Calculate token range for this field - // Each field has continuous range of token IDs in sorted order + // calculate token range for this field + // each field has continuous range of token IDs in sorted order startTID := buf.fieldTIDs[i] endTID := buf.fieldTIDs[i+1] idx.fieldsTokens[util.ByteToStringUnsafe(fieldBuffer[start:])] = tokenRange{ @@ -322,7 +323,7 @@ func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwa m := sw.Start("decompress_meta") defer m.Stop() - // Allocate exact size needed for compressed data + // allocate exact size needed for compressed data buffer := res.GetBytes(int(block.RawLen())) payload, err := block.DecompressTo(buffer) if err != nil { @@ -332,12 +333,12 @@ func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwa } // decodeMetadata decodes document metadata from binary format -// Format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... -func decodeMetadata(res *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { +// format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... +func decodeMetadata(tmp *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { m := sw.Start("decode_meta") defer m.Stop() - // First pass: scan to determine sizes of each metadata entry + // first pass: scan to determine sizes of each metadata entry var offset uint32 for offset < uint32(len(payload)) { size := binary.LittleEndian.Uint32(payload[offset:]) @@ -345,15 +346,15 @@ func decodeMetadata(res *Resources, buf *indexerBuffer, payload []byte, sw *stop buf.sizes = append(buf.sizes, size) } - // Second pass: decode each metadata entry - meta := res.GetMetadata(len(buf.sizes)) + // second pass: decode each metadata entry + meta := tmp.GetMetadata(len(buf.sizes)) for i, size := range buf.sizes { - // Skip size field to get to actual data + // skip size field to get to actual data data := payload[uint32Size : size+uint32(uint32Size)] if err := meta[i].UnmarshalBinaryLazy(data); err != nil { return nil, err } - // Move to next entry + // move to next entry payload = payload[size+uint32(uint32Size):] } diff --git a/frac/active/merge.go b/frac/active/merge.go index 8f9e0664..4d64a67b 100644 --- a/frac/active/merge.go +++ b/frac/active/merge.go @@ -11,10 +11,9 @@ import ( "go.uber.org/zap" ) -// mergeIndexes merges several in-memory indexes (memIndex) -// into a single resulting index. +// mergeIndexes merges several in-memory indexes into one. func mergeIndexes(indexes []*memIndex) *memIndex { - // Count the total number of blocks, documents, and tokens to preallocate memory. + // preallocate memory based on total size blocksCount := 0 dst := newMemIndex() @@ -25,40 +24,37 @@ func mergeIndexes(indexes []*memIndex) *memIndex { blocksCount += len(idx.blocksOffsets) } - // Shared temporary resources for merging - res, release := NewResources() + tmp, release := NewResources() defer release() - // Preallocate memory for final structures + // preallocate final structures dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] dst.positions = dst.res.GetDocPosSlice(int(dst.docsCount))[:0] dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] - // 1. Merge block offsets and recalculate document positions - posMap := mergeBlocksOffsets(dst, res, indexes) + // 1. merge block offsets and recalc document positions + posMap := mergeBlocksOffsets(dst, tmp, indexes) - // 2. Merge documents (IDs), get old LID → new LID mapping - lidsMap := mergeIDs(dst, res, indexes, posMap) + // 2. merge documents, get old→new LID mapping + lidsMap := mergeIDs(dst, tmp, indexes, posMap) - // 3. Merge tokens using the new document LIDs - mergeTokens(dst, res, indexes, lidsMap) + // 3. merge tokens using new LIDs + mergeTokens(dst, tmp, indexes, lidsMap) return dst } -// mergeIDs merges documents from all indexes into a single ordered stream. -// Returns a mapping of oldLID → newLID for each input index. +// mergeIDs merges documents from all indexes into ordered stream. +// returns mapping oldLID → newLID for each index. func mergeIDs( dst *memIndex, - res *Resources, + tmp *Resources, indexes []*memIndex, posMap [][]seq.DocPos, ) [][]uint32 { - // Store old LID → new LID mapping for each index - lidsMap := res.GetUint32Slices(len(indexes)) - - // Iterators over documents of each index + // store old→new LID mapping per index + lidsMap := tmp.GetUint32Slices(len(indexes)) docStreams := make([]OrderedStream[DocRef], len(indexes)) for i, idx := range indexes { @@ -68,12 +64,11 @@ func mergeIDs( posMap: posMap[i], // recalculated document positions } - // LIDs start from 1, so add a "dummy" element immediately - lidsMap[i] = res.GetUint32s(int(idx.docsCount) + 1)[:1] + // LIDs start from 1, so add dummy element + lidsMap[i] = tmp.GetUint32s(int(idx.docsCount) + 1)[:1] } - // Merge all document streams into one, - // sorting by ID (in reverse order) + // merge all streams by ID (reverse order) mergedDocStream := MergeSortedStreams( docStreams, func(a, b DocRef) int { @@ -90,24 +85,23 @@ func mergeIDs( prevRef DocRef ) - // Iterate over the merged stream + // process merged stream for docRef, has := mergedDocStream.Next(); has; docRef, has = mergedDocStream.Next() { if docRef.id == prevRef.id && docRef.i != prevRef.i { doubles++ - lidsMap[docRef.i] = append(lidsMap[docRef.i], 0) // add zero LID for consistent mapping + // map old LID → 0 (will be filtered later) + lidsMap[docRef.i] = append(lidsMap[docRef.i], 0) continue } prevRef = docRef - // Add document to the resulting index + // add to result dst.ids = append(dst.ids, docRef.id) dst.positions = append(dst.positions, docRef.pos) - // New LID is the position in dst.ids (1-based) - lid := uint32(len(dst.ids)) - - // Record oldLID → newLID mapping - lidsMap[docRef.i] = append(lidsMap[docRef.i], lid) + // new LID is position in dst.ids (1-based) + newLID := uint32(len(dst.ids)) + lidsMap[docRef.i] = append(lidsMap[docRef.i], newLID) } if doubles > 0 { @@ -118,24 +112,25 @@ func mergeIDs( return lidsMap } -// mergeTokens merges tokens from all indexes, -// reusing the new document LIDs. +// mergeTokens merges tokens from all indexes using new LIDs. func mergeTokens( dst *memIndex, - res *Resources, + tmp *Resources, indexes []*memIndex, lidsMap [][]uint32, ) { + totalDocs := 0 // sum of documents from all indexes (before deduplication) totalTokens := 0 tokenStreams := make([]OrderedStream[TokenRef], len(indexes)) - // create iterators over tokens + // create token iterators for i, idx := range indexes { + totalDocs += int(idx.docsCount) totalTokens += len(idx.tokens) tokenStreams[i] = NewTokenStream(idx, lidsMap[i]) } - cmpToken := func(a, b TokenRef) int { // token comparison: first by field, then by value + cmpToken := func(a, b TokenRef) int { r := bytes.Compare(a.Field(), b.Field()) if r == 0 { return bytes.Compare(a.Value(), b.Value()) @@ -146,7 +141,7 @@ func mergeTokens( // merged and sorted token stream mergedTokenStream := MergeSortedStreams(tokenStreams, cmpToken) - // statistics for unique values + // unique values statistics uniqTokensSize := 0 uniqTokensCount := 0 uniqFieldsSize := 0 @@ -159,25 +154,25 @@ func mergeTokens( // borders[i] indicates: const ( - borderNone = 0b00 // tokensRef[i] is the same token as in tokensRef[i-1] (but other index) - borderToken = 0b01 // tokensRef[i] is new token - borderField = 0b10 // tokensRef[i] is new token and new field + borderNone = 0b00 // tokensRef[i] same token as previous (but different index) + borderToken = 0b01 // tokensRef[i] is a new token value + borderField = 0b10 // tokensRef[i] is a new field ) - borders := res.GetBytes(totalTokens)[:0] + borders := tmp.GetBytes(totalTokens)[:0] tokensRef := make([]TokenRef, 0, totalTokens) - // First pass: count unique tokens and fields + // first pass: count unique tokens and fields for tokenRef, has := mergedTokenStream.Next(); has; tokenRef, has = mergedTokenStream.Next() { var border uint8 = borderNone - // New token + // new token if prevToken.payload == nil || cmpToken(prevToken, tokenRef) != 0 { uniqTokensCount++ uniqTokensSize += len(tokenRef.Value()) border |= borderToken - // New field + // new field field := tokenRef.Field() if !bytes.Equal(prevField, field) { uniqFieldsCount++ @@ -192,7 +187,7 @@ func mergeTokens( prevToken = tokenRef } - // Initialize resulting index structures + // initialize result structures dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) dst.fields = dst.res.GetBytesSlices(uniqFieldsCount)[:0] dst.tokens = dst.res.GetBytesSlices(uniqTokensCount)[:0] @@ -201,15 +196,15 @@ func mergeTokens( allTokens := dst.res.GetBytes(uniqTokensSize)[:0] allFields := dst.res.GetBytes(uniqFieldsSize)[:0] - // Collector for document LIDs for each token + // collector for token's document LIDs lidsCollector := NewLIDsCollector( - res.GetUint32s(int(dst.docsCount)), // temporary buffer + totalDocs, dst.res.GetUint32s(dst.allTokenLIDsCount)[:0], // all token LIDs dst.res.GetUint32s(int(dst.docsCount)), // LIDs for _all_ - res.GetBytes((int(dst.docsCount) + 1)), // buffer for sorting + tmp.GetBytes((int(dst.docsCount) + 1)), // sorting buffer ) - // Second pass: fill structures + // second pass: fill structures for i, tokenRef := range tokensRef { if borders[i]&borderToken == borderToken { // new token value @@ -240,17 +235,17 @@ func mergeTokens( dst.tokens = append(dst.tokens, allTokens[start:]) } - // Add document LIDs for the token + // add document LIDs for this token newLIDsMap := tokenRef.lidsMap() for _, oldLID := range tokenRef.LIDs() { lidsCollector.Add(newLIDsMap[oldLID]) } } - // Final token + // final token dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) - // Close the last field + // close last field tid := uint32(len(dst.tokens)) - 1 fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) tr := dst.fieldsTokens[fieldStr] @@ -258,93 +253,90 @@ func mergeTokens( dst.fieldsTokens[fieldStr] = tr } -// LIDsCollector collects and efficiently sorts document LIDs for a token. +// LIDsCollector collects and sorts document LIDs for a token. type LIDsCollector struct { - tmp []uint32 // temporary accumulation - lids []uint32 // overall array - all []uint32 // full set of LIDs (1..N) - buf []uint8 // bitmap + totalDocs int // total docs count before deduplication + lids []uint32 // overall array + all []uint32 // full LID set (1..N) + buf []uint8 // bitmap + offset int } -// Initialize collector -func NewLIDsCollector(tmp, lids, all []uint32, buf []uint8) *LIDsCollector { +// NewLIDsCollector initializes collector. +func NewLIDsCollector(totalDocs int, lids, all []uint32, buf []uint8) *LIDsCollector { clear(buf) for i := range all { all[i] = uint32(i) + 1 } return &LIDsCollector{ - tmp: tmp[:0], - lids: lids[:0], - all: all, - buf: buf, + totalDocs: totalDocs, + lids: lids[:0], + all: all, + buf: buf, } } // Add a single LID func (s *LIDsCollector) Add(lid uint32) { - s.tmp = append(s.tmp, lid) + s.lids = append(s.lids, lid) } -// Returns sorted LID list, -// choosing the optimal algorithm depending on density. +// GetSorted returns sorted LID list using optimal algorithm. func (s *LIDsCollector) GetSorted() (dst []uint32) { - n := len(s.tmp) + n := len(s.lids) - s.offset - // If all documents are covered — return all - if n == len(s.all) { - s.tmp = s.tmp[:0] + // all documents covered → return all + if n == s.totalDocs { + s.lids = s.lids[:s.offset] return s.all } - // If density is high — use bitmap + dst = s.lids[s.offset:] + s.offset = len(s.lids) + + // dense case: use bitmap if 100*n/len(s.all) > 50 { - for _, v := range s.tmp { + for _, v := range dst { s.buf[v] = 1 } - s.buf[0] = 0 // avoiding a zero LID caused by duplicates - start := len(s.lids) + s.buf[0] = 0 // skip zero LID from duplicates + dst = dst[:0] for lid, ok := range s.buf { if ok == 1 { s.buf[lid] = 0 - s.lids = append(s.lids, uint32(lid)) + dst = append(dst, uint32(lid)) } } - s.tmp = s.tmp[:0] - return s.lids[start:] + return dst } - // Otherwise, normal sorting + // sparse case: sort normally if n > 1 { - slices.Sort(s.tmp) + slices.Sort(dst) } - i := 0 - for i < len(s.tmp) && s.tmp[i] == 0 { // skipping zero LIDs caused by duplicates - i++ + // skip zero LIDs from duplicates + for len(dst) > 0 && dst[0] == 0 { + dst = dst[1:] } - s.tmp = s.tmp[i:] - start := len(s.lids) - s.lids = append(s.lids, s.tmp...) - s.tmp = s.tmp[:0] - return s.lids[start:] + return dst } -// mergeBlocksOffsets merges block offsets -// and recalculates document positions considering the offset. +// mergeBlocksOffsets merges block offsets and recalculates document positions. func mergeBlocksOffsets( dst *memIndex, - res *Resources, + tmp *Resources, indexes []*memIndex, ) [][]seq.DocPos { var offset uint32 - positions := res.GetDocPosSlices(len(indexes)) + positions := tmp.GetDocPosSlices(len(indexes)) for i, index := range indexes { - // Copy block offsets + // copy block offsets dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) - // Recalculate document positions - positions[i] = res.GetDocPosSlice(len(index.positions))[:0] + // recalculate positions + positions[i] = tmp.GetDocPosSlice(len(index.positions))[:0] for _, p := range index.positions { oldIdx, docOffset := p.Unpack() positions[i] = append( diff --git a/frac/active/resources.go b/frac/active/resources.go index 3ad33eb8..b7d7fe67 100644 --- a/frac/active/resources.go +++ b/frac/active/resources.go @@ -1,7 +1,6 @@ package active import ( - "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/resources" "github.com/ozontech/seq-db/seq" @@ -67,10 +66,7 @@ func (r *Resources) GetBytesSlices(s int) [][]byte { } func (r *Resources) GetBytes(s int) []byte { - b := bytespool.AcquireLen(s) - r.releases.Defer(func() { bytespool.Release(b) }) - return b.B - // return r.bytes.GetSlice(s) + return r.bytes.GetSlice(s) } func (r *Resources) GetUint32s(s int) []uint32 { diff --git a/resources/call_stack.go b/resources/call_stack.go index c60cf301..50df25d9 100644 --- a/resources/call_stack.go +++ b/resources/call_stack.go @@ -12,6 +12,5 @@ func (s *CallStack) CallAll() { for i := len(s.stack) - 1; i >= 0; i-- { s.stack[i]() } - clear(s.stack) s.stack = s.stack[:0] } diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go index f5b317f3..fcf267d9 100644 --- a/resources/slice_allocator.go +++ b/resources/slice_allocator.go @@ -25,7 +25,6 @@ func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] } func (a SlicesPool[T]) GetSlice(size int) []T { - return make([]T, size) data := a.pool.Get(size) a.releases.Defer(func() { a.pool.Put(data) }) return data[:size] diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go index 61245af7..4b827cba 100644 --- a/resources/slice_on_bytes.go +++ b/resources/slice_on_bytes.go @@ -2,8 +2,6 @@ package resources import ( "unsafe" - - "github.com/ozontech/seq-db/bytespool" ) func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { @@ -15,28 +13,24 @@ func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { } type SliceOnBytes[T any] struct { - // pool *SizedPool[byte] + pool *SizedPool[byte] releases *CallStack } func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { return SliceOnBytes[T]{ - // pool: &BytesPool, + pool: &BytesPool, releases: releases, } } func (a SliceOnBytes[T]) GetSlice(size int) []T { - var tmp T - itemSize := int(unsafe.Sizeof(tmp)) - - b := bytespool.AcquireLen(size * itemSize) - buf := b.B + var empty T + itemSize := int(unsafe.Sizeof(empty)) - // buf := a.pool.Get(size * itemSize) + buf := a.pool.Get(size * itemSize) capacity := cap(buf) / itemSize data := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf))), capacity)[:size] - // a.releases.Defer(func() { a.pool.Put(buf) }) - a.releases.Defer(func() { bytespool.Release(b) }) + a.releases.Defer(func() { a.pool.Put(buf) }) return data }