diff --git a/asyncsearcher/async_searcher_test.go b/asyncsearcher/async_searcher_test.go index f403aa44..fce9afcd 100644 --- a/asyncsearcher/async_searcher_test.go +++ b/asyncsearcher/async_searcher_test.go @@ -9,7 +9,6 @@ import ( "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/mappingprovider" "github.com/ozontech/seq-db/seq" @@ -17,11 +16,11 @@ import ( type fakeFrac struct { frac.Fraction - info common.Info + info frac.Info dp fakeDP } -func (f *fakeFrac) Info() *common.Info { +func (f *fakeFrac) Info() *frac.Info { return &f.info } @@ -51,7 +50,7 @@ func TestAsyncSearcherMaintain(t *testing.T) { Retention: time.Hour, } fracs := []frac.Fraction{ - &fakeFrac{info: common.Info{Path: "1"}}, + &fakeFrac{info: frac.Info{Path: "1"}}, } r.NoError(as.StartSearch(req, fracs)) diff --git a/benchmarks/docker-compose-seqdb.yml b/benchmarks/docker-compose-seqdb.yml index f0884ca3..93bd7e7d 100644 --- a/benchmarks/docker-compose-seqdb.yml +++ b/benchmarks/docker-compose-seqdb.yml @@ -7,7 +7,7 @@ services: limits: cpus: "4" memory: "8GB" - image: ghcr.io/ozontech/seq-db:v0.61.0 + image: 'gitlab-registry.ozon.ru/sre/images/seq-db:che@sha256:82d0dd34cb5d6db9e0450bc8d2cd1d9e29414ec2ba81dc8c4ae643dea6eb1bd0' ports: - '9002:9002' volumes: diff --git a/cmd/distribution/main.go b/cmd/distribution/main.go index c8caad0b..93821ffe 100644 --- a/cmd/distribution/main.go +++ b/cmd/distribution/main.go @@ -11,7 +11,7 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" @@ -59,7 +59,7 @@ func readBlock(reader storage.IndexReader, blockIndex uint32) ([]byte, error) { return data, nil } -func loadInfo(path string) *common.Info { +func loadInfo(path string) *frac.Info { indexReader, f := getReader(path) defer f.Close() @@ -87,7 +87,7 @@ func loadInfo(path string) *common.Info { return b.Info } -func buildDist(dist *seq.MIDsDistribution, path string, _ *common.Info) { +func buildDist(dist *seq.MIDsDistribution, path string, _ *frac.Info) { blocksReader, f := getReader(path) defer f.Close() diff --git a/cmd/seq-db/seq-db.go b/cmd/seq-db/seq-db.go index 2be1c1f9..3831356b 100644 --- a/cmd/seq-db/seq-db.go +++ b/cmd/seq-db/seq-db.go @@ -23,7 +23,6 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/mappingprovider" @@ -259,7 +258,8 @@ func startStore( MaintenanceDelay: 0, CacheGCDelay: 0, CacheCleanupDelay: 0, - SealParams: common.SealParams{ + MinSealFracSize: uint64(cfg.Storage.TotalSize) * consts.DefaultMinSealPercent / 100, + SealParams: frac.SealParams{ IDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, LIDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, TokenListZstdLevel: cfg.Compression.SealedZstdCompressionLevel, diff --git a/config/shared.go b/config/shared.go index 83abec44..696ead8f 100644 --- a/config/shared.go +++ b/config/shared.go @@ -8,7 +8,7 @@ var ( ReaderWorkers int CaseSensitive = false - SkipFsync = false + SkipFsync = true MaxFetchSizeBytes = 4 * units.MiB diff --git a/frac/active/active.go b/frac/active/active.go new file mode 100644 index 00000000..401cc3d7 --- /dev/null +++ b/frac/active/active.go @@ -0,0 +1,294 @@ +package active + +import ( + "context" + "io" + "os" + "sync" + "time" + + "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active_old" + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/metric" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +type Active struct { + Config *frac.Config + + BaseFileName string + + indexer *Indexer + + indexes *memIndexPool + merger *mergeManager + + docsFile *os.File + docsReader storage.DocsReader + sortReader storage.DocsReader + docsCache *cache.Cache[[]byte] + sortCache *cache.Cache[[]byte] + + metaFile *os.File + metaReader storage.DocBlocksReader + + writer *active_old.Writer +} + +func New( + baseFileName string, + cfg *frac.Config, + workers int, + readLimiter *storage.ReadLimiter, + docsCache *cache.Cache[[]byte], + sortCache *cache.Cache[[]byte], +) *Active { + docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) + metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) + + info := frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())) + indexes := NewIndexPool(info) + + f := &Active{ + BaseFileName: baseFileName, + Config: cfg, + indexer: NewIndexer(util.NewSemaphore(workers)), + merger: newMergeManager(indexes, util.NewSemaphore(workers)), + indexes: indexes, + + docsFile: docsFile, + docsCache: docsCache, + sortCache: sortCache, + docsReader: storage.NewDocsReader(readLimiter, docsFile, docsCache), + sortReader: storage.NewDocsReader(readLimiter, docsFile, sortCache), + + metaFile: metaFile, + metaReader: storage.NewDocBlocksReader(readLimiter, metaFile), + + writer: active_old.NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), + } + + logger.Info("active fraction created", zap.String("fraction", baseFileName)) + + return f +} + +func (f *Active) Replay(ctx context.Context) error { + + info := f.indexes.info + + logger.Info("start replaying...", zap.String("name", info.Name())) + + t := time.Now() + + offset := uint64(0) + step := info.MetaOnDisk / 10 + wg := sync.WaitGroup{} + next := step + +out: + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + meta, metaSize, err := f.metaReader.ReadDocBlock(int64(offset)) + if err == io.EOF { + if metaSize != 0 { + logger.Warn("last meta block is partially written, skipping it") + } + break out + } + if err != nil && err != io.EOF { + return err + } + + if offset > next { + next += step + progress := float64(offset) / float64(info.MetaOnDisk) * 100 + logger.Info("replaying batch, meta", + zap.String("name", info.Name()), + zap.Uint64("from", offset), + zap.Uint64("to", offset+metaSize), + zap.Uint64("target", info.MetaOnDisk), + util.ZapFloat64WithPrec("progress_percentage", progress, 2), + ) + } + offset += metaSize + + wg.Add(1) + f.indexer.Index(meta, func(idx *memIndex, err error) { + f.AddIndex(idx, 0, 0, err) + wg.Done() + }) + } + } + + wg.Wait() + + tookSeconds := util.DurationToUnit(time.Since(t), "s") + throughputRaw := util.SizeToUnit(info.DocsRaw, "mb") / tookSeconds + throughputMeta := util.SizeToUnit(info.MetaOnDisk, "mb") / tookSeconds + logger.Info("active fraction replayed", + zap.String("name", info.Name()), + zap.Uint32("docs_total", info.DocsTotal), + util.ZapUint64AsSizeStr("docs_size", info.DocsOnDisk), + util.ZapFloat64WithPrec("took_s", tookSeconds, 1), + util.ZapFloat64WithPrec("throughput_raw_mb_sec", throughputRaw, 1), + util.ZapFloat64WithPrec("throughput_meta_mb_sec", throughputMeta, 1), + ) + return nil +} + +func (f *Active) Append(docs, meta []byte, wg *sync.WaitGroup) (err error) { + sw := stopwatch.New() + ma := sw.Start("append") + if err = f.writer.Write(docs, meta, sw); err != nil { + ma.Stop() + return err + } + + mi := sw.Start("send_to_indexer") + + f.indexer.Index(meta, func(idx *memIndex, err error) { + f.AddIndex(idx, uint64(len(docs)), uint64(len(meta)), err) + wg.Done() + }) + + mi.Stop() + + ma.Stop() + sw.Export(bulkStagesSeconds) + return nil +} + +func (f *Active) AddIndex(idx *memIndex, docsLen, metaLen uint64, err error) { + if err != nil { + logger.Fatal("bulk indexing error", zap.Error(err)) + } + f.indexes.Add(idx, docsLen, metaLen) + f.merger.requestMerge() +} + +func (f *Active) String() string { + return frac.FracToString(f, "active") +} + +func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + sw := stopwatch.New() + defer sw.Export(fetcherStagesSec) + + t := sw.Start("total") + + ss, release := f.indexes.Snapshot() + defer release() + + if ss.info.DocsTotal == 0 { // it is empty active fraction state + return nil, nil + } + + res := make([][]byte, len(ids)) + for _, index := range ss.indexes { + fetchIndex := fetchIndex{index: index, docsReader: &f.docsReader} + if err := processor.IndexFetch(ids, sw, &fetchIndex, res); err != nil { + return nil, err + } + } + t.Stop() + + return res, nil +} + +func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { + ss, release := f.indexes.Snapshot() + defer release() + + if ss.info.DocsTotal == 0 { // it is empty active fraction state + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil + } + + aggLimits := processor.AggLimits(f.Config.Search.AggLimits) + + // Limit the parameter range to data boundaries to prevent histogram overflow + params.From = max(params.From, ss.info.From) + params.To = min(params.To, ss.info.To) + + sw := stopwatch.New() + defer sw.Export(getActiveSearchMetric(params)) + + t := sw.Start("total") + qprs := make([]*seq.QPR, 0, len(ss.indexes)) + for _, index := range ss.indexes { + si := searchIndex{ctx: ctx, index: index} + qpr, err := processor.IndexSearch(ctx, params, &si, aggLimits, sw) + if err != nil { + return nil, err + } + qprs = append(qprs, qpr) + } + res := processor.MergeQPRs(qprs, params) + res.IDs.ApplyHint(ss.info.Name()) + t.Stop() + + return res, nil +} + +func (f *Active) Info() *frac.Info { + return f.indexes.Info() +} + +func (f *Active) Contains(id seq.MID) bool { + return f.Info().IsIntersecting(id, id) +} + +func (f *Active) IsIntersecting(from, to seq.MID) bool { + return f.Info().IsIntersecting(from, to) +} + +func (f *Active) Release() { + f.releaseMem() + + if !f.Config.KeepMetaFile { + util.RemoveFile(f.metaFile.Name()) + } + + if !f.Config.SkipSortDocs { + // we use sorted docs in sealed fraction so we can remove original docs of active fraction + util.RemoveFile(f.docsFile.Name()) + } + +} + +func (f *Active) Suicide() { + f.releaseMem() + + util.RemoveFile(f.metaFile.Name()) + util.RemoveFile(f.docsFile.Name()) + util.RemoveFile(f.BaseFileName + consts.SdocsFileSuffix) +} + +func (f *Active) releaseMem() { + f.writer.Stop() + f.merger.Stop() + f.indexes.Release() + + f.docsCache.Release() + f.sortCache.Release() + + if err := f.metaFile.Close(); err != nil { + logger.Error("can't close meta file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } +} diff --git a/frac/active/active_test.go b/frac/active/active_test.go new file mode 100644 index 00000000..6fedcd80 --- /dev/null +++ b/frac/active/active_test.go @@ -0,0 +1,151 @@ +package active + +import ( + "encoding/binary" + "fmt" + "testing" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tokenizer" + "github.com/stretchr/testify/assert" +) + +func TestMerge_DuplicateIDs(t *testing.T) { + // create several indexes with overlapping IDs + idx1 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, // ID 100 will be duplicated + {MID: 101, RID: 2}, + }) + + idx2 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, // duplicate of ID 100 from the first index + {MID: 102, RID: 3}, + }) + + idx3 := createTestIndex(t, []seq.ID{ + {MID: 103, RID: 4}, + {MID: 104, RID: 5}, + }) + + // call mergeIndexes with indexes containing duplicated IDs + indexes := []*memIndex{idx1, idx2, idx3} + result := mergeIndexes(indexes) + + expectedIDs := []seq.ID{ + {MID: 104, RID: 5}, + {MID: 103, RID: 4}, + {MID: 102, RID: 3}, + {MID: 101, RID: 2}, + {MID: 100, RID: 1}, + } + assert.Equal(t, expectedIDs, result.ids) + assert.Equal(t, len(expectedIDs), int(result.docsCount)) + assert.Equal(t, len(expectedIDs)+1, int(result.docsSize), "we can't adjust the total size during deduplication") + + // verify that the _all_ token is correct + allRange := result.fieldsTokens[seq.TokenAll] + allTID := allRange.start + assert.Equal(t, uint32(1), allRange.count) + assert.Empty(t, result.tokenLIDs[allTID], "empty list means ALL documents") + + // verify that the foo:bar token is correct + fooRange := result.fieldsTokens["foo"] + fooTID := fooRange.start + assert.Equal(t, uint32(1), fooRange.count) + assert.Equal(t, []uint32{1, 2, 3, 4, 5}, result.tokenLIDs[fooTID], "") +} + +func createTestIndex(t *testing.T, ids []seq.ID) *memIndex { + meta := []byte{} + for i, id := range ids { + md := indexer.MetaData{ + ID: id, + Size: 1, + Tokens: []tokenizer.MetaToken{ + {Key: []byte("foo"), Value: []byte("bar")}, + {Key: []byte("num"), Value: []byte(fmt.Sprintf("token_%d", i))}, + {Key: []byte("foo"), Value: []byte("bar")}, // duplicate + }, + } + tmp := md.MarshalBinaryTo(nil) + meta = binary.LittleEndian.AppendUint32(meta, uint32(len(tmp))) + meta = append(meta, tmp...) + } + idx, err := NewMemIndex(storage.CompressDocBlock(meta, nil, 1)) + assert.NoError(t, err) + return idx +} + +func TestMemIndexPool_Add_DuplicateBulk(t *testing.T) { + idx1 := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, + {MID: 101, RID: 2}, + }) + + idx2 := createTestIndex(t, []seq.ID{ + {MID: 102, RID: 3}, + {MID: 103, RID: 4}, + }) + + idx3 := createTestIndex(t, []seq.ID{ + {MID: 102, RID: 3}, + {MID: 103, RID: 4}, + }) + + assert.NotEqual(t, idx1.hash, idx2.hash) + assert.Equal(t, idx2.hash, idx3.hash) + + info := frac.NewInfo("test", 0, 0) + pool := NewIndexPool(info) + + // add the first index + pool.Add(idx1, 10, 10) + + // verify the index was added + snapshot1, release1 := pool.Snapshot() + assert.Len(t, snapshot1.indexes, 1) + release1() + + // add the second index + pool.Add(idx2, 10, 10) + + // verify the index was added + snapshot2, release2 := pool.Snapshot() + assert.Len(t, snapshot2.indexes, 2) + release2() + + // add the third index with the same hash + pool.Add(idx3, 10, 10) + + // verify the third index was NOT added (should be ignored) + snapshot3, release3 := pool.Snapshot() + assert.Len(t, snapshot3.indexes, 2, "third index with the same hash should not be added") + + // verify that the first and second indexes remain + assert.Equal(t, seq.MID(101), snapshot3.indexes[0].ids[0].MID) + assert.Equal(t, seq.MID(103), snapshot3.indexes[1].ids[0].MID) + release3() + + // verify statistics - DocsTotal should only account for the first index + assert.Equal(t, uint32(4), info.DocsTotal) + assert.Equal(t, uint64(4), info.DocsRaw) + assert.Equal(t, uint64(20), info.DocsOnDisk) + assert.Equal(t, uint64(20), info.MetaOnDisk) + assert.Equal(t, seq.MID(100), info.From) + assert.Equal(t, seq.MID(103), info.To) +} + +func TestIndexer_TokenDeduplication(t *testing.T) { + idx := createTestIndex(t, []seq.ID{ + {MID: 100, RID: 1}, + {MID: 101, RID: 2}, + }) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens[seq.TokenAll].start], 0) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["foo"].start], 2) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["num"].start+0], 1) + assert.Len(t, idx.tokenLIDs[idx.fieldsTokens["num"].start+1], 1) + assert.Equal(t, 4, idx.allTokenLIDsCount) +} diff --git a/frac/active/data_provider.go b/frac/active/data_provider.go new file mode 100644 index 00000000..0f1730d2 --- /dev/null +++ b/frac/active/data_provider.go @@ -0,0 +1,160 @@ +package active + +import ( + "context" + "fmt" + "sort" + + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/node" + "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/pattern" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" +) + +// fetchIndex is used during the fetch phase: +// reading data and document positions. +type fetchIndex struct { + index *memIndex + docsReader *storage.DocsReader +} + +// GetBlocksOffsets returns the offset of a block by its index. +func (si *fetchIndex) GetBlocksOffsets(blockIndex uint32) uint64 { + return si.index.blocksOffsets[blockIndex] +} + +// GetDocPos returns document positions for the given IDs. +// If a document is not found, DocPosNotFound is returned. +func (si *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { + docsPos := make([]seq.DocPos, len(ids)) + for i, id := range ids { + if lid, ok := si.index.GetLIDByID(id); ok { + docsPos[i] = si.index.positions[lid-1] + continue + } + docsPos[i] = seq.DocPosNotFound + } + return docsPos +} + +// ReadDocs reads documents from storage +// using the block offset and document offsets inside the block. +func (si *fetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { + return si.docsReader.ReadDocs(blockOffset, docOffsets) +} + +// searchIndex is used during the search phase: +// matching tokens, documents, and query conditions. +type searchIndex struct { + ctx context.Context + index *memIndex +} + +// GetValByTID returns the token value by its TID. +func (si *searchIndex) GetValByTID(tid uint32) []byte { + return si.index.tokens[tid] +} + +// GetTIDsByTokenExpr finds TIDs by a token expression from the query. +func (si *searchIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { + field := parser.GetField(t) + tp := si.index.getTokenProvider(field) + + tids, err := pattern.Search(si.ctx, t, tp) + if err != nil { + return nil, fmt.Errorf("search error: %w field: %s, query: %s", err, field, parser.GetHint(t)) + } + return tids, nil +} + +// GetLIDsFromTIDs converts a list of TIDs into a list of nodes (Node), +// each representing a set of local document identifiers (LIDs) +// that satisfy the token. +func (si *searchIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { + nodes := make([]node.Node, 0, len(tids)) + for _, tid := range tids { + nodes = append(nodes, si.getTIDLIDsNode(tid, minLID, maxLID, order)) + } + return nodes +} + +// getTIDLIDsNode creates a node.Node for a single TID. +func (si *searchIndex) getTIDLIDsNode(tid, minLID, maxLID uint32, order seq.DocsOrder) node.Node { + tidLIDs := si.index.tokenLIDs[tid] + if len(tidLIDs) == 0 { // empty list means ALL documents + return node.NewRange(minLID, maxLID, order.IsReverse()) + } + // Regular token — static list of LIDs + return node.NewStatic(narrowDownLIDs(tidLIDs, minLID, maxLID), order.IsReverse()) +} + +// narrowDownLIDs restricts a sorted list of LIDs to the range [minLID, maxLID]. +func narrowDownLIDs(tidLIDs []uint32, minLID, maxLID uint32) []uint32 { + n := len(tidLIDs) + + left := sort.Search(n, func(i int) bool { + return tidLIDs[i] >= minLID + }) + right := sort.Search(n, func(i int) bool { + return tidLIDs[i] > maxLID + }) + + if left > right { + return nil + } + return tidLIDs[left:right] +} + +// LessOrEqual compares a document by LID with the given ID. +func (si *searchIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { + checkedMID := si.GetMID(lid) + if checkedMID == id.MID { + return si.GetRID(lid) <= id.RID + } + return checkedMID < id.MID +} + +// GetMID returns the document MID by LID. +func (si *searchIndex) GetMID(lid seq.LID) seq.MID { + return si.index.ids[lid-1].MID +} + +// GetRID returns the document RID by LID. +func (si *searchIndex) GetRID(lid seq.LID) seq.RID { + return si.index.ids[lid-1].RID +} + +// Len returns the number of documents + 1 (LID starts from 1). +func (si *searchIndex) Len() int { + return len(si.index.ids) + 1 +} + +// tokenProvider is an adapter for pattern.Search. +// It provides access to tokens in the specified TID range. +type tokenProvider struct { + firstTID uint32 + lastTID uint32 + tokens [][]byte +} + +// GetToken returns a token by TID. +func (p *tokenProvider) GetToken(tid uint32) []byte { + return p.tokens[tid] +} + +// FirstTID returns the minimum TID. +func (p *tokenProvider) FirstTID() uint32 { + return p.firstTID +} + +// LastTID returns the maximum TID. +func (p *tokenProvider) LastTID() uint32 { + return p.lastTID +} + +// Ordered reports that tokens are sorted. +func (p *tokenProvider) Ordered() bool { + return true +} diff --git a/frac/active/indexer.go b/frac/active/indexer.go new file mode 100644 index 00000000..5c4f49dd --- /dev/null +++ b/frac/active/indexer.go @@ -0,0 +1,362 @@ +package active + +import ( + "bytes" + "cmp" + "encoding/binary" + "hash/fnv" + "slices" + "unsafe" + + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/metric/stopwatch" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tokenizer" + "github.com/ozontech/seq-db/util" +) + +const uint32Size = uint32(unsafe.Sizeof(uint32(0))) + +// Indexer indexes documents with concurrency limitation +type Indexer struct { + workerPool WorkerLimiter +} + +// NewIndexer creates a new indexer with specified number of workers +func NewIndexer(workerPool WorkerLimiter) *Indexer { + return &Indexer{ + workerPool: workerPool, + } +} + +// indexerBuffer is a temporary reusable buffer used during index construction to avoid allocations. +// it holds intermediate data structures that are needed during processing but not in the final index. +type indexerBuffer struct { + sizes []uint32 + fields []string + fieldTIDs []uint32 + tokens []tokenizer.MetaToken + tokenMap map[tokenStr]uint32 +} + +// Index starts asynchronous document indexing +func (idx *Indexer) Index(block storage.DocBlock, apply func(index *memIndex, err error)) { + idx.workerPool.Acquire() + go func() { + apply(NewMemIndex(block)) + idx.workerPool.Release() + }() +} + +// NewMemIndex creates an in-memory index from a document block +func NewMemIndex(data storage.DocBlock) (*memIndex, error) { + sw := stopwatch.New() + + tmp, release := NewResources() + defer release() + + // decompress metadata + payload, err := decompressMeta(tmp, data, sw) + if err != nil { + return nil, err + } + + buf := tmp.GetBuffer() + + // decode metadata + meta, err := decodeMetadata(tmp, buf, payload, sw) + if err != nil { + return nil, err + } + // initialize index + idx := newMemIndex() + idx.docsCount = uint32(len(meta)) + idx.ids = idx.res.GetIDs(len(meta)) + idx.positions = idx.res.GetDocPosSlice(len(meta)) + idx.blocksOffsets = idx.res.GetUint64s(1) // only one block per bulk + idx.blocksOffsets[0] = data.GetExt2() + + // extract tokens from metadata + tids, lids, tokens, err := extractTokens(idx, tmp, buf, meta) + if err != nil { + return nil, err + } + + // group documents by token + tokenLIDs := groupLIDsByTID(idx, tmp, tids, lids, len(tokens)) + + // organize tokens and fields + organizeTokens(idx, tmp, buf, tokens, tokenLIDs) + + return idx, nil +} + +// tokenStr represents a unique token as a (field, value) pair. +// used as a map key during token deduplication. +type tokenStr struct { + value string + field string +} + +func toToken(t tokenizer.MetaToken) tokenStr { + return tokenStr{ + value: util.ByteToStringUnsafe(t.Value), + field: util.ByteToStringUnsafe(t.Key), + } +} + +// extractTokens extracts tokens from document metadata +func extractTokens( + idx *memIndex, + tmp *Resources, + buf *indexerBuffer, + meta []indexer.MetaData, +) ([]uint32, []uint32, []tokenStr, error) { + var docOffset uint64 + var totalTokens uint32 + + // calculate document positions in the original block + // each document is stored as: [size: uint32][data: size bytes] + positions := tmp.GetDocPosSlice(len(meta)) + prev := seq.PackDocPos(0, docOffset) + + for i := range meta { + docMeta := meta[i] + if docMeta.Size > 0 { + prev = seq.PackDocPos(0, docOffset) + docOffset += uint64(docMeta.Size) + uint64(uint32Size) + } + positions[i] = prev + totalTokens += docMeta.TokensCount() + } + + // create ordering by document ID (descending) + // we need to map global document IDs to local IDs (LIDs) + order := tmp.GetUint32s(len(meta)) + for i := range order { + order[i] = uint32(i) + } + slices.SortFunc(order, func(a, b uint32) int { + return seq.Compare(meta[b].ID, meta[a].ID) + }) + + hash := fnv.New64a() + var idBinary [16]byte + + // fill index structures with sorted documents + for i, origIdx := range order { + docMeta := meta[origIdx] + idx.ids[i] = docMeta.ID + idx.positions[i] = positions[origIdx] + idx.docsSize += uint64(docMeta.Size) + hash.Write(docMeta.ID.AppendBinary(idBinary[:0])) + } + idx.hash = hash.Sum64() + + // extract and process tokens from all documents + var err error + var token tokenStr + + // allocate slices for token-document relationships + lids := tmp.GetUint32s(int(totalTokens))[:0] // local document ID for each token occurrence + tids := tmp.GetUint32s(int(totalTokens))[:0] // token ID for each occurrence + + buf.tokenMap[tokenStr{field: seq.TokenAll}] = 0 // reserve ALL token (just for proper sealing) + + // process documents in ID-sorted order + for i, origIdx := range order { + docMeta := meta[origIdx] + + // decode tokens for this document + if buf.tokens, err = docMeta.DecodeTokens(buf.tokens[:0]); err != nil { + return nil, nil, nil, err + } + + // process each token in the document + lid := uint32(i + 1) + for _, t := range buf.tokens { + if bytes.Equal(t.Key, seq.AllTokenName) { + continue + } + token = toToken(t) + tid, exists := buf.tokenMap[token] + if !exists { + tid = uint32(len(buf.tokenMap)) // assign new token ID + buf.tokenMap[token] = tid + } + tids = append(tids, tid) + lids = append(lids, lid) // store lid+1 (1-based indexing for internal use) + } + } + + // create reverse mapping: tokenID -> tokenKey + tokens := tmp.GetTokens(len(buf.tokenMap)) + for key, tokenID := range buf.tokenMap { + tokens[tokenID] = key + } + + return tids, lids, tokens, nil +} + +// groupLIDsByTID groups document IDs by token +// input: flat arrays of (tid, lid) pairs +// output: 2D array where tokenLIDs[tid] = []lid +func groupLIDsByTID(idx *memIndex, tmp *Resources, tids, lids []uint32, tokenCount int) [][]uint32 { + // phase 1: count documents per token + counts := tmp.GetUint32s(tokenCount) + clear(counts) + for _, tid := range tids { + counts[tid]++ + } + + // phase 2: allocate slices for each token group + // we use a single large buffer and slice it for efficiency + tokenLIDs := tmp.GetUint32Slices(tokenCount) + allTokenLIDs := idx.res.GetUint32s(len(lids)) + idx.allTokenLIDsCount = len(lids) + + tokenLIDs = tokenLIDs[:len(counts)] + for tid, count := range counts { + tokenLIDs[tid] = allTokenLIDs[:count][:0] + allTokenLIDs = allTokenLIDs[count:] + } + + // phase 3: populate groups with LIDs + lids = lids[:len(tids)] + for i, tid := range tids { + if len(tokenLIDs[tid]) > 0 { + if lids[i] == lastLID(tokenLIDs[tid]) { + // tokens deduplication (the same token can occurs a few times for one doc) + idx.allTokenLIDsCount-- + continue + } + } + tokenLIDs[tid] = append(tokenLIDs[tid], lids[i]) + } + + return tokenLIDs +} + +func lastLID(s []uint32) uint32 { + return s[len(s)-1] +} + +// organizeTokens organizes tokens and fields in the index with proper sorting +func organizeTokens(idx *memIndex, tmp *Resources, buf *indexerBuffer, tokens []tokenStr, tokenLIDs [][]uint32) { + tokenSize := 0 + order := tmp.GetUint32s(len(tokens)) + order = order[:len(tokens)] + for i, t := range tokens { + order[i] = uint32(i) + tokenSize += len(t.value) + } + + // create ordering for sorting tokens + // we'll sort by (field, value) to group tokens by field + slices.SortFunc(order, func(a, b uint32) int { + tokenA, tokenB := tokens[a], tokens[b] + return cmp.Or( + cmp.Compare(tokenA.field, tokenB.field), + cmp.Compare(tokenA.value, tokenB.value), + ) + }) + + fieldSize := 0 + prevField := "" + + // prepare buffers for sorted data + tokenBuffer := idx.res.GetBytes(tokenSize)[:0] + idx.tokenLIDs = idx.res.GetUint32Slices(len(order)) + idx.tokens = idx.res.GetBytesSlices(len(order)) + + // process tokens in sorted order + for tid, origIdx := range order { + token := tokens[origIdx] + + // detect field boundaries + // when field name changes, record the field and its first token position + if token.field != prevField || prevField == "" { + fieldSize += len(token.field) + buf.fields = append(buf.fields, token.field) + buf.fieldTIDs = append(buf.fieldTIDs, uint32(tid)) + } + prevField = token.field + + // copy token value to buffer and keep reference + start := len(tokenBuffer) + tokenBuffer = append(tokenBuffer, token.value...) + + // store in sorted arrays + // note: we use original tokenID as index to preserve tokenID->data mapping + idx.tokens[tid] = tokenBuffer[start:] + idx.tokenLIDs[tid] = tokenLIDs[origIdx] + } + // add sentinel value for easier range calculation + buf.fieldTIDs = append(buf.fieldTIDs, uint32(len(tokens))) + + // organize fields + fieldBuffer := idx.res.GetBytes(fieldSize)[:0] + idx.fields = idx.res.GetBytesSlices(len(buf.fields)) + + idx.fieldsTokens = make(map[string]tokenRange, len(buf.fields)) + + for i, field := range buf.fields { + // copy field name to buffer + start := len(fieldBuffer) + fieldBuffer = append(fieldBuffer, field...) + idx.fields[i] = fieldBuffer[start:] + + // calculate token range for this field + // each field has continuous range of token IDs in sorted order + startTID := buf.fieldTIDs[i] + endTID := buf.fieldTIDs[i+1] + idx.fieldsTokens[util.ByteToStringUnsafe(fieldBuffer[start:])] = tokenRange{ + start: startTID, + count: endTID - startTID, + } + } +} + +// decompressMeta decompresses metadata from block +func decompressMeta(res *Resources, block storage.DocBlock, sw *stopwatch.Stopwatch) ([]byte, error) { + m := sw.Start("decompress_meta") + defer m.Stop() + + // allocate exact size needed for compressed data + buffer := res.GetBytes(int(block.RawLen())) + payload, err := block.DecompressTo(buffer) + if err != nil { + return nil, err + } + return payload, nil +} + +// decodeMetadata decodes document metadata from binary format +// format: [size: uint32][data: size bytes][size: uint32][data: size bytes]... +func decodeMetadata(tmp *Resources, buf *indexerBuffer, payload []byte, sw *stopwatch.Stopwatch) ([]indexer.MetaData, error) { + m := sw.Start("decode_meta") + defer m.Stop() + + // first pass: scan to determine sizes of each metadata entry + var offset uint32 + for offset < uint32(len(payload)) { + size := binary.LittleEndian.Uint32(payload[offset:]) + offset += uint32Size + size + buf.sizes = append(buf.sizes, size) + } + + // second pass: decode each metadata entry + meta := tmp.GetMetadata(len(buf.sizes)) + for i, size := range buf.sizes { + // skip size field to get to actual data + data := payload[uint32Size : size+uint32(uint32Size)] + if err := meta[i].UnmarshalBinaryLazy(data); err != nil { + return nil, err + } + // move to next entry + payload = payload[size+uint32(uint32Size):] + } + + return meta, nil +} diff --git a/frac/active/indexer_test.go b/frac/active/indexer_test.go new file mode 100644 index 00000000..3938a8b8 --- /dev/null +++ b/frac/active/indexer_test.go @@ -0,0 +1,234 @@ +package active + +import ( + "bytes" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/alecthomas/units" + "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/tokenizer" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zapcore" +) + +func BenchmarkIndexer(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) + + processor := getTestProcessor() + + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + config.NumCPU, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + b.StartTimer() + + wg := sync.WaitGroup{} + for _, meta := range allMeta { + wg.Add(1) + active.indexer.Index(meta, func(idx *memIndex, err error) { + active.indexes.Add(idx, 0, 0) + wg.Done() + }) + } + wg.Wait() + } +} + +func BenchmarkMerge(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) + + processor := getTestProcessor() + + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{}, + config.NumCPU, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + wg := sync.WaitGroup{} + for _, meta := range allMeta { + wg.Add(1) + active.indexer.Index(meta, func(idx *memIndex, err error) { + active.indexes.Add(idx, 0, 0) + wg.Done() + }) + } + wg.Wait() + b.StartTimer() + + active.merger.ForceMergeAll() + } +} + +func defaultSealingParams() frac.SealParams { + const minZstdLevel = 1 + return frac.SealParams{ + IDsZstdLevel: minZstdLevel, + LIDsZstdLevel: minZstdLevel, + TokenListZstdLevel: minZstdLevel, + DocsPositionsZstdLevel: minZstdLevel, + TokenTableZstdLevel: minZstdLevel, + DocBlocksZstdLevel: minZstdLevel, + DocBlockSize: 128 * int(units.KiB), + } +} + +func BenchmarkFullWrite(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) + + processor := getTestProcessor() + + n := 2 + allDocs := make([][]byte, 0, len(readers)*n) + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + } + + params := defaultSealingParams() + + for b.Loop() { + active := New( + filepath.Join(b.TempDir(), "test"), + &frac.Config{SkipSortDocs: true}, + config.NumCPU, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + ) + + wg := sync.WaitGroup{} + for i, meta := range allMeta { + wg.Add(1) + go func() { + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + }() + } + wg.Wait() + + src, err := NewSealingSource(active, params) + require.NoError(b, err) + sealed, err := sealing.Seal(src, params) + require.NoError(b, err) + assert.Greater(b, int(sealed.Info.DocsTotal), 0) + active.Release() + } +} + +func readFileAllAtOnce(filename string) ([][]byte, error) { + content, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + lines := bytes.Split(content, []byte{'\n'}) + if len(lines) > 0 && len(lines[len(lines)-1]) == 0 { + lines = lines[:len(lines)-1] + } + return lines, nil +} + +func splitLogsToBulks(data [][]byte, bulkSize int) []func() ([]byte, error) { + funcs := []func() ([]byte, error){} + for len(data) > 0 { + size := min(len(data), bulkSize) + funcs = append(funcs, testBufReader(data[0:size])) + data = data[size:] + } + return funcs +} + +func testBufReader(data [][]byte) func() ([]byte, error) { + orig := data + return func() ([]byte, error) { + if len(data) == 0 { + data = orig + return nil, nil + } + line := data[0] + data = data[1:] + return line, nil + } +} + +func getTestProcessor() *indexer.Processor { + mapping := seq.Mapping{ + "clientip": seq.NewSingleType(seq.TokenizerTypeKeyword, "clientip", 1024), + "request": seq.NewSingleType(seq.TokenizerTypeText, "request", 1024), + "status": seq.NewSingleType(seq.TokenizerTypeKeyword, "status", 1024), + "size": seq.NewSingleType(seq.TokenizerTypeKeyword, "size", 1024), + } + + tokenizers := map[seq.TokenizerType]tokenizer.Tokenizer{ + seq.TokenizerTypeText: tokenizer.NewTextTokenizer(1024, false, true, 8192), + seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(1024, false, true), + seq.TokenizerTypePath: tokenizer.NewPathTokenizer(1024, false, true), + seq.TokenizerTypeExists: tokenizer.NewExistsTokenizer(), + } + + return indexer.NewProcessor(mapping, tokenizers, 0, 0, 0) +} diff --git a/frac/active/iterators.go b/frac/active/iterators.go new file mode 100644 index 00000000..db98a755 --- /dev/null +++ b/frac/active/iterators.go @@ -0,0 +1,182 @@ +package active + +import "github.com/ozontech/seq-db/seq" + +// OrderedStream - interface for iterators with ordered elements +type OrderedStream[T any] interface { + Next() (T, bool) // Returns the next element and a flag indicating if an element exists +} + +// MergeSortedStreams - performs K-way merging of sorted iterators (merge sort at iterator level) +// Uses a "divide and conquer" strategy for efficient merging +func MergeSortedStreams[T any](src []OrderedStream[T], cmp func(T, T) int) OrderedStream[T] { + n := len(src) + // Base case of recursion: if there's only one iterator + if n == 1 { + return src[0] + } + // Recursively split the iterator array in half + h := n / 2 + src1 := MergeSortedStreams(src[:h], cmp) // Left half + src2 := MergeSortedStreams(src[h:], cmp) // Right half + // Merge the two sorted halves + return NewTwoWayMergeStream(src1, src2, cmp) +} + +// TwoWayMergeStream - implementation of an iterator for merging two sorted streams +type TwoWayMergeStream[T any] struct { + v1, v2 T // Current values from each source + has1, has2 bool // Flags indicating the presence of current values + src1, src2 OrderedStream[T] // Source iterators + cmp func(T, T) int // Comparison function for sorting +} + +// NewTwoWayMergeStream - constructor for MergeIterator +// Initializes the iterator and prefetches the first values from both sources +func NewTwoWayMergeStream[T any](src1, src2 OrderedStream[T], cmp func(T, T) int) *TwoWayMergeStream[T] { + r := TwoWayMergeStream[T]{ + src1: src1, + src2: src2, + cmp: cmp, + } + // Prefetch the first values to enable comparison + r.v1, r.has1 = r.src1.Next() + r.v2, r.has2 = r.src2.Next() + return &r +} + +// Next - returns the next element when merging two sorted streams +// Algorithm is similar to merging in mergesort, but works with streams +func (s *TwoWayMergeStream[T]) Next() (v T, has bool) { + if s.has1 && s.has2 { + if s.cmp(s.v1, s.v2) < 0 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + } else { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + } + return v, true + } + if s.has1 { + v = s.v1 + s.v1, s.has1 = s.src1.Next() + return v, true + } + if s.has2 { + v = s.v2 + s.v2, s.has2 = s.src2.Next() + return v, true + } + return v, false +} + +// DocRef - item of the document identifier iterator +// Contains information about the document's position in the index +type DocRef struct { + i int // Stream index (for identifying the source) + id seq.ID // Document identifier + pos seq.DocPos // Document position +} + +// DocStream - iterator over the array of document identifiers in memIndex +type DocStream struct { + i int // Stream index (source identifier) + offset int // Current position in the ids array + idx *memIndex // Reference to the in-memory index + posMap []seq.DocPos // Map of document positions +} + +// Next - returns the next document ID from memIndex +func (it *DocStream) Next() (v DocRef, has bool) { + // Check if we haven't exceeded the bounds of the identifiers array + if it.offset < len(it.idx.ids) { + v.i = it.i + v.id = it.idx.ids[it.offset] + v.pos = it.posMap[it.offset] + has = true + it.offset++ // Move pointer for the next call + } + return v, has +} + +// TokenContext - shared data for the token iterator +// Contains a reference to the index and local identifier mapping +type TokenContext struct { + idx *memIndex // In-memory index + lidsMap []uint32 // Local identifiers map +} + +// TokenRef - item of the token iterator +// Represents a single token with metadata +type TokenRef struct { + tid uint32 // Token identifier + fid uint32 // Field identifier + payload *TokenContext // Shared data +} + +// Field - returns the field name by its identifier +func (i *TokenRef) Field() []byte { + return i.payload.idx.fields[i.fid] +} + +// Value - returns the token value by its identifier +func (i *TokenRef) Value() []byte { + return i.payload.idx.tokens[i.tid] +} + +// LIDs - returns the list of local identifiers for the token +func (i *TokenRef) LIDs() []uint32 { + return i.payload.idx.tokenLIDs[i.tid] +} + +// lidsMap - returns the local identifiers map +func (i *TokenRef) lidsMap() []uint32 { + return i.payload.lidsMap +} + +// TokenStream - iterator over tokens in the index +// Iterates through tokens grouped by fields +type TokenStream struct { + tid uint32 // Current token identifier + fid uint32 // Current field identifier + fieldLastTID uint32 // Last TID of the current field + payload TokenContext // Iterator shared data +} + +// NewTokenStream - constructor for TokenIterator +// Initializes the iterator with starting values +func NewTokenStream(idx *memIndex, lidsMap []uint32) *TokenStream { + return &TokenStream{ + // Calculate the last TID for the first field + fieldLastTID: idx.fieldsTokens[string(idx.fields[0])].count - 1, + payload: TokenContext{ + idx: idx, + lidsMap: lidsMap, + }, + } +} + +// Next - returns the next token from the index +// Sequentially iterates through tokens, switching between fields +func (it *TokenStream) Next() (v TokenRef, has bool) { + // Check if we haven't exceeded the bounds of the tokens array + if int(it.tid) < len(it.payload.idx.tokens) { + v.tid = uint32(it.tid) + v.fid = uint32(it.fid) + v.payload = &it.payload + has = true + it.tid++ // Move to the next token + + // Check if we've reached the end of the current field + if it.tid > it.fieldLastTID { + it.fid++ // Move to the next field + // If there's a next field, update the boundary for the new field + if int(it.fid) < len(it.payload.idx.fields) { + // Sum the token counts of fields to get the new boundary + it.fieldLastTID += it.payload.idx.fieldsTokens[string(it.payload.idx.fields[it.fid])].count + } + } + } + return v, has +} diff --git a/frac/active/mem_index.go b/frac/active/mem_index.go new file mode 100644 index 00000000..1a5a46b1 --- /dev/null +++ b/frac/active/mem_index.go @@ -0,0 +1,100 @@ +package active + +import ( + "sort" + "sync" + + "github.com/ozontech/seq-db/seq" +) + +// tokenRange describes a range of tokens belonging to a specific field. +type tokenRange struct { + start uint32 // first TID of the field + count uint32 // number of tokens in the field +} + +// memIndex is an in-memory index of an active segment. +// It is used for searching, mapping tokens to documents, and retrieving document positions. +type memIndex struct { + // Important: + // - index in ids array + 1 = LID (local document id) + // - index in positions array + 1 = LID also + // - index in tokens array = TID (token id) + // - index in fieldsTokens array = TID + + ids []seq.ID // list of document IDs sorted in descending order (DESC) + tokens [][]byte // list of all tokens sorted in ascending order (ASC) by key field:token + tokenLIDs [][]uint32 // for each TID stores a sorted list of LIDs of documents containing this token + fieldsTokens map[string]tokenRange // mapping field → token range (TID) belonging to this field + fields [][]byte // list of all fields sorted in ascending order (ASC) + blocksOffsets []uint64 // offsets of document blocks in storage, sorted in ascending order + positions []seq.DocPos // position of each document inside a block; index corresponds to LID-1 + + hash uint64 + docsSize uint64 // total size of documents in bytes + docsCount uint32 // number of documents in the index + allTokenLIDsCount int // total number of tokenLIDs (for fast calc allocation size in merging) + + wg sync.WaitGroup // used to wait for background operations to finish before releasing resources + res *Resources // shared resource pool (memory, buffers, etc.) + release func() // function to release resources +} + +// newMemIndex creates a new in-memory index and initializes resources. +func newMemIndex() *memIndex { + res, release := NewResources() + return &memIndex{ + res: res, + release: release, + } +} + +// getTokenProvider returns a tokenProvider for the specified field. +// It restricts the TID range to tokens belonging only to this field. +func (idx *memIndex) getTokenProvider(field string) *tokenProvider { + if r, ok := idx.fieldsTokens[field]; ok { + return &tokenProvider{ + firstTID: r.start, + lastTID: r.start + r.count - 1, + tokens: idx.tokens, + } + } + + // Field is not indexed — return an empty provider with firstTID > lastTID. + return &tokenProvider{ + firstTID: 1, + lastTID: 0, + tokens: idx.tokens, + } +} + +// IsIntersecting checks whether the MID range [from, to] intersects +// with the range of documents stored in the index. +func (idx *memIndex) IsIntersecting(from, to seq.MID) bool { + maxMID := idx.ids[0].MID + minMID := idx.ids[len(idx.ids)-1].MID + + if to < minMID || maxMID < from { + return false + } + return true +} + +// GetLIDByID searches for the local document ID (LID) by global ID (MID + RID). +// Returns the LID (starting from 1) and a flag indicating whether it was found. +func (idx *memIndex) GetLIDByID(id seq.ID) (uint32, bool) { + i, ok := sort.Find(len(idx.ids), func(i int) int { + return seq.Compare(idx.ids[i], id) + }) + return uint32(i + 1), ok +} + +// Release frees index resources. +// The call is non-blocking: actual release happens in a separate goroutine +// after all ongoing operations are completed. +func (idx *memIndex) Release() { + go func() { + idx.wg.Wait() + idx.release() + }() +} diff --git a/frac/active/mem_index_pool.go b/frac/active/mem_index_pool.go new file mode 100644 index 00000000..863eb47e --- /dev/null +++ b/frac/active/mem_index_pool.go @@ -0,0 +1,225 @@ +package active + +import ( + "slices" + "sync" + "sync/atomic" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" +) + +// indexEntry is an internal structure that describes a memIndex +// inside the pool and its state during merge operations. +type indexEntry struct { + id uint64 // unique runtime ID of the index + index *memIndex // pointer to the actual index + gen int // generation, used for merge management +} + +// memIndexPool manages the lifecycle of in-memory indexes: +// - keeps indexes ready for use +// - tracks indexes currently participating in merge +// - provides consistent snapshots for readers +type memIndexPool struct { + mu sync.RWMutex // protects all fields below + info *frac.Info // aggregated information for all indexes + hashes map[uint64]struct{} + + ready map[uint64]indexEntry // indexes ready to be merged + merging map[uint64]indexEntry // indexes currently being merged + + // readable is a flat list of indexes available for reading. + // It contains both ready and merging indexes. + readable []*memIndex + + nextID atomic.Uint64 // atomic counter for generating index IDs +} + +// NewIndexPool creates a new index pool +func NewIndexPool(info *frac.Info) *memIndexPool { + return &memIndexPool{ + info: info, + hashes: make(map[uint64]struct{}, 1000), + ready: make(map[uint64]indexEntry), + merging: make(map[uint64]indexEntry), + } +} + +// indexSnapshot represents a consistent snapshot of the pool state. +// It is used to safely read indexes without holding the pool lock. +type indexSnapshot struct { + info *frac.Info // copy of aggregated info + indexes []*memIndex // indexes available for reading +} + +// Snapshot returns a snapshot and a release function. +// While the snapshot is alive, indexes are protected from being released via wg. +func (p *memIndexPool) Snapshot() (*indexSnapshot, func()) { + p.mu.RLock() + defer p.mu.RUnlock() + + // Copy info so the snapshot is immutable + info := *p.info + + iss := indexSnapshot{ + info: &info, + indexes: make([]*memIndex, len(p.readable)), + } + + // Increment usage counter for each index + for i, idx := range p.readable { + iss.indexes[i] = idx + idx.wg.Add(1) + } + + return &iss, func() { + // release function decrements wg counters + for _, idx := range iss.indexes { + idx.wg.Done() + } + } +} + +// Info returns a copy of the aggregated pool information +func (p *memIndexPool) Info() *frac.Info { + p.mu.RLock() + defer p.mu.RUnlock() + + info := *p.info // copy + return &info +} + +// Add adds a new memIndex to the pool and updates aggregated statistics +func (p *memIndexPool) Add(idx *memIndex, docsLen, metaLen uint64) { + maxMID := idx.ids[0].MID + minMID := idx.ids[len(idx.ids)-1].MID + + entry := p.newEntry(idx, 0) + + p.mu.Lock() + defer p.mu.Unlock() + + if idx.hash > 0 { + if _, ok := p.hashes[idx.hash]; ok { + logger.Warn("a duplicate index (bulk) has been detected") + return + } + p.hashes[idx.hash] = struct{}{} + } + + if p.info.From > minMID { + p.info.From = minMID + } + if p.info.To < maxMID { + p.info.To = maxMID + } + + p.info.DocsRaw += idx.docsSize + p.info.DocsTotal += idx.docsCount + + p.info.DocsOnDisk += docsLen + p.info.MetaOnDisk += metaLen + + p.ready[entry.id] = entry + p.readable = append(p.readable, idx) +} + +// ReadyToMerge returns indexes that can be taken for merge (returns a copy without modifying the pool state) +func (p *memIndexPool) ReadyToMerge() []indexEntry { + p.mu.RLock() + defer p.mu.RUnlock() + + entries := make([]indexEntry, 0, len(p.ready)) + for _, entry := range p.ready { + entries = append(entries, entry) + } + return entries +} + +// takeForMerge moves indexes from the "ready" state to the "merging" state +func (p *memIndexPool) takeForMerge(entries []indexEntry) { + p.mu.Lock() + defer p.mu.Unlock() + + for _, entry := range entries { + delete(p.ready, entry.id) + p.merging[entry.id] = entry + } +} + +// replace replaces several old indexes with a single merged index +func (p *memIndexPool) replace(old []indexEntry, merged *memIndex) { + newEntry := p.newEntry(merged, avgGeneration(old)+1) + + defer func() { + for _, entry := range old { + entry.index.Release() + } + }() + + p.mu.Lock() + defer p.mu.Unlock() + + var docsCountToRemove uint32 + for _, entry := range old { + docsCountToRemove += entry.index.docsCount + delete(p.merging, entry.id) + } + p.ready[newEntry.id] = newEntry + + // update info: the number of documents to be deleted may be greater + // than the number to be added due to deduplication + if docsCountToRemove > p.info.DocsTotal { + panic("inconsistent state of index pool") + } + p.info.DocsTotal -= uint32(docsCountToRemove) + p.info.DocsTotal += newEntry.index.docsCount + + p.rebuildReadable() +} + +// avgGeneration calculates the average generation of indexes +func avgGeneration(entries []indexEntry) int { + gen := 0 + for _, entry := range entries { + gen += entry.gen + } + return gen / len(entries) +} + +// rebuildReadable rebuilds the list of indexes available for reading (ready + merging) +func (p *memIndexPool) rebuildReadable() { + p.readable = p.readable[:0] + p.readable = slices.Grow(p.readable, len(p.ready)+len(p.merging)) + + for _, entry := range p.ready { + p.readable = append(p.readable, entry.index) + } + for _, entry := range p.merging { + p.readable = append(p.readable, entry.index) + } +} + +// Release fully releases the pool and all contained indexes +func (p *memIndexPool) Release() { + p.mu.Lock() + defer p.mu.Unlock() + + for _, idx := range p.readable { + idx.Release() + } + + p.readable = nil + p.ready = nil + p.merging = nil +} + +// newEntry creates a new indexEntry with a unique ID +func (p *memIndexPool) newEntry(index *memIndex, gen int) indexEntry { + return indexEntry{ + id: p.nextID.Add(1), + gen: gen, + index: index, + } +} diff --git a/frac/active/merge.go b/frac/active/merge.go new file mode 100644 index 00000000..4d64a67b --- /dev/null +++ b/frac/active/merge.go @@ -0,0 +1,352 @@ +package active + +import ( + "bytes" + "cmp" + "slices" + + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +// mergeIndexes merges several in-memory indexes into one. +func mergeIndexes(indexes []*memIndex) *memIndex { + // preallocate memory based on total size + blocksCount := 0 + dst := newMemIndex() + + for _, idx := range indexes { + dst.docsSize += idx.docsSize + dst.docsCount += idx.docsCount + dst.allTokenLIDsCount += idx.allTokenLIDsCount + blocksCount += len(idx.blocksOffsets) + } + + tmp, release := NewResources() + defer release() + + // preallocate final structures + dst.ids = dst.res.GetIDs(int(dst.docsCount))[:0] + dst.positions = dst.res.GetDocPosSlice(int(dst.docsCount))[:0] + dst.blocksOffsets = dst.res.GetUint64s(blocksCount)[:0] + + // 1. merge block offsets and recalc document positions + posMap := mergeBlocksOffsets(dst, tmp, indexes) + + // 2. merge documents, get old→new LID mapping + lidsMap := mergeIDs(dst, tmp, indexes, posMap) + + // 3. merge tokens using new LIDs + mergeTokens(dst, tmp, indexes, lidsMap) + + return dst +} + +// mergeIDs merges documents from all indexes into ordered stream. +// returns mapping oldLID → newLID for each index. +func mergeIDs( + dst *memIndex, + tmp *Resources, + indexes []*memIndex, + posMap [][]seq.DocPos, +) [][]uint32 { + + // store old→new LID mapping per index + lidsMap := tmp.GetUint32Slices(len(indexes)) + docStreams := make([]OrderedStream[DocRef], len(indexes)) + + for i, idx := range indexes { + docStreams[i] = &DocStream{ + i: i, // index number + idx: idx, // the index itself + posMap: posMap[i], // recalculated document positions + } + + // LIDs start from 1, so add dummy element + lidsMap[i] = tmp.GetUint32s(int(idx.docsCount) + 1)[:1] + } + + // merge all streams by ID (reverse order) + mergedDocStream := MergeSortedStreams( + docStreams, + func(a, b DocRef) int { + r := seq.Compare(b.id, a.id) + if r == 0 { + r = cmp.Compare(a.i, b.i) + } + return r + }, + ) + + var ( + doubles int + prevRef DocRef + ) + + // process merged stream + for docRef, has := mergedDocStream.Next(); has; docRef, has = mergedDocStream.Next() { + if docRef.id == prevRef.id && docRef.i != prevRef.i { + doubles++ + // map old LID → 0 (will be filtered later) + lidsMap[docRef.i] = append(lidsMap[docRef.i], 0) + continue + } + prevRef = docRef + + // add to result + dst.ids = append(dst.ids, docRef.id) + dst.positions = append(dst.positions, docRef.pos) + + // new LID is position in dst.ids (1-based) + newLID := uint32(len(dst.ids)) + lidsMap[docRef.i] = append(lidsMap[docRef.i], newLID) + } + + if doubles > 0 { + dst.docsCount -= uint32(doubles) + logger.Warn("doubles in index", zap.Int("count", doubles)) + } + + return lidsMap +} + +// mergeTokens merges tokens from all indexes using new LIDs. +func mergeTokens( + dst *memIndex, + tmp *Resources, + indexes []*memIndex, + lidsMap [][]uint32, +) { + totalDocs := 0 // sum of documents from all indexes (before deduplication) + totalTokens := 0 + tokenStreams := make([]OrderedStream[TokenRef], len(indexes)) + + // create token iterators + for i, idx := range indexes { + totalDocs += int(idx.docsCount) + totalTokens += len(idx.tokens) + tokenStreams[i] = NewTokenStream(idx, lidsMap[i]) + } + + cmpToken := func(a, b TokenRef) int { + r := bytes.Compare(a.Field(), b.Field()) + if r == 0 { + return bytes.Compare(a.Value(), b.Value()) + } + return r + } + + // merged and sorted token stream + mergedTokenStream := MergeSortedStreams(tokenStreams, cmpToken) + + // unique values statistics + uniqTokensSize := 0 + uniqTokensCount := 0 + uniqFieldsSize := 0 + uniqFieldsCount := 0 + + var ( + prevField []byte + prevToken TokenRef + ) + + // borders[i] indicates: + const ( + borderNone = 0b00 // tokensRef[i] same token as previous (but different index) + borderToken = 0b01 // tokensRef[i] is a new token value + borderField = 0b10 // tokensRef[i] is a new field + ) + + borders := tmp.GetBytes(totalTokens)[:0] + tokensRef := make([]TokenRef, 0, totalTokens) + + // first pass: count unique tokens and fields + for tokenRef, has := mergedTokenStream.Next(); has; tokenRef, has = mergedTokenStream.Next() { + var border uint8 = borderNone + + // new token + if prevToken.payload == nil || cmpToken(prevToken, tokenRef) != 0 { + uniqTokensCount++ + uniqTokensSize += len(tokenRef.Value()) + border |= borderToken + + // new field + field := tokenRef.Field() + if !bytes.Equal(prevField, field) { + uniqFieldsCount++ + uniqFieldsSize += len(field) + border |= borderField + prevField = field + } + } + + borders = append(borders, border) + tokensRef = append(tokensRef, tokenRef) + prevToken = tokenRef + } + + // initialize result structures + dst.fieldsTokens = make(map[string]tokenRange, uniqFieldsCount) + dst.fields = dst.res.GetBytesSlices(uniqFieldsCount)[:0] + dst.tokens = dst.res.GetBytesSlices(uniqTokensCount)[:0] + dst.tokenLIDs = dst.res.GetUint32Slices(uniqTokensCount)[:0] + + allTokens := dst.res.GetBytes(uniqTokensSize)[:0] + allFields := dst.res.GetBytes(uniqFieldsSize)[:0] + + // collector for token's document LIDs + lidsCollector := NewLIDsCollector( + totalDocs, + dst.res.GetUint32s(dst.allTokenLIDsCount)[:0], // all token LIDs + dst.res.GetUint32s(int(dst.docsCount)), // LIDs for _all_ + tmp.GetBytes((int(dst.docsCount) + 1)), // sorting buffer + ) + + // second pass: fill structures + for i, tokenRef := range tokensRef { + if borders[i]&borderToken == borderToken { // new token value + + if i > 0 { // finish previous token + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) + } + + if borders[i]&borderField == borderField { // new field + tid := uint32(len(dst.tokens)) + + if i > 0 { // finish previous field + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] + tr.count = tid - tr.start + dst.fieldsTokens[fieldStr] = tr + } + + start := len(allFields) + allFields = append(allFields, tokenRef.Field()...) + field := allFields[start:] + dst.fields = append(dst.fields, field) + + fieldStr := util.ByteToStringUnsafe(field) + dst.fieldsTokens[fieldStr] = tokenRange{start: tid} + } + start := len(allTokens) + allTokens = append(allTokens, tokenRef.Value()...) + dst.tokens = append(dst.tokens, allTokens[start:]) + } + + // add document LIDs for this token + newLIDsMap := tokenRef.lidsMap() + for _, oldLID := range tokenRef.LIDs() { + lidsCollector.Add(newLIDsMap[oldLID]) + } + } + + // final token + dst.tokenLIDs = append(dst.tokenLIDs, lidsCollector.GetSorted()) + + // close last field + tid := uint32(len(dst.tokens)) - 1 + fieldStr := util.ByteToStringUnsafe(dst.fields[len(dst.fields)-1]) + tr := dst.fieldsTokens[fieldStr] + tr.count = tid - tr.start + 1 + dst.fieldsTokens[fieldStr] = tr +} + +// LIDsCollector collects and sorts document LIDs for a token. +type LIDsCollector struct { + totalDocs int // total docs count before deduplication + lids []uint32 // overall array + all []uint32 // full LID set (1..N) + buf []uint8 // bitmap + offset int +} + +// NewLIDsCollector initializes collector. +func NewLIDsCollector(totalDocs int, lids, all []uint32, buf []uint8) *LIDsCollector { + clear(buf) + for i := range all { + all[i] = uint32(i) + 1 + } + return &LIDsCollector{ + totalDocs: totalDocs, + lids: lids[:0], + all: all, + buf: buf, + } +} + +// Add a single LID +func (s *LIDsCollector) Add(lid uint32) { + s.lids = append(s.lids, lid) +} + +// GetSorted returns sorted LID list using optimal algorithm. +func (s *LIDsCollector) GetSorted() (dst []uint32) { + n := len(s.lids) - s.offset + + // all documents covered → return all + if n == s.totalDocs { + s.lids = s.lids[:s.offset] + return s.all + } + + dst = s.lids[s.offset:] + s.offset = len(s.lids) + + // dense case: use bitmap + if 100*n/len(s.all) > 50 { + for _, v := range dst { + s.buf[v] = 1 + } + s.buf[0] = 0 // skip zero LID from duplicates + dst = dst[:0] + for lid, ok := range s.buf { + if ok == 1 { + s.buf[lid] = 0 + dst = append(dst, uint32(lid)) + } + } + return dst + } + + // sparse case: sort normally + if n > 1 { + slices.Sort(dst) + } + // skip zero LIDs from duplicates + for len(dst) > 0 && dst[0] == 0 { + dst = dst[1:] + } + return dst +} + +// mergeBlocksOffsets merges block offsets and recalculates document positions. +func mergeBlocksOffsets( + dst *memIndex, + tmp *Resources, + indexes []*memIndex, +) [][]seq.DocPos { + + var offset uint32 + positions := tmp.GetDocPosSlices(len(indexes)) + + for i, index := range indexes { + // copy block offsets + dst.blocksOffsets = append(dst.blocksOffsets, index.blocksOffsets...) + + // recalculate positions + positions[i] = tmp.GetDocPosSlice(len(index.positions))[:0] + for _, p := range index.positions { + oldIdx, docOffset := p.Unpack() + positions[i] = append( + positions[i], + seq.PackDocPos(oldIdx+offset, docOffset), + ) + } + + offset += uint32(len(index.blocksOffsets)) + } + + return positions +} diff --git a/frac/active/merge_manager.go b/frac/active/merge_manager.go new file mode 100644 index 00000000..6b3f0ade --- /dev/null +++ b/frac/active/merge_manager.go @@ -0,0 +1,161 @@ +package active + +import ( + "sync" + + "github.com/ozontech/seq-db/logger" + "go.uber.org/zap" +) + +// Tuning parameters for index merge strategy +const ( + maxGenerationBuckets = 32 // Maximum number of generation buckets used for grouping + minMergeBatchSize = 16 // Minimum batch size required to start a merge + forceMergeThreshold = 4096 // Merge all indexes if total count exceeds this limit +) + +type WorkerLimiter interface { + Acquire() // Blocks until a worker slot is available + Release() // Frees a previously acquired slot +} + +// mergeManager coordinates background merging of in-memory indexes +type mergeManager struct { + mu sync.Mutex // Protects internal state + wg sync.WaitGroup // Tracks active merge jobs + + stopped bool // Indicates shutdown state + indexPool *memIndexPool // Source of indexes to be merged + + mergeWorkers WorkerLimiter // Limits parallel merge execution + mergeSignal chan struct{} // Coalesced signal to trigger merge evaluation +} + +// newMergeManager initializes merge manager and starts merge loop +func newMergeManager(indexes *memIndexPool, workerPool WorkerLimiter) *mergeManager { + m := mergeManager{ + indexPool: indexes, + mergeWorkers: workerPool, + mergeSignal: make(chan struct{}, 1), + } + + // Background goroutine responsible for scheduling merges + go m.mergeLoop() + + return &m +} + +// Stop gracefully stops the manager and waits for ongoing merges +func (m *mergeManager) Stop() { + m.mu.Lock() + defer m.mu.Unlock() + + m.stopped = true + + // Ensure all in-flight merges are completed + m.wg.Wait() + close(m.mergeSignal) +} + +// ForceMergeAll performs full merge of all available indexes +func (m *mergeManager) ForceMergeAll() { + m.mu.Lock() + defer m.mu.Unlock() + + // Ensure no background merges are running + m.wg.Wait() + + if batch := m.indexPool.ReadyToMerge(); len(batch) > 1 { + logger.Debug("force merge all indexes", zap.Int("batch", len(batch))) + m.indexPool.takeForMerge(batch) + merged := mergeIndexes(unwrapIndexes(batch)) + m.indexPool.replace(batch, merged) + } +} + +// unwrapIndexes extracts raw memIndex pointers from wrappers +func unwrapIndexes(items []indexEntry) []*memIndex { + result := make([]*memIndex, 0, len(items)) + for _, item := range items { + result = append(result, item.index) + } + return result +} + +// mergeLoop continuously reacts to merge signals and schedules work +func (m *mergeManager) mergeLoop() { + for range m.mergeSignal { + m.mergeWorkers.Acquire() // wait for a free worker + + m.mu.Lock() + + if m.stopped { + m.mu.Unlock() + m.mergeWorkers.Release() + continue + } + + // Decide which indexes are worth merging right now + batch := selectMergeBatch(m.indexPool.ReadyToMerge(), minMergeBatchSize) + if len(batch) == 0 { + m.mu.Unlock() + m.mergeWorkers.Release() + continue + } + + m.indexPool.takeForMerge(batch) + m.wg.Add(1) // important to inc wg inside the lock + m.mu.Unlock() + + logger.Debug("merge indexes", zap.Int("generation", batch[0].gen), zap.Int("size", len(batch))) + + go func(batch []indexEntry) { + defer m.wg.Done() + defer m.mergeWorkers.Release() + + merged := mergeIndexes(unwrapIndexes(batch)) + m.indexPool.replace(batch, merged) + m.requestMerge() // re-check if further merges are possible + }(batch) + } +} + +// requestMerge schedules a merge check if one is not already pending +func (m *mergeManager) requestMerge() { + select { + case m.mergeSignal <- struct{}{}: + default: + // Merge signal already pending; avoid redundant wakeups + } +} + +// selectMergeBatch chooses an optimal merge candidate batch +func selectMergeBatch(items []indexEntry, minBatchSize int) []indexEntry { + if len(items) < minBatchSize { + return nil + } + + if len(items) > forceMergeThreshold { + return items + } + + batch := largestGenerationGroup(items) + if len(batch) < minBatchSize { + return nil + } + return batch +} + +// largestGenerationGroup returns the biggest generation-aligned batch +func largestGenerationGroup(items []indexEntry) []indexEntry { + maxGen := 0 + batches := make([][]indexEntry, maxGenerationBuckets) + for _, item := range items { + gen := min(maxGenerationBuckets, item.gen) + batches[gen] = append(batches[gen], item) + if len(batches[gen]) > len(batches[maxGen]) || len(batches[gen]) == len(batches[maxGen]) && gen > maxGen { + maxGen = gen + } + } + return batches[maxGen] +} diff --git a/frac/active/metrics.go b/frac/active/metrics.go new file mode 100644 index 00000000..dc526640 --- /dev/null +++ b/frac/active/metrics.go @@ -0,0 +1,54 @@ +package active + +import ( + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/metric" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + fetcherStagesSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "fetcher", + Name: "active_stages_seconds", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + + searchAggSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_agg_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + searchHstSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_hist_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + searchSimpleSec = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "search", + Name: "tracer_active_reg_search_sec", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) + + bulkStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "bulk", + Name: "stages_seconds2", + Buckets: metric.SecondsBuckets, + }, []string{"stage"}) +) + +// getActiveSearchMetric selects a Prometheus metric depending on the type of search query. +func getActiveSearchMetric(params processor.SearchParams) *prometheus.HistogramVec { + if params.HasAgg() { + return searchAggSec + } + if params.HasHist() { + return searchHstSec + } + return searchSimpleSec +} diff --git a/frac/active/resources.go b/frac/active/resources.go new file mode 100644 index 00000000..b7d7fe67 --- /dev/null +++ b/frac/active/resources.go @@ -0,0 +1,120 @@ +package active + +import ( + "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/resources" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/tokenizer" +) + +const poolBuckets = 24 + +var ( + tokenKeyPool = resources.NewSizedPool[tokenStr](poolBuckets) + indexerMetaDataPool = resources.NewSizedPool[indexer.MetaData](poolBuckets) + docPosSlicesPool = resources.NewSizedPool[[]seq.DocPos](poolBuckets) + bufPool = resources.TypedPool[*indexerBuffer]{} + resPool = resources.TypedPool[*Resources]{} +) + +// Resources provides pooled memory allocation for index construction. +// It manages reusable buffers to avoid GC pressure during indexing. +type Resources struct { + releases *resources.CallStack + + uint32s resources.SliceOnBytes[uint32] + uint64s resources.SliceOnBytes[uint64] + bytes resources.SlicesPool[byte] + bytesSlices resources.SlicesPool[[]byte] + uint32Slices resources.SlicesPool[[]uint32] + tokenKeys resources.SlicesPool[tokenStr] + indexerMetaData resources.SlicesPool[indexer.MetaData] + buf resources.ObjectsPool[indexerBuffer] + ids resources.SliceOnBytes[seq.ID] + docPos resources.SliceOnBytes[seq.DocPos] + docPosSlices resources.SlicesPool[[]seq.DocPos] +} + +func NewResources() (*Resources, func()) { + r, ok := resPool.Get() + if !ok { + s := resources.CallStack{} + r = &Resources{ + releases: &s, + + bytes: resources.NewBytes(&s), + uint32s: resources.NewUint32s(&s), + uint64s: resources.NewUint64s(&s), + uint32Slices: resources.NewUint32Slices(&s), + bytesSlices: resources.NewBytesSlices(&s), + ids: resources.NewSliceOnBytes[seq.ID](&s), + docPos: resources.NewSliceOnBytes[seq.DocPos](&s), + docPosSlices: resources.NewSlicesPool(&docPosSlicesPool, &s), + indexerMetaData: resources.NewSlicesPool(&indexerMetaDataPool, &s), + tokenKeys: resources.NewSlicesPool(&tokenKeyPool, &s), + buf: resources.NewObjectsPool(&bufPool, &s), + } + } + return r, func() { + r.releases.CallAll() + resPool.Put(r) + } +} + +func (r *Resources) GetBytesSlices(s int) [][]byte { + return r.bytesSlices.GetSlice(s) +} + +func (r *Resources) GetBytes(s int) []byte { + return r.bytes.GetSlice(s) +} + +func (r *Resources) GetUint32s(s int) []uint32 { + return r.uint32s.GetSlice(s) +} + +func (r *Resources) GetIDs(s int) []seq.ID { + return r.ids.GetSlice(s) +} + +func (r *Resources) GetDocPosSlice(s int) []seq.DocPos { + return r.docPos.GetSlice(s) +} + +func (r *Resources) GetDocPosSlices(s int) [][]seq.DocPos { + return r.docPosSlices.GetSlice(s) +} + +func (r *Resources) GetUint64s(s int) []uint64 { + return r.uint64s.GetSlice(s) +} + +func (r *Resources) GetUint32Slices(s int) [][]uint32 { + return r.uint32Slices.GetSlice(s) +} + +func (r *Resources) GetMetadata(s int) []indexer.MetaData { + return r.indexerMetaData.GetSlice(s) +} + +func (r *Resources) GetTokens(s int) []tokenStr { + return r.tokenKeys.GetSlice(s) +} + +func (r *Resources) GetBuffer() *indexerBuffer { + return r.buf.Get(func() *indexerBuffer { + return &indexerBuffer{ + sizes: make([]uint32, 0, 1000), + fields: make([]string, 0, 100), + fieldTIDs: make([]uint32, 0, 100), + tokens: make([]tokenizer.MetaToken, 0, 100), + tokenMap: make(map[tokenStr]uint32, 1000), + } + }, func(b *indexerBuffer) { + b.fields = b.fields[:0] + b.tokens = b.tokens[:0] + b.fieldTIDs = b.fieldTIDs[:0] + b.sizes = b.sizes[:0] + clear(b.tokenMap) + }) +} diff --git a/frac/active/sealing_source.go b/frac/active/sealing_source.go new file mode 100644 index 00000000..d1f553d7 --- /dev/null +++ b/frac/active/sealing_source.go @@ -0,0 +1,176 @@ +package active + +import ( + "iter" + "math" + "time" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active_old" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + +var ( + _ sealing.Source = (*SealingSource)(nil) + + // Special system ID added as the first entry + systemSeqID = seq.ID{ + MID: math.MaxUint64, + RID: math.MaxUint64, + } +) + +// SealingSource provides data from a single memIndex in the format required by the sealing stage. +type SealingSource struct { + info *frac.Info + index *memIndex + blocksOffsets []uint64 + positions []seq.DocPos + lastErr error +} + +// NewSealingSource prepares a sealing source from Active2 state. +func NewSealingSource(a *Active, params frac.SealParams) (sealing.Source, error) { + a.merger.ForceMergeAll() + + iss, release := a.indexes.Snapshot() + defer release() + + if len(iss.indexes) != 1 { + logger.Panic("invalid state: sealing requires a single memIndex") + } + + index := iss.indexes[0] + ss := &SealingSource{ + info: iss.info, + index: index, + positions: index.positions, + blocksOffsets: index.blocksOffsets, + } + + // Sort documents unless explicitly disabled + if !a.Config.SkipSortDocs { + ds := active_old.NewDocsSource(ss, ss.index.blocksOffsets, &a.sortReader) + + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs( + ss.info.Path, + params, + ds, + ) + if err != nil { + return nil, err + } + + ss.positions = positions[1:] // skip system document position + ss.blocksOffsets = blocksOffsets + ss.info.DocsOnDisk = uint64(onDiskSize) + } + + ss.info.MetaOnDisk = 0 + ss.info.SealingTime = uint64(time.Now().UnixMilli()) + + ss.info.BuildDistributionWithIDs(ss.index.ids) + + return ss, nil +} + +// Info returns fraction metadata. +func (src *SealingSource) Info() *frac.Info { + return src.info +} + +// IDsBlocks yields document IDs and their positions in fixed-size blocks. +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { + return func(yield func([]seq.ID, []seq.DocPos) bool) { + ids := make([]seq.ID, 0, blockSize) + pos := make([]seq.DocPos, 0, blockSize) + + // Add system entry + ids = append(ids, systemSeqID) + pos = append(pos, 0) + + for i, id := range src.index.ids { + if len(ids) == blockSize { + if !yield(ids, pos) { + return + } + ids = ids[:0] + pos = pos[:0] + } + + ids = append(ids, id) + pos = append(pos, src.positions[i]) + } + + yield(ids, pos) + } +} + +// TokenBlocks yields tokens grouped by total byte size. +func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { + return func(yield func([][]byte) bool) { + actualSize := 0 + block := make([][]byte, 0, blockSize) + + for _, token := range src.index.tokens { + if actualSize >= blockSize { + if !yield(block) { + return + } + actualSize = 0 + block = block[:0] + } + + actualSize += len(token) + int(uint32Size) + block = append(block, token) + } + + yield(block) + } +} + +// Fields yields field names with their token upper bounds. +func (src *SealingSource) Fields() iter.Seq2[string, uint32] { + return func(yield func(string, uint32) bool) { + for _, field := range src.index.fields { + f := util.ByteToStringUnsafe(field) + r := src.index.fieldsTokens[f] + + if !yield(f, r.start+r.count) { + return + } + } + } +} + +// TokenLIDs yields document LIDs for each token. +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { + all := make([]uint32, src.index.docsCount) + for i := range all { + all[i] = uint32(i) + 1 + } + + return func(yield func([]uint32) bool) { + for _, tokenLIDs := range src.index.tokenLIDs { + if len(tokenLIDs) == 0 { + tokenLIDs = all + } + if !yield(tokenLIDs) { + return + } + } + } +} + +// BlocksOffsets returns document block offsets. +func (src *SealingSource) BlocksOffsets() []uint64 { + return src.blocksOffsets +} + +// LastError returns the last recorded source error. +func (src *SealingSource) LastError() error { + return src.lastErr +} diff --git a/frac/active.go b/frac/active_old/active.go similarity index 67% rename from frac/active.go rename to frac/active_old/active.go index 82810773..506c615e 100644 --- a/frac/active.go +++ b/frac/active_old/active.go @@ -1,11 +1,10 @@ -package frac +package active_old import ( "context" "io" "math" "os" - "path/filepath" "sync" "time" @@ -16,7 +15,7 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" @@ -27,27 +26,23 @@ import ( ) var ( - _ Fraction = (*Active)(nil) + _ frac.Fraction = (*Active)(nil) ) type Active struct { - Config *Config + Config *frac.Config BaseFileName string - useMu sync.RWMutex - suicided bool - released bool - infoMu sync.RWMutex - info *common.Info + info *frac.Info MIDs *UInt64s RIDs *UInt64s DocBlocks *UInt64s - TokenList *TokenList + TokenList *tokenList DocsPositions *DocsPositions @@ -60,8 +55,8 @@ type Active struct { metaFile *os.File metaReader storage.DocBlocksReader - writer *ActiveWriter - indexer *ActiveIndexer + writer *Writer + indexer *Indexer } const ( @@ -74,19 +69,19 @@ var systemSeqID = seq.ID{ RID: systemRID, } -func NewActive( +func New( baseFileName string, - activeIndexer *ActiveIndexer, + activeIndexer *Indexer, readLimiter *storage.ReadLimiter, docsCache *cache.Cache[[]byte], sortCache *cache.Cache[[]byte], - cfg *Config, + cfg *frac.Config, ) *Active { - docsFile, docsStats := mustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) - metaFile, metaStats := mustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) + docsFile, docsStats := util.MustOpenFile(baseFileName+consts.DocsFileSuffix, config.SkipFsync) + metaFile, metaStats := util.MustOpenFile(baseFileName+consts.MetaFileSuffix, config.SkipFsync) f := &Active{ - TokenList: NewActiveTokenList(config.IndexWorkers), + TokenList: NewTokenList(config.IndexWorkers), DocsPositions: NewSyncDocsPositions(), MIDs: NewIDs(), RIDs: NewIDs(), @@ -102,10 +97,10 @@ func NewActive( metaReader: storage.NewDocBlocksReader(readLimiter, metaFile), indexer: activeIndexer, - writer: NewActiveWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), + writer: NewWriter(docsFile, metaFile, docsStats.Size(), metaStats.Size(), config.SkipFsync), BaseFileName: baseFileName, - info: common.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), + info: frac.NewInfo(baseFileName, uint64(docsStats.Size()), uint64(metaStats.Size())), Config: cfg, } @@ -118,24 +113,6 @@ func NewActive( return f } -func mustOpenFile(name string, skipFsync bool) (*os.File, os.FileInfo) { - file, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o776) - if err != nil { - logger.Fatal("can't create docs file", zap.String("file", name), zap.Error(err)) - } - - if !skipFsync { - parentDirPath := filepath.Dir(name) - util.MustSyncPath(parentDirPath) - } - - stat, err := file.Stat() - if err != nil { - logger.Fatal("can't stat docs file", zap.String("file", name), zap.Error(err)) - } - return file, stat -} - func (f *Active) Replay(ctx context.Context) error { logger.Info("start replaying...", zap.String("name", f.info.Name())) @@ -265,48 +242,34 @@ func (f *Active) UpdateStats(minMID, maxMID seq.MID, docCount uint32, sizeCount } func (f *Active) String() string { - return fracToString(f, "active") + return frac.FracToString(f, "active") } func (f *Active) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) + if f.Info().DocsTotal == 0 { // it is empty active fraction state + return nil, nil } + + dp := f.createDataProvider(ctx) + defer dp.release() + return dp.Fetch(ids) } func (f *Active) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) - } - return dp.Search(params) -} - -func (f *Active) DataProvider(ctx context.Context) (*activeDataProvider, func()) { - f.useMu.RLock() - - if f.suicided || f.released || f.Info().DocsTotal == 0 { // it is empty active fraction state - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - } - f.useMu.RUnlock() - return nil, func() {} + if f.Info().DocsTotal == 0 { // it is empty active fraction state + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil } - // it is ordinary active fraction state dp := f.createDataProvider(ctx) - return dp, func() { - dp.release() - f.useMu.RUnlock() - } + defer dp.release() + + return dp.Search(params) } -func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { - return &activeDataProvider{ +func (f *Active) createDataProvider(ctx context.Context) *dataProvider { + return &dataProvider{ ctx: ctx, config: f.Config, info: f.Info(), @@ -321,7 +284,7 @@ func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { } } -func (f *Active) Info() *common.Info { +func (f *Active) Info() *frac.Info { f.infoMu.RLock() defer f.infoMu.RUnlock() @@ -338,49 +301,24 @@ func (f *Active) IsIntersecting(from, to seq.MID) bool { } func (f *Active) Release() { - f.useMu.Lock() - f.released = true - f.useMu.Unlock() - f.releaseMem() if !f.Config.KeepMetaFile { - f.removeMetaFile() + util.RemoveFile(f.metaFile.Name()) } if !f.Config.SkipSortDocs { // we use sorted docs in sealed fraction so we can remove original docs of active fraction - f.removeDocsFiles() + util.RemoveFile(f.docsFile.Name()) } } -// Offload for [Active] fraction is no-op. -// -// Since search within [Active] fraction is too costly (we have to replay the whole index in memory), -// we decided to support offloading only for [Sealed] fractions. -func (f *Active) Offload(context.Context, storage.Uploader) (bool, error) { - return false, nil -} - func (f *Active) Suicide() { - f.useMu.Lock() - released := f.released - f.suicided = true - f.released = true - f.useMu.Unlock() - - if released { // fraction can be suicided after release - if f.Config.KeepMetaFile { - f.removeMetaFile() // meta was not removed while release - } - if f.Config.SkipSortDocs { - f.removeDocsFiles() // docs was not removed while release - } - } else { // was not release - f.releaseMem() - f.removeMetaFile() - f.removeDocsFiles() - } + f.releaseMem() + + util.RemoveFile(f.metaFile.Name()) + util.RemoveFile(f.docsFile.Name()) + util.RemoveFile(f.BaseFileName + consts.SdocsFileSuffix) } func (f *Active) releaseMem() { @@ -393,24 +331,12 @@ func (f *Active) releaseMem() { if err := f.metaFile.Close(); err != nil { logger.Error("can't close meta file", zap.String("frac", f.BaseFileName), zap.Error(err)) } + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } f.RIDs = nil f.MIDs = nil f.TokenList = nil f.DocsPositions = nil } - -func (f *Active) removeDocsFiles() { - if err := f.docsFile.Close(); err != nil { - logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } - if err := os.Remove(f.docsFile.Name()); err != nil { - logger.Error("can't delete docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } -} - -func (f *Active) removeMetaFile() { - if err := os.Remove(f.metaFile.Name()); err != nil { - logger.Error("can't delete metas file", zap.String("frac", f.BaseFileName), zap.Error(err)) - } -} diff --git a/frac/active_docs_positions.go b/frac/active_old/docs_positions.go similarity index 98% rename from frac/active_docs_positions.go rename to frac/active_old/docs_positions.go index 0b4c596b..3acb7c2b 100644 --- a/frac/active_docs_positions.go +++ b/frac/active_old/docs_positions.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "sync" diff --git a/frac/active_old/docs_source.go b/frac/active_old/docs_source.go new file mode 100644 index 00000000..0dae2120 --- /dev/null +++ b/frac/active_old/docs_source.go @@ -0,0 +1,80 @@ +package active_old + +import ( + "iter" + + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" +) + +var _ sealing.DocsSource = (*DocsSource)(nil) + +type DocsSource struct { + src sealing.Source + blocksOffsets []uint64 + docsReader *storage.DocsReader + lastErr error +} + +func NewDocsSource(src sealing.Source, blocksOffsets []uint64, docsReader *storage.DocsReader) *DocsSource { + return &DocsSource{ + src: src, + blocksOffsets: blocksOffsets, + docsReader: docsReader, + } +} + +// Docs returns an iterator for documents with their IDs. +// Handles duplicate IDs (for nested indexes). +func (ds *DocsSource) Docs() iter.Seq2[seq.ID, []byte] { + ds.lastErr = nil + return func(yield func(seq.ID, []byte) bool) { + var ( + prev seq.ID + curDoc []byte + ) + + // iterate through ID and position blocks + for ids, pos := range ds.src.IDsBlocks(consts.IDsPerBlock) { + for i, id := range ids { + if id == systemSeqID { + curDoc = nil // reserved system document (no payload) + } else if id != prev { + // if ID changed, read new document + if curDoc, ds.lastErr = ds.doc(pos[i]); ds.lastErr != nil { + return + } + } + prev = id + if !yield(id, curDoc) { + return + } + } + } + } +} + +// doc reads a document from storage by its position. +func (ds *DocsSource) doc(pos seq.DocPos) ([]byte, error) { + blockIndex, docOffset := pos.Unpack() + blockOffset := ds.blocksOffsets[blockIndex] + + var doc []byte + err := ds.docsReader.ReadDocsFunc(blockOffset, []uint64{docOffset}, func(b []byte) error { + doc = b + return nil + }) + if err != nil { + return nil, err + } + return doc, nil +} + +func (ds *DocsSource) LastError() error { + if ds.lastErr != nil { + return ds.lastErr + } + return ds.src.LastError() +} diff --git a/frac/file_writer.go b/frac/active_old/file_writer.go similarity index 99% rename from frac/file_writer.go rename to frac/active_old/file_writer.go index 9ac5ab9c..8525c462 100644 --- a/frac/file_writer.go +++ b/frac/active_old/file_writer.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "io" diff --git a/frac/file_writer_test.go b/frac/active_old/file_writer_test.go similarity index 99% rename from frac/file_writer_test.go rename to frac/active_old/file_writer_test.go index b72c011b..66a5e0b0 100644 --- a/frac/file_writer_test.go +++ b/frac/active_old/file_writer_test.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "errors" diff --git a/frac/active_ids.go b/frac/active_old/ids.go similarity index 97% rename from frac/active_ids.go rename to frac/active_old/ids.go index 1195c8fa..8d0508f8 100644 --- a/frac/active_ids.go +++ b/frac/active_old/ids.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "sync" diff --git a/frac/active_ids_test.go b/frac/active_old/ids_test.go similarity index 98% rename from frac/active_ids_test.go rename to frac/active_old/ids_test.go index ae2f6111..9995ae82 100644 --- a/frac/active_ids_test.go +++ b/frac/active_old/ids_test.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "sync" diff --git a/frac/active_index.go b/frac/active_old/index.go similarity index 73% rename from frac/active_index.go rename to frac/active_old/index.go index 350a8e0d..fe49a22a 100644 --- a/frac/active_index.go +++ b/frac/active_old/index.go @@ -1,9 +1,9 @@ -package frac +package active_old import ( "context" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/metric/stopwatch" @@ -13,24 +13,24 @@ import ( "github.com/ozontech/seq-db/storage" ) -type activeDataProvider struct { +type dataProvider struct { ctx context.Context - config *Config - info *common.Info + config *frac.Config + info *frac.Info mids *UInt64s rids *UInt64s - tokenList *TokenList + tokenList *tokenList blocksOffsets []uint64 docsPositions *DocsPositions docsReader *storage.DocsReader - idsIndex *activeIDsIndex + idsIndex *idsIndex } -func (dp *activeDataProvider) release() { +func (dp *dataProvider) release() { if dp.idsIndex != nil { dp.idsIndex.inverser.Release() } @@ -38,13 +38,13 @@ func (dp *activeDataProvider) release() { // getIDsIndex creates on demand and returns ActiveIDsIndex. // Creation of inverser for ActiveIDsIndex is expensive operation -func (dp *activeDataProvider) getIDsIndex() *activeIDsIndex { +func (dp *dataProvider) getIDsIndex() *idsIndex { if dp.idsIndex == nil { // creation order is matter mapping := dp.tokenList.GetAllTokenLIDs().GetLIDs(dp.mids, dp.rids) mids := dp.mids.GetVals() // mids and rids should be created after mapping to ensure that rids := dp.rids.GetVals() // they contain all the ids that mapping contains. - dp.idsIndex = &activeIDsIndex{ + dp.idsIndex = &idsIndex{ inverser: newInverser(mapping, len(mids)), mids: mids, rids: rids, @@ -53,8 +53,8 @@ func (dp *activeDataProvider) getIDsIndex() *activeIDsIndex { return dp.idsIndex } -func (dp *activeDataProvider) getTokenIndex() *activeTokenIndex { - return &activeTokenIndex{ +func (dp *dataProvider) getTokenIndex() *tokenIndex { + return &tokenIndex{ ctx: dp.ctx, mids: dp.mids, rids: dp.rids, @@ -63,17 +63,17 @@ func (dp *activeDataProvider) getTokenIndex() *activeTokenIndex { } } -func (dp *activeDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { +func (dp *dataProvider) Fetch(ids []seq.ID) ([][]byte, error) { sw := stopwatch.New() defer sw.Export( - fetcherStagesSeconds, + frac.FetcherStagesSeconds, stopwatch.SetLabel("fraction_type", "active"), ) res := make([][]byte, len(ids)) - indexes := []activeFetchIndex{{ + indexes := []fetchIndex{{ blocksOffsets: dp.blocksOffsets, docsPositions: dp.docsPositions, docsReader: dp.docsReader, @@ -88,7 +88,7 @@ func (dp *activeDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { return res, nil } -func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, error) { +func (dp *dataProvider) Search(params processor.SearchParams) (*seq.QPR, error) { // The index of the active fraction changes in parts and at a single moment in time may not be consistent. // So we can add new IDs to the index but update the range [from; to] with a delay. // Because of this, at the Search stage, we can get IDs that are outside the fraction range [from; to]. @@ -106,16 +106,16 @@ func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, e sw := stopwatch.New() defer sw.Export( - fractionSearchMetric(params), + frac.FractionSearchMetric(params), stopwatch.SetLabel("fraction_type", "active"), ) t := sw.Start("total") m := sw.Start("new_search_index") - indexes := []activeSearchIndex{{ - activeIDsIndex: dp.getIDsIndex(), - activeTokenIndex: dp.getTokenIndex(), + indexes := []searchIndex{{ + idsIndex: dp.getIDsIndex(), + tokenIndex: dp.getTokenIndex(), }} m.Stop() @@ -136,27 +136,27 @@ func (dp *activeDataProvider) Search(params processor.SearchParams) (*seq.QPR, e return res, nil } -type activeIDsIndex struct { +type idsIndex struct { mids []uint64 rids []uint64 inverser *inverser } -func (p *activeIDsIndex) GetMID(lid seq.LID) seq.MID { +func (p *idsIndex) GetMID(lid seq.LID) seq.MID { restoredLID := p.inverser.Revert(uint32(lid)) return seq.MID(p.mids[restoredLID]) } -func (p *activeIDsIndex) GetRID(lid seq.LID) seq.RID { +func (p *idsIndex) GetRID(lid seq.LID) seq.RID { restoredLID := p.inverser.Revert(uint32(lid)) return seq.RID(p.rids[restoredLID]) } -func (p *activeIDsIndex) Len() int { +func (p *idsIndex) Len() int { return p.inverser.Len() } -func (p *activeIDsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { +func (p *idsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { checkedMID := p.GetMID(lid) if checkedMID == id.MID { return p.GetRID(lid) <= id.RID @@ -164,28 +164,28 @@ func (p *activeIDsIndex) LessOrEqual(lid seq.LID, id seq.ID) bool { return checkedMID < id.MID } -type activeSearchIndex struct { - *activeIDsIndex - *activeTokenIndex +type searchIndex struct { + *idsIndex + *tokenIndex } -type activeTokenIndex struct { +type tokenIndex struct { ctx context.Context mids *UInt64s rids *UInt64s - tokenList *TokenList + tokenList *tokenList inverser *inverser } -func (si *activeTokenIndex) GetValByTID(tid uint32) []byte { +func (si *tokenIndex) GetValByTID(tid uint32) []byte { return si.tokenList.GetValByTID(tid) } -func (si *activeTokenIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { +func (si *tokenIndex) GetTIDsByTokenExpr(t parser.Token) ([]uint32, error) { return si.tokenList.FindPattern(si.ctx, t) } -func (si *activeTokenIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { +func (si *tokenIndex) GetLIDsFromTIDs(tids []uint32, _ lids.Counter, minLID, maxLID uint32, order seq.DocsOrder) []node.Node { nodes := make([]node.Node, 0, len(tids)) for _, tid := range tids { tlids := si.tokenList.Provide(tid) @@ -209,17 +209,17 @@ func inverseLIDs(unmapped []uint32, inv *inverser, minLID, maxLID uint32) []uint return result } -type activeFetchIndex struct { +type fetchIndex struct { blocksOffsets []uint64 docsPositions *DocsPositions docsReader *storage.DocsReader } -func (di *activeFetchIndex) GetBlocksOffsets(num uint32) uint64 { +func (di *fetchIndex) GetBlocksOffsets(num uint32) uint64 { return di.blocksOffsets[num] } -func (di *activeFetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { +func (di *fetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { docsPos := make([]seq.DocPos, len(ids)) for i, id := range ids { docsPos[i] = di.docsPositions.GetSync(id) @@ -227,6 +227,6 @@ func (di *activeFetchIndex) GetDocPos(ids []seq.ID) []seq.DocPos { return docsPos } -func (di *activeFetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { +func (di *fetchIndex) ReadDocs(blockOffset uint64, docOffsets []uint64) ([][]byte, error) { return di.docsReader.ReadDocs(blockOffset, docOffsets) } diff --git a/frac/active_indexer.go b/frac/active_old/indexer.go similarity index 87% rename from frac/active_indexer.go rename to frac/active_old/indexer.go index 0422c105..91672462 100644 --- a/frac/active_indexer.go +++ b/frac/active_old/indexer.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "encoding/binary" @@ -14,12 +14,10 @@ import ( "github.com/ozontech/seq-db/storage" ) -type ActiveIndexer struct { +type Indexer struct { ch chan *indexTask chMerge chan *mergeTask workerCount int - - stopFn func() } type indexTask struct { @@ -34,15 +32,17 @@ type mergeTask struct { tokenLIDs *TokenLIDs } -func NewActiveIndexer(workerCount, chLen int) *ActiveIndexer { - return &ActiveIndexer{ +func NewIndexer(workerCount, chLen int) (*Indexer, func()) { + idx := Indexer{ ch: make(chan *indexTask, chLen), chMerge: make(chan *mergeTask, chLen), workerCount: workerCount, } + stopIdx := idx.start() + return &idx, stopIdx } -func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { +func (ai *Indexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, sw *stopwatch.Stopwatch) { m := sw.Start("send_index_chan") ai.ch <- &indexTask{ Pos: storage.DocBlock(metas).GetExt2(), @@ -53,7 +53,7 @@ func (ai *ActiveIndexer) Index(frac *Active, metas []byte, wg *sync.WaitGroup, s m.Stop() } -func (ai *ActiveIndexer) Start() { +func (ai *Indexer) start() func() { wg := sync.WaitGroup{} wg.Add(ai.workerCount) @@ -72,35 +72,26 @@ func (ai *ActiveIndexer) Start() { }() } - ai.stopFn = func() { + return func() { close(ai.ch) close(ai.chMerge) - wg.Wait() - - ai.stopFn = nil } } -func (ai *ActiveIndexer) mergeWorker() { +func (ai *Indexer) mergeWorker() { for task := range ai.chMerge { task.tokenLIDs.GetLIDs(task.frac.MIDs, task.frac.RIDs) // GetLIDs cause sort and merge LIDs from queue } } -func (ai *ActiveIndexer) Stop() { - if ai.stopFn != nil { - ai.stopFn() - } -} - var metaDataPool = sync.Pool{ New: func() any { return new(indexer.MetaData) }, } -func (ai *ActiveIndexer) appendWorker(index int) { +func (ai *Indexer) appendWorker(index int) { // collector of bulk meta data collector := newMetaDataCollector() @@ -178,7 +169,7 @@ func (ai *ActiveIndexer) appendWorker(index int) { } } -func (ai *ActiveIndexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { +func (ai *Indexer) sendTokensToMergeWorkers(frac *Active, tokens []*TokenLIDs) { for _, tl := range tokens { task := mergeTask{ frac: frac, diff --git a/frac/active_indexer_test.go b/frac/active_old/indexer_test.go similarity index 51% rename from frac/active_indexer_test.go rename to frac/active_old/indexer_test.go index faa07a8f..02b794fd 100644 --- a/frac/active_indexer_test.go +++ b/frac/active_old/indexer_test.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "bytes" @@ -8,10 +8,15 @@ import ( "testing" "time" + "github.com/alecthomas/units" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric/stopwatch" @@ -76,39 +81,110 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx := NewActiveIndexer(8, 8) - idx.Start() - defer idx.Stop() + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) + defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - active := NewActive( - filepath.Join(b.TempDir(), "test"), - idx, - storage.NewReadLimiter(1, nil), - cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - &Config{}, - ) - processor := getTestProcessor() - for i := 0; i < b.N; i++ { - b.StopTimer() - bulks := make([][]byte, 0, len(readers)) + n := 2 + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { for _, readNext := range readers { _, _, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) - bulks = append(bulks, storage.CompressDocBlock(meta, nil, 3)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + active := New( + filepath.Join(b.TempDir(), "test"), + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + &frac.Config{}, + ) b.StartTimer() wg := sync.WaitGroup{} - for _, meta := range bulks { + for _, meta := range allMeta { wg.Add(1) idx.Index(active, meta, &wg, stopwatch.New()) } wg.Wait() } } + +func defaultSealingParams() frac.SealParams { + const minZstdLevel = 1 + return frac.SealParams{ + IDsZstdLevel: minZstdLevel, + LIDsZstdLevel: minZstdLevel, + TokenListZstdLevel: minZstdLevel, + DocsPositionsZstdLevel: minZstdLevel, + TokenTableZstdLevel: minZstdLevel, + DocBlocksZstdLevel: minZstdLevel, + DocBlockSize: 128 * int(units.KiB), + } +} + +func BenchmarkFullWrite(b *testing.B) { + logger.SetLevel(zapcore.FatalLevel) + idx, stop := NewIndexer(config.NumCPU, config.NumCPU) + defer stop() + + allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) + readers := splitLogsToBulks(allLogs, 1000) + assert.NoError(b, err) + + processor := getTestProcessor() + + n := 2 + allDocs := make([][]byte, 0, len(readers)*n) + allMeta := make([][]byte, 0, len(readers)*n) + + for range n { + for _, readNext := range readers { + _, docs, meta, _ := processor.ProcessBulk(time.Now(), nil, nil, readNext) + allDocs = append(allDocs, storage.CompressDocBlock(docs, nil, 1)) + allMeta = append(allMeta, storage.CompressDocBlock(meta, nil, 1)) + } + } + + params := defaultSealingParams() + + for b.Loop() { + active := New( + filepath.Join(b.TempDir(), "test"), + idx, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + &frac.Config{SkipSortDocs: true}, + ) + + wg := sync.WaitGroup{} + for i, meta := range allMeta { + wg.Add(1) + go func() { + err := active.Append(allDocs[i], meta, &wg) + assert.NoError(b, err) + }() + } + wg.Wait() + + src, err := NewSealingSource(active, params) + require.NoError(b, err) + sealed, err := sealing.Seal(src, params) + require.NoError(b, err) + assert.Greater(b, int(sealed.Info.DocsTotal), 0) + active.Release() + } +} diff --git a/frac/inverser.go b/frac/active_old/inverser.go similarity index 98% rename from frac/inverser.go rename to frac/active_old/inverser.go index 11d854a7..e4f6eefd 100644 --- a/frac/inverser.go +++ b/frac/active_old/inverser.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "unsafe" diff --git a/frac/meta_data_collector.go b/frac/active_old/meta_data_collector.go similarity index 99% rename from frac/meta_data_collector.go rename to frac/active_old/meta_data_collector.go index 6047bd12..a8dfaa15 100644 --- a/frac/meta_data_collector.go +++ b/frac/active_old/meta_data_collector.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "math" diff --git a/frac/active_sealing_source.go b/frac/active_old/sealing_source.go similarity index 53% rename from frac/active_sealing_source.go rename to frac/active_old/sealing_source.go index 42bde383..9f93aeb7 100644 --- a/frac/active_sealing_source.go +++ b/frac/active_old/sealing_source.go @@ -1,30 +1,21 @@ -package frac +package active_old import ( "bytes" - "encoding/binary" "errors" - "io" "iter" - "os" - "path/filepath" "slices" "time" "unsafe" - "github.com/alecthomas/units" - "go.uber.org/zap" - - "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" - "github.com/ozontech/seq-db/util" ) -// ActiveSealingSource transforms data from in-memory (frac.Active) storage +// SealingSource transforms data from in-memory (frac.Active) storage // into a format suitable for disk writing during index creation. // // The main purpose of this type is to provide access to sorted data @@ -39,29 +30,28 @@ import ( // // All iterators work with pre-sorted data and return information // in an order optimal for creating disk index structures. -type ActiveSealingSource struct { - params common.SealParams // Sealing parameters - info *common.Info // fraction Info - created time.Time // Creation time of the source - sortedLIDs []uint32 // Sorted LIDs (Local ID) - oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) - mids *UInt64s // MIDs - rids *UInt64s // RIDs - fields []string // Sorted field names - fieldsMaxTIDs []uint32 // Maximum TIDs for each field - tids []uint32 // Sorted TIDs (Token ID) - tokens [][]byte // Tokens (values) by TID - lids []*TokenLIDs // LID lists for each token - docPosOrig []seq.DocPos // Original document positions - docPosSorted []seq.DocPos // Document positions after sorting - blocksOffsets []uint64 // Document block offsets - docsReader *storage.DocsReader // Document storage reader - lastErr error // Last error +type SealingSource struct { + info *frac.Info // fraction Info + created time.Time // Creation time of the source + sortedLIDs []uint32 // Sorted LIDs (Local ID) + oldToNewLIDs []uint32 // Mapping from old LIDs to new ones (after sorting) + mids *UInt64s // MIDs + rids *UInt64s // RIDs + fields []string // Sorted field names + fieldsMaxTIDs []uint32 // Maximum TIDs for each field + tids []uint32 // Sorted TIDs (Token ID) + tokens [][]byte // Tokens (values) by TID + lids []*TokenLIDs // LID lists for each token + docPosOrig map[seq.ID]seq.DocPos // Original document positions + docPosSorted []seq.DocPos // Document positions after sorting + blocksOffsets []uint64 // Document block offsets + docsReader *storage.DocsReader // Document storage reader + lastErr error // Last error } -// NewActiveSealingSource creates a new data source for sealing +// NewSealingSource creates a new data source for sealing // based on an active in-memory index. -func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSealingSource, error) { +func NewSealingSource(active *Active, params frac.SealParams) (*SealingSource, error) { info := *active.info // copy sortedLIDs := active.GetAllDocuments() @@ -71,8 +61,7 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe // Sort tokens within each field sortedTIDs := sortTokens(sortedFields, active.TokenList) - src := ActiveSealingSource{ - params: params, + src := SealingSource{ info: &info, created: time.Now(), sortedLIDs: sortedLIDs, @@ -84,7 +73,7 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe fieldsMaxTIDs: fieldsMaxTIDs, tokens: active.TokenList.tidToVal, lids: active.TokenList.tidToLIDs, - docPosOrig: active.DocsPositions.lidToPos, + docPosOrig: active.DocsPositions.idToPos, blocksOffsets: active.DocBlocks.vals, docsReader: &active.sortReader, } @@ -93,9 +82,14 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe // Sort documents if not skipped in configuration if !active.Config.SkipSortDocs { - if err := src.SortDocs(); err != nil { + ds := NewDocsSource(&src, src.blocksOffsets, &active.sortReader) + blocksOffsets, positions, onDiskSize, err := sealing.SortDocs(info.Path, params, ds) + if err != nil { return nil, err } + src.docPosSorted = positions[1:] + src.blocksOffsets = blocksOffsets + src.info.DocsOnDisk = uint64(onDiskSize) } return &src, nil @@ -103,7 +97,7 @@ func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSe // sortFields sorts field names and calculates maximum TIDs for each field. // Returns sorted field list and array of maximum TIDs. -func sortFields(tl *TokenList) ([]string, []uint32) { +func sortFields(tl *tokenList) ([]string, []uint32) { fields := make([]string, 0, len(tl.FieldTIDs)) for field := range tl.FieldTIDs { fields = append(fields, field) @@ -122,7 +116,7 @@ func sortFields(tl *TokenList) ([]string, []uint32) { // sortTokens sorts tokens lexicographically within each field. // Returns sorted list of TIDs. -func sortTokens(sortedFields []string, tl *TokenList) []uint32 { +func sortTokens(sortedFields []string, tl *tokenList) []uint32 { pos := 0 tids := make([]uint32, 0, len(tl.tidToVal)) for _, field := range sortedFields { @@ -139,26 +133,26 @@ func sortTokens(sortedFields []string, tl *TokenList) []uint32 { } // LastError returns the last error that occurred during processing. -func (src *ActiveSealingSource) LastError() error { +func (src *SealingSource) LastError() error { return src.lastErr } // prepareInfo prepares metadata for disk writing. -func (src *ActiveSealingSource) prepareInfo() { +func (src *SealingSource) prepareInfo() { src.info.MetaOnDisk = 0 src.info.SealingTime = uint64(src.created.UnixMilli()) src.info.BuildDistribution(src.mids.vals) } // Info returns index metadata information. -func (src *ActiveSealingSource) Info() *common.Info { +func (src *SealingSource) Info() *frac.Info { return src.info } // TokenBlocks returns an iterator for token blocks for disk writing. // Tokens are pre-sorted: first by fields, then lexicographically within each field. // Each block contains up to blockSize bytes of data for efficient writing. -func (src *ActiveSealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { +func (src *SealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { const tokenLengthSize = int(unsafe.Sizeof(uint32(0))) return func(yield func([][]byte) bool) { if len(src.tids) == 0 { @@ -193,7 +187,7 @@ func (src *ActiveSealingSource) TokenBlocks(blockSize int) iter.Seq[[][]byte] { // Fields returns an iterator for sorted fields and their maximum TIDs. // Fields are sorted lexicographically, ensuring predictable order // when building disk index structures. -func (src *ActiveSealingSource) Fields() iter.Seq2[string, uint32] { +func (src *SealingSource) Fields() iter.Seq2[string, uint32] { return func(yield func(string, uint32) bool) { for i, field := range src.fields { if !yield(field, src.fieldsMaxTIDs[i]) { @@ -206,7 +200,7 @@ func (src *ActiveSealingSource) Fields() iter.Seq2[string, uint32] { // IDsBlocks returns an iterator for document ID blocks and corresponding positions. // IDs are sorted. Block size is controlled by blockSize parameter for balance between // performance and memory usage. -func (src *ActiveSealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { +func (src *SealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []seq.DocPos] { return func(yield func([]seq.ID, []seq.DocPos) bool) { mids := src.mids.vals rids := src.rids.vals @@ -232,9 +226,9 @@ func (src *ActiveSealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []s // Use sorted or original positions if len(src.docPosSorted) == 0 { - pos = append(pos, src.docPosOrig[lid]) + pos = append(pos, src.docPosOrig[id]) } else { - pos = append(pos, src.docPosSorted[i+1]) // +1 for system document + pos = append(pos, src.docPosSorted[i]) // +1 for system document } } yield(ids, pos) @@ -242,7 +236,7 @@ func (src *ActiveSealingSource) IDsBlocks(blockSize int) iter.Seq2[[]seq.ID, []s } // BlocksOffsets returns document block offsets. -func (src *ActiveSealingSource) BlocksOffsets() []uint64 { +func (src *SealingSource) BlocksOffsets() []uint64 { return src.blocksOffsets } @@ -250,7 +244,7 @@ func (src *ActiveSealingSource) BlocksOffsets() []uint64 { // LIDs are converted to new numbering after document sorting. // Each iterator call returns a list of documents containing a specific token, // in sorted order. -func (src *ActiveSealingSource) TokenLIDs() iter.Seq[[]uint32] { +func (src *SealingSource) TokenLIDs() iter.Seq[[]uint32] { return func(yield func([]uint32) bool) { newLIDs := []uint32{} @@ -284,7 +278,7 @@ func makeInverser(sortedLIDs []uint32) []uint32 { // Docs returns an iterator for documents with their IDs. // Handles duplicate IDs (for nested indexes). -func (src *ActiveSealingSource) Docs() iter.Seq2[seq.ID, []byte] { +func (src *SealingSource) Docs() iter.Seq2[seq.ID, []byte] { src.lastErr = nil return func(yield func(seq.ID, []byte) bool) { var ( @@ -313,7 +307,7 @@ func (src *ActiveSealingSource) Docs() iter.Seq2[seq.ID, []byte] { } // doc reads a document from storage by its position. -func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { +func (src *SealingSource) doc(pos seq.DocPos) ([]byte, error) { blockIndex, docOffset := pos.Unpack() blockOffset := src.blocksOffsets[blockIndex] @@ -327,140 +321,3 @@ func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { } return doc, nil } - -// SortDocs sorts documents and writes them in compressed form to disk. -// Creates a temporary file that is then renamed to the final one. -func (src *ActiveSealingSource) SortDocs() error { - start := time.Now() - logger.Info("sorting docs...") - - // Create temporary file for sorted documents - sdocsFile, err := os.Create(src.info.Path + consts.SdocsTmpFileSuffix) - if err != nil { - return err - } - - bw := bytespool.AcquireWriterSize(sdocsFile, int(units.MiB)) - defer bytespool.ReleaseWriter(bw) - - // Group documents into blocks - blocks := docBlocks(src.Docs(), src.params.DocBlockSize) - - // Write blocks and get new offsets and positions - blocksOffsets, positions, err := src.writeDocs(blocks, bw) - - if err := util.CollapseErrors([]error{src.lastErr, err}); err != nil { - return err - } - if err := bw.Flush(); err != nil { - return err - } - - src.docPosSorted = positions - src.blocksOffsets = blocksOffsets - - // Get file statistics - stat, err := sdocsFile.Stat() - if err != nil { - return err - } - src.info.DocsOnDisk = uint64(stat.Size()) - - // Synchronize and rename file - if err := sdocsFile.Sync(); err != nil { - return err - } - if err := sdocsFile.Close(); err != nil { - return err - } - if err := os.Rename(sdocsFile.Name(), src.info.Path+consts.SdocsFileSuffix); err != nil { - return err - } - if err := util.SyncPath(filepath.Dir(src.info.Path)); err != nil { - return err - } - - // Log compression statistics - ratio := float64(src.info.DocsRaw) / float64(src.info.DocsOnDisk) - logger.Info("docs sorting stat", - util.ZapUint64AsSizeStr("raw", src.info.DocsRaw), - util.ZapUint64AsSizeStr("compressed", src.info.DocsOnDisk), - util.ZapFloat64WithPrec("ratio", ratio, 2), - zap.Int("blocks_count", len(blocksOffsets)), - zap.Int("docs_total", len(positions)), - util.ZapDurationWithPrec("write_duration_ms", time.Since(start), "ms", 0), - ) - - return nil -} - -// writeDocs compresses and writes document blocks, calculating new offsets -// and collecting document positions. -func (src *ActiveSealingSource) writeDocs(blocks iter.Seq2[[]byte, []seq.DocPos], w io.Writer) ([]uint64, []seq.DocPos, error) { - offset := 0 - buf := make([]byte, 0) - blocksOffsets := make([]uint64, 0) - allPositions := make([]seq.DocPos, 0, len(src.mids.vals)) - - // Process each document block - for block, positions := range blocks { - allPositions = append(allPositions, positions...) - blocksOffsets = append(blocksOffsets, uint64(offset)) - - // Compress document block - buf = storage.CompressDocBlock(block, buf[:0], src.params.DocBlocksZstdLevel) - if _, err := w.Write(buf); err != nil { - return nil, nil, err - } - offset += len(buf) - } - return blocksOffsets, allPositions, nil -} - -// docBlocks groups documents into fixed-size blocks. -// Returns an iterator for blocks and corresponding document positions. -func docBlocks(docs iter.Seq2[seq.ID, []byte], blockSize int) iter.Seq2[[]byte, []seq.DocPos] { - return func(yield func([]byte, []seq.DocPos) bool) { - const defaultBlockSize = 128 * units.KiB - if blockSize <= 0 { - blockSize = int(defaultBlockSize) - logger.Warn("document block size not specified", zap.Int("default_size", blockSize)) - } - - var ( - prev seq.ID - index uint32 // Current block index - ) - pos := make([]seq.DocPos, 0) - buf := make([]byte, 0, blockSize) - - // Iterate through documents - for id, doc := range docs { - if id == prev { - // Duplicate IDs (for nested indexes) - store document once, - // but create positions for each LID - pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) - continue - } - prev = id - - // If block is full, yield it - if len(buf) >= blockSize { - if !yield(buf, pos) { - return - } - index++ - buf = buf[:0] - pos = pos[:0] - } - - // Add document position - pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) - - // Write document size and the document itself - buf = binary.LittleEndian.AppendUint32(buf, uint32(len(doc))) - buf = append(buf, doc...) - } - yield(buf, pos) - } -} diff --git a/frac/active_lids.go b/frac/active_old/token_lids.go similarity index 99% rename from frac/active_lids.go rename to frac/active_old/token_lids.go index 47abe92a..1d1dafb6 100644 --- a/frac/active_lids.go +++ b/frac/active_old/token_lids.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "math" diff --git a/frac/active_token_list.go b/frac/active_old/token_list.go similarity index 79% rename from frac/active_token_list.go rename to frac/active_old/token_list.go index adf94ffd..94a66031 100644 --- a/frac/active_token_list.go +++ b/frac/active_old/token_list.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "context" @@ -28,36 +28,36 @@ type tokenTask struct { tlids []*TokenLIDs } -type activeTokenProvider struct { +type tokenProvider struct { inverseIndex []uint32 tidToVal [][]byte } -func (tp *activeTokenProvider) GetToken(tid uint32) []byte { +func (tp *tokenProvider) GetToken(tid uint32) []byte { id := tp.inverseIndex[tid-1] return tp.tidToVal[id] } -func (tp *activeTokenProvider) FirstTID() uint32 { +func (tp *tokenProvider) FirstTID() uint32 { return 1 } -func (tp *activeTokenProvider) LastTID() uint32 { +func (tp *tokenProvider) LastTID() uint32 { return uint32(len(tp.inverseIndex)) } -func (tp *activeTokenProvider) Ordered() bool { +func (tp *tokenProvider) Ordered() bool { return false } -func (tp *activeTokenProvider) inverseTIDs(tids []uint32) []uint32 { +func (tp *tokenProvider) inverseTIDs(tids []uint32) []uint32 { for i, tid := range tids { tids[i] = tp.inverseIndex[tid-1] } return tids } -type TokenList struct { +type tokenList struct { chList []chan tokenTask fieldsMu sync.RWMutex @@ -73,8 +73,8 @@ type TokenList struct { tidToLIDs []*TokenLIDs } -func NewActiveTokenList(workers int) *TokenList { - tl := &TokenList{ +func NewTokenList(workers int) *tokenList { + tl := &tokenList{ chList: make([]chan tokenTask, workers), FieldTIDs: make(map[string][]uint32), fieldSizes: make(map[string]uint32), @@ -91,13 +91,13 @@ func NewActiveTokenList(workers int) *TokenList { return tl } -func (tl *TokenList) Stop() { +func (tl *tokenList) Stop() { for _, c := range tl.chList { close(c) } } -func (tl *TokenList) tokenLIDsWorker(ch chan tokenTask, tokenToLIDs map[string]*TokenLIDs) { +func (tl *tokenList) tokenLIDsWorker(ch chan tokenTask, tokenToLIDs map[string]*TokenLIDs) { nonExistent := make([]int, 0) for task := range ch { bufSize := 0 @@ -138,13 +138,13 @@ func copyAndSplit(token []byte, fLen int, dest []byte) (string, string, []byte, dest } -func (tl *TokenList) initSystemTokens() { +func (tl *tokenList) initSystemTokens() { token := []byte(seq.TokenAll + ":") tlids := tl.Append([][]byte{token}, []int{len(seq.TokenAll)}, []*TokenLIDs{nil}) tl.allTokenLIDs = tlids[0] } -func (tl *TokenList) GetValByTID(tid uint32) []byte { +func (tl *tokenList) GetValByTID(tid uint32) []byte { tl.tidMu.RLock() defer tl.tidMu.RUnlock() @@ -152,7 +152,7 @@ func (tl *TokenList) GetValByTID(tid uint32) []byte { return tl.tidToVal[tid] } -func (tl *TokenList) Provide(tid uint32) *TokenLIDs { +func (tl *tokenList) Provide(tid uint32) *TokenLIDs { tl.tidMu.RLock() defer tl.tidMu.RUnlock() @@ -160,31 +160,31 @@ func (tl *TokenList) Provide(tid uint32) *TokenLIDs { return tl.tidToLIDs[tid] } -func (tl *TokenList) GetAllTokenLIDs() *TokenLIDs { +func (tl *tokenList) GetAllTokenLIDs() *TokenLIDs { return tl.allTokenLIDs } -func (tl *TokenList) GetTIDsByField(f string) []uint32 { +func (tl *tokenList) GetTIDsByField(f string) []uint32 { tl.fieldsMu.RLock() defer tl.fieldsMu.RUnlock() return tl.FieldTIDs[f] } -func (tl *TokenList) getTokenProvider(field string) *activeTokenProvider { +func (tl *tokenList) getTokenProvider(field string) *tokenProvider { inverseIndex := tl.GetTIDsByField(field) tl.tidMu.RLock() tidToVal := tl.tidToVal tl.tidMu.RUnlock() - return &activeTokenProvider{ + return &tokenProvider{ tidToVal: tidToVal, inverseIndex: inverseIndex, } } -func (tl *TokenList) FindPattern(ctx context.Context, t parser.Token) ([]uint32, error) { +func (tl *tokenList) FindPattern(ctx context.Context, t parser.Token) ([]uint32, error) { field := parser.GetField(t) tp := tl.getTokenProvider(field) tids, err := pattern.Search(ctx, t, tp) @@ -198,7 +198,7 @@ func getTokenHash(token []byte) uint32 { return crc32.ChecksumIEEE(token) } -func (tl *TokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids []*TokenLIDs) []tokenData { +func (tl *tokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids []*TokenLIDs) []tokenData { n := len(tl.chList) remap := make([][]int, n) for i, token := range tokens { @@ -234,7 +234,7 @@ func (tl *TokenList) getTokenLIDs(tokens [][]byte, fieldsLengths []int, tlids [] return newTokensData } -func (tl *TokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlaces []*TokenLIDs) []*TokenLIDs { +func (tl *tokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlaces []*TokenLIDs) []*TokenLIDs { newTokensData := tl.getTokenLIDs(tokens, fieldsLengths, tokenLIDsPlaces) tl.createTIDs(newTokensData) @@ -244,7 +244,7 @@ func (tl *TokenList) Append(tokens [][]byte, fieldsLengths []int, tokenLIDsPlace return tokenLIDsPlaces } -func (tl *TokenList) createTIDs(newTokensData []tokenData) { +func (tl *tokenList) createTIDs(newTokensData []tokenData) { tl.tidMu.Lock() for i, token := range newTokensData { newTokensData[i].tid = uint32(len(tl.tidToVal)) @@ -254,7 +254,7 @@ func (tl *TokenList) createTIDs(newTokensData []tokenData) { tl.tidMu.Unlock() } -func (tl *TokenList) fillFieldTIDs(newTokensData []tokenData) { +func (tl *tokenList) fillFieldTIDs(newTokensData []tokenData) { tl.fieldsMu.Lock() for _, token := range newTokensData { field := token.field @@ -263,7 +263,7 @@ func (tl *TokenList) fillFieldTIDs(newTokensData []tokenData) { tl.fieldsMu.Unlock() } -func (tl *TokenList) fillSizes(newTokensData []tokenData) { +func (tl *tokenList) fillSizes(newTokensData []tokenData) { tl.sizesMu.Lock() for _, token := range newTokensData { tl.fieldSizes[token.field] += uint32(len(token.value)) @@ -271,7 +271,7 @@ func (tl *TokenList) fillSizes(newTokensData []tokenData) { tl.sizesMu.Unlock() } -func (tl *TokenList) GetFieldSizes() map[string]uint32 { +func (tl *tokenList) GetFieldSizes() map[string]uint32 { tl.sizesMu.Lock() defer tl.sizesMu.Unlock() diff --git a/frac/active_writer.go b/frac/active_old/writer.go similarity index 61% rename from frac/active_writer.go rename to frac/active_old/writer.go index 95cf2fdd..96dfabe2 100644 --- a/frac/active_writer.go +++ b/frac/active_old/writer.go @@ -1,4 +1,4 @@ -package frac +package active_old import ( "os" @@ -7,19 +7,19 @@ import ( "github.com/ozontech/seq-db/storage" ) -type ActiveWriter struct { +type Writer struct { docs *FileWriter meta *FileWriter } -func NewActiveWriter(docsFile, metaFile *os.File, docsOffset, metaOffset int64, skipFsync bool) *ActiveWriter { - return &ActiveWriter{ +func NewWriter(docsFile, metaFile *os.File, docsOffset, metaOffset int64, skipFsync bool) *Writer { + return &Writer{ docs: NewFileWriter(docsFile, docsOffset, skipFsync), meta: NewFileWriter(metaFile, metaOffset, skipFsync), } } -func (a *ActiveWriter) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { +func (a *Writer) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { m := sw.Start("write_docs") offset, err := a.docs.Write(docs, sw) m.Stop() @@ -28,7 +28,6 @@ func (a *ActiveWriter) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { return err } - storage.DocBlock(meta).SetExt1(uint64(len(docs))) storage.DocBlock(meta).SetExt2(uint64(offset)) m = sw.Start("write_meta") @@ -38,7 +37,7 @@ func (a *ActiveWriter) Write(docs, meta []byte, sw *stopwatch.Stopwatch) error { return err } -func (a *ActiveWriter) Stop() { +func (a *Writer) Stop() { a.docs.Stop() a.meta.Stop() } diff --git a/frac/empty.go b/frac/empty.go deleted file mode 100644 index f4748c47..00000000 --- a/frac/empty.go +++ /dev/null @@ -1,50 +0,0 @@ -package frac - -import ( - "context" - "math" - - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/metric" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" -) - -var EmptyFraction Fraction = Empty{ - info: &common.Info{ - Path: "empty", - From: math.MaxUint64, - To: 0, - }, -} - -type Empty struct { - info *common.Info -} - -func (Empty) Fetch(context.Context, []seq.ID) ([][]byte, error) { - metric.CountersTotal.WithLabelValues("empty_fraction_fetch").Inc() - return nil, nil -} - -func (Empty) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { - metric.CountersTotal.WithLabelValues("empty_fraction_search").Inc() - return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil -} - -func (e Empty) Info() *common.Info { - return e.info -} -func (Empty) IsIntersecting(seq.MID, seq.MID) bool { - return false -} -func (Empty) Contains(mid seq.MID) bool { - return false -} - -func (Empty) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - return false, nil -} - -func (Empty) Suicide() {} diff --git a/frac/fraction.go b/frac/fraction.go index 89929c8b..1cf74f16 100644 --- a/frac/fraction.go +++ b/frac/fraction.go @@ -9,25 +9,21 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" ) type Fraction interface { - Info() *common.Info + Info() *Info IsIntersecting(from seq.MID, to seq.MID) bool Contains(mid seq.MID) bool Fetch(context.Context, []seq.ID) ([][]byte, error) Search(context.Context, processor.SearchParams) (*seq.QPR, error) - Offload(ctx context.Context, u storage.Uploader) (bool, error) - Suicide() } var ( - fetcherStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ + FetcherStagesSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "seq_db_store", Subsystem: "fetcher", Name: "fraction_stages_seconds", @@ -53,7 +49,7 @@ var ( }, []string{"stage", "fraction_type"}) ) -func fractionSearchMetric( +func FractionSearchMetric( params processor.SearchParams, ) *prometheus.HistogramVec { if params.HasAgg() { @@ -65,7 +61,7 @@ func fractionSearchMetric( return fractionRegSearchSec } -func fracToString(f Fraction, fracType string) string { +func FracToString(f Fraction, fracType string) string { info := f.Info() s := fmt.Sprintf( "%s fraction name=%s, creation time=%s, from=%s, to=%s, %s", diff --git a/frac/common/info.go b/frac/info.go similarity index 95% rename from frac/common/info.go rename to frac/info.go index ad3c3a57..cb9ba3c2 100644 --- a/frac/common/info.go +++ b/frac/info.go @@ -1,4 +1,4 @@ -package common +package frac import ( "fmt" @@ -79,6 +79,15 @@ func (s *Info) BuildDistribution(mids []uint64) { } } +func (s *Info) BuildDistributionWithIDs(ids []seq.ID) { + if !s.InitEmptyDistribution() { + return + } + for _, id := range ids { + s.Distribution.Add(id.MID) + } +} + func (s *Info) InitEmptyDistribution() bool { from := time.UnixMilli(int64(s.From)) creationTime := time.UnixMilli(int64(s.CreationTime)) diff --git a/frac/common/seal_params.go b/frac/seal_params.go similarity index 95% rename from frac/common/seal_params.go rename to frac/seal_params.go index c19365f9..b0f9d462 100644 --- a/frac/common/seal_params.go +++ b/frac/seal_params.go @@ -1,4 +1,4 @@ -package common +package frac type SealParams struct { IDsZstdLevel int diff --git a/frac/sealed/block_info.go b/frac/sealed/block_info.go index 8436f91e..a79fdffa 100644 --- a/frac/sealed/block_info.go +++ b/frac/sealed/block_info.go @@ -6,14 +6,14 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" ) const seqDBMagic = "SEQM" type BlockInfo struct { - Info *common.Info + Info *frac.Info } func (b *BlockInfo) Pack(buf []byte) []byte { @@ -33,7 +33,7 @@ func (b *BlockInfo) Unpack(data []byte) error { return errors.New("seq-db index file header corrupted") } - b.Info = &common.Info{} + b.Info = &frac.Info{} if err := json.Unmarshal(data[4:], b.Info); err != nil { return errors.New("stats unmarshaling error") } diff --git a/frac/sealed_index.go b/frac/sealed/index.go similarity index 98% rename from frac/sealed_index.go rename to frac/sealed/index.go index f97c6e84..8c6c1777 100644 --- a/frac/sealed_index.go +++ b/frac/sealed/index.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -7,7 +7,7 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -24,8 +24,8 @@ import ( type sealedDataProvider struct { ctx context.Context - info *common.Info - config *Config + info *frac.Info + config *frac.Config idsTable *seqids.Table idsProvider *seqids.Provider @@ -85,7 +85,7 @@ func (dp *sealedDataProvider) Fetch(ids []seq.ID) ([][]byte, error) { sw := stopwatch.New() defer sw.Export( - fetcherStagesSeconds, + frac.FetcherStagesSeconds, stopwatch.SetLabel("fraction_type", dp.fractionTypeLabel), ) @@ -107,7 +107,7 @@ func (dp *sealedDataProvider) Search(params processor.SearchParams) (*seq.QPR, e sw := stopwatch.New() defer sw.Export( - fractionSearchMetric(params), + frac.FractionSearchMetric(params), stopwatch.SetLabel("fraction_type", dp.fractionTypeLabel), ) diff --git a/frac/index_cache.go b/frac/sealed/index_cache.go similarity index 97% rename from frac/index_cache.go rename to frac/sealed/index_cache.go index 3dc5b3a4..19adeddb 100644 --- a/frac/index_cache.go +++ b/frac/sealed/index_cache.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "github.com/ozontech/seq-db/cache" diff --git a/frac/sealed_loader.go b/frac/sealed/loader.go similarity index 93% rename from frac/sealed_loader.go rename to frac/sealed/loader.go index 83a7f060..1b332a44 100644 --- a/frac/sealed_loader.go +++ b/frac/sealed/loader.go @@ -1,12 +1,11 @@ -package frac +package sealed import ( "time" "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/logger" @@ -21,7 +20,7 @@ type Loader struct { blockBuf []byte } -func (l *Loader) Load(blocksData *sealed.BlocksData, info *common.Info, indexReader *storage.IndexReader) { +func (l *Loader) Load(blocksData *BlocksData, info *frac.Info, indexReader *storage.IndexReader) { t := time.Now() l.reader = indexReader @@ -78,7 +77,7 @@ func (l *Loader) loadIDs() (idsTable seqids.Table, blocksOffsets []uint64, err e return idsTable, nil, err } - b := sealed.BlockOffsets{} + b := BlockOffsets{} if err := b.Unpack(result); err != nil { return idsTable, nil, err } diff --git a/frac/sealed/preloaded_data.go b/frac/sealed/preloaded_data.go index 1b43b865..76442c94 100644 --- a/frac/sealed/preloaded_data.go +++ b/frac/sealed/preloaded_data.go @@ -1,14 +1,14 @@ package sealed import ( - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" ) type PreloadedData struct { - Info *common.Info + Info *frac.Info BlocksData BlocksData TokenTable token.Table } diff --git a/frac/remote.go b/frac/sealed/remote.go similarity index 81% rename from frac/remote.go rename to frac/sealed/remote.go index d5e85340..e4ad8584 100644 --- a/frac/remote.go +++ b/frac/sealed/remote.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -10,14 +10,12 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -25,7 +23,7 @@ import ( ) var ( - _ Fraction = (*Remote)(nil) + _ frac.Fraction = (*Remote)(nil) ) // Remote fraction is a fraction that is backed by remote storage. @@ -36,14 +34,11 @@ var ( type Remote struct { ctx context.Context - Config *Config + Config *frac.Config BaseFileName string - info *common.Info - - useMu sync.RWMutex - suicided bool + info *frac.Info docsFile storage.ImmutableFile docsCache *cache.Cache[[]byte] @@ -55,7 +50,7 @@ type Remote struct { loadMu *sync.RWMutex isLoaded bool - blocksData sealed.BlocksData + blocksData BlocksData s3cli *s3.Client readLimiter *storage.ReadLimiter @@ -67,8 +62,8 @@ func NewRemote( readLimiter *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - info *common.Info, - config *Config, + info *frac.Info, + config *frac.Config, s3cli *s3.Client, ) *Remote { f := &Remote{ @@ -116,57 +111,58 @@ func (f *Remote) Contains(mid seq.MID) bool { } func (f *Remote) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) + dp, err := f.createDataProvider(ctx) + if err != nil { + return nil, err } + defer dp.release() + return dp.Fetch(ids) } func (f *Remote) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) + dp, err := f.createDataProvider(ctx) + if err != nil { + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, err } + defer dp.release() + return dp.Search(params) } -func (f *Remote) DataProvider(ctx context.Context) (*sealedDataProvider, func()) { - f.useMu.RLock() - - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - f.useMu.RUnlock() - return nil, func() {} - } - - defer func() { - if panicData := recover(); panicData != nil { - f.useMu.RUnlock() - panic(panicData) - } - }() - +func (f *Remote) createDataProvider(ctx context.Context) (*sealedDataProvider, error) { if err := f.load(); err != nil { logger.Error( "will create empty data provider: cannot load remote fraction", zap.String("fraction", f.Info().Name()), zap.Error(err), ) - f.useMu.RUnlock() - return nil, func() {} + return nil, err } + return &sealedDataProvider{ + ctx: ctx, + info: f.info, + config: f.Config, + docsReader: &f.docsReader, + blocksOffsets: f.blocksData.BlocksOffsets, + lidsTable: f.blocksData.LIDsTable, + lidsLoader: lids.NewLoader(&f.indexReader, f.indexCache.LIDs), + tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.indexReader, f.indexCache.Tokens), + tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.indexReader, f.indexCache.TokenTable), - dp := f.createDataProvider(ctx) - return dp, func() { - dp.release() - f.useMu.RUnlock() - } + idsTable: &f.blocksData.IDsTable, + idsProvider: seqids.NewProvider( + &f.indexReader, + f.indexCache.MIDs, + f.indexCache.RIDs, + f.indexCache.Params, + &f.blocksData.IDsTable, + f.info.BinaryDataVer, + ), + }, nil } -func (f *Remote) Info() *common.Info { +func (f *Remote) Info() *frac.Info { return f.info } @@ -174,15 +170,7 @@ func (f *Remote) IsIntersecting(from, to seq.MID) bool { return f.info.IsIntersecting(from, to) } -func (f *Remote) Offload(context.Context, storage.Uploader) (bool, error) { - panic("BUG: remote fraction cannot be offloaded") -} - func (f *Remote) Suicide() { - f.useMu.Lock() - f.suicided = true - f.useMu.Unlock() - util.MustRemoveFileByPath(f.BaseFileName + consts.RemoteFractionSuffix) f.docsCache.Release() @@ -205,33 +193,7 @@ func (f *Remote) Suicide() { } func (f *Remote) String() string { - return fracToString(f, "remote") -} - -func (f *Remote) createDataProvider(ctx context.Context) *sealedDataProvider { - return &sealedDataProvider{ - ctx: ctx, - fractionTypeLabel: "remote", - - info: f.info, - config: f.Config, - docsReader: &f.docsReader, - blocksOffsets: f.blocksData.BlocksOffsets, - lidsTable: f.blocksData.LIDsTable, - lidsLoader: lids.NewLoader(&f.indexReader, f.indexCache.LIDs), - tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.indexReader, f.indexCache.Tokens), - tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.indexReader, f.indexCache.TokenTable), - - idsTable: &f.blocksData.IDsTable, - idsProvider: seqids.NewProvider( - &f.indexReader, - f.indexCache.MIDs, - f.indexCache.RIDs, - f.indexCache.Params, - &f.blocksData.IDsTable, - f.info.BinaryDataVer, - ), - } + return frac.FracToString(f, "remote") } func (f *Remote) load() error { diff --git a/frac/sealed.go b/frac/sealed/sealed.go similarity index 80% rename from frac/sealed.go rename to frac/sealed/sealed.go index 0755164f..af4f328f 100644 --- a/frac/sealed.go +++ b/frac/sealed/sealed.go @@ -1,4 +1,4 @@ -package frac +package sealed import ( "context" @@ -12,32 +12,27 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" ) var ( - _ Fraction = (*Sealed)(nil) + _ frac.Fraction = (*Sealed)(nil) ) type Sealed struct { - Config *Config + Config *frac.Config BaseFileName string - info *common.Info - - useMu sync.RWMutex - suicided bool + info *frac.Info docsFile *os.File docsCache *cache.Cache[[]byte] @@ -49,7 +44,7 @@ type Sealed struct { loadMu *sync.RWMutex isLoaded bool - blocksData sealed.BlocksData + blocksData BlocksData readLimiter *storage.ReadLimiter @@ -65,13 +60,13 @@ const ( HalfRemove ) -func NewSealed( +func New( baseFile string, readLimiter *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - info *common.Info, - config *Config, + info *frac.Info, + config *frac.Config, ) *Sealed { f := &Sealed{ loadMu: &sync.RWMutex{}, @@ -127,13 +122,13 @@ func (f *Sealed) openDocs() { } } -func NewSealedPreloaded( +func NewPreloaded( baseFile string, - preloaded *sealed.PreloadedData, + preloaded *PreloadedData, rl *storage.ReadLimiter, indexCache *IndexCache, docsCache *cache.Cache[[]byte], - config *Config, + config *frac.Config, ) *Sealed { f := &Sealed{ blocksData: preloaded.BlocksData, @@ -189,20 +184,15 @@ func (f *Sealed) load() { // Offload saves `.docs` (or `.sdocs`) and `.index` files into remote storage. // It does not free any of the occupied memory (nor on disk nor in memory). func (f *Sealed) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - f.useMu.Lock() - defer f.useMu.Unlock() - g, gctx := errgroup.WithContext(ctx) - g.Go(func() error { - f.openDocs() - return u.Upload(gctx, f.docsFile) - }) + f.loadMu.Lock() + f.openDocs() + f.openIndex() + f.loadMu.Unlock() - g.Go(func() error { - f.openIndex() - return u.Upload(gctx, f.indexFile) - }) + g.Go(func() error { return u.Upload(gctx, f.docsFile) }) + g.Go(func() error { return u.Upload(gctx, f.indexFile) }) if err := g.Wait(); err != nil { return true, err @@ -220,15 +210,25 @@ func (f *Sealed) Offload(ctx context.Context, u storage.Uploader) (bool, error) return true, nil } -func (f *Sealed) Suicide() { - f.useMu.Lock() - f.suicided = true - f.useMu.Unlock() +func (f *Sealed) Release() { + if f.docsFile != nil { + if err := f.docsFile.Close(); err != nil { + logger.Error("can't close docs file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + } - f.close("suicide") + if f.indexFile != nil { + if err := f.indexFile.Close(); err != nil { + logger.Error("can't close index file", zap.String("frac", f.BaseFileName), zap.Error(err)) + } + } f.docsCache.Release() f.indexCache.Release() +} + +func (f *Sealed) Suicide() { + f.Release() // make some atomic magic, to be more stable on removing fractions oldPath := f.BaseFileName + consts.DocsFileSuffix @@ -294,82 +294,26 @@ func (f *Sealed) Suicide() { } } -func (f *Sealed) close(hint string) { - f.loadMu.Lock() - defer f.loadMu.Unlock() - - if !f.isLoaded { - return - } - - if f.docsFile != nil { // docs file may not be opened since it's loaded lazily - if err := f.docsFile.Close(); err != nil { - logger.Error("can't close docs file", - zap.String("frac", f.BaseFileName), - zap.String("type", "sealed"), - zap.String("hint", hint), - zap.Error(err)) - } - } - - if err := f.indexFile.Close(); err != nil { - logger.Error("can't close index file", - zap.String("frac", f.BaseFileName), - zap.String("type", "sealed"), - zap.String("hint", hint), - zap.Error(err)) - } -} - func (f *Sealed) String() string { - return fracToString(f, "sealed") + return frac.FracToString(f, "sealed") } func (f *Sealed) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Fetch(ctx, ids) - } + dp := f.createDataProvider(ctx) + defer dp.release() + return dp.Fetch(ids) } func (f *Sealed) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - dp, release := f.DataProvider(ctx) - defer release() - if dp == nil { - return EmptyFraction.Search(ctx, params) - } - return dp.Search(params) -} - -func (f *Sealed) DataProvider(ctx context.Context) (*sealedDataProvider, func()) { - f.useMu.RLock() - - if f.suicided { - metric.CountersTotal.WithLabelValues("fraction_suicided").Inc() - f.useMu.RUnlock() - return nil, func() {} - } - - defer func() { - if panicData := recover(); panicData != nil { - f.useMu.RUnlock() - panic(panicData) - } - }() - - f.load() - dp := f.createDataProvider(ctx) + defer dp.release() - return dp, func() { - dp.release() - f.useMu.RUnlock() - } + return dp.Search(params) } func (f *Sealed) createDataProvider(ctx context.Context) *sealedDataProvider { + f.load() return &sealedDataProvider{ ctx: ctx, fractionTypeLabel: "sealed", @@ -395,7 +339,7 @@ func (f *Sealed) createDataProvider(ctx context.Context) *sealedDataProvider { } } -func (f *Sealed) Info() *common.Info { +func (f *Sealed) Info() *frac.Info { return f.info } @@ -407,10 +351,7 @@ func (f *Sealed) IsIntersecting(from, to seq.MID) bool { return f.info.IsIntersecting(from, to) } -func loadHeader( - indexFile storage.ImmutableFile, - indexReader storage.IndexReader, -) *common.Info { +func loadHeader(indexFile storage.ImmutableFile, indexReader storage.IndexReader) *frac.Info { block, _, err := indexReader.ReadIndexBlock(0, nil) if err != nil { logger.Fatal( @@ -420,7 +361,7 @@ func loadHeader( ) } - var bi sealed.BlockInfo + var bi BlockInfo if err := bi.Unpack(block); err != nil { logger.Fatal( "error unpacking info block", diff --git a/frac/sealed/sealing/blocks_builder.go b/frac/sealed/sealing/blocks_builder.go index 14a5cac7..51fa9e2e 100644 --- a/frac/sealed/sealing/blocks_builder.go +++ b/frac/sealed/sealing/blocks_builder.go @@ -4,6 +4,7 @@ import ( "encoding/binary" "errors" "iter" + "strings" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -104,7 +105,10 @@ func (bb *blocksBuilder) BuildTokenBlocks( } // Entry covers TIDs from currentTID to min(fieldMaxTID, block.ext.maxTID) entry := createTokenTableEntry(currentTID, fieldMaxTID, idx, block) - table = append(table, token.FieldTable{Field: fieldName, Entries: []*token.TableEntry{entry}}) + table = append(table, token.FieldTable{ + Field: strings.Clone(fieldName), + Entries: []*token.TableEntry{entry}, + }) currentTID += entry.ValCount } diff --git a/frac/sealed/sealing/blocks_builder_test.go b/frac/sealed/sealing/blocks_builder_test.go index 80892ca2..8205f707 100644 --- a/frac/sealed/sealing/blocks_builder_test.go +++ b/frac/sealed/sealing/blocks_builder_test.go @@ -7,14 +7,14 @@ import ( "github.com/stretchr/testify/assert" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" ) type mockSource struct { - info common.Info + info frac.Info tokens [][]byte fields []string fieldMaxTIDs []uint32 @@ -25,7 +25,7 @@ type mockSource struct { lastError error } -func (m *mockSource) Info() common.Info { return m.info } +func (m *mockSource) Info() frac.Info { return m.info } func (m *mockSource) Fields() iter.Seq2[string, uint32] { return func(yield func(string, uint32) bool) { diff --git a/frac/sealed/sealing/index.go b/frac/sealed/sealing/index.go index 491c7233..871de9ef 100644 --- a/frac/sealed/sealing/index.go +++ b/frac/sealed/sealing/index.go @@ -11,7 +11,7 @@ import ( "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -29,10 +29,10 @@ import ( // - Registry for quick access to block locations // - PreloadedData structures for fast initialization instance of sealed fraction type IndexSealer struct { - lastErr error // Last error encountered during processing - buf1 []byte // Reusable buffer for packing raw data before compression - buf2 []byte // Reusable buffer for compressed data - params common.SealParams // Configuration parameters for sealing process + lastErr error // Last error encountered during processing + buf1 []byte // Reusable buffer for packing raw data before compression + buf2 []byte // Reusable buffer for compressed data + params frac.SealParams // Configuration parameters for sealing process // PreloadedData structures built during sealing for fast initialization of sealed fraction idsTable seqids.Table // Table mapping document IDs to blocks @@ -41,7 +41,7 @@ type IndexSealer struct { } // NewIndexSealer creates a new IndexSealer instance with the given parameters. -func NewIndexSealer(params common.SealParams) *IndexSealer { +func NewIndexSealer(params frac.SealParams) *IndexSealer { return &IndexSealer{ params: params, buf1: make([]byte, 0, consts.RegularBlockSize), diff --git a/frac/sealed/sealing/sealer.go b/frac/sealed/sealing/sealer.go index 3eb00761..8e63c439 100644 --- a/frac/sealed/sealing/sealer.go +++ b/frac/sealed/sealing/sealer.go @@ -5,9 +5,10 @@ import ( "iter" "os" "path/filepath" + "slices" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/util" @@ -16,7 +17,7 @@ import ( // Source interface defines the contract for data sources that can be sealed. // Provides access to all necessary data components for index creation. type Source interface { - Info() *common.Info // Fraction metadata information + Info() *frac.Info // Fraction metadata information IDsBlocks(size int) iter.Seq2[[]seq.ID, []seq.DocPos] // Ordered sequence of document IDs and their positions, divided into blocks TokenBlocks(size int) iter.Seq[[][]byte] // Ordered sequence of tokens divided into blocks Fields() iter.Seq2[string, uint32] // Ordered sequence of fields with their max field's TID value @@ -39,7 +40,7 @@ type Source interface { // Returns: // - *sealed.PreloadedData: Preloaded data structures for initialization of sealed fraction // - error: Any error encountered during the sealing process -func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { +func Seal(src Source, params frac.SealParams) (*sealed.PreloadedData, error) { info := src.Info() // Validate that we're not sealing an empty fraction @@ -84,15 +85,19 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { // Ensure directory metadata is synced to disk util.MustSyncPath(filepath.Dir(info.Path)) + // copy this because it uses active fraction structures under the hood that must be released + tokenTable := indexSealer.TokenTable() + blocksOffsets := slices.Clone(src.BlocksOffsets()) + // Build preloaded data structure for fast query access lidsTable := indexSealer.LIDsTable() preloaded := sealed.PreloadedData{ Info: info, - TokenTable: indexSealer.TokenTable(), + TokenTable: tokenTable, BlocksData: sealed.BlocksData{ IDsTable: indexSealer.IDsTable(), LIDsTable: &lidsTable, - BlocksOffsets: src.BlocksOffsets(), + BlocksOffsets: blocksOffsets, }, } diff --git a/frac/sealed/sealing/sort_docs.go b/frac/sealed/sealing/sort_docs.go new file mode 100644 index 00000000..0f0f8ac2 --- /dev/null +++ b/frac/sealed/sealing/sort_docs.go @@ -0,0 +1,164 @@ +package sealing + +import ( + "encoding/binary" + "io" + "iter" + "os" + "path/filepath" + "time" + + "github.com/alecthomas/units" + "github.com/ozontech/seq-db/bytespool" + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" + "go.uber.org/zap" +) + +type DocsSource interface { + Docs() iter.Seq2[seq.ID, []byte] + LastError() error +} + +// SortDocs sorts documents and writes them in compressed form to disk. +// Creates a temporary file that is then renamed to the final one. +func SortDocs(name string, params frac.SealParams, ds DocsSource) ([]uint64, []seq.DocPos, int, error) { + start := time.Now() + logger.Info("sorting docs...") + + // Create temporary file for sorted documents + sdocsFile, err := os.Create(name + consts.SdocsTmpFileSuffix) + if err != nil { + return nil, nil, 0, err + } + + bw := bytespool.AcquireWriterSize(sdocsFile, int(units.MB)) + defer bytespool.ReleaseWriter(bw) + + // Group documents into blocks + blocks := docBlocks(ds.Docs(), params.DocBlockSize) + + // Write blocks and get new offsets and positions + blocksOffsets, positions, rawSize, onDiskSize, err := writeDocs(blocks, bw, params) + + if err := util.CollapseErrors([]error{ds.LastError(), err}); err != nil { + return nil, nil, 0, err + } + if err := bw.Flush(); err != nil { + return nil, nil, 0, err + } + + // Synchronize and rename file + if err := sdocsFile.Sync(); err != nil { + return nil, nil, 0, err + } + if err := sdocsFile.Close(); err != nil { + return nil, nil, 0, err + } + if err := os.Rename(sdocsFile.Name(), name+consts.SdocsFileSuffix); err != nil { + return nil, nil, 0, err + } + if err := util.SyncPath(filepath.Dir(name)); err != nil { + return nil, nil, 0, err + } + + // Log compression statistics + ratio := float64(rawSize) / float64(onDiskSize) + logger.Info("docs sorting stat", + util.ZapUint64AsSizeStr("raw", uint64(rawSize)), + util.ZapUint64AsSizeStr("compressed", uint64(onDiskSize)), + util.ZapFloat64WithPrec("ratio", ratio, 2), + zap.Int("blocks_count", len(blocksOffsets)), + zap.Int("docs_total", len(positions)), + util.ZapDurationWithPrec("write_duration_ms", time.Since(start), "ms", 0), + ) + + return blocksOffsets, positions, onDiskSize, nil +} + +// writeDocs compresses and writes document blocks, calculating new offsets +// and collecting document positions. +func writeDocs( + blocks iter.Seq2[[]byte, []seq.DocPos], + w io.Writer, + params frac.SealParams, +) ([]uint64, []seq.DocPos, int, int, error) { + offset := 0 + buf := make([]byte, 0) + blocksOffsets := make([]uint64, 0) + allPositions := make([]seq.DocPos, 0) + + rawSize := 0 + diskSize := 0 + + // Process each document block + for block, positions := range blocks { + allPositions = append(allPositions, positions...) + blocksOffsets = append(blocksOffsets, uint64(offset)) + + // Compress document block + buf = storage.CompressDocBlock(block, buf[:0], params.DocBlocksZstdLevel) + + rawSize += len(block) + diskSize += len(buf) + + if _, err := w.Write(buf); err != nil { + return nil, nil, 0, 0, err + } + offset += len(buf) + } + + return blocksOffsets, allPositions, rawSize, diskSize, nil +} + +// docBlocks groups documents into fixed-size blocks. +// Returns an iterator for blocks and corresponding document positions. +func docBlocks(docs iter.Seq2[seq.ID, []byte], blockSize int) iter.Seq2[[]byte, []seq.DocPos] { + return func(yield func([]byte, []seq.DocPos) bool) { + const defaultBlockSize = 128 * units.KiB + if blockSize <= 0 { + blockSize = int(defaultBlockSize) + logger.Warn("document block size not specified", zap.Int("default_size", blockSize)) + } + + var ( + prev seq.ID + index uint32 // Current block index + ) + pos := make([]seq.DocPos, 0) + buf := make([]byte, 0, blockSize) + + // Iterate through documents + for id, doc := range docs { + if id == prev { + // Duplicate IDs (for nested indexes) - store document once, + // but create positions for each LID + pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) + continue + } + prev = id + + // If block is full, yield it + if len(buf) >= blockSize { + if !yield(buf, pos) { + return + } + index++ + buf = buf[:0] + pos = pos[:0] + } + + // Add document position + pos = append(pos, seq.PackDocPos(index, uint64(len(buf)))) + + // Write document size and the document itself + buf = binary.LittleEndian.AppendUint32(buf, uint32(len(doc))) + buf = append(buf, doc...) + } + yield(buf, pos) + } +} diff --git a/frac/fraction_test.go b/frac/tests/fraction_test.go similarity index 87% rename from frac/fraction_test.go rename to frac/tests/fraction_test.go index 920445ce..4b6d06d2 100644 --- a/frac/fraction_test.go +++ b/frac/tests/fraction_test.go @@ -1,4 +1,4 @@ -package frac +package tests import ( "context" @@ -14,14 +14,16 @@ import ( "testing" "time" - "github.com/alecthomas/units" "github.com/johannesboyne/gofakes3" "github.com/johannesboyne/gofakes3/backend/s3mem" "github.com/stretchr/testify/suite" "github.com/ozontech/seq-db/cache" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/frac/sealed/seqids" @@ -34,31 +36,34 @@ import ( "github.com/ozontech/seq-db/tokenizer" ) +// TODO сделать разные тесты для сортированных и не сортированных доков type FractionTestSuite struct { suite.Suite tmpDir string - config *Config + config *frac.Config mapping seq.Mapping tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *ActiveIndexer - sealParams common.SealParams + activeIndexer *active_old.Indexer + stopIndexer func() + sealParams frac.SealParams - fraction Fraction + fraction frac.Fraction insertDocuments func(docs ...[]string) } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer = NewActiveIndexer(4, 10) - s.activeIndexer.Start() + s.activeIndexer, s.stopIndexer = active_old.NewIndexer(4, 10) } func (s *FractionTestSuite) TearDownSuiteCommon() { - s.activeIndexer.Stop() + s.stopIndexer() } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &Config{} + s.config = &frac.Config{ + // SkipSortDocs: true, + } s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -80,14 +85,14 @@ func (s *FractionTestSuite) SetupTestCommon() { "spans.span_id": seq.NewSingleType(seq.TokenizerTypeKeyword, "", 0), "v": seq.NewSingleType(seq.TokenizerTypeKeyword, "", 0), } - s.sealParams = common.SealParams{ + s.sealParams = frac.SealParams{ IDsZstdLevel: 1, LIDsZstdLevel: 1, TokenListZstdLevel: 1, DocsPositionsZstdLevel: 1, TokenTableZstdLevel: 1, DocBlocksZstdLevel: 1, - DocBlockSize: 128 * int(units.KiB), + DocBlockSize: 128, // Using a small block size to test multi-block sorting output. } var err error @@ -96,6 +101,9 @@ func (s *FractionTestSuite) SetupTestCommon() { } func (s *FractionTestSuite) TearDownTestCommon() { + if s.fraction != nil { + s.fraction = nil + } err := os.RemoveAll(s.tmpDir) s.NoError(err, "Failed to remove tmp dir") } @@ -1042,23 +1050,36 @@ func (s *FractionTestSuite) TestFractionInfo() { // these checks should not break without a reason // but if compression/marshalling has changed, expected values can be updated accordingly s.Require().Equal(uint32(5), info.DocsTotal, "doc total doesn't match") - // it varies depending on params and docs shuffled - s.Require().True(info.DocsOnDisk > uint64(200) && info.DocsOnDisk < uint64(300), - "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(583), info.DocsRaw, "doc raw doesn't match") s.Require().Equal(seq.MID(946731625000), info.From, "from doesn't match") s.Require().Equal(seq.MID(946731654000), info.To, "to doesn't match") switch s.fraction.(type) { - case *Active: - s.Require().True(info.MetaOnDisk >= uint64(250) && info.MetaOnDisk <= uint64(350), + case *active_old.Active: + // it varies depending on params and docs shuffled + s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) + s.Require().True(info.MetaOnDisk >= uint64(450) && info.MetaOnDisk <= uint64(550), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") - case *Sealed: + case *active.Active: + // it varies depending on params and docs shuffled + s.Require().True(info.DocsOnDisk > uint64(450) && info.DocsOnDisk < uint64(500), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) + s.Require().True(info.MetaOnDisk >= uint64(450) && info.MetaOnDisk <= uint64(550), + "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) + s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") + case *sealed.Sealed: + // it varies depending on params and docs shuffled and docs sorting + s.Require().True(info.DocsOnDisk > uint64(460) && info.DocsOnDisk < uint64(540), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1600), "index on disk doesn't match. actual value: %d", info.MetaOnDisk) - case *Remote: + case *sealed.Remote: + // it varies depending on params and docs shuffled and docs sorting + s.Require().True(info.DocsOnDisk > uint64(460) && info.DocsOnDisk < uint64(540), + "doc on disk doesn't match. actual value: %d", info.DocsOnDisk) s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1400) && info.IndexOnDisk < uint64(1500), "index on disk doesn't match. actual value: %d", info.MetaOnDisk) @@ -1190,7 +1211,7 @@ func (s *FractionTestSuite) AssertSearchWithSearchParams( for i, fetchedDoc := range fetchedDocs { expectedDoc := originalDocs[expectedIndexes[i]] - s.Require().Equal(expectedDoc, fetchedDoc, "doc at index %d doesn't match", i) + s.Require().Equal(expectedDoc, fetchedDoc, "doc at index %d doesn't match (order %d)", i, order) } } } @@ -1249,61 +1270,77 @@ func (s *FractionTestSuite) AssertHist( } } -func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { +func (s *FractionTestSuite) newActive(bulks ...[]string) *active.Active { baseName := filepath.Join(s.tmpDir, "test_fraction") - active := NewActive( + a := active.New( baseName, - s.activeIndexer, + s.config, + 4, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - s.config, ) - var wg sync.WaitGroup + s.AppendBulks(a, bulks...) - for _, docs := range bulks { - docsCopy := slices.Clone(docs) - rand.Shuffle(len(docsCopy), func(i, j int) { - docsCopy[i], docsCopy[j] = docsCopy[j], docsCopy[i] - }) + return a +} + +type appender interface { + Append([]byte, []byte, *sync.WaitGroup) error +} + +func (s *FractionTestSuite) AppendBulks(a appender, bulks ...[]string) { + var wg sync.WaitGroup - idx := 0 - readNext := func() ([]byte, error) { - if idx >= len(docsCopy) { - return nil, nil + for _, bulk := range bulks { + bulkSize := (len(bulk)-1)/3 + 1 + for len(bulk) > 0 { + l := min(bulkSize, len(bulk)) + docs := bulk[:l] + bulk = bulk[l:] + + docsCopy := slices.Clone(docs) + rand.Shuffle(len(docsCopy), func(i, j int) { + docsCopy[i], docsCopy[j] = docsCopy[j], docsCopy[i] + }) + + idx := 0 + readNext := func() ([]byte, error) { + if idx >= len(docsCopy) { + return nil, nil + } + d := []byte(docsCopy[idx]) + idx++ + return d, nil } - d := []byte(docsCopy[idx]) - idx++ - return d, nil - } - proc := indexer.NewProcessor(s.mapping, s.tokenizers, 0, 0, 0) - compressor := indexer.GetDocsMetasCompressor(3, 3) - _, binaryDocs, binaryMeta, err := proc.ProcessBulk(time.Now(), nil, nil, readNext) - s.Require().NoError(err, "processing bulk failed") + proc := indexer.NewProcessor(s.mapping, s.tokenizers, 0, 0, 0) + compressor := indexer.GetDocsMetasCompressor(3, 3) + _, binaryDocs, binaryMeta, err := proc.ProcessBulk(time.Now(), nil, nil, readNext) + s.Require().NoError(err, "processing bulk failed") - compressor.CompressDocsAndMetas(binaryDocs, binaryMeta) - docsBlock, metasBlock := compressor.DocsMetas() + compressor.CompressDocsAndMetas(binaryDocs, binaryMeta) + docsBlock, metasBlock := compressor.DocsMetas() - wg.Add(1) - err = active.Append(docsBlock, metasBlock, &wg) - s.Require().NoError(err, "append to active failed") + wg.Add(1) + err = a.Append(docsBlock, metasBlock, &wg) + s.Require().NoError(err, "append to active failed") + } } wg.Wait() - return active } -func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { - active := s.newActive(bulks...) +func (s *FractionTestSuite) newSealed(bulks ...[]string) *sealed.Sealed { + a := s.newActive(bulks...) - activeSealingSource, err := NewActiveSealingSource(active, s.sealParams) + activeSealingSource, err := active.NewSealingSource(a, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) s.Require().NoError(err, "Sealing failed") - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1313,21 +1350,19 @@ func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { Registry: cache.NewCache[[]byte](nil, nil), } - sealed := NewSealedPreloaded( - active.BaseFileName, + f := sealed.NewPreloaded( + a.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), s.config, ) - active.Release() - return sealed + a.Release() + return f } -/* -ActiveFractionTestSuite run tests for active fraction -*/ +// ActiveFractionTestSuite run tests for active fraction type ActiveFractionTestSuite struct { FractionTestSuite } @@ -1348,15 +1383,10 @@ func (s *ActiveFractionTestSuite) SetupTest() { } func (s *ActiveFractionTestSuite) TearDownTest() { - if s.fraction != nil { - active, ok := s.fraction.(*Active) - if ok { - active.Release() - } else { - s.Require().Fail("fraction is not of Active type") - } - s.fraction.Suicide() - s.fraction = nil + if a, ok := s.fraction.(*active.Active); ok { + a.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") } s.TearDownTestCommon() @@ -1366,9 +1396,7 @@ func (s *ActiveFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -ActiveReplayedFractionTestSuite run tests for active fraction which was replayed from meta and docs file on disk -*/ +// ActiveReplayedFractionTestSuite run tests for active fraction which was replayed from meta and docs file on disk type ActiveReplayedFractionTestSuite struct { FractionTestSuite } @@ -1380,8 +1408,6 @@ func (s *ActiveReplayedFractionTestSuite) SetupSuite() { func (s *ActiveReplayedFractionTestSuite) SetupTest() { s.SetupTestCommon() // Setting this flags allows to keep meta and docs files on disk after Active.Release() is called - s.config.SkipSortDocs = true - s.config.KeepMetaFile = true s.insertDocuments = func(bulks ...[]string) { if s.fraction != nil { @@ -1391,33 +1417,27 @@ func (s *ActiveReplayedFractionTestSuite) SetupTest() { } } -func (s *ActiveReplayedFractionTestSuite) Replay(frac *Active) Fraction { - fracFileName := frac.BaseFileName - frac.Release() - replayedFrac := NewActive( +func (s *ActiveReplayedFractionTestSuite) Replay(f *active.Active) frac.Fraction { + fracFileName := f.BaseFileName + replayedFrac := active.New( fracFileName, - s.activeIndexer, + s.config, + 4, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), - cache.NewCache[[]byte](nil, nil), - &Config{}) + cache.NewCache[[]byte](nil, nil)) + err := replayedFrac.Replay(context.Background()) s.Require().NoError(err, "replay failed") return replayedFrac } func (s *ActiveReplayedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - active, ok := s.fraction.(*Active) - if ok { - active.Release() - } else { - s.Require().Fail("fraction is not of Active type") - } - s.fraction.Suicide() - s.fraction = nil + if f, ok := s.fraction.(*active.Active); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") } - s.TearDownTestCommon() } @@ -1425,9 +1445,7 @@ func (s *ActiveReplayedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -SealedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. -*/ +// SealedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. type SealedFractionTestSuite struct { FractionTestSuite } @@ -1448,9 +1466,10 @@ func (s *SealedFractionTestSuite) SetupTest() { } func (s *SealedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") } s.TearDownTestCommon() } @@ -1459,10 +1478,8 @@ func (s *SealedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -/* -SealedLoadedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. -Sealed fraction is then loaded with sealed.NewSealed call -*/ +// SealedLoadedFractionTestSuite run tests for sealed fraction. Active fraction is created first and then sealed. +// Sealed fraction is then loaded with sealed.NewSealed call type SealedLoadedFractionTestSuite struct { FractionTestSuite } @@ -1483,9 +1500,10 @@ func (s *SealedLoadedFractionTestSuite) SetupTest() { } func (s *SealedLoadedFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") } s.TearDownTestCommon() } @@ -1494,11 +1512,11 @@ func (s *SealedLoadedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Sealed { - sealed := s.newSealed(bulks...) - sealed.close("closed") +func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *sealed.Sealed { + f := s.newSealed(bulks...) + f.Release() - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1508,21 +1526,19 @@ func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Seal Registry: cache.NewCache[[]byte](nil, nil), } - sealed = NewSealed( - sealed.BaseFileName, + f = sealed.New( + f.BaseFileName, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), nil, s.config) - s.fraction = sealed - return sealed + s.fraction = f + return f } -/* -RemoteFractionTestSuite runs tests for remote fraction. Fraction is first sealed, then uploaded -to fakes3 backend. -*/ +// RemoteFractionTestSuite runs tests for remote fraction. Fraction is first sealed, then uploaded +// to fakes3 backend. type RemoteFractionTestSuite struct { FractionTestSuite @@ -1548,8 +1564,8 @@ func (s *RemoteFractionTestSuite) SetupTest() { if s.fraction != nil { s.Require().Fail("can insert docs only once") } - sealed := s.newSealed(bulks...) - defer sealed.Suicide() + f := s.newSealed(bulks...) + defer f.Suicide() s3cli, err := s3.NewClient( s.s3server.URL, @@ -1561,11 +1577,11 @@ func (s *RemoteFractionTestSuite) SetupTest() { ) s.Require().NoError(err, "s3 client setup failed") - offloaded, err := sealed.Offload(context.Background(), s3.NewUploader(s3cli)) + offloaded, err := f.Offload(context.Background(), s3.NewUploader(s3cli)) s.Require().NoError(err, "offload failed") s.Require().True(offloaded, "didn't offload frac") - indexCache := &IndexCache{ + indexCache := &sealed.IndexCache{ MIDs: cache.NewCache[[]byte](nil, nil), RIDs: cache.NewCache[[]byte](nil, nil), Params: cache.NewCache[seqids.BlockParams](nil, nil), @@ -1575,13 +1591,13 @@ func (s *RemoteFractionTestSuite) SetupTest() { Registry: cache.NewCache[[]byte](nil, nil), } - remoteFrac := NewRemote( + remoteFrac := sealed.NewRemote( context.Background(), - sealed.BaseFileName, + f.BaseFileName, storage.NewReadLimiter(1, nil), indexCache, cache.NewCache[[]byte](nil, nil), - sealed.info, + f.Info(), s.config, s3cli) s.fraction = remoteFrac @@ -1589,9 +1605,10 @@ func (s *RemoteFractionTestSuite) SetupTest() { } func (s *RemoteFractionTestSuite) TearDownTest() { - if s.fraction != nil { - s.fraction.Suicide() - s.fraction = nil + if remote, ok := s.fraction.(*sealed.Remote); ok { + remote.Suicide() + } else { + s.Require().Nil(s.fraction, "fraction is not of Remote type") } s.TearDownTestCommon() } @@ -1621,3 +1638,124 @@ func TestSealedLoadedFractionTestSuite(t *testing.T) { func TestRemoteFractionTestSuite(t *testing.T) { suite.Run(t, new(RemoteFractionTestSuite)) } + +func TestActiveOldFractionTestSuite(t *testing.T) { + suite.Run(t, new(ActiveOldFractionTestSuite)) +} + +func TestSealedOldFractionTestSuite(t *testing.T) { + suite.Run(t, new(SealedOldFractionTestSuite)) +} + +type ActiveOldFractionTestSuite struct { + FractionTestSuite +} + +func (s *ActiveOldFractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *ActiveOldFractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(bulks ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newActiveOld(bulks...) + } +} + +func (s *ActiveOldFractionTestSuite) newActiveOld(bulks ...[]string) *active_old.Active { + + baseName := filepath.Join(s.tmpDir, "test_fraction") + a := active_old.New( + baseName, + s.activeIndexer, + storage.NewReadLimiter(1, nil), + cache.NewCache[[]byte](nil, nil), + cache.NewCache[[]byte](nil, nil), + s.config, + ) + + s.AppendBulks(a, bulks...) + + return a +} + +func (s *ActiveOldFractionTestSuite) TearDownTest() { + if f, ok := s.fraction.(*active_old.Active); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Active type") + } + + s.TearDownTestCommon() +} + +func (s *ActiveOldFractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} + +type SealedOldFractionTestSuite struct { + ActiveOldFractionTestSuite +} + +func (s *SealedOldFractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *SealedOldFractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(docs ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newSealedOld(docs...) + } +} + +func (s *SealedOldFractionTestSuite) TearDownTest() { + if f, ok := s.fraction.(*sealed.Sealed); ok { + f.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") + } + s.TearDownTestCommon() +} + +func (s *SealedOldFractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} + +func (s *SealedOldFractionTestSuite) newSealedOld(bulks ...[]string) *sealed.Sealed { + a := s.newActiveOld(bulks...) + + activeSealingSource, err := active_old.NewSealingSource(a, s.sealParams) + s.Require().NoError(err, "Sealing source creation failed") + + preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) + s.Require().NoError(err, "Sealing failed") + + indexCache := &sealed.IndexCache{ + MIDs: cache.NewCache[[]byte](nil, nil), + RIDs: cache.NewCache[[]byte](nil, nil), + Params: cache.NewCache[seqids.BlockParams](nil, nil), + LIDs: cache.NewCache[*lids.Block](nil, nil), + Tokens: cache.NewCache[*token.Block](nil, nil), + TokenTable: cache.NewCache[token.Table](nil, nil), + Registry: cache.NewCache[[]byte](nil, nil), + } + + f := sealed.NewPreloaded( + a.BaseFileName, + preloaded, + storage.NewReadLimiter(1, nil), + indexCache, + cache.NewCache[[]byte](nil, nil), + s.config, + ) + a.Release() + return f +} diff --git a/fracmanager/cache_maintainer.go b/fracmanager/cache_maintainer.go index 5139b4ca..c5aeb3ba 100644 --- a/fracmanager/cache_maintainer.go +++ b/fracmanager/cache_maintainer.go @@ -7,7 +7,7 @@ import ( "go.uber.org/zap" "github.com/ozontech/seq-db/cache" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" @@ -141,8 +141,8 @@ func (cm *CacheMaintainer) CreateSortDocsCache() *cache.Cache[[]byte] { return newCache[[]byte](cm, sortName) } -func (cm *CacheMaintainer) CreateIndexCache() *frac.IndexCache { - return &frac.IndexCache{ +func (cm *CacheMaintainer) CreateIndexCache() *sealed.IndexCache { + return &sealed.IndexCache{ MIDs: newCache[[]byte](cm, midsName), RIDs: newCache[[]byte](cm, ridsName), Params: newCache[seqids.BlockParams](cm, paramsName), diff --git a/fracmanager/config.go b/fracmanager/config.go index 30f442b9..d6e22fc1 100644 --- a/fracmanager/config.go +++ b/fracmanager/config.go @@ -7,7 +7,6 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/util" ) @@ -23,9 +22,10 @@ type Config struct { MaintenanceDelay time.Duration CacheCleanupDelay time.Duration CacheGCDelay time.Duration - SealParams common.SealParams + SealParams frac.SealParams SortCacheSize uint64 // size for docs cache for active fraction Fraction frac.Config + MinSealFracSize uint64 OffloadingEnabled bool OffloadingRetention time.Duration diff --git a/fracmanager/frac_info_cache.go b/fracmanager/frac_info_cache.go index 7356b1bb..46b29296 100644 --- a/fracmanager/frac_info_cache.go +++ b/fracmanager/frac_info_cache.go @@ -10,7 +10,7 @@ import ( "go.uber.org/zap" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" ) @@ -22,7 +22,7 @@ type fracInfoCache struct { fileName string mu sync.RWMutex - cache map[string]*common.Info + cache map[string]*frac.Info version uint64 // if we increment the counter every second it will take 31 billion years (quite enough) saveMu sync.Mutex @@ -31,7 +31,7 @@ type fracInfoCache struct { func NewFracInfoCache(filePath string) *fracInfoCache { fc := &fracInfoCache{ - cache: make(map[string]*common.Info), + cache: make(map[string]*frac.Info), mu: sync.RWMutex{}, fullPath: filePath, fileName: filepath.Base(filePath), @@ -74,7 +74,7 @@ func (fc *fracInfoCache) LoadFromDisk(fileName string) { } // Add adds a new entry to the in-memory [sealedFracCache]. -func (fc *fracInfoCache) Add(info *common.Info) { +func (fc *fracInfoCache) Add(info *frac.Info) { name := info.Name() fc.mu.Lock() @@ -96,7 +96,7 @@ func (fc *fracInfoCache) Remove(name string) { // Get returns fraction info and a flag that indicates // whether the data is present in the map. -func (fc *fracInfoCache) Get(name string) (*common.Info, bool) { +func (fc *fracInfoCache) Get(name string) (*frac.Info, bool) { fc.mu.RLock() defer fc.mu.RUnlock() diff --git a/fracmanager/frac_info_cache_test.go b/fracmanager/frac_info_cache_test.go index 126171ad..ee4b2052 100644 --- a/fracmanager/frac_info_cache_test.go +++ b/fracmanager/frac_info_cache_test.go @@ -4,19 +4,13 @@ import ( "encoding/json" "os" "path/filepath" - "sort" - "sync" "testing" - insaneJSON "github.com/ozontech/insane-json" "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/indexer" - "github.com/ozontech/seq-db/seq" - testscommon "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/tests/common" ) const dummyFracFixture = `{"a":{"name":"a","ver":"1.1","docs_total":1,"docs_on_disk":363,"docs_raw":450,"meta_on_disk":0,"index_on_disk":1284,"const_regular_block_size":16384,"const_ids_per_block":4096,"const_lid_block_cap":65536,"from":1666193255114,"to":1666193255114,"creation_time":1666193044479},"b":{"name":"b","ver":"1.2","docs_total":1,"docs_on_disk":363,"docs_raw":450,"meta_on_disk":0,"index_on_disk":1276,"const_regular_block_size":16384,"const_ids_per_block":4096,"const_lid_block_cap":65536,"from":1666193602304,"to":1666193602304,"creation_time":1666193598979}}` @@ -27,13 +21,13 @@ func loadFracCacheContent(dataDir string) ([]byte, error) { return content, err } -func loadFracCache(dataDir string) (map[string]*common.Info, error) { +func loadFracCache(dataDir string) (map[string]*frac.Info, error) { content, err := loadFracCacheContent(dataDir) if err != nil { return nil, err } - fracCache := make(map[string]*common.Info) + fracCache := make(map[string]*frac.Info) err = json.Unmarshal(content, &fracCache) if err != nil { return nil, err @@ -49,10 +43,10 @@ func writeToFracCache(dataDir, fname, data string) error { } func TestEmpty(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) f := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) err := f.SyncWithDisk() @@ -73,10 +67,10 @@ func TestEmpty(t *testing.T) { } func TestLoadFromDisk(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -103,9 +97,9 @@ func TestLoadFromDisk(t *testing.T) { } func TestRemoveFraction(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + dataDir := common.GetTestTmpDir(t) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -123,7 +117,7 @@ func TestRemoveFraction(t *testing.T) { assert.NoError(t, err) assert.Equal(t, contents, []byte("{}")) - newInfo := &common.Info{ + newInfo := &frac.Info{ Path: "/data/c", Ver: "1.3", DocsTotal: 0, @@ -144,7 +138,7 @@ func TestRemoveFraction(t *testing.T) { m, err := loadFracCache(dataDir) assert.NoError(t, err) - expected := map[string]*common.Info{"c": newInfo} + expected := map[string]*frac.Info{"c": newInfo} assert.Equal(t, expected, m) f.Remove("c") @@ -157,10 +151,10 @@ func TestRemoveFraction(t *testing.T) { } func TestWriteToDisk(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) @@ -168,7 +162,7 @@ func TestWriteToDisk(t *testing.T) { f := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) f.LoadFromDisk(filepath.Join(dataDir, consts.FracCacheFileSuffix)) - newInfo := &common.Info{ + newInfo := &frac.Info{ Path: "/data/c", Ver: "1.3", DocsTotal: 0, @@ -227,15 +221,15 @@ func TestWriteToDisk(t *testing.T) { } func TestUnusedFractionsCleanup(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) + dataDir := common.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) + common.RecreateDir(dataDir) + defer common.RemoveDir(dataDir) err := writeToFracCache(dataDir, consts.FracCacheFileSuffix, dummyFracFixture) assert.NoError(t, err) - expected := map[string]*common.Info{} + expected := map[string]*frac.Info{} cacheFile := filepath.Join(dataDir, consts.FracCacheFileSuffix) diskFracCache := NewFracInfoCacheFromDisk(cacheFile) @@ -258,197 +252,3 @@ func TestUnusedFractionsCleanup(t *testing.T) { assert.NoError(t, err) assert.Equal(t, []byte("{}"), cacheStr) } - -func rotateAndSeal(fm *FracManager) frac.Fraction { - active := fm.rotate() - fm.seal(active) - return active.ref.instance -} - -func TestFracInfoSavedToCache(t *testing.T) { - const maxSize = 10000 - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize * 2, - }) - defer stop() - - dp := indexer.NewTestDocProvider() - metaRoot := insaneJSON.Spawn() - defer insaneJSON.Release(metaRoot) - - infos := map[string]*common.Info{} - totalSize := uint64(0) - cnt := 1 - for totalSize < maxSize { - addDummyDoc(t, fm, dp, seq.SimpleID(cnt)) - cnt++ - fracInstance := rotateAndSeal(fm) - totalSize += fracInstance.Info().FullSize() - info := fracInstance.Info() - infos[info.Name()] = info - dp.TryReset() - } - - err := fm.fracCache.SyncWithDisk() - assert.NoError(t, err) - - fracCacheFromDisk, err := loadFracCache(cfg.DataDir) - assert.NoError(t, err) - assert.Equal(t, fracCacheFromDisk, fm.fracCache.cache) - assert.Equal(t, fracCacheFromDisk, infos) -} - -type item struct { - value string - size int -} - -type evictingQueue struct { - values []item - size int - maxSize int -} - -func newEvictingQueue(maxSize int) evictingQueue { - return evictingQueue{ - values: []item{}, - maxSize: maxSize, - size: 0, - } -} - -func (q *evictingQueue) Add(v item) { - q.values = append(q.values, v) - q.size += v.size - - for q.size > q.maxSize { - q.size -= q.values[0].size - q.values = q.values[1:] - } -} - -func (q *evictingQueue) GetItems() []item { - return q.values -} - -func appendGlob(files []string, dataDir, glob string) []string { - docs, _ := filepath.Glob(filepath.Join(dataDir, glob)) - files = append(files, docs...) - return files -} - -func TestExtraFractionsRemoved(t *testing.T) { - const maxSize = 5500 - const times = 10 - - q := newEvictingQueue(maxSize) - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize, - }) - - dp := indexer.NewTestDocProvider() - infos := map[string]*common.Info{} - - for i := 1; i < times+1; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(i)) - fracInstance := rotateAndSeal(fm) - info := fracInstance.Info() - q.Add(item{ - value: info.Name(), - size: int(fracInstance.Info().FullSize()), - }) - infos[info.Name()] = info - dp.TryReset() - } - - expectedFracs := []string{} - for _, itemValue := range q.GetItems() { - expectedFracs = append(expectedFracs, itemValue.value) - } - - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - - fm.maintenance(&sealWG, &suicideWG) // shrinkSizes should be called - sealWG.Wait() - suicideWG.Wait() - - stop() - - fracsOnDisk := []string{} - fracCacheFromDisk, err := loadFracCache(cfg.DataDir) - - assert.NoError(t, err) - for k := range fracCacheFromDisk { - fracsOnDisk = append(fracsOnDisk, k) - } - - sort.Strings(expectedFracs) - sort.Strings(fracsOnDisk) - - assert.Equal(t, expectedFracs, fracsOnDisk) -} - -func TestMissingCacheFilesDeleted(t *testing.T) { - const maxSize = 5500 - const times = 10 - - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 100, - TotalSize: maxSize, - }) - - dp := indexer.NewTestDocProvider() - metaRoot := insaneJSON.Spawn() - defer insaneJSON.Release(metaRoot) - - for i := 1; i < times+1; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(i)) - rotateAndSeal(fm) - dp.TryReset() - } - - // make sure the disk is in sync with the in-memory fraction cache - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - - fm.maintenance(&sealWG, &suicideWG) // shrinkSizes should be called - sealWG.Wait() - suicideWG.Wait() - - stop() - - // remove the fraction files - dataDir := cfg.DataDir - files := []string{} - files = appendGlob(files, dataDir, "*.docs") - files = appendGlob(files, dataDir, "*.sdocs") - files = appendGlob(files, dataDir, "*.index") - files = appendGlob(files, dataDir, "*.meta") - for _, file := range files { - err := os.RemoveAll(file) - assert.NoError(t, err) - } - - // create a new fracmanager that will read the fraction cache file - - _, fm2, stop2 := setupFracManager(t, cfg) - - sealWG2 := sync.WaitGroup{} - suicideWG2 := sync.WaitGroup{} - - fm2.maintenance(&sealWG2, &suicideWG2) // shrinkSizes should be called - sealWG2.Wait() - suicideWG2.Wait() - - stop2() - - // make sure the missing files are removed from the fraction cache - fracCacheFromDisk, err := loadFracCacheContent(dataDir) - assert.NoError(t, err) - assert.Equal(t, fracCacheFromDisk, []byte("{}")) -} diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index fd89750f..c7c44764 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -2,448 +2,140 @@ package fracmanager import ( "context" - "errors" "path/filepath" "sync" "time" - "go.uber.org/atomic" "go.uber.org/zap" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" "github.com/ozontech/seq-db/util" ) +// FracManager manages database fractions with lifecycle operations type FracManager struct { - ctx context.Context - config *Config - - cacheMaintainer *CacheMaintainer - - fracCache *fracInfoCache - - fracMu sync.RWMutex - localFracs []*fracRef - remoteFracs []*frac.Remote - active activeRef - - indexer *frac.ActiveIndexer - fracProvider *fractionProvider - - oldestCTLocal atomic.Uint64 - oldestCTRemote atomic.Uint64 - - flags *StateManager - - stopFn func() - statWG sync.WaitGroup - mntcWG sync.WaitGroup - cacheWG sync.WaitGroup - - s3cli *s3.Client -} - -type fracRef struct { - instance frac.Fraction + mu sync.Mutex // todo: get rid of mutex after removing SealForcedForTests method + lc *lifecycleManager } -type activeRef struct { - ref *fracRef // ref contains a back reference to the fraction in the slice - frac *proxyFrac +var defaultStorageState = StorageState{ + CapacityExceeded: false, } -func (fm *FracManager) newActiveRef(active *frac.Active) activeRef { - f := newProxyFrac(active, fm.fracProvider) - return activeRef{ - frac: f, - ref: &fracRef{instance: f}, - } -} - -func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, error) { +// New creates and initializes a new fraction manager +// Starts all background workers: +// - indexer, +// - cache cleaner, +// - fraction rotation +// - stats updating +// +// Returns the manager instance and a stop function to gracefully shutdown +func New(ctx context.Context, cfg *Config, s3cli *s3.Client) (*FracManager, func(), error) { FillConfigWithDefault(cfg) - cacheMaintainer := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) - readLimiter := storage.NewReadLimiter(config.ReaderWorkers, storeBytesRead) - indexer := frac.NewActiveIndexer(config.IndexWorkers, config.IndexWorkers) - indexer.Start() - - flags, err := NewStateManager(cfg.DataDir, StorageState{}) + idx, stopIdx := active_old.NewIndexer(config.IndexWorkers, config.IndexWorkers) + cache := NewCacheMaintainer(cfg.CacheSize, cfg.SortCacheSize, newDefaultCacheMetrics()) + provider := newFractionProvider(cfg, s3cli, cache, readLimiter, idx) + infoCache := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) + + // Load existing fractions into registry + loader := NewLoader(cfg, provider, infoCache) + registry, err := loader.Load(ctx) if err != nil { - logger.Fatal("state manager initiation error", zap.Error(err)) + return nil, nil, err } - fm := &FracManager{ - config: cfg, - ctx: ctx, - s3cli: s3cli, - flags: flags, - cacheMaintainer: cacheMaintainer, - indexer: indexer, - fracProvider: newFractionProvider(cfg, s3cli, cacheMaintainer, readLimiter, indexer), - fracCache: NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)), + // Initialize storage state manager to track capacity status + storageState, err := NewStateManager(cfg.DataDir, defaultStorageState) + if err != nil { + return nil, nil, err } - err = fm.load(ctx) - return fm, err -} + // Create lc manager to handle fraction maintenance + lc := newLifecycleManager(infoCache, provider, storageState, registry) + fm := FracManager{lc: lc} -func (fm *FracManager) maintenance(sealWg, cleanupWg *sync.WaitGroup) { - logger.Debug("maintenance started") + // Start background workers and get stop function + wg := sync.WaitGroup{} + ctx, cancel := context.WithCancel(ctx) - n := time.Now() - if fm.Active().Info().DocsOnDisk > fm.config.FracSize { - active := fm.rotate() + startStatsWorker(ctx, registry, &wg) + startMaintWorker(ctx, cfg, &fm, &wg) + startCacheWorker(ctx, cfg, cache, &wg) - sealWg.Add(1) - go func() { - fm.seal(active) - sealWg.Done() - }() - } - - fm.cleanupFractions(cleanupWg) - fm.removeStaleFractions(cleanupWg, fm.config.OffloadingRetention) - fm.updateOldestCT() + stop := func() { + n := time.Now() + logger.Info("start stopping fracmanager's workers") - if err := fm.fracCache.SyncWithDisk(); err != nil { - logger.Error("can't sync frac-cache", zap.Error(err)) - } + cancel() + wg.Wait() - logger.Debug("maintenance finished", zap.Int64("took_ms", time.Since(n).Milliseconds())) -} + // Freeze active fraction to prevent new writes + active := lc.registry.Active() + if err := active.Finalize(); err != nil { + logger.Fatal("shutdown fraction freezing error", zap.Error(err)) + } + active.WaitWriteIdle() -func (fm *FracManager) Oldest() uint64 { - local, remote := fm.oldestCTLocal.Load(), fm.oldestCTRemote.Load() - if local != 0 && remote != 0 { - return min(local, remote) - } - return local -} + // Stop indexer + stopIdx() -func (fm *FracManager) updateOldestCT() { - fm.updateOldestCTFor(fm.getLocalFracs(), &fm.oldestCTLocal, "local") - fm.updateOldestCTFor(fm.getRemoteFracs(), &fm.oldestCTRemote, "remote") -} + // Save info cache + lc.SyncInfoCache() -func (fm *FracManager) updateOldestCTFor( - fracs List, v *atomic.Uint64, storageType string, -) { - oldestByCT := fracs.GetOldestFrac() + // Seal active fraction + sealOnShutdown(active.instance, provider, cfg.MinSealFracSize) - if oldestByCT == nil { - v.Store(0) - return + logger.Info("fracmanager's workers are stopped", zap.Int64("took_ms", time.Since(n).Milliseconds())) } - newOldestCT := oldestByCT.Info().CreationTime - prevOldestCT := v.Swap(newOldestCT) - - if newOldestCT != prevOldestCT { - logger.Info( - "new oldest by creation time", - zap.String("fraction", oldestByCT.Info().Name()), - zap.String("storage_type", storageType), - zap.Time("creation_time", time.UnixMilli(int64(newOldestCT))), - ) - } + return &fm, stop, nil } -func (fm *FracManager) shiftFirstFrac() frac.Fraction { - fm.fracMu.Lock() - defer fm.fracMu.Unlock() - - if len(fm.localFracs) == 0 { - return nil - } - - outsider := fm.localFracs[0].instance - fm.localFracs[0] = nil - fm.localFracs = fm.localFracs[1:] - return outsider +func (fm *FracManager) Fractions() List { + return fm.lc.registry.AllFractions() } -// removeStaleFractions removes [frac.Remote] fractions from external storage. -// Decision is based on the retention period provided by user. -func (fm *FracManager) removeStaleFractions(cleanupWg *sync.WaitGroup, retention time.Duration) { - // User did not provide retention period so keep all remote fractions alive. - // It's safe to do because we do not keep anything locally (but maybe we will eventually run out of inodes). - if retention <= 0 { - return - } - - var ( - staleFractions []*frac.Remote - freshFractions []*frac.Remote - ) - - fm.fracMu.Lock() - - for _, f := range fm.remoteFracs { - ct := time.UnixMilli(int64(f.Info().CreationTime)) - if time.Since(ct) < retention { - freshFractions = append(freshFractions, f) - continue - } - staleFractions = append(staleFractions, f) - } - - fm.remoteFracs = freshFractions - - fm.fracMu.Unlock() - - cleanupWg.Add(1) - go func() { - defer cleanupWg.Done() - - for _, f := range staleFractions { - ct := time.UnixMilli(int64(f.Info().CreationTime)) - - logger.Info( - "removing stale remote fraction", - zap.String("fraction", f.Info().Name()), - zap.Time("creation_time", ct), - zap.String("retention", retention.String()), - ) - - fm.fracCache.Remove(f.Info().Name()) - f.Suicide() - } - }() +func (fm *FracManager) Oldest() uint64 { + return fm.lc.registry.OldestTotal() } func (fm *FracManager) Flags() *StateManager { - return fm.flags + return fm.lc.flags } -func (fm *FracManager) determineOutsiders() []frac.Fraction { - var outsiders []frac.Fraction - - localFracs := fm.getLocalFracs() - occupiedSize := localFracs.GetTotalSize() - - var truncated int - for occupiedSize > fm.config.TotalSize { - outsider := fm.shiftFirstFrac() - if outsider == nil { - break - } - - localFracs = localFracs[1:] - outsiders = append(outsiders, outsider) - occupiedSize -= outsider.Info().FullSize() - truncated++ - } - - if len(outsiders) > 0 && !fm.flags.IsCapacityExceeded() { - if err := fm.flags.setCapacityExceeded(true); err != nil { - logger.Fatal("set capacity exceeded error", zap.Error(err)) - } - } - - maintenanceTruncateTotal.Add(float64(truncated)) - return outsiders +// Active returns the currently active fraction +func (fm *FracManager) Active() frac.Fraction { + return fm.lc.registry.Active().proxy } -func (fm *FracManager) cleanupFractions(cleanupWg *sync.WaitGroup) { - outsiders := fm.determineOutsiders() - if len(outsiders) == 0 { - return - } - - for _, outsider := range outsiders { - cleanupWg.Add(1) - go func() { - defer cleanupWg.Done() - - info := outsider.Info() - if !fm.config.OffloadingEnabled { - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - return - } - - offloadStart := time.Now() - remote, err := fm.fracProvider.Offload(fm.ctx, outsider) +// Append writes documents and metadata to the active fraction +// Implements retry logic in case of fraction sealing during write +func (fm *FracManager) Append(ctx context.Context, docs, metas storage.DocBlock) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + // Try to append data to the currently active fraction + err := fm.lc.registry.Active().Append(docs, metas) if err != nil { - offloadingTotal.WithLabelValues("failure").Inc() - offloadingDurationSeconds.Observe(float64(time.Since(offloadStart).Seconds())) - - logger.Error( - "will call Suicide() on fraction: failed to offload fraction", - zap.String("fraction", info.Name()), - zap.Int("retry_count", fm.s3cli.MaxRetryAttempts()), - zap.Error(err), - ) - - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - - return - } - - if remote == nil { - fm.fracCache.Remove(info.Name()) - outsider.Suicide() - return + logger.Info("append fail", zap.Error(err)) + if err == ErrFractionNotWritable { + continue // fraction is currently being sealed, retry the operation + } } - - offloadingTotal.WithLabelValues("success").Inc() - offloadingDurationSeconds.Observe(float64(time.Since(offloadStart).Seconds())) - - logger.Info( - "successully offloaded fraction", - zap.String("fraction", info.Name()), - zap.String("took", time.Since(offloadStart).String()), - ) - - fm.fracMu.Lock() - // FIXME(dkharms): We had previously shifted fraction from local fracs list (in [fm.determineOutsiders] via [fm.shiftFirstFrac]) - // and therefore excluded it from search queries. - // But now we return that fraction back (well now it's a [frac.Remote] fraction but it still points to the same data) - // so user can face incosistent search results. - fm.remoteFracs = append(fm.remoteFracs, remote) - fm.fracMu.Unlock() - - outsider.Suicide() - }() - } -} - -type FracType int - -const ( - FracTypeLocal FracType = 1 << iota - FracTypeRemote -) - -// Fractions returns a list of known fracs (local and remote). -// -// While working with this list, it may become irrelevant (factions may, for example, be deleted). -// This is a valid situation, because access to the data of these factions (search and fetch) occurs under blocking (see DataProvider). -// This way we avoid the race. -// -// Accessing the deleted faction data just will return an empty result. -func (fm *FracManager) Fractions() (fracs List) { - return append(fm.getLocalFracs(), fm.getRemoteFracs()...) -} - -func (fm *FracManager) getLocalFracs() List { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - fracs := make(List, 0, len(fm.localFracs)) - for _, f := range fm.localFracs { - fracs = append(fracs, f.instance) - } - - return fracs -} - -func (fm *FracManager) getRemoteFracs() List { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - fracs := make(List, 0, len(fm.remoteFracs)) - for _, f := range fm.remoteFracs { - fracs = append(fracs, f) - } - - return fracs -} - -func (fm *FracManager) processFracsStats() { - type fracStats struct { - docsTotal uint64 - docsRaw uint64 - docsDisk uint64 - index uint64 - totalSize uint64 - count int - } - - calculate := func(fracs List) (st fracStats) { - for _, f := range fracs { - info := f.Info() - st.count += 1 - st.totalSize += info.FullSize() - st.docsTotal += uint64(info.DocsTotal) - st.docsRaw += info.DocsRaw - st.docsDisk += info.DocsOnDisk - st.index += info.IndexOnDisk + info.MetaOnDisk + return err } - return } - - setMetrics := func(st string, oldest uint64, ft fracStats) { - logger.Info("fraction stats", - zap.Int("count", ft.count), - zap.String("storage_type", st), - zap.Uint64("docs_k", ft.docsTotal/1000), - util.ZapUint64AsSizeStr("total_size", ft.totalSize), - util.ZapUint64AsSizeStr("docs_raw", ft.docsRaw), - util.ZapUint64AsSizeStr("docs_comp", ft.docsDisk), - util.ZapUint64AsSizeStr("index", ft.index), - ) - - dataSizeTotal.WithLabelValues("total", st).Set(float64(ft.totalSize)) - dataSizeTotal.WithLabelValues("docs_raw", st).Set(float64(ft.docsRaw)) - dataSizeTotal.WithLabelValues("docs_on_disk", st).Set(float64(ft.docsDisk)) - dataSizeTotal.WithLabelValues("index", st).Set(float64(ft.index)) - - if oldest != 0 { - oldestFracTime.WithLabelValues(st). - Set((time.Duration(oldest) * time.Millisecond).Seconds()) - } - } - - setMetrics("local", fm.oldestCTLocal.Load(), calculate(fm.getLocalFracs())) - setMetrics("remote", fm.oldestCTRemote.Load(), calculate(fm.getRemoteFracs())) -} - -func (fm *FracManager) runMaintenanceLoop(ctx context.Context) { - fm.mntcWG.Add(1) - go func() { - defer fm.mntcWG.Done() - - var ( - sealWg sync.WaitGroup - cleanupWg sync.WaitGroup - ) - - util.RunEvery(ctx.Done(), fm.config.MaintenanceDelay, func() { - fm.maintenance(&sealWg, &cleanupWg) - }) - - sealWg.Wait() - cleanupWg.Wait() - }() -} - -func (fm *FracManager) runStatsLoop(ctx context.Context) { - fm.statWG.Add(1) - go func() { - defer fm.statWG.Done() - - util.RunEvery(ctx.Done(), time.Second*10, func() { - fm.processFracsStats() - }) - }() -} - -func (fm *FracManager) Start() { - var ctx context.Context - ctx, fm.stopFn = context.WithCancel(fm.ctx) - - fm.runStatsLoop(ctx) - fm.runMaintenanceLoop(ctx) - startCacheWorker(ctx, fm.config, fm.cacheMaintainer, &fm.cacheWG) } // startCacheWorker starts background cache garbage collection @@ -459,138 +151,71 @@ func startCacheWorker(ctx context.Context, cfg *Config, cache *CacheMaintainer, }() } -func (fm *FracManager) load(ctx context.Context) error { - l := NewLoader(fm.config, fm.fracProvider, fm.fracCache) - - active, locals, remotes, err := l.Load(ctx) - if err != nil { - return err - } - - for _, s := range locals { - fm.localFracs = append(fm.localFracs, &fracRef{instance: s}) - } - - for _, s := range remotes { - fm.remoteFracs = append(fm.remoteFracs, s) - } - - fm.active = fm.newActiveRef(active) - fm.localFracs = append(fm.localFracs, fm.active.ref) - - fm.updateOldestCT() - return nil -} - -func (fm *FracManager) Append(ctx context.Context, docs, metas storage.DocBlock) error { - var err error - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - if err = fm.Writer().Append(docs, metas); err == nil { - return nil - } - logger.Info("append fail", zap.Error(err)) // can get fail if fraction already sealed - } - } -} - -func (fm *FracManager) seal(activeRef activeRef) { - sealsTotal.Inc() - now := time.Now() - sealed, err := activeRef.frac.Seal() - if err != nil { - if errors.Is(err, ErrSealingFractionSuicided) { - // the faction is suicided, this means that it has already pushed out of the list of factions, - // so we simply skip further actions - return - } - logger.Fatal("sealing error", zap.Error(err)) - } - sealingTime := time.Since(now) - sealsDoneSeconds.Observe(sealingTime.Seconds()) - - logger.Info( - "fraction sealed", - zap.String("fraction", filepath.Dir(sealed.Info().Path)), - zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), - ) - - info := sealed.Info() - fm.fracCache.Add(info) +// startStatsWorker starts periodic statistics collection and reporting +func startStatsWorker(ctx context.Context, reg *fractionRegistry, wg *sync.WaitGroup) { + wg.Add(1) + go func() { + defer wg.Done() - fm.fracMu.Lock() - activeRef.ref.instance = sealed - fm.fracMu.Unlock() + logger.Info("stats loop is started") + // Run stats collection every 10 seconds + util.RunEvery(ctx.Done(), time.Second*10, func() { + stats := reg.Stats() + stats.Log() // Log statistics + stats.SetMetrics() // Update Prometheus metrics + }) + logger.Info("stats loop is stopped") + }() } -func (fm *FracManager) rotate() activeRef { - next := fm.newActiveRef(fm.fracProvider.CreateActive()) - - fm.fracMu.Lock() - prev := fm.active - fm.active = next - fm.localFracs = append(fm.localFracs, fm.active.ref) - fm.fracMu.Unlock() - - logger.Info("new fraction created", zap.String("filepath", next.frac.active.BaseFileName)) - - return prev -} +// startMaintWorker starts periodic fraction maintenance operations +func startMaintWorker(ctx context.Context, cfg *Config, fm *FracManager, wg *sync.WaitGroup) { + wg.Add(1) + go func() { + defer wg.Done() -func (fm *FracManager) minFracSizeToSeal() uint64 { - return fm.config.FracSize * consts.DefaultMinSealPercent / 100 + logger.Info("maintenance loop is started") + // Run maintenance at configured interval + util.RunEvery(ctx.Done(), cfg.MaintenanceDelay, func() { + n := time.Now() + logger.Debug("maintenance iteration started") + fm.mu.Lock() + // Perform fraction maintenance (rotation, truncating, offloading, etc.) + fm.lc.Maintain(ctx, cfg, wg) + fm.mu.Unlock() + logger.Debug("maintenance iteration finished", zap.Int64("took_ms", time.Since(n).Milliseconds())) + }) + logger.Info("waiting maintenance complete background tasks") + logger.Info("maintenance loop is stopped") + }() } -func (fm *FracManager) Stop() { - fm.Writer().WaitWriteIdle() - fm.indexer.Stop() - fm.stopFn() - - fm.statWG.Wait() - fm.mntcWG.Wait() - fm.cacheWG.Wait() +// SealOnShutdown seals the active fraction on storage shutdown +func sealOnShutdown(active *active.Active, provider *fractionProvider, minSealSize uint64) { + fracSize := active.Info().FullSize() - if err := fm.fracCache.SyncWithDisk(); err != nil { - logger.Error( - "failed to sync frac-cache on disk", - zap.Error(err), + if minSealSize == 0 { + logger.Info("sealing skipped: sealing on shutdown is disabled", + zap.String("frac", active.BaseFileName), + zap.Uint64("size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), ) + return } - needSealing := false - status := "frac too small to be sealed" - - info := fm.active.frac.Info() - if info.FullSize() > fm.minFracSizeToSeal() { - needSealing = true - status = "need seal active fraction before exit" + if fracSize < minSealSize { + logger.Info("sealing skipped: fraction too small", + zap.String("frac", active.BaseFileName), + zap.Uint64("size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), + ) + return } - logger.Info( - "sealing on exit", - zap.String("status", status), - zap.String("frac", info.Name()), - zap.Uint64("fill_size_mb", uint64(util.SizeToUnit(info.FullSize(), "mb"))), + logger.Info("fraction sealed before shutdown", + zap.String("frac", active.BaseFileName), + zap.Uint64("fill_size_mb", uint64(util.SizeToUnit(fracSize, "mb"))), ) - if needSealing { - fm.seal(fm.active) + if _, err := provider.Seal(active); err != nil { + logger.Error("error sealing on shutdown", zap.Error(err)) } } - -func (fm *FracManager) Writer() *proxyFrac { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - return fm.active.frac -} - -func (fm *FracManager) Active() frac.Fraction { - fm.fracMu.RLock() - defer fm.fracMu.RUnlock() - - return fm.active.frac -} diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index 75d664ec..6b3e1bc5 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -1,12 +1,22 @@ package fracmanager +import "sync" + func (fm *FracManager) WaitIdleForTests() { - fm.Writer().WaitWriteIdle() + fm.lc.registry.Active().WaitWriteIdle() } func (fm *FracManager) SealForcedForTests() { - active := fm.rotate() - if active.frac.Info().DocsTotal > 0 { - fm.seal(active) - } + wg := sync.WaitGroup{} + fm.mu.Lock() // todo: get rid of mutex after removing SealForcedForTests method + fm.lc.Rotate(0, &wg) + fm.mu.Unlock() + + wg.Wait() + fm.lc.waitSealingForTests() // todo: get rid of waitSealingForTests method after removing SealForcedForTests method +} + +// todo: get rid of this after removing fracmanager.SealForcedForTests() +func (lc *lifecycleManager) waitSealingForTests() { + lc.sealingWg.Wait() } diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index c8e7088f..13925cca 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -1,25 +1,23 @@ package fracmanager import ( - "context" - "fmt" - "sync" "testing" - "time" + "github.com/alecthomas/units" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/seq" - testscommon "github.com/ozontech/seq-db/tests/common" ) func setupDataDir(t testing.TB, cfg *Config) *Config { if cfg == nil { - cfg = &Config{} + cfg = &Config{ + Fraction: frac.Config{SkipSortDocs: true}, + } } if cfg.DataDir == "" { cfg.DataDir = t.TempDir() @@ -29,186 +27,57 @@ func setupDataDir(t testing.TB, cfg *Config) *Config { func setupFracManager(t testing.TB, cfg *Config) (*Config, *FracManager, func()) { cfg = setupDataDir(t, cfg) - fm, err := New(t.Context(), cfg, nil) + fm, stop, err := New(t.Context(), cfg, nil) assert.NoError(t, err) - fm.Start() - return cfg, fm, fm.Stop + return cfg, fm, stop } -func addDummyDoc(t *testing.T, fm *FracManager, dp *indexer.TestDocProvider, seqID seq.ID) { - doc := []byte("document") - dp.Append(doc, nil, seqID, "service:100500", "k8s_pod", "_all_:") +func appendDocsToFracManager(t testing.TB, fm *FracManager, docCount int) { + dp := indexer.NewTestDocProvider() + for i := 0; i < docCount; i++ { + doc := []byte("{\"timestamp\": 0, \"message\": \"msg\"}") + dp.Append(doc, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") + } docs, metas := dp.Provide() - err := fm.Append(context.Background(), docs, metas) + err := fm.Append(t.Context(), docs, metas) assert.NoError(t, err) } -func MakeSomeFractions(t *testing.T, fm *FracManager) { - dp := indexer.NewTestDocProvider() - addDummyDoc(t, fm, dp, seq.SimpleID(1)) - fm.seal(fm.rotate()) - - dp.TryReset() - - addDummyDoc(t, fm, dp, seq.SimpleID(2)) - fm.seal(fm.rotate()) - - dp.TryReset() - addDummyDoc(t, fm, dp, seq.SimpleID(3)) -} - -func TestCleanUp(t *testing.T) { - cfg, fm, stop := setupFracManager(t, &Config{ - FracSize: 1000, - TotalSize: 100000, - }) +func TestSealingOnShutdown(t *testing.T) { + cfg := &Config{ + FracSize: 1 * uint64(units.MiB), // to ensure that the frac will not be sealed on maintenance + TotalSize: 1 * uint64(units.MiB), + Fraction: frac.Config{SkipSortDocs: true}, + } - MakeSomeFractions(t, fm) + // first start + cfg.MinSealFracSize = 0 // to ensure that the frac will not be sealed on shutdown + cfg, fm, stop := setupFracManager(t, cfg) + appendDocsToFracManager(t, fm, 10) + activeName := fm.Fractions()[0].Info().Name() + stop() - first := fm.localFracs[0].instance.(*frac.Sealed) - first.PartialSuicideMode = frac.HalfRename - first.Suicide() + // second start + cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown + cfg, fm, stop = setupFracManager(t, cfg) - second := fm.localFracs[1].instance.(*frac.Sealed) - second.PartialSuicideMode = frac.HalfRemove - second.Suicide() - info := fm.active.frac.Info() - shouldSealOnExit := info.FullSize() > fm.minFracSizeToSeal() + assert.Equal(t, 1, len(fm.Fractions()), "should have one fraction") + assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "fraction should have the same name") + _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*active.Active) + assert.True(t, ok, "fraction should be active") stop() - if shouldSealOnExit && info.DocsTotal > 0 { - t.Error("active fraction should be empty after rotation and sealing") - } - + // third start _, fm, stop = setupFracManager(t, cfg) - defer stop() - - assert.Equal(t, 1, len(fm.localFracs), "wrong frac count") -} - -func TestCapacityExceeded(t *testing.T) { - dataDir := testscommon.GetTestTmpDir(t) - testscommon.RecreateDir(dataDir) - defer testscommon.RemoveDir(dataDir) - - launchAndCheck := func(checkFn func(fm *FracManager)) { - fm, err := New(context.Background(), &Config{ - FracSize: 500, - TotalSize: 5000, - DataDir: dataDir, - }, nil) - assert.NoError(t, err) - - checkFn(fm) - fm.indexer.Stop() - } - - id := 1 - dp := indexer.NewTestDocProvider() - makeSealedFrac := func(fm *FracManager, docsPerFrac int) { - for i := 0; i < docsPerFrac; i++ { - addDummyDoc(t, fm, dp, seq.SimpleID(id)) - id++ - } - fm.seal(fm.rotate()) - dp.TryReset() - } + assert.Equal(t, 2, len(fm.Fractions()), "should have 2 fraction: new active and old sealed") + _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*sealed.Sealed) + assert.True(t, ok, "first fraction should be sealed") + assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "sealed fraction should have the same name") + assert.Equal(t, uint32(0), fm.Fractions()[1].Info().DocsTotal, "active fraction should be empty") + _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*active.Active) + assert.True(t, ok, "new fraction should be active") - // first run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "expect data dir is empty") - makeSealedFrac(fm, 10) - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - }) - - // second run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - for fm.Fractions().GetTotalSize() < fm.config.TotalSize { - makeSealedFrac(fm, 10) - } - assert.Equal(t, false, fm.Flags().IsCapacityExceeded(), "there should still be no fraction removal and the flag should be false") - sealWG := sync.WaitGroup{} - suicideWG := sync.WaitGroup{} - fm.maintenance(&sealWG, &suicideWG) - assert.Equal(t, true, fm.Flags().IsCapacityExceeded(), "the deletion should occur and the flag should now be true") - }) - - // third run - launchAndCheck(func(fm *FracManager) { - assert.Equal(t, true, fm.Flags().IsCapacityExceeded(), "IsCapacityExceeded must be set to true in the state file") - }) - -} - -func TestOldestCT(t *testing.T) { - const fracCount = 10 - - t.Run("local", func(t *testing.T) { - fm, err := New(context.Background(), &Config{DataDir: t.TempDir()}, nil) - assert.NoError(t, err) - - oldestLocal := time.Now() - nowOldestLocal := oldestLocal - - fm.localFracs = nil - for i := range fracCount { - fm.localFracs = append(fm.localFracs, &fracRef{instance: frac.NewSealed( - "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("local-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestLocal.UnixMilli()), - }, nil, - )}) - nowOldestLocal = nowOldestLocal.Add(time.Second) - } - - fm.updateOldestCT() - - require.Equal(t, uint64(0), fm.oldestCTRemote.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.oldestCTLocal.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.Oldest()) - }) - - t.Run("local-and-remote", func(t *testing.T) { - fm, err := New(context.Background(), &Config{DataDir: t.TempDir()}, nil) - assert.NoError(t, err) - - oldestRemote := time.Now() - nowOldestRemote := oldestRemote - - fm.localFracs = nil - for i := range fracCount { - fm.remoteFracs = append(fm.remoteFracs, frac.NewRemote( - t.Context(), "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("remote-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestRemote.UnixMilli()), - }, nil, nil, - )) - nowOldestRemote = nowOldestRemote.Add(time.Second) - } - - oldestLocal := nowOldestRemote - nowOldestLocal := oldestLocal - - for i := range fracCount { - fm.localFracs = append(fm.localFracs, &fracRef{instance: frac.NewSealed( - "", nil, nil, nil, &common.Info{ - Path: fmt.Sprintf("local-frac-%d", i), - IndexOnDisk: 1, - CreationTime: uint64(nowOldestLocal.UnixMilli()), - }, nil, - )}) - nowOldestLocal = nowOldestLocal.Add(time.Second) - } - - fm.updateOldestCT() - - require.Equal(t, uint64(oldestRemote.UnixMilli()), fm.oldestCTRemote.Load()) - require.Equal(t, uint64(oldestLocal.UnixMilli()), fm.oldestCTLocal.Load()) - require.Equal(t, uint64(oldestRemote.UnixMilli()), fm.Oldest()) - }) + stop() } diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go new file mode 100644 index 00000000..1e9d18d8 --- /dev/null +++ b/fracmanager/fracs_stats.go @@ -0,0 +1,85 @@ +package fracmanager + +import ( + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +// fracsStats contains statistical information about a group of fractions +// Used to track aggregate metrics for fractions in different states +type fracsStats struct { + count int // Number of fractions in the group + docsCount uint64 // Total number of documents across all fractions + docsSizeRaw uint64 // Total raw size of documents before compression + docsSizeOnDisk uint64 // Total size of documents on disk after compression + indexSizeOnDisk uint64 // Total size of index and metadata on disk + totalSizeOnDisk uint64 // Total storage size, including documents, index and metadata +} + +// Add incorporates fraction information into the statistics +// Updates all aggregate metrics with the values from the provided fraction info +func (s *fracsStats) Add(info *frac.Info) { + s.count++ + s.docsCount += uint64(info.DocsTotal) + s.docsSizeRaw += info.DocsRaw + s.docsSizeOnDisk += info.DocsOnDisk + s.indexSizeOnDisk += info.IndexOnDisk + info.MetaOnDisk + s.totalSizeOnDisk += info.FullSize() +} + +// Sub removes fraction information from the statistics +// Decrements all aggregate metrics with the values from the provided fraction info +func (s *fracsStats) Sub(info *frac.Info) { + s.count-- + s.docsCount -= uint64(info.DocsTotal) + s.docsSizeRaw -= info.DocsRaw + s.docsSizeOnDisk -= info.DocsOnDisk + s.indexSizeOnDisk -= info.IndexOnDisk + info.MetaOnDisk + s.totalSizeOnDisk -= info.FullSize() +} + +func (s *fracsStats) Log(stage string) { + logger.Info("fraction stats", + zap.Int("count", s.count), + zap.String("stage", stage), + zap.Uint64("docs_k", s.docsCount/1000), + util.ZapUint64AsSizeStr("total_size", s.totalSizeOnDisk), + util.ZapUint64AsSizeStr("docs_raw", s.docsSizeRaw), + util.ZapUint64AsSizeStr("docs_comp", s.docsSizeOnDisk), + util.ZapUint64AsSizeStr("index", s.indexSizeOnDisk), + ) +} + +func (s *fracsStats) SetMetrics(metric *prometheus.GaugeVec, stage string) { + metric.WithLabelValues("total", stage).Set(float64(s.totalSizeOnDisk)) + metric.WithLabelValues("docs_raw", stage).Set(float64(s.docsSizeRaw)) + metric.WithLabelValues("docs_on_disk", stage).Set(float64(s.docsSizeOnDisk)) + metric.WithLabelValues("index", stage).Set(float64(s.indexSizeOnDisk)) +} + +// registryStats contains statistical data for all fraction queues +// Used for monitoring and memory management decisions +type registryStats struct { + sealing fracsStats // Statistics for fractions in the sealing process + locals fracsStats // Statistics for fractions on local disk + offloading fracsStats // Statistics for fractions in the offloading process + remotes fracsStats // Statistics for fractions in remote storage +} + +func (s *registryStats) Log() { + s.sealing.Log("sealing") + s.locals.Log("locals") + s.offloading.Log("offloading") + s.remotes.Log("remotes") +} + +func (s *registryStats) SetMetrics() { + s.sealing.SetMetrics(dataSizeTotal, "sealing") + s.locals.SetMetrics(dataSizeTotal, "locals") + s.offloading.SetMetrics(dataSizeTotal, "offloading") + s.remotes.SetMetrics(dataSizeTotal, "remotes") +} diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 6343b3a4..c68a1c8e 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -9,8 +9,11 @@ import ( "github.com/oklog/ulid/v2" + "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" + active1 "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/storage" @@ -25,14 +28,14 @@ type fractionProvider struct { s3cli *s3.Client // Client for S3 storage operations config *Config // Fraction manager configuration cacheProvider *CacheMaintainer // Cache provider for data access optimization - activeIndexer *frac.ActiveIndexer // Indexer for active fractions + activeIndexer *active_old.Indexer // Indexer for active fractions readLimiter *storage.ReadLimiter // Read rate limiter ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( cfg *Config, s3cli *s3.Client, cp *CacheMaintainer, - readLimiter *storage.ReadLimiter, indexer *frac.ActiveIndexer, + readLimiter *storage.ReadLimiter, indexer *active_old.Indexer, ) *fractionProvider { return &fractionProvider{ s3cli: s3cli, @@ -44,8 +47,19 @@ func newFractionProvider( } } -func (fp *fractionProvider) NewActive(name string) *frac.Active { - return frac.NewActive( +func (fp *fractionProvider) NewActive(name string) *active.Active { + return active.New( + name, + &fp.config.Fraction, + config.NumCPU, + fp.readLimiter, + fp.cacheProvider.CreateDocBlockCache(), + fp.cacheProvider.CreateSortDocsCache(), + ) +} + +func (fp *fractionProvider) NewActiveOld(name string) *active_old.Active { + return active_old.New( name, fp.activeIndexer, fp.readLimiter, @@ -55,8 +69,8 @@ func (fp *fractionProvider) NewActive(name string) *frac.Active { ) } -func (fp *fractionProvider) NewSealed(name string, cachedInfo *common.Info) *frac.Sealed { - return frac.NewSealed( +func (fp *fractionProvider) NewSealed(name string, cachedInfo *frac.Info) *sealed.Sealed { + return sealed.New( name, fp.readLimiter, fp.cacheProvider.CreateIndexCache(), @@ -66,8 +80,8 @@ func (fp *fractionProvider) NewSealed(name string, cachedInfo *common.Info) *fra ) } -func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *sealed.PreloadedData) *frac.Sealed { - return frac.NewSealedPreloaded( +func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *sealed.PreloadedData) *sealed.Sealed { + return sealed.NewPreloaded( name, preloadedData, // Data already loaded into memory fp.readLimiter, @@ -77,8 +91,8 @@ func (fp *fractionProvider) NewSealedPreloaded(name string, preloadedData *seale ) } -func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedInfo *common.Info) *frac.Remote { - return frac.NewRemote( +func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedInfo *frac.Info) *sealed.Remote { + return sealed.NewRemote( ctx, name, fp.readLimiter, @@ -99,7 +113,15 @@ func (fp *fractionProvider) nextFractionID() string { // CreateActive creates a new active fraction with auto-generated filename // Filename pattern: base_pattern + ULID -func (fp *fractionProvider) CreateActive() *frac.Active { +func (fp *fractionProvider) CreateActive() *active.Active { + filePath := fileBasePattern + fp.nextFractionID() + baseFilePath := filepath.Join(fp.config.DataDir, filePath) + return fp.NewActive(baseFilePath) +} + +// CreateActive creates a new active fraction with auto-generated filename +// Filename pattern: base_pattern + ULID +func (fp *fractionProvider) CreateActive2() *active1.Active { filePath := fileBasePattern + fp.nextFractionID() baseFilePath := filepath.Join(fp.config.DataDir, filePath) return fp.NewActive(baseFilePath) @@ -107,8 +129,8 @@ func (fp *fractionProvider) CreateActive() *frac.Active { // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading -func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { - src, err := frac.NewActiveSealingSource(active, fp.config.SealParams) +func (fp *fractionProvider) Seal1(a *active.Active) (*sealed.Sealed, error) { + src, err := active.NewSealingSource(a, fp.config.SealParams) if err != nil { return nil, err } @@ -117,12 +139,27 @@ func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { return nil, err } - return fp.NewSealedPreloaded(active.BaseFileName, preloaded), nil + return fp.NewSealedPreloaded(a.BaseFileName, preloaded), nil +} + +// Seal converts an active fraction to a sealed one +// Process includes sorting, indexing, and data optimization for reading +func (fp *fractionProvider) Seal(a *active.Active) (*sealed.Sealed, error) { + src, err := active.NewSealingSource(a, fp.config.SealParams) + if err != nil { + return nil, err + } + _, err = sealing.Seal(src, fp.config.SealParams) + if err != nil { + return nil, err + } + + return fp.NewSealed(a.BaseFileName, nil), nil } // Offload uploads fraction to S3 storage and returns a remote fraction // IMPORTANT: context controls timeouts and operation cancellation -func (fp *fractionProvider) Offload(ctx context.Context, f frac.Fraction) (*frac.Remote, error) { +func (fp *fractionProvider) Offload(ctx context.Context, f *sealed.Sealed) (*sealed.Remote, error) { mustBeOffloaded, err := f.Offload(ctx, s3.NewUploader(fp.s3cli)) if err != nil { return nil, err diff --git a/fracmanager/fraction_provider_test.go b/fracmanager/fraction_provider_test.go index d5769bdf..9e175178 100644 --- a/fracmanager/fraction_provider_test.go +++ b/fracmanager/fraction_provider_test.go @@ -1,23 +1,48 @@ package fracmanager import ( + "fmt" + "math/rand" + "net/http/httptest" "testing" + "time" "github.com/alecthomas/units" + "github.com/johannesboyne/gofakes3" + "github.com/johannesboyne/gofakes3/backend/s3mem" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active_old" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/storage/s3" ) +func setupS3Client(t testing.TB) (*s3.Client, func()) { + s3Backend := s3mem.New() + s3server := httptest.NewServer(gofakes3.New(s3Backend).Server()) + + bucketName := fmt.Sprintf("bucket_%s_%d_%d", t.Name(), time.Now().UnixMilli(), rand.Int()) + err := s3Backend.CreateBucket(bucketName) + require.NoError(t, err, "create bucket failed") + + s3cli, err := s3.NewClient(s3server.URL, "ACCESS_KEY", "SECRET_KEY", "eu-west-3", bucketName, 3) + require.NoError(t, err, "s3 client setup failed") + + return s3cli, s3server.Close +} + func setupFractionProvider(t testing.TB, cfg *Config) (*fractionProvider, func()) { cfg = setupDataDir(t, cfg) rl := storage.NewReadLimiter(1, nil) - idx := frac.NewActiveIndexer(1, 1) - idx.Start() + s3cli, stopS3 := setupS3Client(t) + idx, stopIdx := active_old.NewIndexer(1, 1) cache := NewCacheMaintainer(uint64(units.MB), uint64(units.MB), nil) - provider := newFractionProvider(cfg, nil, cache, rl, idx) - return provider, idx.Stop + provider := newFractionProvider(cfg, s3cli, cache, rl, idx) + return provider, func() { + stopIdx() + stopS3() + } } func TestFractionID(t *testing.T) { diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go new file mode 100644 index 00000000..7d36ec0a --- /dev/null +++ b/fracmanager/fraction_registry.go @@ -0,0 +1,387 @@ +package fracmanager + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" +) + +// fractionRegistry manages fraction queues at different lifecycle stages. +// Tracks fractions through different stages: active → sealing → local → offloading → remote +// Ensures correct state transitions while maintaining chronological order. +// The entire structure is thread-safe due to internal synchronization. +// Lifecycle: Created once, persists through application lifetime. +type fractionRegistry struct { + mu sync.RWMutex // Main mutex for protecting registry state + + // Lifecycle queues (FIFO order, oldest at lower indexes) + sealing []*activeProxy // Fractions being sealed (0-5 typical) + locals []*sealedProxy // Local sealed fractions (can be thousands) + offloading []*sealedProxy // Fractions being offloaded (0-5 typical) + remotes []*remoteProxy // Offloaded fractions (can be thousands) + + stats registryStats // Size statistics for monitoring + oldestTotal uint64 // Creation time of oldest fraction + oldestLocal uint64 // Creation time of oldest fraction + + muAll sync.RWMutex // Mutex specifically for all fractions list + active *activeProxy // Currently active writable fraction + all []frac.Fraction // All fractions in creation order (read-only view) +} + +// NewFractionRegistry creates and initializes a new fraction registry instance. +// Populates the registry with existing active, local and remote fractions. +// Rebuilds the complete fractions list in chronological order. +func NewFractionRegistry(active *active.Active, locals []*sealed.Sealed, remotes []*sealed.Remote) (*fractionRegistry, error) { + if active == nil { + return nil, errors.New("active fraction must be specified") + } + + // Set current active fraction + r := fractionRegistry{ + active: &activeProxy{ + proxy: &fractionProxy{impl: active}, + instance: active, + }, + } + + // Initialize local sealed fractions + for _, sealed := range locals { + r.stats.locals.Add(sealed.Info()) + r.locals = append(r.locals, &sealedProxy{ + proxy: &fractionProxy{impl: sealed}, + instance: sealed, + }) + } + + // Initialize remote fractions + for _, remote := range remotes { + r.stats.remotes.Add(remote.Info()) + r.remotes = append(r.remotes, &remoteProxy{ + proxy: &fractionProxy{impl: remote}, + instance: remote, + }) + } + + // Init oldest local value + r.updateOldestLocal() + + // Rebuild complete fractions list in order + r.rebuildAllFractions() + + return &r, nil +} + +// Active returns the currently active writable fraction. +func (r *fractionRegistry) Active() *activeProxy { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.active +} + +// AllFractions returns a read-only view of all fractions in creation order. +func (r *fractionRegistry) AllFractions() []frac.Fraction { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.all +} + +// Stats returns current size statistics of the registry. +func (r *fractionRegistry) Stats() registryStats { + r.mu.RLock() + defer r.mu.RUnlock() + return r.stats +} + +// OldestTotal returns the creation time of the oldest fraction in the registry. +func (r *fractionRegistry) OldestTotal() uint64 { + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.oldestTotal +} + +// OldestLocal returns the creation time of the oldest local fraction in the registry. +func (r *fractionRegistry) OldestLocal() uint64 { + r.mu.RLock() + defer r.mu.RUnlock() + return r.oldestLocal +} + +// RotateIfFull completes the current active fraction and starts a new one. +// Moves previous active fraction to sealing queue. +// Updates statistics and maintains chronological order. +// Should be called when creating a new fraction. +func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *activeProxy) (*activeProxy, func(), error) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.active.instance.Info().DocsOnDisk <= maxSize { + return nil, nil, nil + } + + old := r.active + r.sealing = append(r.sealing, old) + r.addActive(newActive()) + + if err := old.Finalize(); err != nil { + return old, nil, err + } + + curInfo := old.instance.Info() + r.stats.sealing.Add(curInfo) + + wg := sync.WaitGroup{} + wg.Add(1) + // since old.WaitWriteIdle() can take some time, we don't want to do it under the lock + // we will do it asynchronously in a goroutine. + go func() { + defer wg.Done() + + old.WaitWriteIdle() // can be long enough + finalInfo := old.instance.Info() + + r.mu.Lock() + defer r.mu.Unlock() + + // curInfo and finalInfo differ because while we are waiting for old.WaitWriteIdle(), + // the latest data is being written to the active fraction index. + r.stats.sealing.Sub(curInfo) + r.stats.sealing.Add(finalInfo) + }() + + return old, wg.Wait, nil +} + +// addActive sets a new active fraction and updates the complete fractions list. +func (r *fractionRegistry) addActive(a *activeProxy) { + r.muAll.Lock() + defer r.muAll.Unlock() + + r.active = a + r.all = append(r.all, a.proxy) +} + +// trimAll removes the oldest fractions from the complete fractions list. +// Used when fractions are evicted or deleted from the system. +func (r *fractionRegistry) trimAll(count int) { + r.muAll.Lock() + defer r.muAll.Unlock() + + r.all = r.all[count:] + r.updateOldestTotal() +} + +// EvictLocal removes oldest local fractions to free disk space. +// If shouldOffload is true, moves fractions to offloading queue instead of deleting. +// Returns evicted fractions or error if insufficient space is released. +func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]*sealedProxy, error) { + r.mu.Lock() + defer r.mu.Unlock() + + var ( + count int + releasingSize uint64 + ) + + // Calculate total used disk space + totalUsedSize := r.stats.locals.totalSizeOnDisk + + r.stats.sealing.totalSizeOnDisk + + r.active.instance.Info().FullSize() + + // Determine how many oldest fractions need to be removed to meet size limit + for _, item := range r.locals { + if totalUsedSize-releasingSize <= sizeLimit { + break + } + info := item.instance.Info() + releasingSize += info.FullSize() + r.stats.locals.Sub(info) + count++ + } + + // Check if enough space will be freed + if totalUsedSize-releasingSize > sizeLimit { + return nil, fmt.Errorf("insufficient space released: need to free %d more bytes "+ + "(total: %d, releasing: %d, limit: %d)", + (totalUsedSize-releasingSize)-sizeLimit, totalUsedSize, releasingSize, sizeLimit) + } + + // Extract fractions to evict + evicted := r.locals[:count] + r.locals = r.locals[count:] + + // Either offload or completely remove the fractions + if shouldOffload { + for _, item := range evicted { + r.offloading = append(r.offloading, item) + r.stats.offloading.Add(item.instance.Info()) + } + } else { + r.trimAll(count) // Permanently remove + r.updateOldestLocal() // Oldest local can be changed here + } + + return evicted, nil +} + +// EvictRemote removes oldest remote fractions based on retention policy. +// Fractions older than retention period are permanently deleted. +// Returns removed fractions or empty slice if nothing to remove. +func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { + r.mu.Lock() + defer r.mu.Unlock() + + count := 0 + // Find fractions older than retention period + for _, item := range r.remotes { + info := item.instance.Info() + if time.Since(time.UnixMilli(int64(info.CreationTime))) <= retention { + break // Stop at first fraction within retention + } + r.stats.remotes.Sub(info) + count++ + } + + evicted := r.remotes[:count] + r.remotes = r.remotes[count:] + r.trimAll(count) // Remove from complete list + + return evicted +} + +// PromoteToLocal moves fractions from sealing to local queue when sealing completes. +// Maintains strict ordering - younger fractions wait for older ones to seal first. +func (r *fractionRegistry) PromoteToLocal(active *activeProxy, sealed *sealed.Sealed) { + r.mu.Lock() + defer r.mu.Unlock() + + active.sealed = sealed + + promotedCount := 0 + // Process sealing queue in order, promoting completed fractions + for _, item := range r.sealing { + if item.sealed == nil { + break // Maintain order - wait for previous fractions to complete + } + promotedCount++ + r.locals = append(r.locals, &sealedProxy{ + proxy: item.proxy, + instance: item.sealed, + }) + r.stats.locals.Add(item.sealed.Info()) + r.stats.sealing.Sub(item.instance.Info()) + } + + // Remove promoted fractions from sealing queue + r.sealing = r.sealing[promotedCount:] +} + +// PromoteToRemote moves fractions from offloading to remote queue when offloading completes. +// Special case: Handles fractions that don't require offloading (remote == nil). +// Maintains strict ordering - younger fractions wait for older ones to offload. +func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *sealed.Remote) { + r.mu.Lock() + defer r.mu.Unlock() + + sealed.remote = remote + + // Special case: remote == nil means fraction doesn't require offloading + if remote == nil { + r.removeFromOffloading(sealed) + } + + promotedCount := 0 + // Process offloading queue in order, promoting completed fractions + for _, item := range r.offloading { + if item.remote == nil { + break // Maintain order - wait for previous fractions to complete + } + promotedCount++ + r.remotes = append(r.remotes, &remoteProxy{ + proxy: item.proxy, + instance: item.remote, + }) + + r.stats.remotes.Add(item.remote.Info()) + r.stats.offloading.Sub(item.instance.Info()) + } + if promotedCount > 0 { + // Remove promoted fractions from offloading queue + r.offloading = r.offloading[promotedCount:] + r.updateOldestLocal() + } +} + +// removeFromOffloading removes a specific fraction from offloading queue. +// O(n) operation that rebuilds the all fractions list. +func (r *fractionRegistry) removeFromOffloading(sealed *sealedProxy) { + count := 0 + // Filter out the target fraction + for _, item := range r.offloading { + if sealed != item { + r.offloading[count] = item + count++ + } + } + r.offloading = r.offloading[:count] + r.stats.offloading.Sub(sealed.instance.Info()) + + // Oldest local can be changed here + r.updateOldestLocal() + + // Rebuild complete list since we modified the middle of the queue + r.rebuildAllFractions() +} + +// rebuildAllFractions reconstructs the all fractions list in correct chronological order. +// Order: remote (oldest) → offloading → local → sealing → active (newest) +// Expensive O(n) operation used when direct list modification is insufficient. +func (r *fractionRegistry) rebuildAllFractions() { + all := make([]frac.Fraction, 0, len(r.all)) + + // Collect fractions in correct chronological order: from oldest (remote) to newest (active) + for _, remote := range r.remotes { + all = append(all, remote.proxy) + } + for _, offloaded := range r.offloading { + all = append(all, offloaded.proxy) + } + for _, sealed := range r.locals { + all = append(all, sealed.proxy) + } + for _, active := range r.sealing { + all = append(all, active.proxy) + } + all = append(all, r.active.proxy) + + r.muAll.Lock() + defer r.muAll.Unlock() + + r.all = all + r.updateOldestTotal() +} + +// updateOldestTotal recalculates the creation time of the oldest fraction. +// Called after modifications of the complete fractions list. +func (r *fractionRegistry) updateOldestTotal() { + r.oldestTotal = r.all[0].Info().CreationTime +} + +// updateOldestLocal recalculates the creation time of the oldest local fraction. +// Called after modifications of the local fractions list. +func (r *fractionRegistry) updateOldestLocal() { + if len(r.offloading) > 0 { + r.oldestLocal = r.offloading[0].proxy.Info().CreationTime + } else if len(r.locals) > 0 { + r.oldestLocal = r.locals[0].proxy.Info().CreationTime + } else if len(r.sealing) > 0 { + r.oldestLocal = r.sealing[0].proxy.Info().CreationTime + } else { + r.oldestLocal = r.active.proxy.Info().CreationTime + } +} diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go new file mode 100644 index 00000000..345c5a83 --- /dev/null +++ b/fracmanager/lifecycle_manager.go @@ -0,0 +1,209 @@ +package fracmanager + +import ( + "context" + "path/filepath" + "sync" + "time" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +// lifecycleManager manages the complete lifecycle of fractions. +// Handles rotation, sealing, offloading, and cleanup operations. +// Lifecycle: Created once, coordinates all fraction state transitions. +type lifecycleManager struct { + infoCache *fracInfoCache // Fraction metadata cache + provider *fractionProvider // Provider for fraction operations + flags *StateManager // Storage state flags + registry *fractionRegistry // Fraction state registry + + sealingWg sync.WaitGroup +} + +func newLifecycleManager( + infoCache *fracInfoCache, + provider *fractionProvider, + flags *StateManager, + registry *fractionRegistry, +) *lifecycleManager { + return &lifecycleManager{ + infoCache: infoCache, + provider: provider, + flags: flags, + registry: registry, + } +} + +// Maintain performs periodic lifecycle management tasks. +// It is a CORE method of lifecycleManager +// Coordinates rotation, offloading, cleanup based on configuration. +func (lc *lifecycleManager) Maintain(ctx context.Context, config *Config, wg *sync.WaitGroup) { + lc.Rotate(config.FracSize, wg) + if config.OffloadingEnabled { + lc.OffloadLocal(ctx, config.TotalSize, wg) + lc.CleanRemote(config.OffloadingRetention, wg) + } else { + lc.CleanLocal(config.TotalSize, wg) + } + lc.UpdateOldestMetric() + lc.SyncInfoCache() +} + +func (lc *lifecycleManager) SyncInfoCache() { + if err := lc.infoCache.SyncWithDisk(); err != nil { + logger.Error("can't sync info-cache", zap.Error(err)) + } +} + +// Seal converts an active fraction to sealed state +// Freezes writes, waits for pending operations, then seals the fraction. +func (lc *lifecycleManager) Seal(active *activeProxy) error { + now := time.Now() + sealed, err := lc.provider.Seal(active.instance) + if err != nil { + return err + } + sealsTotal.Inc() + sealingTime := time.Since(now) + sealsDoneSeconds.Observe(sealingTime.Seconds()) + + logger.Info( + "fraction sealed", + zap.String("fraction", filepath.Base(sealed.BaseFileName)), + zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), + ) + + lc.infoCache.Add(sealed.Info()) + lc.registry.PromoteToLocal(active, sealed) + active.proxy.Redirect(sealed) + active.instance.Release() + return nil +} + +// Rotate checks if active fraction needs rotation based on size limit +// Creates new active fraction and starts sealing the previous one. +func (lc *lifecycleManager) Rotate(maxSize uint64, wg *sync.WaitGroup) { + activeToSeal, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, func() *activeProxy { + return newActiveProxy(lc.provider.CreateActive()) + }) + if err != nil { + logger.Fatal("active fraction rotation error", zap.Error(err)) + } + if activeToSeal == nil { + return + } + + wg.Add(1) + lc.sealingWg.Add(1) + go func() { + defer wg.Done() + defer lc.sealingWg.Done() + + waitBeforeSealing() + if err := lc.Seal(activeToSeal); err != nil { + logger.Fatal("sealing error", zap.Error(err)) + } + }() +} + +// OffloadLocal starts offloading of local fractions to remote storage +// Selects fractions based on disk space usage and retention policy. +func (lc *lifecycleManager) OffloadLocal(ctx context.Context, sizeLimit uint64, wg *sync.WaitGroup) { + toOffload, err := lc.registry.EvictLocal(true, sizeLimit) + if err != nil { + logger.Fatal("error releasing old fractions:", zap.Error(err)) + } + for _, sealed := range toOffload { + wg.Add(1) + go func() { + defer wg.Done() + + remote, _ := lc.TryOffload(ctx, sealed.instance) + lc.registry.PromoteToRemote(sealed, remote) + + if remote == nil { + sealed.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(sealed.instance.Info().Name()) + } else { + sealed.proxy.Redirect(remote) + } + + // Free up local resources + sealed.instance.Suicide() + maintenanceTruncateTotal.Add(1) + }() + } +} + +// TryOffload performs a single offload attempt and records metrics +// Measures offloading duration and tracks success/failure statistics. +func (lc *lifecycleManager) TryOffload(ctx context.Context, sealed *sealed.Sealed) (*sealed.Remote, error) { + now := time.Now() + remote, err := lc.provider.Offload(ctx, sealed) + offloadingDuration := time.Since(now).Seconds() + + if err != nil { + offloadingTotal.WithLabelValues("failure").Inc() + offloadingDurationSeconds.Observe(float64(offloadingDuration)) + return nil, err + } + + if remote != nil { + offloadingTotal.WithLabelValues("success").Inc() + offloadingDurationSeconds.Observe(float64(offloadingDuration)) + } + + return remote, nil +} + +// CleanRemote deletes outdated remote fractions based on retention policy +func (lc *lifecycleManager) CleanRemote(retention time.Duration, wg *sync.WaitGroup) { + if retention == 0 { + return + } + toDelete := lc.registry.EvictRemote(retention) + wg.Add(1) + go func() { + defer wg.Done() + for _, remote := range toDelete { + remote.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(remote.instance.Info().Name()) + remote.instance.Suicide() + } + }() +} + +// CleanLocal deletes outdated local fractions when offloading is disabled +func (lc *lifecycleManager) CleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { + toDelete, err := lc.registry.EvictLocal(false, sizeLimit) + if err != nil { + logger.Fatal("error releasing old fractions:", zap.Error(err)) + } + if len(toDelete) > 0 && !lc.flags.IsCapacityExceeded() { + if err := lc.flags.setCapacityExceeded(true); err != nil { + logger.Fatal("can't set capacity_exceeded flag", zap.Error(err)) + } + } + + wg.Add(1) + go func() { + defer wg.Done() + for _, sealed := range toDelete { + sealed.proxy.Redirect(emptyFraction{}) + lc.infoCache.Remove(sealed.instance.Info().Name()) + sealed.instance.Suicide() + maintenanceTruncateTotal.Add(1) + } + }() +} + +// UpdateOldestMetric updates the prometheus metric with oldest fraction timestamp +func (lc *lifecycleManager) UpdateOldestMetric() { + oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.OldestTotal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.OldestLocal()) * time.Millisecond).Seconds()) +} diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go new file mode 100644 index 00000000..c277c98e --- /dev/null +++ b/fracmanager/lifecycle_manager_test.go @@ -0,0 +1,160 @@ +package fracmanager + +import ( + "math/rand" + "path/filepath" + "sync" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/ozontech/seq-db/consts" +) + +func setupLifecycle(t testing.TB, cfg *Config) (*lifecycleManager, func()) { + provider, tearDown := setupFractionProvider(t, cfg) + dataDir := provider.config.DataDir + infoCache := NewFracInfoCache(filepath.Join(dataDir, consts.FracCacheFileSuffix)) + + registry, err := NewFractionRegistry(provider.CreateActive(), nil, nil) + assert.NoError(t, err) + + storageState, err := NewStateManager(dataDir, defaultStorageState) + assert.NoError(t, err) + + lifecycle := newLifecycleManager(infoCache, provider, storageState, registry) + + return lifecycle, tearDown +} + +func TestFracInfoCache(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + var total uint64 + + fillRotateAndCheck := func(names map[string]struct{}) { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + _, ok := lc.infoCache.Get(info.Name()) + assert.True(t, ok) + + total += info.FullSize() + names[info.Name()] = struct{}{} + } + + first := map[string]struct{}{} + for range 10 { + fillRotateAndCheck(first) + } + halfSize := total + + second := map[string]struct{}{} + for range 10 { + fillRotateAndCheck(second) + } + + wg := sync.WaitGroup{} + lc.CleanLocal(total-halfSize, &wg) + wg.Wait() + + for n := range first { + _, ok := lc.infoCache.Get(n) + assert.False(t, ok, "expect the first part to be deleted") + } + + for n := range second { + _, ok := lc.infoCache.Get(n) + assert.True(t, ok, "expect the second part to still be present") + } +} + +func TestCapacityExceeded(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const fracsCount = 10 + var total uint64 + + fillAndRotate := func() { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + total += info.FullSize() + } + + assert.False(t, lc.flags.IsCapacityExceeded(), "expect data dir is empty") + + // make some fracs + for range fracsCount { + fillAndRotate() + } + assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") + + wg := sync.WaitGroup{} + lc.CleanLocal(total, &wg) + wg.Wait() + + assert.Equal(t, fracsCount, lc.registry.Stats().locals.count, "as much as was added, so much should be") + assert.False(t, lc.flags.IsCapacityExceeded(), "there should still be no deletions, and the flag is false") + + lc.CleanLocal(total-1, &wg) + wg.Wait() + + assert.Equal(t, fracsCount-1, lc.registry.Stats().locals.count, "expect one less") + assert.True(t, lc.flags.IsCapacityExceeded(), "the flag must be true now") +} + +func TestOldestMetrics(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const fracsCount = 10 + var total uint64 + + fillAndRotate := func() { + active := lc.registry.Active() + appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + wg := sync.WaitGroup{} + lc.Rotate(0, &wg) + wg.Wait() + + info := active.proxy.Info() + total += info.FullSize() + } + + firstFracTime := lc.registry.Active().proxy.Info().CreationTime + for range fracsCount { + fillAndRotate() + } + + // Check state after initial rotations + assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") + assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") + + halfSize := total + halfwayFracTime := lc.registry.Active().proxy.Info().CreationTime + for range fracsCount { + fillAndRotate() + } + + wg := sync.WaitGroup{} + lc.OffloadLocal(t.Context(), total-halfSize, &wg) + wg.Wait() + + // Check state after offloading + assert.NotEqual(t, firstFracTime, halfwayFracTime, "expect different creation times") + assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") + assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") +} diff --git a/fracmanager/loader.go b/fracmanager/loader.go index 41322bbf..5229c040 100644 --- a/fracmanager/loader.go +++ b/fracmanager/loader.go @@ -9,7 +9,8 @@ import ( "go.uber.org/zap" "golang.org/x/sync/errgroup" - "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" ) @@ -38,17 +39,17 @@ func NewLoader(config *Config, provider *fractionProvider, infoCache *fracInfoCa // Load is the main method for loading all fractions // Coordinates the entire process: discovery, validation, recovery, and ordering -func (l *Loader) Load(ctx context.Context) (*frac.Active, []*frac.Sealed, []*frac.Remote, error) { +func (l *Loader) Load(ctx context.Context) (*fractionRegistry, error) { // Stage 1: Discover all fractions in filesystem actives, locals, remotes, err := l.discover(ctx) if err != nil { - return nil, nil, nil, err + return nil, err } // Stage 2: Replay active fractions and seal them active, sealed, err := l.replayAndSeal(ctx, actives) if err != nil { - return nil, nil, nil, err + return nil, err } // Stage 3: Create new active fraction if no existing ones @@ -58,12 +59,12 @@ func (l *Loader) Load(ctx context.Context) (*frac.Active, []*frac.Sealed, []*fra // Stage 4: Combine all local fractions locals = append(locals, sealed...) - return active, locals, remotes, nil + return NewFractionRegistry(active, locals, remotes) } // replayAndSeal replays active fractions and seals old ones // Key method for ensuring data consistency during restart -func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*frac.Active, []*frac.Sealed, error) { +func (l *Loader) replayAndSeal(ctx context.Context, actives []*active.Active) (*active.Active, []*sealed.Sealed, error) { if len(actives) == 0 { return nil, nil, nil } @@ -71,7 +72,7 @@ func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*fr g, ctx := errgroup.WithContext(ctx) g.SetLimit(l.config.ReplayWorkers) - sealed := make([]*frac.Sealed, len(actives)-1) + sealed := make([]*sealed.Sealed, len(actives)-1) for i, a := range actives[:len(actives)-1] { g.Go(func() error { @@ -115,7 +116,7 @@ func (l *Loader) replayAndSeal(ctx context.Context, actives []*frac.Active) (*fr // discover discovers all fractions in filesystem // Returns fractions separated by type: active, local, remote -func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, []*frac.Remote, error) { +func (l *Loader) discover(ctx context.Context) ([]*active.Active, []*sealed.Sealed, []*sealed.Remote, error) { // Scan and analyze fraction files. Filter valid fractions manifests, err := analyzeFiles(l.scanFiles()) if err != nil { @@ -125,9 +126,9 @@ func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, total := len(manifests) logProgress := progressLogger(time.Millisecond * 500) - actives := make([]*frac.Active, 0) - locals := make([]*frac.Sealed, 0, total) - remotes := make([]*frac.Remote, 0, total) + actives := make([]*active.Active, 0) + locals := make([]*sealed.Sealed, 0, total) + remotes := make([]*sealed.Remote, 0, total) loadedInfoCache := NewFracInfoCacheFromDisk(l.infoCache.fullPath) @@ -153,7 +154,7 @@ func (l *Loader) discover(ctx context.Context) ([]*frac.Active, []*frac.Sealed, } // loadSealed loads a sealed fraction using cache -func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *frac.Sealed { +func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *sealed.Sealed { info, found := loadedInfoCache.Get(filepath.Base(basePath)) l.updateStats(found) @@ -163,7 +164,7 @@ func (l *Loader) loadSealed(basePath string, loadedInfoCache *fracInfoCache) *fr } // loadRemote loads a remote fraction -func (l *Loader) loadRemote(ctx context.Context, basePath string, loadedInfoCache *fracInfoCache) *frac.Remote { +func (l *Loader) loadRemote(ctx context.Context, basePath string, loadedInfoCache *fracInfoCache) *sealed.Remote { info, found := loadedInfoCache.Get(filepath.Base(basePath)) l.updateStats(found) diff --git a/fracmanager/loader_test.go b/fracmanager/loader_test.go index de92ad19..43a1e967 100644 --- a/fracmanager/loader_test.go +++ b/fracmanager/loader_test.go @@ -8,30 +8,29 @@ import ( "testing" "time" - insaneJSON "github.com/ozontech/insane-json" "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/seq" ) func setupLoaderTest(t testing.TB, cfg *Config) (*fractionProvider, *Loader, func()) { fp, tearDown := setupFractionProvider(t, cfg) + cfg = fp.config ic := NewFracInfoCache(filepath.Join(cfg.DataDir, consts.FracCacheFileSuffix)) loader := NewLoader(cfg, fp, ic) return fp, loader, tearDown } -func appendDocs(t *testing.T, active *frac.Active, docCount int) { +func appendDocsToActive(t testing.TB, active *active.Active, docCount int) { dp := indexer.NewTestDocProvider() - for i := 0; i < docCount; i++ { + for i := 1; i <= docCount; i++ { doc := []byte("{\"timestamp\": 0, \"message\": \"msg\"}") - docRoot, err := insaneJSON.DecodeBytes(doc) - assert.NoError(t, err) - dp.Append(doc, docRoot, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") + dp.Append(doc, seq.SimpleID(i), "service:100500", "k8s_pod", "_all_:") } docs, metas := dp.Provide() @@ -50,10 +49,10 @@ func TestReplayWithEmptyActive(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) actives = append(actives, active) } actives = append(actives, fp.CreateActive()) // last active frac is now empty @@ -80,12 +79,12 @@ func TestReplayWithMultipleEmpty(t *testing.T) { defer tearDown() // fill data - nonEmpty := make([]*common.Info, 0) - actives := make([]*frac.Active, 0, fracCount) + nonEmpty := make([]*frac.Info, 0) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() if i%3 == 0 { - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) nonEmpty = append(nonEmpty, active.Info()) } actives = append(actives, active) @@ -113,28 +112,30 @@ func TestReplayMultiple(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) // empty active fractions for replay for i := 0; i < fracCount; i++ { - active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) - actives = append(actives, active) + a := fp.CreateActive() + appendDocsToActive(t, a, 500+rand.Intn(100)) + actives = append(actives, fp.NewActive(a.BaseFileName)) } - active := fp.CreateActive() - appendDocs(t, active, 5) - actives = append(actives, active) + a := fp.CreateActive() + appendDocsToActive(t, a, 5) + actives = append(actives, fp.NewActive(a.BaseFileName)) // replay and seal - active, sealed, err := loader.replayAndSeal(t.Context(), actives) + a, s, err := loader.replayAndSeal(t.Context(), actives) assert.NoError(t, err) // checks - assert.Equal(t, len(actives), len(sealed)+1, "should replay same number of fractions") + assert.Equal(t, len(actives)-1, len(s), "should replay same number of fractions") for i := 0; i < fracCount; i++ { - assert.Equal(t, actives[i].Info().Name(), sealed[i].Info().Name(), "fraction %d should have the same name", i) - assert.Equal(t, actives[i].Info().DocsTotal, sealed[i].Info().DocsTotal, "fraction %d should have the same doc count", i) + assert.Equal(t, actives[i].Info().Name(), s[i].Info().Name(), "fraction %d should have the same name", i) + assert.Equal(t, actives[i].Info().DocsTotal, s[i].Info().DocsTotal, "fraction %d should have the same doc count", i) } - assert.Equal(t, actives[fracCount].Info().Name(), active.Info().Name(), "new active fraction should have the same name") - assert.Equal(t, uint32(5), active.Info().DocsTotal, "new active fraction should not be empty") + assert.Equal(t, actives[fracCount].Info().Name(), a.Info().Name(), "new active fraction should have the same name") + assert.Equal(t, uint32(5), a.Info().DocsTotal, + "new active fraction should have exact 5 docs but %d given", a.Info().DocsTotal, + ) } func TestReplaySingleEmpty(t *testing.T) { @@ -143,7 +144,7 @@ func TestReplaySingleEmpty(t *testing.T) { defer tearDown() // fill data: one empty fraction - actives := []*frac.Active{fp.CreateActive()} + actives := []*active.Active{fp.CreateActive()} // replay and seal active, sealed, err := loader.replayAndSeal(t.Context(), actives) @@ -163,10 +164,10 @@ func TestReplayContextCancel(t *testing.T) { defer tearDown() // fill data - actives := make([]*frac.Active, 0, fracCount) + actives := make([]*active.Active, 0, fracCount) for i := 0; i < fracCount; i++ { active := fp.CreateActive() - appendDocs(t, active, 500+rand.Intn(100)) + appendDocsToActive(t, active, 500+rand.Intn(100)) actives = append(actives, active) } actives = append(actives, fp.CreateActive()) @@ -188,8 +189,8 @@ func TestReplaySingleNonEmpty(t *testing.T) { defer tearDown() // fill data - actives := []*frac.Active{fp.CreateActive()} - appendDocs(t, actives[0], 500+rand.Intn(100)) + actives := []*active.Active{fp.CreateActive()} + appendDocsToActive(t, actives[0], 500+rand.Intn(100)) // replay and seal active, sealed, err := loader.replayAndSeal(t.Context(), actives) @@ -199,3 +200,79 @@ func TestReplaySingleNonEmpty(t *testing.T) { assert.Equal(t, active.Info().Name(), actives[0].Info().Name(), "should have the same name") assert.Equal(t, active.Info().DocsTotal, actives[0].Info().DocsTotal, "should have the same doc count for replayed frac") } + +func TestDiscover(t *testing.T) { + const fracCount = 16 + + // setup + fp, loader, tearDown := setupLoaderTest(t, nil) + defer tearDown() + + // make some sealed fracs + expectedSealed := map[string]*sealed.Sealed{} + for range fracCount { + a := fp.CreateActive() + appendDocsToActive(t, a, 10+rand.Intn(10)) + s, err := fp.Seal(a) + assert.NoError(t, err) + expectedSealed[s.Info().Name()] = s + } + + // make half sealed fracs remote + expectedRemote := map[string]*sealed.Remote{} + for n, s := range expectedSealed { + if rand.Intn(2) != 0 { + continue + } + r, err := fp.Offload(t.Context(), s) + assert.NoError(t, err) + expectedRemote[n] = r + s.Suicide() + delete(expectedSealed, n) + } + + // make half sealed fracs deleted + for n, s := range expectedSealed { + if rand.Intn(2) != 0 { + continue + } + s.Suicide() + delete(expectedSealed, n) + } + + // make half remote fracs deleted + for n, r := range expectedRemote { + if rand.Intn(2) != 0 { + continue + } + r.Suicide() + delete(expectedRemote, n) + } + + // make active + a := fp.CreateActive() + appendDocsToActive(t, a, 10+rand.Intn(10)) + + // discover from FS + actives, locals, remotes, err := loader.discover(t.Context()) + assert.NoError(t, err) + + // checks + for _, s := range locals { + n := s.Info().Name() + _, ok := expectedSealed[n] + delete(expectedSealed, n) + assert.True(t, ok, "not deleted sealed should be discovered") + } + for _, s := range remotes { + n := s.Info().Name() + _, ok := expectedRemote[n] + delete(expectedRemote, n) + assert.True(t, ok, "not deleted remote should be discovered %s", n) + } + + assert.Equal(t, 1, len(actives), "only one active should be discovered") + assert.Equal(t, a.BaseFileName, actives[0].BaseFileName, "must be the same name") + assert.Empty(t, expectedSealed, "we don't expect any more sealed fractions") + assert.Empty(t, expectedRemote, "we don't expect any more remote fractions") +} diff --git a/fracmanager/proxy_frac.go b/fracmanager/proxy_frac.go index 11e9fc85..349c054a 100644 --- a/fracmanager/proxy_frac.go +++ b/fracmanager/proxy_frac.go @@ -3,234 +3,173 @@ package fracmanager import ( "context" "errors" - "fmt" + "math" "sync" "time" "go.uber.org/zap" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric" "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" ) -var ErrSealingFractionSuicided = errors.New("sealing fraction is suicided") - -/** - * Possible states (only 4): - * -------------------------------------------------------- - * | | f.active | f.sealed | f.readonly | - * -------------------------------------------------------- - * | Active & Writable | value | nil | false | - * -------------------------------------------------------- - * | Sealing | value | nil | true | - * -------------------------------------------------------- - * | Sealed | nil | value | true | - * -------------------------------------------------------- - * | Suicided | nil | nil | true | - * -------------------------------------------------------- - * All other states are impossible. - */ - -type proxyFrac struct { - fp *fractionProvider - - useMu sync.RWMutex - active *frac.Active - sealed *frac.Sealed - readonly bool - - name string - - indexWg sync.WaitGroup - sealWg sync.WaitGroup -} - -func newProxyFrac(active *frac.Active, fp *fractionProvider) *proxyFrac { - return &proxyFrac{ - fp: fp, - active: active, - name: active.BaseFileName, - } -} - -func (f *proxyFrac) cur() frac.Fraction { - f.useMu.RLock() - defer f.useMu.RUnlock() - - if f.active != nil { - return f.active - } +var ( + _ frac.Fraction = (*fractionProxy)(nil) + _ frac.Fraction = (*emptyFraction)(nil) - if f.sealed != nil { - metric.CountersTotal.WithLabelValues("use_sealed_from_active").Inc() - return f.sealed - } + ErrFractionNotWritable = errors.New("fraction is not writable") +) - metric.CountersTotal.WithLabelValues("use_empty_from_active").Inc() - return frac.EmptyFraction +// fractionProxy provides thread-safe access to a fraction with atomic replacement +// Used to switch fraction implementations (active → sealed → remote) without blocking readers. +// Lifecycle: Created for each fraction, persists through state transitions. +type fractionProxy struct { + mu sync.RWMutex + impl frac.Fraction // Current fraction implementation } -func (f *proxyFrac) IsIntersecting(from, to seq.MID) bool { - return f.cur().IsIntersecting(from, to) +func (p *fractionProxy) Redirect(f frac.Fraction) { + p.mu.Lock() + defer p.mu.Unlock() + p.impl = f } -func (f *proxyFrac) Contains(mid seq.MID) bool { - return f.cur().Contains(mid) +func (p *fractionProxy) Info() *frac.Info { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Info() } -func (f *proxyFrac) Info() *common.Info { - return f.cur().Info() +func (p *fractionProxy) IsIntersecting(from, to seq.MID) bool { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.IsIntersecting(from, to) } -func (f *proxyFrac) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - return f.cur().Fetch(ctx, ids) +func (p *fractionProxy) Contains(mid seq.MID) bool { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Contains(mid) } -func (f *proxyFrac) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - return f.cur().Search(ctx, params) +func (p *fractionProxy) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Fetch(ctx, ids) } -func (f *proxyFrac) Append(docs, meta []byte) error { - f.useMu.RLock() - if !f.isActiveState() { - f.useMu.RUnlock() - return errors.New("fraction is not writable") - } - active := f.active - f.indexWg.Add(1) // It's important to put wg.Add() inside a lock, otherwise we might call WaitWriteIdle() before it - f.useMu.RUnlock() - - return active.Append(docs, meta, &f.indexWg) +func (p *fractionProxy) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { + p.mu.RLock() + defer p.mu.RUnlock() + return p.impl.Search(ctx, params) } -func (f *proxyFrac) WaitWriteIdle() { - start := time.Now() - logger.Info("waiting fraction to stop write...", zap.String("name", f.name)) - f.indexWg.Wait() - waitTime := util.DurationToUnit(time.Since(start), "s") - logger.Info("write is stopped", zap.String("name", f.name), zap.Float64("time_wait_s", waitTime)) -} +// activeProxy manages an active (writable) fraction +// Tracks pending write operations and provides freeze capability. +// Lifecycle: Created when fraction becomes active, destroyed after sealing. +type activeProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *active.Active // Actual active fraction instance + sealed *sealed.Sealed // Sealed version (set after sealing) -func (f *proxyFrac) Seal() (*frac.Sealed, error) { - f.useMu.Lock() - if f.isSuicidedState() { - f.useMu.Unlock() - return nil, ErrSealingFractionSuicided - } - - if !f.isActiveState() { - f.useMu.Unlock() - return nil, errors.New("sealing fraction is not active") - } + mu sync.RWMutex // Protects readonly state + wg sync.WaitGroup // Tracks pending write operations - f.readonly = true - active := f.active - - f.sealWg.Add(1) // It's important to put wg.Add() inside a lock, otherwise we might call wg.Wait() before it - f.useMu.Unlock() - - f.WaitWriteIdle() - - sealed, err := f.fp.Seal(active) - if err != nil { - return nil, err - } - - f.useMu.Lock() - f.sealed = sealed - f.active = nil - f.useMu.Unlock() - - f.sealWg.Done() - - active.Release() - - return sealed, nil + finalized bool // Whether fraction is frozen for writes } -// trySetSuicided set suicided state if possible (if not sealing right now) -func (f *proxyFrac) trySetSuicided() (*frac.Active, *frac.Sealed, bool) { - f.useMu.Lock() - defer f.useMu.Unlock() - - sealed := f.sealed - active := f.active - - // We must compute `isSealing` before - // we change fraction to read-only. - isSealing := f.isSealingState() - - // If the object is in active state, switch to read-only mode - if f.isActiveState() { - f.readonly = true +func newActiveProxy(active *active.Active) *activeProxy { + return &activeProxy{ + proxy: &fractionProxy{impl: active}, + instance: active, } +} - // If sealing is not in progress, we can safely clear the state - if !isSealing { - f.sealed = nil - f.active = nil +// Append adds documents to the active fraction +func (p *activeProxy) Append(docs, meta []byte) error { + p.mu.RLock() + if p.finalized { + p.mu.RUnlock() + return ErrFractionNotWritable } + p.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() + p.mu.RUnlock() - return active, sealed, isSealing + return p.instance.Append(docs, meta, &p.wg) } -func (f *proxyFrac) Offload(ctx context.Context, u storage.Uploader) (bool, error) { - f.useMu.RLock() - - if f.isSealingState() { - f.useMu.RUnlock() - f.sealWg.Wait() - - if c := f.cur(); c != nil { - return c.Offload(ctx, u) - } +// WaitWriteIdle waits for all pending write operations to complete +// Used before sealing to ensure data consistency. +func (p *activeProxy) WaitWriteIdle() { + start := time.Now() + logger.Info("waiting fraction to stop write...", zap.String("name", p.instance.BaseFileName)) + p.wg.Wait() + waitTime := util.DurationToUnit(time.Since(start), "s") + logger.Info("write is stopped", + zap.String("name", p.instance.BaseFileName), + zap.Float64("time_wait_s", waitTime)) +} - return false, nil +// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (p *activeProxy) Finalize() error { + p.mu.Lock() + if p.finalized { + p.mu.Unlock() + return errors.New("fraction is already finalized") } + p.finalized = true + p.mu.Unlock() - f.useMu.RUnlock() - return f.cur().Offload(ctx, u) + return nil } -func (f *proxyFrac) Suicide() { - active, sealed, isSealing := f.trySetSuicided() +// sealedProxy represents a sealed fraction that may be offloaded +// Tracks both local sealed instance and remote version if offloaded. +type sealedProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *sealed.Sealed // Local sealed fraction + remote *sealed.Remote // Remote version (if offloaded) +} - if isSealing { - f.sealWg.Wait() - // we can get `sealing` == true only once here - // next attempt after Wait() should be successful - active, sealed, _ = f.trySetSuicided() - } +// remoteProxy represents an offloaded fraction +type remoteProxy struct { + proxy *fractionProxy // Thread-safe fraction access + instance *sealed.Remote // Remote fraction instance +} - if active != nil { - // Wait for write operations to complete before suiciding - f.WaitWriteIdle() - active.Suicide() - } +// emptyFraction represents a missing or deleted fraction +// Returns empty results for all operations. +// Used as placeholder when fraction is removed but references still exist. +type emptyFraction struct { +} - if sealed != nil { - sealed.Suicide() +func (emptyFraction) Info() *frac.Info { + return &frac.Info{ + Path: "empty", + From: math.MaxUint64, + To: 0, } } -func (f *proxyFrac) String() string { - return fmt.Sprintf("%s", f.cur()) +func (emptyFraction) IsIntersecting(_, _ seq.MID) bool { + return false } -func (f *proxyFrac) isActiveState() bool { - return f.active != nil && f.sealed == nil && !f.readonly +func (emptyFraction) Contains(mid seq.MID) bool { + return false } -func (f *proxyFrac) isSealingState() bool { - return f.active != nil && f.sealed == nil && f.readonly +func (emptyFraction) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { + return nil, nil } -func (f *proxyFrac) isSuicidedState() bool { - return f.active == nil && f.sealed == nil +func (emptyFraction) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { + metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() + return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil } diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index d4251b16..ca369959 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -13,12 +13,11 @@ import ( "time" "github.com/alecthomas/units" - insaneJSON "github.com/ozontech/insane-json" "github.com/pkg/profile" "github.com/stretchr/testify/assert" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/active" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" @@ -36,12 +35,9 @@ func TestMain(m *testing.M) { m.Run() } -func fillActiveFraction(active *frac.Active) error { +func fillActiveFraction(active *active.Active) error { const muliplier = 10 - docRoot := insaneJSON.Spawn() - defer insaneJSON.Release(docRoot) - file, err := os.Open(filepath.Join(testscommon.TestDataDir, "k8s.logs")) if err != nil { return err @@ -62,12 +58,8 @@ func fillActiveFraction(active *frac.Active) error { for scanner.Scan() { k++ doc := scanner.Bytes() - if err := docRoot.DecodeBytes(doc); err != nil { - return err - } - id := seq.NewID(time.Now(), uint64(rand.Int63())) - dp.Append(doc, docRoot, id, + dp.Append(doc, id, "_all_:", "service:service"+strconv.Itoa(rand.Intn(200)), "k8s_pod1:"+strconv.Itoa(k%100000), @@ -85,9 +77,9 @@ func fillActiveFraction(active *frac.Active) error { return nil } -func defaultSealingParams() common.SealParams { +func defaultSealingParams() frac.SealParams { const minZstdLevel = 1 - return common.SealParams{ + return frac.SealParams{ IDsZstdLevel: minZstdLevel, LIDsZstdLevel: minZstdLevel, TokenListZstdLevel: minZstdLevel, @@ -110,19 +102,19 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { fp, tearDown := setupFractionProvider(b, &Config{Fraction: *cfg}) defer tearDown() - active := fp.CreateActive() - err := fillActiveFraction(active) + a := fp.CreateActive2() + err := fillActiveFraction(a) assert.NoError(b, err) - seal := func(active *frac.Active, params common.SealParams) (*sealed.PreloadedData, error) { - src, err := frac.NewActiveSealingSource(active, params) + seal := func(a *active.Active, params frac.SealParams) (*sealed.PreloadedData, error) { + src, err := active.NewSealingSource(a, params) assert.NoError(b, err) return sealing.Seal(src, params) } params := defaultSealingParams() // The first sealing will sort all the LIDs, so we take this load out of the measurement range - _, err = seal(active, params) + _, err = seal(a, params) assert.NoError(b, err) b.ReportAllocs() @@ -142,7 +134,7 @@ func runSealingBench(b *testing.B, cfg *frac.Config) { } for b.Loop() { - _, err = seal(active, params) + _, err = seal(a, params) assert.NoError(b, err) } } diff --git a/fracmanager/searcher_test.go b/fracmanager/searcher_test.go index e584e9a1..c9261a7e 100644 --- a/fracmanager/searcher_test.go +++ b/fracmanager/searcher_test.go @@ -10,15 +10,13 @@ import ( "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/parser" "github.com/ozontech/seq-db/seq" ) type testFakeFrac struct { - frac.Empty - info *common.Info + info *frac.Info qpr *seq.QPR searchesCount int fetchCount int @@ -26,7 +24,7 @@ type testFakeFrac struct { fetchError error } -func (f *testFakeFrac) Info() *common.Info { +func (f *testFakeFrac) Info() *frac.Info { return f.info } @@ -65,7 +63,7 @@ func (f *testFakeFrac) Search(context.Context, processor.SearchParams) (*seq.QPR func newFakeFrac(from, to seq.MID, qpr *seq.QPR) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: 1}, + info: &frac.Info{From: from, To: to, DocsTotal: 1}, qpr: qpr, documents: make(map[seq.ID][]byte), } @@ -73,14 +71,14 @@ func newFakeFrac(from, to seq.MID, qpr *seq.QPR) *testFakeFrac { func newFakeFracWithDocs(from, to seq.MID, documents map[seq.ID][]byte) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: uint32(len(documents))}, + info: &frac.Info{From: from, To: to, DocsTotal: uint32(len(documents))}, documents: documents, } } func newFakeFracWithFetchError(from, to seq.MID, fetchError error) *testFakeFrac { return &testFakeFrac{ - info: &common.Info{From: from, To: to, DocsTotal: 1}, + info: &frac.Info{From: from, To: to, DocsTotal: 1}, documents: make(map[seq.ID][]byte), fetchError: fetchError, } diff --git a/go.mod b/go.mod index 6b9439ec..ca86ec24 100644 --- a/go.mod +++ b/go.mod @@ -45,6 +45,7 @@ require ( ) require ( + github.com/RoaringBitmap/roaring/v2 v2.14.4 // indirect github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect @@ -61,11 +62,13 @@ require ( github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect github.com/aws/smithy-go v1.23.2 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.24.2 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/pprof v0.0.0-20250422154841-e1f9c1950416 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect diff --git a/go.sum b/go.sum index 6989060d..8803c552 100644 --- a/go.sum +++ b/go.sum @@ -27,6 +27,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/KimMachineGun/automemlimit v0.7.5 h1:RkbaC0MwhjL1ZuBKunGDjE/ggwAX43DwZrJqVwyveTk= github.com/KimMachineGun/automemlimit v0.7.5/go.mod h1:QZxpHaGOQoYvFhv/r4u3U0JTC2ZcOwbSr11UZF46UBM= +github.com/RoaringBitmap/roaring/v2 v2.14.4 h1:4aKySrrg9G/5oRtJ3TrZLObVqxgQ9f1znCRBwEwjuVw= +github.com/RoaringBitmap/roaring/v2 v2.14.4/go.mod h1:oMvV6omPWr+2ifRdeZvVJyaz+aoEUopyv5iH0u/+wbY= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0= @@ -71,6 +73,8 @@ github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM= github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bits-and-blooms/bitset v1.24.2 h1:M7/NzVbsytmtfHbumG+K2bremQPMJuqv1JD3vOaFxp0= +github.com/bits-and-blooms/bitset v1.24.2/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/c2h5oh/datasize v0.0.0-20200112174442-28bbd4740fee h1:BnPxIde0gjtTnc9Er7cxvBk8DHLWhEux0SxayC8dP6I= github.com/c2h5oh/datasize v0.0.0-20200112174442-28bbd4740fee/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= github.com/cactus/go-statsd-client v3.1.1+incompatible/go.mod h1:cMRcwZDklk7hXp+Law83urTHUiHMzCev/r4JMYr/zU0= @@ -196,6 +200,8 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s= diff --git a/indexer/meta_data.go b/indexer/meta_data.go index 241f219f..5d33f7b5 100644 --- a/indexer/meta_data.go +++ b/indexer/meta_data.go @@ -10,10 +10,11 @@ import ( ) type MetaData struct { - ID seq.ID - // Size of an uncompressed document in bytes. - Size uint32 - Tokens []tokenizer.MetaToken + ID seq.ID + Size uint32 // Size of an uncompressed document in bytes. + Tokens []tokenizer.MetaToken + tokensCount uint32 + tokensBin []byte } // String used in tests for human-readable output. @@ -72,6 +73,23 @@ func (m *MetaData) UnmarshalBinary(b []byte) error { } } +func (m *MetaData) UnmarshalBinaryLazy(b []byte) error { + if !IsItBinaryEncodedMetaData(b) { + return fmt.Errorf("invalid metadata magic bytes") + } + b = b[2:] + + version := binary.LittleEndian.Uint16(b) + b = b[2:] + + switch version { + case 1: + return m.unmarshalVersion1Lazy(b) + default: + return fmt.Errorf("unimplemented metadata version: %d", version) + } +} + func (m *MetaData) unmarshalVersion1(b []byte) error { // Decode seq.ID. m.ID.MID = seq.MID(binary.LittleEndian.Uint64(b)) @@ -101,3 +119,42 @@ func (m *MetaData) unmarshalVersion1(b []byte) error { } return nil } + +func (m *MetaData) unmarshalVersion1Lazy(b []byte) error { + // Decode seq.ID. + m.ID.MID = seq.MID(binary.LittleEndian.Uint64(b)) + b = b[8:] + m.ID.RID = seq.RID(binary.LittleEndian.Uint64(b)) + b = b[8:] + + // Decode uncompressed document size. + m.Size = binary.LittleEndian.Uint32(b) + b = b[4:] + + m.tokensCount = binary.LittleEndian.Uint32(b) + b = b[4:] + + m.tokensBin = b + + return nil +} + +func (m *MetaData) DecodeTokens(tokens []tokenizer.MetaToken) ([]tokenizer.MetaToken, error) { + b := m.tokensBin + + // Decode tokens. + tokens = tokens[:0] + tokens = slices.Grow(tokens, int(m.tokensCount))[:m.tokensCount] + + for i := range tokens { + var err error + if b, err = tokens[i].UnmarshalBinary(b); err != nil { + return nil, err + } + } + return tokens, nil +} + +func (m *MetaData) TokensCount() uint32 { + return m.tokensCount +} diff --git a/indexer/processor.go b/indexer/processor.go index dbf7c106..589b67b5 100644 --- a/indexer/processor.go +++ b/indexer/processor.go @@ -210,6 +210,8 @@ func (p *Processor) ProcessBulk( dstDocs = binary.LittleEndian.AppendUint32(dstDocs, uint32(len(doc))) dstDocs = append(dstDocs, doc...) for _, m := range meta { + // todo: it is possible to have a few equal tokens here + // todo: probably we need deduplicate it here dstMeta = marshalAppendMeta(dstMeta, m) } } diff --git a/indexer/test_doc_provider.go b/indexer/test_doc_provider.go index 316464d2..0af90dde 100644 --- a/indexer/test_doc_provider.go +++ b/indexer/test_doc_provider.go @@ -2,13 +2,8 @@ package indexer import ( "encoding/binary" - "math/rand" "strings" - "time" - insaneJSON "github.com/ozontech/insane-json" - - "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/tokenizer" @@ -45,14 +40,8 @@ func (dp *TestDocProvider) appendMeta(docLen int, id seq.ID, tokens []tokenizer. dp.Metas = append(dp.Metas, dp.buf...) } -func (dp *TestDocProvider) Append(doc []byte, docRoot *insaneJSON.Root, id seq.ID, tokensStr ...string) { +func (dp *TestDocProvider) Append(doc []byte, id seq.ID, tokensStr ...string) { tokens := stringsToTokens(tokensStr...) - if id.MID == 0 { - // this case runs only in the integration tests - t, _ := extractDocTimeForTest(docRoot) - id = seq.NewID(t, uint64(rand.Int63())) - } - dp.appendMeta(len(doc), id, tokens) dp.appendDoc(doc) } @@ -86,34 +75,6 @@ func encodeMeta(buf []byte, tokens []tokenizer.MetaToken, id seq.ID, size int) [ return md.MarshalBinaryTo(buf) } -// extractDocTimeForTest extracts timestamp from doc -// It searches by one of supported field name and parses by supported formats -// If no field was found or not parsable it returns time.Now() -func extractDocTimeForTest(docRoot *insaneJSON.Root) (time.Time, []string) { - var t time.Time - var f []string -top: - for _, field := range consts.TimeFields { - timeNode := docRoot.Dig(field...) - if timeNode == nil { - continue - } - timeVal := timeNode.AsString() - for _, format := range consts.TimeFormats { - if value, err := time.Parse(format, timeVal); err == nil { - t = value - f = field - break top - } - } - } - - if t.IsZero() { - t = time.Now() - } - return t, f -} - func stringsToTokens(tokens ...string) []tokenizer.MetaToken { r := make([]tokenizer.MetaToken, 0) for _, tokenStr := range tokens { diff --git a/resources/call_stack.go b/resources/call_stack.go new file mode 100644 index 00000000..50df25d9 --- /dev/null +++ b/resources/call_stack.go @@ -0,0 +1,16 @@ +package resources + +type CallStack struct { + stack []func() +} + +func (s *CallStack) Defer(f func()) { + s.stack = append(s.stack, f) +} + +func (s *CallStack) CallAll() { + for i := len(s.stack) - 1; i >= 0; i-- { + s.stack[i]() + } + s.stack = s.stack[:0] +} diff --git a/resources/global_pools.go b/resources/global_pools.go new file mode 100644 index 00000000..ff403952 --- /dev/null +++ b/resources/global_pools.go @@ -0,0 +1,9 @@ +package resources + +const poolBuckets = 24 + +var ( + BytesPool = NewSizedPool[byte](poolBuckets) + Uint32SlicesPool = NewSizedPool[[]uint32](poolBuckets) + BytesSlicesPool = NewSizedPool[[]byte](poolBuckets) +) diff --git a/resources/object_allocator.go b/resources/object_allocator.go new file mode 100644 index 00000000..c5da1952 --- /dev/null +++ b/resources/object_allocator.go @@ -0,0 +1,48 @@ +package resources + +type MapsPool[K comparable, V any] struct { + pool *TypedPool[map[K]V] + releases *CallStack +} + +func NewMapsPool[K comparable, V any](pool *TypedPool[map[K]V], releases *CallStack) MapsPool[K, V] { + return MapsPool[K, V]{ + pool: pool, + releases: releases, + } +} + +func (a MapsPool[K, V]) Alloc(size int) map[K]V { + obj, ok := a.pool.Get() + if !ok { + obj = make(map[K]V, size) + } + a.releases.Defer(func() { + clear(obj) + a.pool.Put(obj) + }) + return obj +} + +type ObjectsPool[T any] struct { + pool *TypedPool[*T] + releases *CallStack +} + +func NewObjectsPool[T any](pool *TypedPool[*T], releases *CallStack) ObjectsPool[T] { + return ObjectsPool[T]{ + pool: pool, + releases: releases, + } +} + +func (a ObjectsPool[T]) Get(newFn func() *T, resetFn func(*T)) *T { + obj, ok := a.pool.Get() + if ok { + resetFn(obj) + } else { + obj = newFn() + } + a.releases.Defer(func() { a.pool.Put(obj) }) + return obj +} diff --git a/resources/sized_pool.go b/resources/sized_pool.go new file mode 100644 index 00000000..8142334e --- /dev/null +++ b/resources/sized_pool.go @@ -0,0 +1,65 @@ +package resources + +import ( + "math/bits" + "sync" +) + +type TypedPool[T any] struct { + pool sync.Pool +} + +func (p *TypedPool[T]) Get() (T, bool) { + item := p.pool.Get() + var val T + if item == nil { + return val, false + } + val, ok := item.(T) + return val, ok +} + +func (p *TypedPool[T]) Put(item T) { + p.pool.Put(item) +} + +type SizedPool[T any] struct { + pools []TypedPool[[]T] +} + +func NewSizedPool[T any](buckets int) SizedPool[T] { + return SizedPool[T]{ + pools: make([]TypedPool[[]T], buckets), + } +} + +func index(size uint) (idx, leftBorder int) { + idx = bits.Len((size - 1) >> 8) + return idx, 1 << (idx + 8) +} + +func (p SizedPool[T]) Get(size int) []T { + idx, poolCapacity := index(uint(size)) + + if idx < len(p.pools) { + if data, ok := p.pools[idx].Get(); ok { + return data[:size] + } + } + + return make([]T, size, poolCapacity) +} + +func (p SizedPool[T]) Put(item []T) { + capacity := cap(item) + idx, leftBorder := index(uint(capacity)) + + if idx > 0 && capacity < leftBorder { + idx-- + } + + if idx < len(p.pools) { + item = item[:0] + p.pools[idx].Put(item) + } +} diff --git a/resources/slice_allocator.go b/resources/slice_allocator.go new file mode 100644 index 00000000..fcf267d9 --- /dev/null +++ b/resources/slice_allocator.go @@ -0,0 +1,31 @@ +package resources + +func NewBytes(releases *CallStack) SlicesPool[byte] { + return NewSlicesPool(&BytesPool, releases) +} + +func NewUint32Slices(releases *CallStack) SlicesPool[[]uint32] { + return NewSlicesPool(&Uint32SlicesPool, releases) +} + +func NewBytesSlices(releases *CallStack) SlicesPool[[]byte] { + return NewSlicesPool(&BytesSlicesPool, releases) +} + +type SlicesPool[T any] struct { + pool *SizedPool[T] + releases *CallStack +} + +func NewSlicesPool[T any](pool *SizedPool[T], releases *CallStack) SlicesPool[T] { + return SlicesPool[T]{ + pool: pool, + releases: releases, + } +} + +func (a SlicesPool[T]) GetSlice(size int) []T { + data := a.pool.Get(size) + a.releases.Defer(func() { a.pool.Put(data) }) + return data[:size] +} diff --git a/resources/slice_on_bytes.go b/resources/slice_on_bytes.go new file mode 100644 index 00000000..4b827cba --- /dev/null +++ b/resources/slice_on_bytes.go @@ -0,0 +1,36 @@ +package resources + +import ( + "unsafe" +) + +func NewUint32s(releases *CallStack) SliceOnBytes[uint32] { + return NewSliceOnBytes[uint32](releases) +} + +func NewUint64s(releases *CallStack) SliceOnBytes[uint64] { + return NewSliceOnBytes[uint64](releases) +} + +type SliceOnBytes[T any] struct { + pool *SizedPool[byte] + releases *CallStack +} + +func NewSliceOnBytes[T any](releases *CallStack) SliceOnBytes[T] { + return SliceOnBytes[T]{ + pool: &BytesPool, + releases: releases, + } +} + +func (a SliceOnBytes[T]) GetSlice(size int) []T { + var empty T + itemSize := int(unsafe.Sizeof(empty)) + + buf := a.pool.Get(size * itemSize) + capacity := cap(buf) / itemSize + data := unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(buf))), capacity)[:size] + a.releases.Defer(func() { a.pool.Put(buf) }) + return data +} diff --git a/seq/seq.go b/seq/seq.go index 8c56cab7..797cf2c3 100644 --- a/seq/seq.go +++ b/seq/seq.go @@ -1,9 +1,12 @@ package seq import ( + "cmp" "encoding/binary" "encoding/hex" "fmt" + "math" + "slices" "time" "github.com/ozontech/seq-db/util" @@ -19,7 +22,12 @@ type RID uint64 // random part of ID type LID uint32 // local id for a fraction func (m MID) Time() time.Time { - return time.UnixMilli(int64(m)) + if uint64(m) <= math.MaxInt64 { + return time.UnixMilli(int64(m)) + } else { + // since MaxInt64 is 292278994 year in milliseconds, so we assume this MID is "infinite future" + return time.UnixMilli(math.MaxInt64) + } } func (d ID) String() string { @@ -52,6 +60,13 @@ func (d ID) Bytes() []byte { return final } +func (d ID) AppendBinary(buf []byte) []byte { + buf = slices.Grow(buf, 16) + buf = binary.LittleEndian.AppendUint64(buf, uint64(d.MID)) + buf = binary.LittleEndian.AppendUint64(buf, uint64(d.RID)) + return buf +} + func LessOrEqual(a, b ID) bool { if a.MID == b.MID { return a.RID <= b.RID @@ -59,6 +74,13 @@ func LessOrEqual(a, b ID) bool { return a.MID < b.MID } +func Compare(a, b ID) int { + return cmp.Or( + cmp.Compare(a.MID, b.MID), + cmp.Compare(a.RID, b.RID), + ) +} + func Less(a, b ID) bool { if a.MID == b.MID { return a.RID < b.RID diff --git a/storage/docs_reader.go b/storage/docs_reader.go index ad5edbd8..b0429ae8 100644 --- a/storage/docs_reader.go +++ b/storage/docs_reader.go @@ -45,7 +45,7 @@ func (r *DocsReader) ReadDocsFunc(blockOffset uint64, docOffsets []uint64, cb fu block, err := r.cache.GetWithError(uint32(blockOffset), func() ([]byte, int, error) { block, _, err := r.reader.ReadDocBlockPayload(int64(blockOffset)) if err != nil { - return nil, 0, fmt.Errorf("can't fetch doc at pos %d: %w", blockOffset, err) + return nil, 0, fmt.Errorf("can't fetch doc block at pos %d: %w", blockOffset, err) } return block, cap(block), nil }) diff --git a/storeapi/grpc_v1_test.go b/storeapi/grpc_v1_test.go index e8c69038..20688ee0 100644 --- a/storeapi/grpc_v1_test.go +++ b/storeapi/grpc_v1_test.go @@ -56,7 +56,7 @@ func makeBulkRequest(cnt int) *storeapi.BulkRequest { for i := 0; i < cnt; i++ { id := seq.SimpleID(i + 1) doc := []byte("document") - dp.Append(doc, nil, id, "_all_:", "service:100500", "k8s_pod:"+strconv.Itoa(i)) + dp.Append(doc, id, "_all_:", "service:100500", "k8s_pod:"+strconv.Itoa(i)) } req := &storeapi.BulkRequest{Count: int64(cnt)} req.Docs, req.Metas = dp.Provide() @@ -67,13 +67,12 @@ func getTestGrpc(t *testing.T) (*GrpcV1, func(), func()) { dataDir := common.GetTestTmpDir(t) common.RecreateDir(dataDir) - fm, err := fracmanager.New(t.Context(), &fracmanager.Config{ + fm, stop, err := fracmanager.New(t.Context(), &fracmanager.Config{ FracSize: 500, TotalSize: 5000, DataDir: dataDir, }, nil) assert.NoError(t, err) - fm.Start() config := APIConfig{ StoreMode: "", @@ -99,7 +98,7 @@ func getTestGrpc(t *testing.T) (*GrpcV1, func(), func()) { g := NewGrpcV1(config, fm, mappingProvider) release := func() { - fm.Stop() + stop() common.RemoveDir(dataDir) } diff --git a/storeapi/store.go b/storeapi/store.go index aa06f44f..857be4e2 100644 --- a/storeapi/store.go +++ b/storeapi/store.go @@ -26,7 +26,8 @@ type Store struct { grpcAddr string grpcServer *grpcServer - FracManager *fracmanager.FracManager + FracManager *fracmanager.FracManager + fracManagerStop func() isStopped atomic.Bool } @@ -51,19 +52,19 @@ func NewStore(ctx context.Context, c StoreConfig, s3cli *s3.Client, mappingProvi return nil, err } - fracManager, err := fracmanager.New(ctx, &c.FracManager, s3cli) + fracManager, stop, err := fracmanager.New(ctx, &c.FracManager, s3cli) if err != nil { return nil, fmt.Errorf("loading fractions error: %w", err) } - fracManager.Start() return &Store{ Config: c, // We will set grpcAddr later in Start() - grpcAddr: "", - grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), - FracManager: fracManager, - isStopped: atomic.Bool{}, + grpcAddr: "", + grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), + FracManager: fracManager, + fracManagerStop: stop, + isStopped: atomic.Bool{}, }, nil } @@ -86,8 +87,7 @@ func (s *Store) Stop() { defer cancel() s.grpcServer.Stop(ctx) - - s.FracManager.Stop() + s.fracManagerStop() logger.Info("store stopped") } diff --git a/tests/setup/env.go b/tests/setup/env.go index d7af955a..5f3172ef 100644 --- a/tests/setup/env.go +++ b/tests/setup/env.go @@ -22,7 +22,7 @@ import ( "github.com/ozontech/seq-db/buildinfo" "github.com/ozontech/seq-db/consts" - "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/mappingprovider" @@ -34,7 +34,7 @@ import ( "github.com/ozontech/seq-db/seq" seqs3 "github.com/ozontech/seq-db/storage/s3" "github.com/ozontech/seq-db/storeapi" - testscommon "github.com/ozontech/seq-db/tests/common" + "github.com/ozontech/seq-db/tests/common" ) type TestingEnvConfig struct { @@ -91,7 +91,7 @@ func (cfg *TestingEnvConfig) GetFracManagerConfig(replicaID string) fracmanager. c = fracmanager.FillConfigWithDefault(&fracmanager.Config{ FracSize: 256 * uint64(units.MiB), TotalSize: 1 * uint64(units.GiB), - SealParams: common.SealParams{ + SealParams: frac.SealParams{ IDsZstdLevel: fastestZstdLevel, LIDsZstdLevel: fastestZstdLevel, TokenListZstdLevel: fastestZstdLevel, @@ -264,7 +264,7 @@ func (cfg *TestingEnvConfig) MakeStores( for i := range confs { k := i / replicas - testscommon.CreateDir(confs[i].FracManager.DataDir) + common.CreateDir(confs[i].FracManager.DataDir) mappingProvider, err := mappingprovider.New( "", @@ -429,7 +429,7 @@ func (t *TestingEnv) IngestorFetchAddr() string { } func randomListener() (lis net.Listener) { - lis, err := net.Listen("tcp", fmt.Sprintf("%s:0", testscommon.Localhost)) + lis, err := net.Listen("tcp", fmt.Sprintf("%s:0", common.Localhost)) if err != nil { panic(err) } diff --git a/util/fs.go b/util/fs.go index 57fd7b17..1e1c94cc 100644 --- a/util/fs.go +++ b/util/fs.go @@ -6,6 +6,7 @@ package util import ( "errors" "os" + "path/filepath" "go.uber.org/zap" @@ -53,3 +54,21 @@ func RemoveFile(file string) { logger.Error("file removing error", zap.Error(err)) } } + +func MustOpenFile(name string, skipFsync bool) (*os.File, os.FileInfo) { + file, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o664) + if err != nil { + logger.Fatal("can't create docs file", zap.String("file", name), zap.Error(err)) + } + + if !skipFsync { + parentDirPath := filepath.Dir(name) + MustSyncPath(parentDirPath) + } + + stat, err := file.Stat() + if err != nil { + logger.Fatal("can't stat docs file", zap.String("file", name), zap.Error(err)) + } + return file, stat +} diff --git a/util/semaphore.go b/util/semaphore.go new file mode 100644 index 00000000..77b936ff --- /dev/null +++ b/util/semaphore.go @@ -0,0 +1,36 @@ +package util + +type Semaphore struct { + b chan struct{} +} + +func NewSemaphore(capacity int) *Semaphore { + return &Semaphore{ + b: make(chan struct{}, capacity), + } +} + +func (s *Semaphore) Capacity() int { + return cap(s.b) +} + +func (s *Semaphore) InProgress() int { + return len(s.b) +} + +func (s *Semaphore) TryToAcquire() bool { + select { + case s.b <- struct{}{}: + return true + default: + return false + } +} + +func (s *Semaphore) Acquire() { + s.b <- struct{}{} +} + +func (s *Semaphore) Release() { + <-s.b +}