diff --git a/CHANGELOG.md b/CHANGELOG.md index ce15cf7..81e87a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.2.0 + +Add: + +- Added `BasicDiskVectorDatabase` to provide a basic automatically disk persistent vector database, and associated `BasicDiskVectorStore` and `BasicDiskVocabularyStore`. + ## v2.1.3 Add: diff --git a/docs/docs/persistence/index.md b/docs/docs/persistence/index.md index 55559ac..cf5c3c7 100644 --- a/docs/docs/persistence/index.md +++ b/docs/docs/persistence/index.md @@ -5,6 +5,8 @@ title: Data Persistence The `Build5Nines.SharpVector` library provides easy-to-use methods for saving a memory-based vector database to a file or stream and loading it again later. This is particularly useful for caching indexed content between runs, deploying pre-built vector stores, or shipping databases with your application. +--- + ## :material-file: File Persistence `Build5Nines.SharpVector` supports persisting the vector database to a file. @@ -51,6 +53,8 @@ vdb.LoadFromFile(filePath); await vdb.LoadFromFileAsync(filePath); ``` +--- + ## :material-file-move: Persist to Stream The underlying methods used by `SaveToFile` and `LoadFromFile` methods for serializing the vector database to a `Stream` are available to use directly. This provides support for reading/writing to `MemoryStream` (or other streams) if the vector database needs to be persisted to something other than the local file system. @@ -92,3 +96,30 @@ vdb.DeserializeFromBinaryStream(stream); // deserialize asynchronously from JSON stream await vdb.DeserializeFromBinaryStreamAsync(stream); ``` + +--- + +## :material-file-database: BasicDiskVectorDatabase + +The `BasicDiskVectorDatabase` provides a basic vector database implementation that automatically stores the vector store and vocabulary store to disk. It's implmentation of vectorization is the same as the `BasicMemoryVectorDatabase`, but with the modification that it automatically persists the database to disk in the background to the specified folder path. + +Here's a basic example of using `BasicDiskVectorDatabase`: + +```csharp +// specify the folder where to persist the database data on disk +var vdb = new BasicDiskVectorDatabase("C:/data/content-db"); +foreach (var doc in documents) +{ + vdb.AddText(doc.Id, doc.Text); +} + +var results = vdb.Search("some text"); + +``` + +### Tips + +- Prefer absolute paths for the storage folder in production services. +- Place the folder on fast storage (SSD) for best indexing/query performance. +- Avoid sharing the same folder across multiple processes concurrently. +- Back up the folder regularly to preserve your vector store and vocabulary. diff --git a/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs new file mode 100644 index 0000000..79d2d4e --- /dev/null +++ b/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs @@ -0,0 +1,29 @@ +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.Vocabulary; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.VectorStore; + +namespace Build5Nines.SharpVector; + +/// +/// Base class for an on-disk vector database. Mirrors MemoryVectorDatabaseBase generic composition +/// while using disk-backed stores for persistence. +/// +public abstract class BasicDiskMemoryVectorDatabaseBase + : VectorDatabaseBase + where TId : notnull + where TVocabularyKey : notnull + where TVocabularyValue : notnull + where TVectorStore : IVectorStoreWithVocabulary + where TVocabularyStore : IVocabularyStore + where TIdGenerator : IIdGenerator, new() + where TTextPreprocessor : ITextPreprocessor, new() + where TVectorizer : IVectorizer, new() + where TVectorComparer : IVectorComparer, new() +{ + protected BasicDiskMemoryVectorDatabaseBase(TVectorStore vectorStore) + : base(vectorStore) + { } +} diff --git a/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs b/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs new file mode 100644 index 0000000..bea8a64 --- /dev/null +++ b/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs @@ -0,0 +1,47 @@ +using Build5Nines.SharpVector.Vocabulary; +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.VectorStore; + +namespace Build5Nines.SharpVector; + +/// +/// A basic disk-backed vector database using Bag-of-Words, Cosine similarity, +/// disk-backed vector store and vocabulary store. Uses int IDs and string metadata. +/// +public class BasicDiskVectorDatabase + : BasicDiskMemoryVectorDatabaseBase< + int, + TMetadata, + BasicDiskVectorStore, string, int>, + BasicDiskVocabularyStore, + string, int, + IntIdGenerator, + BasicTextPreprocessor, + BagOfWordsVectorizer, + CosineSimilarityVectorComparer + >, IMemoryVectorDatabase, IVectorDatabase +{ + public BasicDiskVectorDatabase(string rootPath) + : base( + new BasicDiskVectorStore, string, int>( + rootPath, + new BasicDiskVocabularyStore(rootPath) + ) + ) + { } + + [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")] + public override async Task DeserializeFromJsonStreamAsync(Stream stream) + { + await DeserializeFromBinaryStreamAsync(stream); + } + + [Obsolete("Use DeserializeFromBinaryStream instead.")] + public override void DeserializeFromJsonStream(Stream stream) + { + DeserializeFromBinaryStream(stream); + } +} diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj index 8183a8e..269cf34 100644 --- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj +++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj @@ -9,7 +9,7 @@ Build5Nines.SharpVector https://sharpvector.build5nines.com https://github.com/Build5Nines/SharpVector - 2.1.3 + 2.2.0 Lightweight In-memory Vector Database to embed in any .NET Applications Copyright (c) 2025 Build5Nines LLC README.md diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index c1bf17f..6d407d7 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -15,7 +15,6 @@ namespace Build5Nines.SharpVector; - /// /// Base class for a memory vector database. /// @@ -30,7 +29,7 @@ namespace Build5Nines.SharpVector; /// /// public abstract class MemoryVectorDatabaseBase - : IVectorDatabase + : VectorDatabaseBase where TId : notnull where TVocabularyKey : notnull where TVocabularyValue: notnull @@ -41,378 +40,11 @@ public abstract class MemoryVectorDatabaseBase, new() where TVectorComparer : IVectorComparer, new() { - protected TIdGenerator _idGenerator; - - private TTextPreprocessor _textPreprocessor; - - private TVectorizer _vectorizer; - - private TVectorComparer _vectorComparer; - - /// - /// The Vector Store used to store the text vectors of the database - /// - protected TVectorStore VectorStore { get; private set; } - - public MemoryVectorDatabaseBase(TVectorStore vectorStore) - { - VectorStore = vectorStore; - _idGenerator = new TIdGenerator(); - _textPreprocessor = new TTextPreprocessor(); - _vectorizer = new TVectorizer(); - _vectorComparer = new TVectorComparer(); - } - - /// - /// Get all the Ids for each text the database. - /// - /// - public IEnumerable GetIds() - { - return VectorStore.GetIds(); - } - - /// - /// Adds a new text with Metadata to the database and returns its ID - /// - /// - /// - /// - public TId AddText(TVocabularyKey text, TMetadata? metadata = default(TMetadata)) - { - return AddTextAsync(text, metadata).Result; - } - - /// - /// Adds a new text with Metadata to the database and returns its ID - /// - /// - /// - /// - public async Task AddTextAsync(TVocabularyKey text, TMetadata? metadata = default(TMetadata)) - { - // Perform preprocessing asynchronously - var tokens = await _textPreprocessor.TokenizeAndPreprocessAsync(text); - - // Update the vocabulary store asynchronously - await VectorStore.VocabularyStore.UpdateAsync(tokens); - - // Generate the vector asynchronously - float[] vector = await _vectorizer.GenerateVectorFromTokensAsync(VectorStore.VocabularyStore, tokens); - - // Generate the ID and store the vector text item asynchronously - TId id = _idGenerator.NewId(); - await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector)); - - return id; - } - - public async Task> AddTextsAsync(IEnumerable<(TVocabularyKey text, TMetadata? metadata)> items) - { - if (items is null) throw new ArgumentNullException(nameof(items)); - - var ids = new List(); - - foreach(var item in items) - { - TId id = await AddTextAsync(item.text, item.metadata); - ids.Add(id); - } - - return ids; - } - - - /// - /// Retrieves a text and metadata by its ID - /// - /// - /// - /// - public IVectorTextItem GetText(TId id) - { - return VectorStore.Get(id); - } - - /// - /// Deletes a text by its ID - /// - /// - /// - public IVectorTextItem DeleteText(TId id) - { - return VectorStore.Delete(id); - } - - /// - /// Updates a text by its ID - /// - /// - /// - /// - public void UpdateText(TId id, TVocabularyKey text) - { - if (VectorStore.ContainsKey(id)) - { - var tokens = _textPreprocessor.TokenizeAndPreprocess(text); - VectorStore.VocabularyStore.Update(tokens); - float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens); - var metadata = VectorStore.Get(id).Metadata; - VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Updates the Metadata of a Text by its ID - /// - /// - /// - /// - public void UpdateTextMetadata(TId id, TMetadata metadata) { - if (VectorStore.ContainsKey(id)) - { - var existing = VectorStore.Get(id); - - var item = new VectorTextItem( - existing.Text, - metadata, - existing.Vector - ); - - VectorStore.Set(id, item); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Updates a Text by its ID with new text and metadata values - /// - /// - /// - /// - public void UpdateTextAndMetadata(TId id, TVocabularyKey text, TMetadata metadata) - { - if (VectorStore.ContainsKey(id)) - { - var tokens = _textPreprocessor.TokenizeAndPreprocess(text); - VectorStore.VocabularyStore.Update(tokens); - float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens); - VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Performs a vector search to find the top N most similar texts to the given text - /// - /// The query prompt to search by. - /// The highest number of results to show. - /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. - /// The page index of the search results. Default is 0. - /// The number of search results per page. Default is Null and returns all results. - /// A filter function to apply to the metadata of each result. - /// - public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) - { - Func>? filterToUse = null; - if (filter != null) - { - filterToUse = (metadata) => Task.FromResult(filter(metadata)); - } - return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; - } - - /// - /// Performs an asynchronous search vector search to find the top N most similar texts to the given text - /// - /// The query prompt to search by. - /// The similarity threshold to filter by. - /// The page index of the search results. Default is 0. - /// The number of search results per page. Default is Null and returns all results. - /// A filter function to apply to the metadata of each result. - /// - public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) - { - var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); - - similarities = await _vectorComparer.SortAsync(similarities); - - var totalCountFoundInSearch = similarities.Count(); - - IEnumerable> resultsToReturn; - if (pageCount != null && pageCount >= 0 && pageIndex >= 0) { - resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value); - } else { - // no paging specified, return all results - resultsToReturn = similarities; - } - - return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); - } - - private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null, Func>? filter = null) - { - var queryTokens = _textPreprocessor.TokenizeAndPreprocess(queryText); - float[] queryVector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, queryTokens); - - // Method to get the maximum vector length in the database - var desiredLength = VectorStore.VocabularyStore.Count; - - if (VectorStore.Count == 0) - { - throw new InvalidOperationException("The database is empty."); - } - - var results = new ConcurrentBag>(); - await foreach (KeyValuePair> kvp in VectorStore) - { - if (filter == null || await filter(kvp.Value.Metadata)) - { - var item = kvp.Value; - float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength)); - - if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) - { - var id = kvp.Key; - results.Add( - new VectorTextResultItem(id, item, vectorComparisonValue) - ); - } - } - } - return results; - } - - [Obsolete("Use SerializeToBinaryStreamAsync instead.")] - public virtual async Task SerializeToJsonStreamAsync(Stream stream) - { - await SerializeToBinaryStreamAsync(stream); - } - - /// - /// Serializes the Vector Database to a JSON stream - /// - /// - /// - /// - public virtual async Task SerializeToBinaryStreamAsync(Stream stream) - { - var streamVectorStore = new MemoryStream(); - var streamVocabularyStore = new MemoryStream(); - - var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore); - var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore); - - await Task.WhenAll(taskVectorStore, taskVocabularyStore); - - await DatabaseFile.SaveDatabaseToZipArchiveAsync( - stream, - new DatabaseInfo(this.GetType().FullName), - async (archive) => - { - var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename); - using (var entryStream = entryVectorStore.Open()) - { - streamVectorStore.Position = 0; - await streamVectorStore.CopyToAsync(entryStream); - await entryStream.FlushAsync(); - } - - var entryVocabularyStore = archive.CreateEntry(DatabaseFile.vocabularyStoreFilename); - using (var entryStream = entryVocabularyStore.Open()) - { - streamVocabularyStore.Position = 0; - await streamVocabularyStore.CopyToAsync(entryStream); - await entryStream.FlushAsync(); - } - } - ); - await stream.FlushAsync(); - } - - [Obsolete("Use SerializeToBinaryStream instead.")] - public virtual void SerializeToJsonStream(Stream stream) - { - SerializeToBinaryStream(stream); - } - - public virtual void SerializeToBinaryStream(Stream stream) - { - if (stream == null) - { - throw new ArgumentNullException(nameof(stream)); - } - SerializeToBinaryStreamAsync(stream).Wait(); - } - - [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")] - public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) { - await DeserializeFromBinaryStreamAsync(stream); - } - - public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream) - { - await DatabaseFile.LoadDatabaseFromZipArchiveAsync( - stream, - this.GetType().FullName, - async (archive) => - { - await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore); - await DatabaseFile.LoadVocabularyStoreAsync(archive, VectorStore.VocabularyStore); - - // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs - if (_idGenerator is ISequentialIdGenerator seqIdGen) - { - // Re-seed the sequence only if there are existing IDs - var ids = VectorStore.GetIds(); - if (ids.Any()) - { - seqIdGen.SetMostRecent(ids.Max()!); - } - } - } - ); - } - - - [Obsolete("Use DeserializeFromBinaryStream instead")] - public virtual void DeserializeFromJsonStream(Stream stream) - { - DeserializeFromBinaryStream(stream); - } - - public virtual void DeserializeFromBinaryStream(Stream stream) - { - if (stream == null) - { - throw new ArgumentNullException(nameof(stream)); - } - DeserializeFromBinaryStreamAsync(stream).Wait(); - } - - public IEnumerator> GetEnumerator() - { - return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator(); - } - - IEnumerator IEnumerable.GetEnumerator() - { - return GetEnumerator(); - } + protected MemoryVectorDatabaseBase(TVectorStore vectorStore) + : base(vectorStore) + { } } - - /// /// Base class for a memory vector database. /// @@ -422,398 +54,13 @@ IEnumerator IEnumerable.GetEnumerator() /// /// public abstract class MemoryVectorDatabaseBase - : IMemoryVectorDatabase, IVectorDatabase + : VectorDatabaseBase, IMemoryVectorDatabase where TId : notnull where TVectorStore : IVectorStore where TIdGenerator : IIdGenerator, new() where TVectorComparer : IVectorComparer, new() { - private TIdGenerator _idGenerator; - - private TVectorComparer _vectorComparer; - - /// - /// The Vector Store used to store the text vectors of the database - /// - protected TVectorStore VectorStore { get; private set; } - - protected IEmbeddingsGenerator EmbeddingsGenerator { get; private set; } - public MemoryVectorDatabaseBase(IEmbeddingsGenerator embeddingsGenerator, TVectorStore vectorStore) - { - EmbeddingsGenerator = embeddingsGenerator; - VectorStore = vectorStore; - _idGenerator = new TIdGenerator(); - _vectorComparer = new TVectorComparer(); - } - - /// - /// Get all the Ids for each text the database. - /// - /// - public IEnumerable GetIds() - { - return VectorStore.GetIds(); - } - - /// - /// Adds a new text with Metadata to the database and returns its ID - /// - /// - /// - /// - public TId AddText(string text, TMetadata? metadata = default(TMetadata)) - { - return AddTextAsync(text, metadata).Result; - } - - /// - /// Adds a new text with Metadata to the database and returns its ID - /// - /// - /// - /// - public async Task AddTextAsync(string text, TMetadata? metadata = default(TMetadata)) - { - // Generate the vector asynchronously - var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); - - // Generate the ID and store the vector text item asynchronously - TId id = _idGenerator.NewId(); - await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector)); - - return id; - } - - /// - /// Adds multiple texts with optional metadata to the database efficiently. - /// If the embeddings generator supports batching, this will generate vectors in a single multi-input call. - /// - /// Collection of (text, metadata) tuples to add. - /// List of generated IDs in the same order as inputs. - public async Task> AddTextsAsync(IEnumerable<(string text, TMetadata? metadata)> items) - { - if (items is null) throw new ArgumentNullException(nameof(items)); - - var list = items.ToList(); - if (list.Count == 0) return Array.Empty(); - - // Try batch embeddings if supported - float[][] vectors; - if (EmbeddingsGenerator is IBatchEmbeddingsGenerator batchGen) - { - var batch = await batchGen.GenerateEmbeddingsAsync(list.Select(i => i.text)); - vectors = batch.Select(v => v.ToArray()).ToArray(); - } - else - { - // Fallback to per-item embedding - vectors = new float[list.Count][]; - for (int i = 0; i < list.Count; i++) - { - vectors[i] = await EmbeddingsGenerator.GenerateEmbeddingsAsync(list[i].text); - } - } - - // Store items and produce IDs - var ids = new List(list.Count); - for (int i = 0; i < list.Count; i++) - { - TId id = _idGenerator.NewId(); - ids.Add(id); - await VectorStore.SetAsync(id, new VectorTextItem(list[i].text, list[i].metadata, vectors[i])); - } - - return ids; - } - - /// - /// Retrieves a text and metadata by its ID - /// - /// - /// - /// - public IVectorTextItem GetText(TId id) - { - return VectorStore.Get(id); - } - - /// - /// Deletes a text by its ID - /// - /// - /// - public IVectorTextItem DeleteText(TId id) - { - return VectorStore.Delete(id); - } - - /// - /// Updates a text by its ID - /// - /// - /// - /// - public async Task UpdateTextAsync(TId id, string text) - { - if (VectorStore.ContainsKey(id)) - { - var existing = VectorStore.Get(id); - var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); - var metadata = existing.Metadata; - VectorStore.Set(id, new VectorTextItem(text, existing.Metadata, vector)); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Updates a text by its ID - /// - /// - /// - public void UpdateText(TId id, string text) - { - var task = UpdateTextAsync(id, text); - task.Wait(); - } - - /// - /// Updates the Metadata of a Text by its ID - /// - /// - /// - /// - public void UpdateTextMetadata(TId id, TMetadata metadata) { - if (VectorStore.ContainsKey(id)) - { - var existing = VectorStore.Get(id); - - var item = new VectorTextItem( - existing.Text, - metadata, - existing.Vector - ); - - VectorStore.Set(id, item); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Updates a Text by its ID with new text and metadata values - /// - /// - /// - /// - public async Task UpdateTextAndMetadataAsync(TId id, string text, TMetadata metadata) - { - if (VectorStore.ContainsKey(id)) - { - var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); - VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); - } - else - { - throw new KeyNotFoundException($"Text with ID {id} not found."); - } - } - - /// - /// Updates a Text by its ID with new text and metadata values - /// - /// - /// - /// - public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata) - { - var task = UpdateTextAndMetadataAsync(id, text, metadata); - task.Wait(); - } - - /// - /// Performs a vector search to find the top N most similar texts to the given text - /// - /// The query prompt to search by. - /// The highest number of results to show. - /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. - /// The page index of the search results. Default is 0. - /// The number of search results per page. Default is Null and returns all results. - /// A filter function to apply to the metadata of each result. - /// The search results as an IVectorTextResult object. - public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) - { - Func>? filterToUse = null; - if (filter != null) - { - filterToUse = (metadata) => Task.FromResult(filter(metadata)); - } - return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; - } - - /// - /// Performs an asynchronous search vector search to find the top N most similar texts to the given text - /// - /// The query prompt to search by. - /// The similarity threshold to filter by. - /// The page index of the search results. Default is 0. - /// The number of search results per page. Default is Null and returns all results. - /// A filter function to apply to the metadata of each result. - /// The search results as an IVectorTextResult object. - public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) - { - var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); - - similarities = await _vectorComparer.SortAsync(similarities); - - var totalCountFoundInSearch = similarities.Count(); - - IEnumerable> resultsToReturn; - if (pageCount != null && pageCount >= 0 && pageIndex >= 0) { - resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value); - } else { - // no paging specified, return all results - resultsToReturn = similarities; - } - - return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); - } - - private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null, Func>? filter = null) - { - var queryVector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(queryText); - - if (VectorStore.Count == 0) - { - throw new InvalidOperationException("The database is empty."); - } - - var results = new ConcurrentBag>(); - await foreach (var kvp in VectorStore) - { - if (filter == null || await filter(kvp.Value.Metadata)) - { - var item = kvp.Value; - - float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector); - - if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) - { - var id = kvp.Key; - results.Add( - new VectorTextResultItem(id, item, vectorComparisonValue) - ); - } - } - } - return results; - } - - [Obsolete("Use SerializeToBinaryStreamAsync Instead")] - public virtual async Task SerializeToJsonStreamAsync(Stream stream) - { - await SerializeToBinaryStreamAsync(stream); - } - - /// - /// Serializes the Vector Database to a JSON stream - /// - /// - /// - /// - public virtual async Task SerializeToBinaryStreamAsync(Stream stream) - { - var streamVectorStore = new MemoryStream(); - await VectorStore.SerializeToJsonStreamAsync(streamVectorStore); - - await DatabaseFile.SaveDatabaseToZipArchiveAsync( - stream, - new DatabaseInfo(this.GetType().FullName), - async (archive) => - { - var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename); - using (var entryStream = entryVectorStore.Open()) - { - streamVectorStore.Position = 0; - await streamVectorStore.CopyToAsync(entryStream); - await entryStream.FlushAsync(); - } - } - ); - await stream.FlushAsync(); - } - - [Obsolete("Use SerializeToBinaryStream Instead")] - public virtual void SerializeToJsonStream(Stream stream) - { - SerializeToBinaryStream(stream); - } - - public virtual void SerializeToBinaryStream(Stream stream) - { - if (stream == null) - { - throw new ArgumentNullException(nameof(stream)); - } - SerializeToBinaryStreamAsync(stream).Wait(); - } - - [Obsolete("Use DeserializeFromBinaryStreamAsync Instead")] - public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) - { - await DeserializeFromBinaryStreamAsync(stream); - } - - public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream) - { - await DatabaseFile.LoadDatabaseFromZipArchiveAsync( - stream, - this.GetType().FullName, - async (archive) => - { - await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore); - - // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs - if (_idGenerator is ISequentialIdGenerator seqIdGen) - { - // Re-seed the sequence only if there are existing IDs - var ids = VectorStore.GetIds(); - if (ids.Any()) - { - seqIdGen.SetMostRecent(ids.Max()!); - } - } - } - ); - } - - [Obsolete("Use DeserializeFromBinaryStream Instead")] - public virtual void DeserializeFromJsonStream(Stream stream) - { - DeserializeFromBinaryStream(stream); - } - - public virtual void DeserializeFromBinaryStream(Stream stream) - { - if (stream == null) - { - throw new ArgumentNullException(nameof(stream)); - } - DeserializeFromBinaryStreamAsync(stream).Wait(); - } - - public IEnumerator> GetEnumerator() - { - return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator(); - } - - System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() - { - return this.GetEnumerator(); - } -} \ No newline at end of file + : base(embeddingsGenerator, vectorStore) + { } +} diff --git a/src/Build5Nines.SharpVector/VectorDatabaseBase.cs b/src/Build5Nines.SharpVector/VectorDatabaseBase.cs new file mode 100644 index 0000000..203cb58 --- /dev/null +++ b/src/Build5Nines.SharpVector/VectorDatabaseBase.cs @@ -0,0 +1,818 @@ +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.Vocabulary; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.VectorStore; +using System.Collections.Concurrent; +using System.IO.Compression; +using System.Runtime.CompilerServices; +using System.Text.Json; +using Build5Nines.SharpVector.Embeddings; +using System.Runtime.ExceptionServices; +using System.Collections; +using System.Linq; + +namespace Build5Nines.SharpVector; + +/// +/// Base class for a vector database. +/// +/// +/// +/// +/// +/// +/// +/// +/// +/// +/// +public abstract class VectorDatabaseBase + : IVectorDatabase + where TId : notnull + where TVocabularyKey : notnull + where TVocabularyValue: notnull + where TVectorStore : IVectorStoreWithVocabulary + where TVocabularyStore : IVocabularyStore + where TIdGenerator : IIdGenerator, new() + where TTextPreprocessor : ITextPreprocessor, new() + where TVectorizer : IVectorizer, new() + where TVectorComparer : IVectorComparer, new() +{ + protected TIdGenerator _idGenerator; + + private TTextPreprocessor _textPreprocessor; + + private TVectorizer _vectorizer; + + private TVectorComparer _vectorComparer; + + /// + /// The Vector Store used to store the text vectors of the database + /// + protected TVectorStore VectorStore { get; private set; } + + public VectorDatabaseBase(TVectorStore vectorStore) + { + VectorStore = vectorStore; + _idGenerator = new TIdGenerator(); + _textPreprocessor = new TTextPreprocessor(); + _vectorizer = new TVectorizer(); + _vectorComparer = new TVectorComparer(); + } + + /// + /// Get all the Ids for each text the database. + /// + /// + public IEnumerable GetIds() + { + return VectorStore.GetIds(); + } + + /// + /// Adds a new text with Metadata to the database and returns its ID + /// + /// + /// + /// + public TId AddText(TVocabularyKey text, TMetadata? metadata = default(TMetadata)) + { + return AddTextAsync(text, metadata).Result; + } + + /// + /// Adds a new text with Metadata to the database and returns its ID + /// + /// + /// + /// + public async Task AddTextAsync(TVocabularyKey text, TMetadata? metadata = default(TMetadata)) + { + // Perform preprocessing asynchronously + var tokens = await _textPreprocessor.TokenizeAndPreprocessAsync(text); + + // Update the vocabulary store asynchronously + await VectorStore.VocabularyStore.UpdateAsync(tokens); + + // Generate the vector asynchronously + float[] vector = await _vectorizer.GenerateVectorFromTokensAsync(VectorStore.VocabularyStore, tokens); + + // Generate the ID and store the vector text item asynchronously + TId id = _idGenerator.NewId(); + await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector)); + + return id; + } + + public async Task> AddTextsAsync(IEnumerable<(TVocabularyKey text, TMetadata? metadata)> items) + { + if (items is null) throw new ArgumentNullException(nameof(items)); + + var ids = new List(); + + foreach(var item in items) + { + TId id = await AddTextAsync(item.text, item.metadata); + ids.Add(id); + } + + return ids; + } + + + /// + /// Retrieves a text and metadata by its ID + /// + /// + /// + /// + public IVectorTextItem GetText(TId id) + { + return VectorStore.Get(id); + } + + /// + /// Deletes a text by its ID + /// + /// + /// + public IVectorTextItem DeleteText(TId id) + { + return VectorStore.Delete(id); + } + + /// + /// Updates a text by its ID + /// + /// + /// + /// + public void UpdateText(TId id, TVocabularyKey text) + { + if (VectorStore.ContainsKey(id)) + { + var tokens = _textPreprocessor.TokenizeAndPreprocess(text); + VectorStore.VocabularyStore.Update(tokens); + float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens); + var metadata = VectorStore.Get(id).Metadata; + VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Updates the Metadata of a Text by its ID + /// + /// + /// + /// + public void UpdateTextMetadata(TId id, TMetadata metadata) { + if (VectorStore.ContainsKey(id)) + { + var existing = VectorStore.Get(id); + + var item = new VectorTextItem( + existing.Text, + metadata, + existing.Vector + ); + + VectorStore.Set(id, item); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Updates a Text by its ID with new text and metadata values + /// + /// + /// + /// + public void UpdateTextAndMetadata(TId id, TVocabularyKey text, TMetadata metadata) + { + if (VectorStore.ContainsKey(id)) + { + var tokens = _textPreprocessor.TokenizeAndPreprocess(text); + VectorStore.VocabularyStore.Update(tokens); + float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens); + VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Performs a vector search to find the top N most similar texts to the given text + /// + /// The query prompt to search by. + /// The highest number of results to show. + /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. + /// + public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) + { + Func>? filterToUse = null; + if (filter != null) + { + filterToUse = (metadata) => Task.FromResult(filter(metadata)); + } + return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; + } + + /// + /// Performs an asynchronous search vector search to find the top N most similar texts to the given text + /// + /// The query prompt to search by. + /// The similarity threshold to filter by. + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. + /// + public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) + { + var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); + + similarities = await _vectorComparer.SortAsync(similarities); + + var totalCountFoundInSearch = similarities.Count(); + + IEnumerable> resultsToReturn; + if (pageCount != null && pageCount >= 0 && pageIndex >= 0) { + resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value); + } else { + // no paging specified, return all results + resultsToReturn = similarities; + } + + return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); + } + + private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null, Func>? filter = null) + { + var queryTokens = _textPreprocessor.TokenizeAndPreprocess(queryText); + float[] queryVector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, queryTokens); + + // Method to get the maximum vector length in the database + var desiredLength = VectorStore.VocabularyStore.Count; + + if (VectorStore.Count == 0) + { + throw new InvalidOperationException("The database is empty."); + } + + var results = new ConcurrentBag>(); + await foreach (KeyValuePair> kvp in VectorStore) + { + if (filter == null || await filter(kvp.Value.Metadata)) + { + var item = kvp.Value; + float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength)); + + if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) + { + var id = kvp.Key; + results.Add( + new VectorTextResultItem(id, item, vectorComparisonValue) + ); + } + } + } + return results; + } + + [Obsolete("Use SerializeToBinaryStreamAsync instead.")] + public virtual async Task SerializeToJsonStreamAsync(Stream stream) + { + await SerializeToBinaryStreamAsync(stream); + } + + /// + /// Serializes the Vector Database to a JSON stream + /// + /// + /// + /// + public virtual async Task SerializeToBinaryStreamAsync(Stream stream) + { + var streamVectorStore = new MemoryStream(); + var streamVocabularyStore = new MemoryStream(); + + var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore); + var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore); + + await Task.WhenAll(taskVectorStore, taskVocabularyStore); + + await DatabaseFile.SaveDatabaseToZipArchiveAsync( + stream, + new DatabaseInfo(this.GetType().FullName), + async (archive) => + { + var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename); + using (var entryStream = entryVectorStore.Open()) + { + streamVectorStore.Position = 0; + await streamVectorStore.CopyToAsync(entryStream); + await entryStream.FlushAsync(); + } + + var entryVocabularyStore = archive.CreateEntry(DatabaseFile.vocabularyStoreFilename); + using (var entryStream = entryVocabularyStore.Open()) + { + streamVocabularyStore.Position = 0; + await streamVocabularyStore.CopyToAsync(entryStream); + await entryStream.FlushAsync(); + } + } + ); + await stream.FlushAsync(); + } + + [Obsolete("Use SerializeToBinaryStream instead.")] + public virtual void SerializeToJsonStream(Stream stream) + { + SerializeToBinaryStream(stream); + } + + public virtual void SerializeToBinaryStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + SerializeToBinaryStreamAsync(stream).Wait(); + } + + [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")] + public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) { + await DeserializeFromBinaryStreamAsync(stream); + } + + public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream) + { + await DatabaseFile.LoadDatabaseFromZipArchiveAsync( + stream, + this.GetType().FullName, + async (archive) => + { + await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore); + await DatabaseFile.LoadVocabularyStoreAsync(archive, VectorStore.VocabularyStore); + + // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs + if (_idGenerator is ISequentialIdGenerator seqIdGen) + { + // Re-seed the sequence only if there are existing IDs + var ids = VectorStore.GetIds(); + if (ids.Any()) + { + seqIdGen.SetMostRecent(ids.Max()!); + } + } + } + ); + } + + + [Obsolete("Use DeserializeFromBinaryStream instead")] + public virtual void DeserializeFromJsonStream(Stream stream) + { + DeserializeFromBinaryStream(stream); + } + + public virtual void DeserializeFromBinaryStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + DeserializeFromBinaryStreamAsync(stream).Wait(); + } + + public IEnumerator> GetEnumerator() + { + return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } +} + + + +/// +/// Base class for a vector database. +/// +/// +/// +/// +/// +/// +public abstract class VectorDatabaseBase + : IVectorDatabase + where TId : notnull + where TVectorStore : IVectorStore + where TIdGenerator : IIdGenerator, new() + where TVectorComparer : IVectorComparer, new() +{ + private TIdGenerator _idGenerator; + + private TVectorComparer _vectorComparer; + + /// + /// The Vector Store used to store the text vectors of the database + /// + protected TVectorStore VectorStore { get; private set; } + + protected IEmbeddingsGenerator EmbeddingsGenerator { get; private set; } + + public VectorDatabaseBase(IEmbeddingsGenerator embeddingsGenerator, TVectorStore vectorStore) + { + EmbeddingsGenerator = embeddingsGenerator; + VectorStore = vectorStore; + _idGenerator = new TIdGenerator(); + _vectorComparer = new TVectorComparer(); + } + + /// + /// Get all the Ids for each text the database. + /// + /// + public IEnumerable GetIds() + { + return VectorStore.GetIds(); + } + + /// + /// Adds a new text with Metadata to the database and returns its ID + /// + /// + /// + /// + public TId AddText(string text, TMetadata? metadata = default(TMetadata)) + { + return AddTextAsync(text, metadata).Result; + } + + /// + /// Adds a new text with Metadata to the database and returns its ID + /// + /// + /// + /// + public async Task AddTextAsync(string text, TMetadata? metadata = default(TMetadata)) + { + // Generate the vector asynchronously + var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); + + // Generate the ID and store the vector text item asynchronously + TId id = _idGenerator.NewId(); + await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector)); + + return id; + } + + /// + /// Adds multiple texts with optional metadata to the database efficiently. + /// If the embeddings generator supports batching, this will generate vectors in a single multi-input call. + /// + /// Collection of (text, metadata) tuples to add. + /// List of generated IDs in the same order as inputs. + public async Task> AddTextsAsync(IEnumerable<(string text, TMetadata? metadata)> items) + { + if (items is null) throw new ArgumentNullException(nameof(items)); + + var list = items.ToList(); + if (list.Count == 0) return Array.Empty(); + + // Try batch embeddings if supported + float[][] vectors; + if (EmbeddingsGenerator is IBatchEmbeddingsGenerator batchGen) + { + var batch = await batchGen.GenerateEmbeddingsAsync(list.Select(i => i.text)); + vectors = batch.Select(v => v.ToArray()).ToArray(); + } + else + { + // Fallback to per-item embedding + vectors = new float[list.Count][]; + for (int i = 0; i < list.Count; i++) + { + vectors[i] = await EmbeddingsGenerator.GenerateEmbeddingsAsync(list[i].text); + } + } + + // Store items and produce IDs + var ids = new List(list.Count); + for (int i = 0; i < list.Count; i++) + { + TId id = _idGenerator.NewId(); + ids.Add(id); + await VectorStore.SetAsync(id, new VectorTextItem(list[i].text, list[i].metadata, vectors[i])); + } + + return ids; + } + + /// + /// Retrieves a text and metadata by its ID + /// + /// + /// + /// + public IVectorTextItem GetText(TId id) + { + return VectorStore.Get(id); + } + + /// + /// Deletes a text by its ID + /// + /// + /// + public IVectorTextItem DeleteText(TId id) + { + return VectorStore.Delete(id); + } + + /// + /// Updates a text by its ID + /// + /// + /// + /// + public async Task UpdateTextAsync(TId id, string text) + { + if (VectorStore.ContainsKey(id)) + { + var existing = VectorStore.Get(id); + var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); + var metadata = existing.Metadata; + VectorStore.Set(id, new VectorTextItem(text, existing.Metadata, vector)); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Updates a text by its ID + /// + /// + /// + public void UpdateText(TId id, string text) + { + var task = UpdateTextAsync(id, text); + task.Wait(); + } + + /// + /// Updates the Metadata of a Text by its ID + /// + /// + /// + /// + public void UpdateTextMetadata(TId id, TMetadata metadata) { + if (VectorStore.ContainsKey(id)) + { + var existing = VectorStore.Get(id); + + var item = new VectorTextItem( + existing.Text, + metadata, + existing.Vector + ); + + VectorStore.Set(id, item); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Updates a Text by its ID with new text and metadata values + /// + /// + /// + /// + public async Task UpdateTextAndMetadataAsync(TId id, string text, TMetadata metadata) + { + if (VectorStore.ContainsKey(id)) + { + var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text); + VectorStore.Set(id, new VectorTextItem(text, metadata, vector)); + } + else + { + throw new KeyNotFoundException($"Text with ID {id} not found."); + } + } + + /// + /// Updates a Text by its ID with new text and metadata values + /// + /// + /// + /// + public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata) + { + var task = UpdateTextAndMetadataAsync(id, text, metadata); + task.Wait(); + } + + /// + /// Performs a vector search to find the top N most similar texts to the given text + /// + /// The query prompt to search by. + /// The highest number of results to show. + /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. + /// The search results as an IVectorTextResult object. + public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) + { + Func>? filterToUse = null; + if (filter != null) + { + filterToUse = (metadata) => Task.FromResult(filter(metadata)); + } + return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; + } + + /// + /// Performs an asynchronous search vector search to find the top N most similar texts to the given text + /// + /// The query prompt to search by. + /// The similarity threshold to filter by. + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. + /// The search results as an IVectorTextResult object. + public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) + { + var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); + + similarities = await _vectorComparer.SortAsync(similarities); + + var totalCountFoundInSearch = similarities.Count(); + + IEnumerable> resultsToReturn; + if (pageCount != null && pageCount >= 0 && pageIndex >= 0) { + resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value); + } else { + // no paging specified, return all results + resultsToReturn = similarities; + } + + return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); + } + + private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null, Func>? filter = null) + { + var queryVector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(queryText); + + if (VectorStore.Count == 0) + { + throw new InvalidOperationException("The database is empty."); + } + + var results = new ConcurrentBag>(); + await foreach (var kvp in VectorStore) + { + if (filter == null || await filter(kvp.Value.Metadata)) + { + var item = kvp.Value; + + float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector); + + if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) + { + var id = kvp.Key; + results.Add( + new VectorTextResultItem(id, item, vectorComparisonValue) + ); + } + } + } + return results; + } + + [Obsolete("Use SerializeToBinaryStreamAsync Instead")] + public virtual async Task SerializeToJsonStreamAsync(Stream stream) + { + await SerializeToBinaryStreamAsync(stream); + } + + /// + /// Serializes the Vector Database to a JSON stream + /// + /// + /// + /// + public virtual async Task SerializeToBinaryStreamAsync(Stream stream) + { + var streamVectorStore = new MemoryStream(); + await VectorStore.SerializeToJsonStreamAsync(streamVectorStore); + + await DatabaseFile.SaveDatabaseToZipArchiveAsync( + stream, + new DatabaseInfo(this.GetType().FullName), + async (archive) => + { + var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename); + using (var entryStream = entryVectorStore.Open()) + { + streamVectorStore.Position = 0; + await streamVectorStore.CopyToAsync(entryStream); + await entryStream.FlushAsync(); + } + } + ); + await stream.FlushAsync(); + } + + [Obsolete("Use SerializeToBinaryStream Instead")] + public virtual void SerializeToJsonStream(Stream stream) + { + SerializeToBinaryStream(stream); + } + + public virtual void SerializeToBinaryStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + SerializeToBinaryStreamAsync(stream).Wait(); + } + + [Obsolete("Use DeserializeFromBinaryStreamAsync Instead")] + public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) + { + await DeserializeFromBinaryStreamAsync(stream); + } + + public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream) + { + await DatabaseFile.LoadDatabaseFromZipArchiveAsync( + stream, + this.GetType().FullName, + async (archive) => + { + await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore); + + // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs + if (_idGenerator is ISequentialIdGenerator seqIdGen) + { + // Re-seed the sequence only if there are existing IDs + var ids = VectorStore.GetIds(); + if (ids.Any()) + { + seqIdGen.SetMostRecent(ids.Max()!); + } + } + } + ); + } + + [Obsolete("Use DeserializeFromBinaryStream Instead")] + public virtual void DeserializeFromJsonStream(Stream stream) + { + DeserializeFromBinaryStream(stream); + } + + public virtual void DeserializeFromBinaryStream(Stream stream) + { + if (stream == null) + { + throw new ArgumentNullException(nameof(stream)); + } + DeserializeFromBinaryStreamAsync(stream).Wait(); + } + + public IEnumerator> GetEnumerator() + { + return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator(); + } + + System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() + { + return this.GetEnumerator(); + } +} \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs b/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs new file mode 100644 index 0000000..137d725 --- /dev/null +++ b/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs @@ -0,0 +1,356 @@ +using System.Collections; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Text.Json; +using Build5Nines.SharpVector.VectorStore; +using Build5Nines.SharpVector.Vocabulary; + +namespace Build5Nines.SharpVector.VectorStore; + +/// +/// Disk-backed vector store. Persists items to files and supports streaming reads. +/// Uses the vocabulary key type as the document type to satisfy the IVectorStore contract. +/// +public class BasicDiskVectorStore + : IVectorStoreWithVocabulary, IDisposable + where TId : notnull + where TVocabularyKey : notnull + where TVocabularyValue : notnull + where TVocabularyStore : IVocabularyStore +{ + private readonly string _rootPath; + private readonly string _indexPath; + private readonly string _itemsPath; + private readonly string _walPath; + + private readonly ReaderWriterLockSlim _rwLock = new(LockRecursionPolicy.SupportsRecursion); + + private readonly ConcurrentDictionary _index = new(); + private readonly ConcurrentDictionary> _cache = new(); + + private readonly ConcurrentQueue<(TId id, VectorTextItem? item, bool isDelete)> _pending = new(); + private readonly CancellationTokenSource _cts = new(); + private readonly Task _backgroundFlushTask; + + public TVocabularyStore VocabularyStore { get; } + + public int Count => _cache.Count; + + public BasicDiskVectorStore(string rootPath, TVocabularyStore vocabularyStore) + { + _rootPath = rootPath; + _indexPath = Path.Combine(rootPath, "index.json"); + _itemsPath = Path.Combine(rootPath, "items.bin"); + _walPath = Path.Combine(rootPath, "wal.log"); + Directory.CreateDirectory(rootPath); + VocabularyStore = vocabularyStore; + RecoverFromWalOrIndex(); + _backgroundFlushTask = Task.Run(BackgroundFlusherAsync); + } + + public IEnumerable GetIds() => _cache.Keys; + + public IVectorTextItem Get(TId id) + { + // First check cache for fast read + if (_cache.TryGetValue(id, out var cached)) return cached; + + _rwLock.EnterReadLock(); + try + { + if (!_index.TryGetValue(id, out var offset)) throw new KeyNotFoundException(); + using var fs = File.OpenRead(_itemsPath); + fs.Seek(offset, SeekOrigin.Begin); + var item = ReadItem(fs); + _cache[id] = item; + return item; + } + finally + { + _rwLock.ExitReadLock(); + } + } + + public void Set(TId id, VectorTextItem item) + { + // Write-Ahead Log entry to ensure durability (A in ACID) + AppendWalRecord(id, item, isDelete: false); + + // Update memory state atomically + _rwLock.EnterWriteLock(); + try + { + _cache[id] = item; + _pending.Enqueue((id, item, false)); + } + finally + { + _rwLock.ExitWriteLock(); + } + } + + public async Task SetAsync(TId id, VectorTextItem item) + { + Set(id, item); + await Task.Yield(); + } + + public IVectorTextItem Delete(TId id) + { + var existing = Get(id); + + // WAL for delete + AppendWalRecord(id, item: null, isDelete: true); + + _rwLock.EnterWriteLock(); + try + { + _cache.TryRemove(id, out _); + _pending.Enqueue((id, null, true)); + } + finally + { + _rwLock.ExitWriteLock(); + } + + return existing; + } + + public bool ContainsKey(TId id) => _cache.ContainsKey(id); + + public async Task SerializeToJsonStreamAsync(Stream stream) + { + await JsonSerializer.SerializeAsync(stream, _index); + } + + public async Task DeserializeFromJsonStreamAsync(Stream stream) + { + var loaded = await JsonSerializer.DeserializeAsync>(stream); + if (loaded != null) + { + foreach (var kv in loaded) _index[kv.Key] = kv.Value; + } + } + + public IEnumerator>> GetEnumerator() + { + foreach (var key in _cache.Keys) + { + yield return new KeyValuePair>(key, (VectorTextItem)Get(key)); + } + } + + System.Collections.IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + public async IAsyncEnumerator>> GetAsyncEnumerator(CancellationToken cancellationToken = default) + { + foreach (var key in _cache.Keys) + { + yield return new KeyValuePair>(key, (VectorTextItem)Get(key)); + await Task.Yield(); + } + } + + private void PersistIndex() + { + const int maxRetries = 5; + int attempt = 0; + while (true) + { + try + { + using var fs = new FileStream(_indexPath, FileMode.Create, FileAccess.Write, FileShare.Read); + JsonSerializer.Serialize(fs, _index); + fs.Flush(true); // ensure durability of index checkpoint + break; + } + catch (IOException) + { + attempt++; + if (attempt >= maxRetries) throw; + Thread.Sleep(10 * attempt); + } + } + } + + private void LoadIndexIfExists() + { + if (!File.Exists(_indexPath)) return; + using var fs = File.OpenRead(_indexPath); + var loaded = JsonSerializer.Deserialize>(fs); + if (loaded != null) + { + foreach (var kv in loaded) _index[kv.Key] = kv.Value; + } + } + + private void RecoverFromWalOrIndex() + { + // Load index checkpoint if present + LoadIndexIfExists(); + + // Replay WAL to recover any operations after the last checkpoint + if (!File.Exists(_walPath)) return; + using var fs = new FileStream(_walPath, FileMode.Open, FileAccess.Read, FileShare.Read); + using var br = new BinaryReader(fs); + while (fs.Position < fs.Length) + { + bool isDelete = br.ReadBoolean(); + var idJson = br.ReadString(); + var id = JsonSerializer.Deserialize(idJson)!; + if (isDelete) + { + _index.TryRemove(id, out _); + _cache.TryRemove(id, out _); + } + else + { + var itemJson = br.ReadString(); + var item = JsonSerializer.Deserialize>(itemJson)!; + + // Append item to items file to bring storage up-to-date + using var ofs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read); + ofs.Seek(0, SeekOrigin.End); + var offset = ofs.Position; + WriteItem(ofs, item); + ofs.Flush(true); + _index[id] = offset; + _cache[id] = item; + } + } + // After successful replay, truncate WAL (commit) + File.WriteAllBytes(_walPath, Array.Empty()); + PersistIndex(); + } + + private void AppendWalRecord(TId id, VectorTextItem? item, bool isDelete) + { + Directory.CreateDirectory(_rootPath); + using var fs = new FileStream(_walPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read); + fs.Seek(0, SeekOrigin.End); + using var bw = new BinaryWriter(fs); + bw.Write(isDelete); + bw.Write(JsonSerializer.Serialize(id)); + if (!isDelete && item != null) + { + bw.Write(JsonSerializer.Serialize(item)); + } + bw.Flush(); + fs.Flush(true); // fsync for WAL to guarantee durability + } + + private async Task BackgroundFlusherAsync() + { + var token = _cts.Token; + while (!token.IsCancellationRequested) + { + try + { + // Batch flush pending operations + if (_pending.IsEmpty) + { + await Task.Delay(25, token); + continue; + } + + using var itemsFs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read); + while (_pending.TryDequeue(out var op)) + { + if (op.isDelete) + { + _index.TryRemove(op.id, out _); + } + else if (op.item is not null) + { + itemsFs.Seek(0, SeekOrigin.End); + var offset = itemsFs.Position; + WriteItem(itemsFs, op.item); + _index[op.id] = offset; + } + } + itemsFs.Flush(true); + PersistIndex(); + + // After index checkpoint, truncate WAL safely + File.WriteAllBytes(_walPath, Array.Empty()); + } + catch (OperationCanceledException) + { + // normal shutdown + } + catch + { + // best-effort: wait and retry + await Task.Delay(100, token); + } + } + } + + public void Dispose() + { + try + { + _cts.Cancel(); + _backgroundFlushTask.Wait(1500); + } + catch { } + finally + { + // Attempt a final flush of pending operations synchronously + try + { + using var itemsFs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read); + while (_pending.TryDequeue(out var op)) + { + if (op.isDelete) + { + _index.TryRemove(op.id, out _); + } + else if (op.item is not null) + { + itemsFs.Seek(0, SeekOrigin.End); + var offset = itemsFs.Position; + WriteItem(itemsFs, op.item); + _index[op.id] = offset; + } + } + itemsFs.Flush(true); + PersistIndex(); + File.WriteAllBytes(_walPath, Array.Empty()); + } + catch { } + + _cts.Dispose(); + _rwLock.Dispose(); + GC.SuppressFinalize(this); + } + } + + private static void WriteItem(FileStream fs, VectorTextItem item) + { + using var bw = new BinaryWriter(fs, System.Text.Encoding.UTF8, leaveOpen: true); + var json = JsonSerializer.Serialize(item); + var bytes = System.Text.Encoding.UTF8.GetBytes(json); + bw.Write(bytes.Length); + bw.Write(bytes); + } + + private static VectorTextItem ReadItem(FileStream fs) + { + using var br = new BinaryReader(fs, System.Text.Encoding.UTF8, leaveOpen: true); + int len = br.ReadInt32(); + var bytes = br.ReadBytes(len); + var json = System.Text.Encoding.UTF8.GetString(bytes); + var item = JsonSerializer.Deserialize>(json)!; + return item; + } +} + +public class BasicDiskVectorStore + : BasicDiskVectorStore, string, int> + where TId : notnull +{ + public BasicDiskVectorStore(string rootPath, IVocabularyStore vocabularyStore) + : base(rootPath, vocabularyStore) + { } +} diff --git a/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs b/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs new file mode 100644 index 0000000..2b1c6cd --- /dev/null +++ b/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs @@ -0,0 +1,223 @@ +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Text.Json; +using Build5Nines.SharpVector.Vocabulary; + +namespace Build5Nines.SharpVector.Vocabulary; + +/// +/// Disk-backed vocabulary store that persists token->index mapping. +/// +public class BasicDiskVocabularyStore : IVocabularyStore, IDisposable + where TKey : notnull +{ + private readonly string _rootPath; + private readonly string _indexPath; + private readonly string _walPath; + + private readonly ReaderWriterLockSlim _rwLock = new(LockRecursionPolicy.SupportsRecursion); + + private ConcurrentDictionary _vocab = new(); + private ConcurrentDictionary _cache = new(); + private readonly ConcurrentQueue> _pending = new(); + private readonly CancellationTokenSource _cts = new(); + private readonly Task _backgroundFlushTask; + + public int Count => _cache.Count; + + public BasicDiskVocabularyStore(string rootPath) + { + _rootPath = rootPath; + Directory.CreateDirectory(rootPath); + _indexPath = Path.Combine(rootPath, "vocabulary.json"); + _walPath = Path.Combine(rootPath, "vocabulary.wal"); + RecoverFromWalOrIndex(); + _backgroundFlushTask = Task.Run(BackgroundFlusherAsync); + } + + public void Update(IEnumerable tokens) + { + var tokenList = tokens as IList ?? tokens.ToList(); + AppendWalRecord(tokenList); + + _rwLock.EnterWriteLock(); + try + { + foreach (var token in tokenList) + { + if (!_cache.ContainsKey(token)) + { + _cache[token] = _cache.Count; + } + } + _pending.Enqueue(tokenList); + } + finally + { + _rwLock.ExitWriteLock(); + } + } + + public async Task UpdateAsync(IEnumerable tokens) + { + await Task.Run(() => Update(tokens)); + } + + public bool TryGetValue(TKey token, out int index) => _cache.TryGetValue(token, out index); + + public async Task SerializeToJsonStreamAsync(Stream stream) + { + await JsonSerializer.SerializeAsync(stream, _vocab); + } + + public async Task DeserializeFromJsonStreamAsync(Stream stream) + { + var loaded = await JsonSerializer.DeserializeAsync>(stream); + if (loaded != null) + { + _vocab = loaded; + } + } + + private void Persist() + { + const int maxRetries = 5; + int attempt = 0; + while (true) + { + try + { + using var fs = new FileStream(_indexPath, FileMode.Create, FileAccess.Write, FileShare.Read); + JsonSerializer.Serialize(fs, _vocab); + fs.Flush(true); + break; + } + catch (IOException) + { + attempt++; + if (attempt >= maxRetries) throw; + Thread.Sleep(10 * attempt); + } + } + } + + private void LoadIfExists() + { + if (!File.Exists(_indexPath)) return; + if (new FileInfo(_indexPath).Length == 0) return; + using var fs = File.OpenRead(_indexPath); + var loaded = JsonSerializer.Deserialize>(fs); + if (loaded != null) + { + _vocab = loaded; + _cache = new ConcurrentDictionary(loaded); + } + } + + private void RecoverFromWalOrIndex() + { + LoadIfExists(); + if (!File.Exists(_walPath)) return; + using var fs = new FileStream(_walPath, FileMode.Open, FileAccess.Read, FileShare.Read); + using var br = new BinaryReader(fs); + while (fs.Position < fs.Length) + { + int count = br.ReadInt32(); + for (int i = 0; i < count; i++) + { + var tokenJson = br.ReadString(); + var token = JsonSerializer.Deserialize(tokenJson)!; + if (!_vocab.ContainsKey(token)) + { + var idx = _vocab.Count; + _vocab[token] = idx; + _cache[token] = idx; + } + } + } + File.WriteAllBytes(_walPath, Array.Empty()); + Persist(); + } + + private void AppendWalRecord(IList tokens) + { + Directory.CreateDirectory(_rootPath); + using var fs = new FileStream(_walPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read); + fs.Seek(0, SeekOrigin.End); + using var bw = new BinaryWriter(fs); + bw.Write(tokens.Count); + foreach (var token in tokens) + { + bw.Write(JsonSerializer.Serialize(token)); + } + bw.Flush(); + fs.Flush(true); + } + + private async Task BackgroundFlusherAsync() + { + var token = _cts.Token; + while (!token.IsCancellationRequested) + { + try + { + if (_pending.IsEmpty) + { + await Task.Delay(25, token); + continue; + } + + var batch = new List(); + while (_pending.TryDequeue(out var tks)) + { + batch.AddRange(tks); + } + + // Apply to persistent vocab + foreach (var tk in batch) + { + if (!_vocab.ContainsKey(tk)) + { + _vocab[tk] = _vocab.Count; + } + } + + Persist(); + File.WriteAllBytes(_walPath, Array.Empty()); + } + catch (OperationCanceledException) { } + catch { await Task.Delay(100, token); } + } + } + + public void Dispose() + { + try + { + _cts.Cancel(); + _backgroundFlushTask.Wait(1500); + } + catch { } + finally + { + try + { + var batch = new List(); + while (_pending.TryDequeue(out var tks)) batch.AddRange(tks); + foreach (var tk in batch) + { + if (!_vocab.ContainsKey(tk)) + { + _vocab[tk] = _vocab.Count; + } + } + Persist(); + File.WriteAllBytes(_walPath, Array.Empty()); + } + catch { } + _cts.Dispose(); + _rwLock.Dispose(); + GC.SuppressFinalize(this); + } + } +} diff --git a/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs b/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs new file mode 100644 index 0000000..ad3b474 --- /dev/null +++ b/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs @@ -0,0 +1,73 @@ +namespace SharpVectorPerformance; + +using System.Diagnostics; +using Build5Nines.SharpVector; +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.Preprocessing; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.Vectorization; +using Build5Nines.SharpVector.VectorStore; +using Build5Nines.SharpVector.Vocabulary; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; + +[MemoryDiagnoser] +public class DiskVectorDatabasePerformance +{ + private BasicDiskVectorDatabase? _db; + private string _rootPath = Path.Combine(Path.GetTempPath(), "SharpVectorPerf", Guid.NewGuid().ToString("N")); + + [GlobalSetup] + public void Setup() + { + Directory.CreateDirectory(_rootPath); + _db = new BasicDiskVectorDatabase(_rootPath); + } + + [GlobalCleanup] + public void Cleanup() + { + try { if (Directory.Exists(_rootPath)) Directory.Delete(_rootPath, recursive: true); } catch { } + } + + [Params(25)] + public int ItemCount; + + [Benchmark] + public async Task AddTexts() + { + var indices = Enumerable.Range(0, ItemCount); + await Parallel.ForEachAsync(indices, async (i, ct) => + { + var text = $"Sample text {i} fox {Random.Shared.Next(0, 100)}"; + await _db!.AddTextAsync(text, "meta"); + }); + } + + [Benchmark] + public async Task Search() + { + // Ensure some data + if (!_db!.GetIds().Any()) + { + var indices = Enumerable.Range(0, 500); + await Parallel.ForEachAsync(indices, async (i, ct) => + { + await _db.AddTextAsync($"quick brown fox {i}", null); + }); + } + var results = await _db.SearchAsync("quick fox"); + // Touch results to avoid dead-code elimination + _ = results.Texts.Take(10).Count(); + } + + [Benchmark] + public void DeleteIds() + { + var ids = _db!.GetIds().Take(Math.Min(50, _db.GetIds().Count())).ToList(); + foreach (var id in ids) + { + _db.DeleteText(id); + } + } +} \ No newline at end of file diff --git a/src/SharpVectorPerformance/Program.cs b/src/SharpVectorPerformance/Program.cs index 375e8d0..281466a 100644 --- a/src/SharpVectorPerformance/Program.cs +++ b/src/SharpVectorPerformance/Program.cs @@ -8,5 +8,6 @@ public class Program public static void Main(string[] args) { BenchmarkRunner.Run(); + BenchmarkRunner.Run(); } } \ No newline at end of file diff --git a/src/SharpVectorTest/DiskVectorDatabaseTests.cs b/src/SharpVectorTest/DiskVectorDatabaseTests.cs new file mode 100644 index 0000000..458831f --- /dev/null +++ b/src/SharpVectorTest/DiskVectorDatabaseTests.cs @@ -0,0 +1,71 @@ +namespace SharpVectorTest; + +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using Build5Nines.SharpVector; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +[TestClass] +public class DiskVectorDatabaseTests +{ + private static string CreateTempDir() + { + var dir = Path.Combine(Path.GetTempPath(), "SharpVectorTests", Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(dir); + return dir; + } + + [TestMethod] + public async Task AddAndGetText_PersistsToDisk() + { + var root = CreateTempDir(); + var db = new BasicDiskVectorDatabase(root); + + var id = await db.AddTextAsync("hello world", "meta1"); + var item = db.GetText(id); + Assert.AreEqual("hello world", item.Text); + Assert.AreEqual("meta1", item.Metadata); + + // Recreate DB and ensure data is still there + var db2 = new BasicDiskVectorDatabase(root); + var item2 = db2.GetText(id); + Assert.AreEqual("hello world", item2.Text); + Assert.AreEqual("meta1", item2.Metadata); + } + + [TestMethod] + public async Task Search_ReturnsSimilarResults() + { + var root = CreateTempDir(); + var db = new BasicDiskVectorDatabase(root); + + await db.AddTextAsync("The quick brown fox", "a"); + await db.AddTextAsync("Jumps over the lazy dog", "b"); + await db.AddTextAsync("An unrelated sentence", "c"); + + var results = await db.SearchAsync("quick fox", threshold: null, pageIndex: 0, pageCount: null); + Assert.IsTrue(results.Texts.Any()); + Assert.IsTrue(results.Texts.Any(r => r.Text.Contains("quick", StringComparison.OrdinalIgnoreCase))); + } + + [TestMethod] + public async Task Delete_RemovesFromIndexButKeepsFile() + { + var root = CreateTempDir(); + var db = new BasicDiskVectorDatabase(root); + var id = await db.AddTextAsync("to be deleted", "m"); + var existing = db.GetText(id); + Assert.AreEqual("to be deleted", existing.Text); + + db.DeleteText(id); + Assert.IsFalse(db.GetIds().Contains(id)); + Assert.ThrowsException(() => db.GetText(id)); + + // Reopen and ensure deletion persists + var db2 = new BasicDiskVectorDatabase(root); + Assert.IsFalse(db2.GetIds().Contains(id)); + Assert.ThrowsException(() => db2.GetText(id)); + } +} \ No newline at end of file