diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce15cf7..81e87a9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## v2.2.0
+
+Add:
+
+- Added `BasicDiskVectorDatabase` to provide a basic automatically disk persistent vector database, and associated `BasicDiskVectorStore` and `BasicDiskVocabularyStore`.
+
## v2.1.3
Add:
diff --git a/docs/docs/persistence/index.md b/docs/docs/persistence/index.md
index 55559ac..cf5c3c7 100644
--- a/docs/docs/persistence/index.md
+++ b/docs/docs/persistence/index.md
@@ -5,6 +5,8 @@ title: Data Persistence
The `Build5Nines.SharpVector` library provides easy-to-use methods for saving a memory-based vector database to a file or stream and loading it again later. This is particularly useful for caching indexed content between runs, deploying pre-built vector stores, or shipping databases with your application.
+---
+
## :material-file: File Persistence
`Build5Nines.SharpVector` supports persisting the vector database to a file.
@@ -51,6 +53,8 @@ vdb.LoadFromFile(filePath);
await vdb.LoadFromFileAsync(filePath);
```
+---
+
## :material-file-move: Persist to Stream
The underlying methods used by `SaveToFile` and `LoadFromFile` methods for serializing the vector database to a `Stream` are available to use directly. This provides support for reading/writing to `MemoryStream` (or other streams) if the vector database needs to be persisted to something other than the local file system.
@@ -92,3 +96,30 @@ vdb.DeserializeFromBinaryStream(stream);
// deserialize asynchronously from JSON stream
await vdb.DeserializeFromBinaryStreamAsync(stream);
```
+
+---
+
+## :material-file-database: BasicDiskVectorDatabase
+
+The `BasicDiskVectorDatabase` provides a basic vector database implementation that automatically stores the vector store and vocabulary store to disk. It's implmentation of vectorization is the same as the `BasicMemoryVectorDatabase`, but with the modification that it automatically persists the database to disk in the background to the specified folder path.
+
+Here's a basic example of using `BasicDiskVectorDatabase`:
+
+```csharp
+// specify the folder where to persist the database data on disk
+var vdb = new BasicDiskVectorDatabase("C:/data/content-db");
+foreach (var doc in documents)
+{
+ vdb.AddText(doc.Id, doc.Text);
+}
+
+var results = vdb.Search("some text");
+
+```
+
+### Tips
+
+- Prefer absolute paths for the storage folder in production services.
+- Place the folder on fast storage (SSD) for best indexing/query performance.
+- Avoid sharing the same folder across multiple processes concurrently.
+- Back up the folder regularly to preserve your vector store and vocabulary.
diff --git a/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs
new file mode 100644
index 0000000..79d2d4e
--- /dev/null
+++ b/src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs
@@ -0,0 +1,29 @@
+using Build5Nines.SharpVector.Id;
+using Build5Nines.SharpVector.Preprocessing;
+using Build5Nines.SharpVector.Vocabulary;
+using Build5Nines.SharpVector.Vectorization;
+using Build5Nines.SharpVector.VectorCompare;
+using Build5Nines.SharpVector.VectorStore;
+
+namespace Build5Nines.SharpVector;
+
+///
+/// Base class for an on-disk vector database. Mirrors MemoryVectorDatabaseBase generic composition
+/// while using disk-backed stores for persistence.
+///
+public abstract class BasicDiskMemoryVectorDatabaseBase
+ : VectorDatabaseBase
+ where TId : notnull
+ where TVocabularyKey : notnull
+ where TVocabularyValue : notnull
+ where TVectorStore : IVectorStoreWithVocabulary
+ where TVocabularyStore : IVocabularyStore
+ where TIdGenerator : IIdGenerator, new()
+ where TTextPreprocessor : ITextPreprocessor, new()
+ where TVectorizer : IVectorizer, new()
+ where TVectorComparer : IVectorComparer, new()
+{
+ protected BasicDiskMemoryVectorDatabaseBase(TVectorStore vectorStore)
+ : base(vectorStore)
+ { }
+}
diff --git a/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs b/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs
new file mode 100644
index 0000000..bea8a64
--- /dev/null
+++ b/src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs
@@ -0,0 +1,47 @@
+using Build5Nines.SharpVector.Vocabulary;
+using Build5Nines.SharpVector.Id;
+using Build5Nines.SharpVector.Preprocessing;
+using Build5Nines.SharpVector.Vectorization;
+using Build5Nines.SharpVector.VectorCompare;
+using Build5Nines.SharpVector.VectorStore;
+
+namespace Build5Nines.SharpVector;
+
+///
+/// A basic disk-backed vector database using Bag-of-Words, Cosine similarity,
+/// disk-backed vector store and vocabulary store. Uses int IDs and string metadata.
+///
+public class BasicDiskVectorDatabase
+ : BasicDiskMemoryVectorDatabaseBase<
+ int,
+ TMetadata,
+ BasicDiskVectorStore, string, int>,
+ BasicDiskVocabularyStore,
+ string, int,
+ IntIdGenerator,
+ BasicTextPreprocessor,
+ BagOfWordsVectorizer,
+ CosineSimilarityVectorComparer
+ >, IMemoryVectorDatabase, IVectorDatabase
+{
+ public BasicDiskVectorDatabase(string rootPath)
+ : base(
+ new BasicDiskVectorStore, string, int>(
+ rootPath,
+ new BasicDiskVocabularyStore(rootPath)
+ )
+ )
+ { }
+
+ [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")]
+ public override async Task DeserializeFromJsonStreamAsync(Stream stream)
+ {
+ await DeserializeFromBinaryStreamAsync(stream);
+ }
+
+ [Obsolete("Use DeserializeFromBinaryStream instead.")]
+ public override void DeserializeFromJsonStream(Stream stream)
+ {
+ DeserializeFromBinaryStream(stream);
+ }
+}
diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj
index 8183a8e..269cf34 100644
--- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj
+++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj
@@ -9,7 +9,7 @@
Build5Nines.SharpVector
https://sharpvector.build5nines.com
https://github.com/Build5Nines/SharpVector
- 2.1.3
+ 2.2.0
Lightweight In-memory Vector Database to embed in any .NET Applications
Copyright (c) 2025 Build5Nines LLC
README.md
diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs
index c1bf17f..6d407d7 100644
--- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs
+++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs
@@ -15,7 +15,6 @@
namespace Build5Nines.SharpVector;
-
///
/// Base class for a memory vector database.
///
@@ -30,7 +29,7 @@ namespace Build5Nines.SharpVector;
///
///
public abstract class MemoryVectorDatabaseBase
- : IVectorDatabase
+ : VectorDatabaseBase
where TId : notnull
where TVocabularyKey : notnull
where TVocabularyValue: notnull
@@ -41,378 +40,11 @@ public abstract class MemoryVectorDatabaseBase, new()
where TVectorComparer : IVectorComparer, new()
{
- protected TIdGenerator _idGenerator;
-
- private TTextPreprocessor _textPreprocessor;
-
- private TVectorizer _vectorizer;
-
- private TVectorComparer _vectorComparer;
-
- ///
- /// The Vector Store used to store the text vectors of the database
- ///
- protected TVectorStore VectorStore { get; private set; }
-
- public MemoryVectorDatabaseBase(TVectorStore vectorStore)
- {
- VectorStore = vectorStore;
- _idGenerator = new TIdGenerator();
- _textPreprocessor = new TTextPreprocessor();
- _vectorizer = new TVectorizer();
- _vectorComparer = new TVectorComparer();
- }
-
- ///
- /// Get all the Ids for each text the database.
- ///
- ///
- public IEnumerable GetIds()
- {
- return VectorStore.GetIds();
- }
-
- ///
- /// Adds a new text with Metadata to the database and returns its ID
- ///
- ///
- ///
- ///
- public TId AddText(TVocabularyKey text, TMetadata? metadata = default(TMetadata))
- {
- return AddTextAsync(text, metadata).Result;
- }
-
- ///
- /// Adds a new text with Metadata to the database and returns its ID
- ///
- ///
- ///
- ///
- public async Task AddTextAsync(TVocabularyKey text, TMetadata? metadata = default(TMetadata))
- {
- // Perform preprocessing asynchronously
- var tokens = await _textPreprocessor.TokenizeAndPreprocessAsync(text);
-
- // Update the vocabulary store asynchronously
- await VectorStore.VocabularyStore.UpdateAsync(tokens);
-
- // Generate the vector asynchronously
- float[] vector = await _vectorizer.GenerateVectorFromTokensAsync(VectorStore.VocabularyStore, tokens);
-
- // Generate the ID and store the vector text item asynchronously
- TId id = _idGenerator.NewId();
- await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector));
-
- return id;
- }
-
- public async Task> AddTextsAsync(IEnumerable<(TVocabularyKey text, TMetadata? metadata)> items)
- {
- if (items is null) throw new ArgumentNullException(nameof(items));
-
- var ids = new List();
-
- foreach(var item in items)
- {
- TId id = await AddTextAsync(item.text, item.metadata);
- ids.Add(id);
- }
-
- return ids;
- }
-
-
- ///
- /// Retrieves a text and metadata by its ID
- ///
- ///
- ///
- ///
- public IVectorTextItem GetText(TId id)
- {
- return VectorStore.Get(id);
- }
-
- ///
- /// Deletes a text by its ID
- ///
- ///
- ///
- public IVectorTextItem DeleteText(TId id)
- {
- return VectorStore.Delete(id);
- }
-
- ///
- /// Updates a text by its ID
- ///
- ///
- ///
- ///
- public void UpdateText(TId id, TVocabularyKey text)
- {
- if (VectorStore.ContainsKey(id))
- {
- var tokens = _textPreprocessor.TokenizeAndPreprocess(text);
- VectorStore.VocabularyStore.Update(tokens);
- float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens);
- var metadata = VectorStore.Get(id).Metadata;
- VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Updates the Metadata of a Text by its ID
- ///
- ///
- ///
- ///
- public void UpdateTextMetadata(TId id, TMetadata metadata) {
- if (VectorStore.ContainsKey(id))
- {
- var existing = VectorStore.Get(id);
-
- var item = new VectorTextItem(
- existing.Text,
- metadata,
- existing.Vector
- );
-
- VectorStore.Set(id, item);
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Updates a Text by its ID with new text and metadata values
- ///
- ///
- ///
- ///
- public void UpdateTextAndMetadata(TId id, TVocabularyKey text, TMetadata metadata)
- {
- if (VectorStore.ContainsKey(id))
- {
- var tokens = _textPreprocessor.TokenizeAndPreprocess(text);
- VectorStore.VocabularyStore.Update(tokens);
- float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens);
- VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Performs a vector search to find the top N most similar texts to the given text
- ///
- /// The query prompt to search by.
- /// The highest number of results to show.
- /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all.
- /// The page index of the search results. Default is 0.
- /// The number of search results per page. Default is Null and returns all results.
- /// A filter function to apply to the metadata of each result.
- ///
- public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null)
- {
- Func>? filterToUse = null;
- if (filter != null)
- {
- filterToUse = (metadata) => Task.FromResult(filter(metadata));
- }
- return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result;
- }
-
- ///
- /// Performs an asynchronous search vector search to find the top N most similar texts to the given text
- ///
- /// The query prompt to search by.
- /// The similarity threshold to filter by.
- /// The page index of the search results. Default is 0.
- /// The number of search results per page. Default is Null and returns all results.
- /// A filter function to apply to the metadata of each result.
- ///
- public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null)
- {
- var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter);
-
- similarities = await _vectorComparer.SortAsync(similarities);
-
- var totalCountFoundInSearch = similarities.Count();
-
- IEnumerable> resultsToReturn;
- if (pageCount != null && pageCount >= 0 && pageIndex >= 0) {
- resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value);
- } else {
- // no paging specified, return all results
- resultsToReturn = similarities;
- }
-
- return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn);
- }
-
- private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null, Func>? filter = null)
- {
- var queryTokens = _textPreprocessor.TokenizeAndPreprocess(queryText);
- float[] queryVector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, queryTokens);
-
- // Method to get the maximum vector length in the database
- var desiredLength = VectorStore.VocabularyStore.Count;
-
- if (VectorStore.Count == 0)
- {
- throw new InvalidOperationException("The database is empty.");
- }
-
- var results = new ConcurrentBag>();
- await foreach (KeyValuePair> kvp in VectorStore)
- {
- if (filter == null || await filter(kvp.Value.Metadata))
- {
- var item = kvp.Value;
- float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength));
-
- if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue))
- {
- var id = kvp.Key;
- results.Add(
- new VectorTextResultItem(id, item, vectorComparisonValue)
- );
- }
- }
- }
- return results;
- }
-
- [Obsolete("Use SerializeToBinaryStreamAsync instead.")]
- public virtual async Task SerializeToJsonStreamAsync(Stream stream)
- {
- await SerializeToBinaryStreamAsync(stream);
- }
-
- ///
- /// Serializes the Vector Database to a JSON stream
- ///
- ///
- ///
- ///
- public virtual async Task SerializeToBinaryStreamAsync(Stream stream)
- {
- var streamVectorStore = new MemoryStream();
- var streamVocabularyStore = new MemoryStream();
-
- var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore);
- var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore);
-
- await Task.WhenAll(taskVectorStore, taskVocabularyStore);
-
- await DatabaseFile.SaveDatabaseToZipArchiveAsync(
- stream,
- new DatabaseInfo(this.GetType().FullName),
- async (archive) =>
- {
- var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename);
- using (var entryStream = entryVectorStore.Open())
- {
- streamVectorStore.Position = 0;
- await streamVectorStore.CopyToAsync(entryStream);
- await entryStream.FlushAsync();
- }
-
- var entryVocabularyStore = archive.CreateEntry(DatabaseFile.vocabularyStoreFilename);
- using (var entryStream = entryVocabularyStore.Open())
- {
- streamVocabularyStore.Position = 0;
- await streamVocabularyStore.CopyToAsync(entryStream);
- await entryStream.FlushAsync();
- }
- }
- );
- await stream.FlushAsync();
- }
-
- [Obsolete("Use SerializeToBinaryStream instead.")]
- public virtual void SerializeToJsonStream(Stream stream)
- {
- SerializeToBinaryStream(stream);
- }
-
- public virtual void SerializeToBinaryStream(Stream stream)
- {
- if (stream == null)
- {
- throw new ArgumentNullException(nameof(stream));
- }
- SerializeToBinaryStreamAsync(stream).Wait();
- }
-
- [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")]
- public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) {
- await DeserializeFromBinaryStreamAsync(stream);
- }
-
- public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream)
- {
- await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
- stream,
- this.GetType().FullName,
- async (archive) =>
- {
- await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
- await DatabaseFile.LoadVocabularyStoreAsync(archive, VectorStore.VocabularyStore);
-
- // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
- if (_idGenerator is ISequentialIdGenerator seqIdGen)
- {
- // Re-seed the sequence only if there are existing IDs
- var ids = VectorStore.GetIds();
- if (ids.Any())
- {
- seqIdGen.SetMostRecent(ids.Max()!);
- }
- }
- }
- );
- }
-
-
- [Obsolete("Use DeserializeFromBinaryStream instead")]
- public virtual void DeserializeFromJsonStream(Stream stream)
- {
- DeserializeFromBinaryStream(stream);
- }
-
- public virtual void DeserializeFromBinaryStream(Stream stream)
- {
- if (stream == null)
- {
- throw new ArgumentNullException(nameof(stream));
- }
- DeserializeFromBinaryStreamAsync(stream).Wait();
- }
-
- public IEnumerator> GetEnumerator()
- {
- return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator();
- }
-
- IEnumerator IEnumerable.GetEnumerator()
- {
- return GetEnumerator();
- }
+ protected MemoryVectorDatabaseBase(TVectorStore vectorStore)
+ : base(vectorStore)
+ { }
}
-
-
///
/// Base class for a memory vector database.
///
@@ -422,398 +54,13 @@ IEnumerator IEnumerable.GetEnumerator()
///
///
public abstract class MemoryVectorDatabaseBase
- : IMemoryVectorDatabase, IVectorDatabase
+ : VectorDatabaseBase, IMemoryVectorDatabase
where TId : notnull
where TVectorStore : IVectorStore
where TIdGenerator : IIdGenerator, new()
where TVectorComparer : IVectorComparer, new()
{
- private TIdGenerator _idGenerator;
-
- private TVectorComparer _vectorComparer;
-
- ///
- /// The Vector Store used to store the text vectors of the database
- ///
- protected TVectorStore VectorStore { get; private set; }
-
- protected IEmbeddingsGenerator EmbeddingsGenerator { get; private set; }
-
public MemoryVectorDatabaseBase(IEmbeddingsGenerator embeddingsGenerator, TVectorStore vectorStore)
- {
- EmbeddingsGenerator = embeddingsGenerator;
- VectorStore = vectorStore;
- _idGenerator = new TIdGenerator();
- _vectorComparer = new TVectorComparer();
- }
-
- ///
- /// Get all the Ids for each text the database.
- ///
- ///
- public IEnumerable GetIds()
- {
- return VectorStore.GetIds();
- }
-
- ///
- /// Adds a new text with Metadata to the database and returns its ID
- ///
- ///
- ///
- ///
- public TId AddText(string text, TMetadata? metadata = default(TMetadata))
- {
- return AddTextAsync(text, metadata).Result;
- }
-
- ///
- /// Adds a new text with Metadata to the database and returns its ID
- ///
- ///
- ///
- ///
- public async Task AddTextAsync(string text, TMetadata? metadata = default(TMetadata))
- {
- // Generate the vector asynchronously
- var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
-
- // Generate the ID and store the vector text item asynchronously
- TId id = _idGenerator.NewId();
- await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector));
-
- return id;
- }
-
- ///
- /// Adds multiple texts with optional metadata to the database efficiently.
- /// If the embeddings generator supports batching, this will generate vectors in a single multi-input call.
- ///
- /// Collection of (text, metadata) tuples to add.
- /// List of generated IDs in the same order as inputs.
- public async Task> AddTextsAsync(IEnumerable<(string text, TMetadata? metadata)> items)
- {
- if (items is null) throw new ArgumentNullException(nameof(items));
-
- var list = items.ToList();
- if (list.Count == 0) return Array.Empty();
-
- // Try batch embeddings if supported
- float[][] vectors;
- if (EmbeddingsGenerator is IBatchEmbeddingsGenerator batchGen)
- {
- var batch = await batchGen.GenerateEmbeddingsAsync(list.Select(i => i.text));
- vectors = batch.Select(v => v.ToArray()).ToArray();
- }
- else
- {
- // Fallback to per-item embedding
- vectors = new float[list.Count][];
- for (int i = 0; i < list.Count; i++)
- {
- vectors[i] = await EmbeddingsGenerator.GenerateEmbeddingsAsync(list[i].text);
- }
- }
-
- // Store items and produce IDs
- var ids = new List(list.Count);
- for (int i = 0; i < list.Count; i++)
- {
- TId id = _idGenerator.NewId();
- ids.Add(id);
- await VectorStore.SetAsync(id, new VectorTextItem(list[i].text, list[i].metadata, vectors[i]));
- }
-
- return ids;
- }
-
- ///
- /// Retrieves a text and metadata by its ID
- ///
- ///
- ///
- ///
- public IVectorTextItem GetText(TId id)
- {
- return VectorStore.Get(id);
- }
-
- ///
- /// Deletes a text by its ID
- ///
- ///
- ///
- public IVectorTextItem DeleteText(TId id)
- {
- return VectorStore.Delete(id);
- }
-
- ///
- /// Updates a text by its ID
- ///
- ///
- ///
- ///
- public async Task UpdateTextAsync(TId id, string text)
- {
- if (VectorStore.ContainsKey(id))
- {
- var existing = VectorStore.Get(id);
- var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
- var metadata = existing.Metadata;
- VectorStore.Set(id, new VectorTextItem(text, existing.Metadata, vector));
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Updates a text by its ID
- ///
- ///
- ///
- public void UpdateText(TId id, string text)
- {
- var task = UpdateTextAsync(id, text);
- task.Wait();
- }
-
- ///
- /// Updates the Metadata of a Text by its ID
- ///
- ///
- ///
- ///
- public void UpdateTextMetadata(TId id, TMetadata metadata) {
- if (VectorStore.ContainsKey(id))
- {
- var existing = VectorStore.Get(id);
-
- var item = new VectorTextItem(
- existing.Text,
- metadata,
- existing.Vector
- );
-
- VectorStore.Set(id, item);
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Updates a Text by its ID with new text and metadata values
- ///
- ///
- ///
- ///
- public async Task UpdateTextAndMetadataAsync(TId id, string text, TMetadata metadata)
- {
- if (VectorStore.ContainsKey(id))
- {
- var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
- VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
- }
- else
- {
- throw new KeyNotFoundException($"Text with ID {id} not found.");
- }
- }
-
- ///
- /// Updates a Text by its ID with new text and metadata values
- ///
- ///
- ///
- ///
- public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata)
- {
- var task = UpdateTextAndMetadataAsync(id, text, metadata);
- task.Wait();
- }
-
- ///
- /// Performs a vector search to find the top N most similar texts to the given text
- ///
- /// The query prompt to search by.
- /// The highest number of results to show.
- /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all.
- /// The page index of the search results. Default is 0.
- /// The number of search results per page. Default is Null and returns all results.
- /// A filter function to apply to the metadata of each result.
- /// The search results as an IVectorTextResult object.
- public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null)
- {
- Func>? filterToUse = null;
- if (filter != null)
- {
- filterToUse = (metadata) => Task.FromResult(filter(metadata));
- }
- return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result;
- }
-
- ///
- /// Performs an asynchronous search vector search to find the top N most similar texts to the given text
- ///
- /// The query prompt to search by.
- /// The similarity threshold to filter by.
- /// The page index of the search results. Default is 0.
- /// The number of search results per page. Default is Null and returns all results.
- /// A filter function to apply to the metadata of each result.
- /// The search results as an IVectorTextResult object.
- public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null)
- {
- var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter);
-
- similarities = await _vectorComparer.SortAsync(similarities);
-
- var totalCountFoundInSearch = similarities.Count();
-
- IEnumerable> resultsToReturn;
- if (pageCount != null && pageCount >= 0 && pageIndex >= 0) {
- resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value);
- } else {
- // no paging specified, return all results
- resultsToReturn = similarities;
- }
-
- return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn);
- }
-
- private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null, Func>? filter = null)
- {
- var queryVector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(queryText);
-
- if (VectorStore.Count == 0)
- {
- throw new InvalidOperationException("The database is empty.");
- }
-
- var results = new ConcurrentBag>();
- await foreach (var kvp in VectorStore)
- {
- if (filter == null || await filter(kvp.Value.Metadata))
- {
- var item = kvp.Value;
-
- float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector);
-
- if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue))
- {
- var id = kvp.Key;
- results.Add(
- new VectorTextResultItem(id, item, vectorComparisonValue)
- );
- }
- }
- }
- return results;
- }
-
- [Obsolete("Use SerializeToBinaryStreamAsync Instead")]
- public virtual async Task SerializeToJsonStreamAsync(Stream stream)
- {
- await SerializeToBinaryStreamAsync(stream);
- }
-
- ///
- /// Serializes the Vector Database to a JSON stream
- ///
- ///
- ///
- ///
- public virtual async Task SerializeToBinaryStreamAsync(Stream stream)
- {
- var streamVectorStore = new MemoryStream();
- await VectorStore.SerializeToJsonStreamAsync(streamVectorStore);
-
- await DatabaseFile.SaveDatabaseToZipArchiveAsync(
- stream,
- new DatabaseInfo(this.GetType().FullName),
- async (archive) =>
- {
- var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename);
- using (var entryStream = entryVectorStore.Open())
- {
- streamVectorStore.Position = 0;
- await streamVectorStore.CopyToAsync(entryStream);
- await entryStream.FlushAsync();
- }
- }
- );
- await stream.FlushAsync();
- }
-
- [Obsolete("Use SerializeToBinaryStream Instead")]
- public virtual void SerializeToJsonStream(Stream stream)
- {
- SerializeToBinaryStream(stream);
- }
-
- public virtual void SerializeToBinaryStream(Stream stream)
- {
- if (stream == null)
- {
- throw new ArgumentNullException(nameof(stream));
- }
- SerializeToBinaryStreamAsync(stream).Wait();
- }
-
- [Obsolete("Use DeserializeFromBinaryStreamAsync Instead")]
- public virtual async Task DeserializeFromJsonStreamAsync(Stream stream)
- {
- await DeserializeFromBinaryStreamAsync(stream);
- }
-
- public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream)
- {
- await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
- stream,
- this.GetType().FullName,
- async (archive) =>
- {
- await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
-
- // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
- if (_idGenerator is ISequentialIdGenerator seqIdGen)
- {
- // Re-seed the sequence only if there are existing IDs
- var ids = VectorStore.GetIds();
- if (ids.Any())
- {
- seqIdGen.SetMostRecent(ids.Max()!);
- }
- }
- }
- );
- }
-
- [Obsolete("Use DeserializeFromBinaryStream Instead")]
- public virtual void DeserializeFromJsonStream(Stream stream)
- {
- DeserializeFromBinaryStream(stream);
- }
-
- public virtual void DeserializeFromBinaryStream(Stream stream)
- {
- if (stream == null)
- {
- throw new ArgumentNullException(nameof(stream));
- }
- DeserializeFromBinaryStreamAsync(stream).Wait();
- }
-
- public IEnumerator> GetEnumerator()
- {
- return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator();
- }
-
- System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
- {
- return this.GetEnumerator();
- }
-}
\ No newline at end of file
+ : base(embeddingsGenerator, vectorStore)
+ { }
+}
diff --git a/src/Build5Nines.SharpVector/VectorDatabaseBase.cs b/src/Build5Nines.SharpVector/VectorDatabaseBase.cs
new file mode 100644
index 0000000..203cb58
--- /dev/null
+++ b/src/Build5Nines.SharpVector/VectorDatabaseBase.cs
@@ -0,0 +1,818 @@
+using Build5Nines.SharpVector.Id;
+using Build5Nines.SharpVector.Preprocessing;
+using Build5Nines.SharpVector.Vocabulary;
+using Build5Nines.SharpVector.Vectorization;
+using Build5Nines.SharpVector.VectorCompare;
+using Build5Nines.SharpVector.VectorStore;
+using System.Collections.Concurrent;
+using System.IO.Compression;
+using System.Runtime.CompilerServices;
+using System.Text.Json;
+using Build5Nines.SharpVector.Embeddings;
+using System.Runtime.ExceptionServices;
+using System.Collections;
+using System.Linq;
+
+namespace Build5Nines.SharpVector;
+
+///
+/// Base class for a vector database.
+///
+///
+///
+///
+///
+///
+///
+///
+///
+///
+///
+public abstract class VectorDatabaseBase
+ : IVectorDatabase
+ where TId : notnull
+ where TVocabularyKey : notnull
+ where TVocabularyValue: notnull
+ where TVectorStore : IVectorStoreWithVocabulary
+ where TVocabularyStore : IVocabularyStore
+ where TIdGenerator : IIdGenerator, new()
+ where TTextPreprocessor : ITextPreprocessor, new()
+ where TVectorizer : IVectorizer, new()
+ where TVectorComparer : IVectorComparer, new()
+{
+ protected TIdGenerator _idGenerator;
+
+ private TTextPreprocessor _textPreprocessor;
+
+ private TVectorizer _vectorizer;
+
+ private TVectorComparer _vectorComparer;
+
+ ///
+ /// The Vector Store used to store the text vectors of the database
+ ///
+ protected TVectorStore VectorStore { get; private set; }
+
+ public VectorDatabaseBase(TVectorStore vectorStore)
+ {
+ VectorStore = vectorStore;
+ _idGenerator = new TIdGenerator();
+ _textPreprocessor = new TTextPreprocessor();
+ _vectorizer = new TVectorizer();
+ _vectorComparer = new TVectorComparer();
+ }
+
+ ///
+ /// Get all the Ids for each text the database.
+ ///
+ ///
+ public IEnumerable GetIds()
+ {
+ return VectorStore.GetIds();
+ }
+
+ ///
+ /// Adds a new text with Metadata to the database and returns its ID
+ ///
+ ///
+ ///
+ ///
+ public TId AddText(TVocabularyKey text, TMetadata? metadata = default(TMetadata))
+ {
+ return AddTextAsync(text, metadata).Result;
+ }
+
+ ///
+ /// Adds a new text with Metadata to the database and returns its ID
+ ///
+ ///
+ ///
+ ///
+ public async Task AddTextAsync(TVocabularyKey text, TMetadata? metadata = default(TMetadata))
+ {
+ // Perform preprocessing asynchronously
+ var tokens = await _textPreprocessor.TokenizeAndPreprocessAsync(text);
+
+ // Update the vocabulary store asynchronously
+ await VectorStore.VocabularyStore.UpdateAsync(tokens);
+
+ // Generate the vector asynchronously
+ float[] vector = await _vectorizer.GenerateVectorFromTokensAsync(VectorStore.VocabularyStore, tokens);
+
+ // Generate the ID and store the vector text item asynchronously
+ TId id = _idGenerator.NewId();
+ await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector));
+
+ return id;
+ }
+
+ public async Task> AddTextsAsync(IEnumerable<(TVocabularyKey text, TMetadata? metadata)> items)
+ {
+ if (items is null) throw new ArgumentNullException(nameof(items));
+
+ var ids = new List();
+
+ foreach(var item in items)
+ {
+ TId id = await AddTextAsync(item.text, item.metadata);
+ ids.Add(id);
+ }
+
+ return ids;
+ }
+
+
+ ///
+ /// Retrieves a text and metadata by its ID
+ ///
+ ///
+ ///
+ ///
+ public IVectorTextItem GetText(TId id)
+ {
+ return VectorStore.Get(id);
+ }
+
+ ///
+ /// Deletes a text by its ID
+ ///
+ ///
+ ///
+ public IVectorTextItem DeleteText(TId id)
+ {
+ return VectorStore.Delete(id);
+ }
+
+ ///
+ /// Updates a text by its ID
+ ///
+ ///
+ ///
+ ///
+ public void UpdateText(TId id, TVocabularyKey text)
+ {
+ if (VectorStore.ContainsKey(id))
+ {
+ var tokens = _textPreprocessor.TokenizeAndPreprocess(text);
+ VectorStore.VocabularyStore.Update(tokens);
+ float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens);
+ var metadata = VectorStore.Get(id).Metadata;
+ VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Updates the Metadata of a Text by its ID
+ ///
+ ///
+ ///
+ ///
+ public void UpdateTextMetadata(TId id, TMetadata metadata) {
+ if (VectorStore.ContainsKey(id))
+ {
+ var existing = VectorStore.Get(id);
+
+ var item = new VectorTextItem(
+ existing.Text,
+ metadata,
+ existing.Vector
+ );
+
+ VectorStore.Set(id, item);
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Updates a Text by its ID with new text and metadata values
+ ///
+ ///
+ ///
+ ///
+ public void UpdateTextAndMetadata(TId id, TVocabularyKey text, TMetadata metadata)
+ {
+ if (VectorStore.ContainsKey(id))
+ {
+ var tokens = _textPreprocessor.TokenizeAndPreprocess(text);
+ VectorStore.VocabularyStore.Update(tokens);
+ float[] vector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, tokens);
+ VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Performs a vector search to find the top N most similar texts to the given text
+ ///
+ /// The query prompt to search by.
+ /// The highest number of results to show.
+ /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all.
+ /// The page index of the search results. Default is 0.
+ /// The number of search results per page. Default is Null and returns all results.
+ /// A filter function to apply to the metadata of each result.
+ ///
+ public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null)
+ {
+ Func>? filterToUse = null;
+ if (filter != null)
+ {
+ filterToUse = (metadata) => Task.FromResult(filter(metadata));
+ }
+ return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result;
+ }
+
+ ///
+ /// Performs an asynchronous search vector search to find the top N most similar texts to the given text
+ ///
+ /// The query prompt to search by.
+ /// The similarity threshold to filter by.
+ /// The page index of the search results. Default is 0.
+ /// The number of search results per page. Default is Null and returns all results.
+ /// A filter function to apply to the metadata of each result.
+ ///
+ public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null)
+ {
+ var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter);
+
+ similarities = await _vectorComparer.SortAsync(similarities);
+
+ var totalCountFoundInSearch = similarities.Count();
+
+ IEnumerable> resultsToReturn;
+ if (pageCount != null && pageCount >= 0 && pageIndex >= 0) {
+ resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value);
+ } else {
+ // no paging specified, return all results
+ resultsToReturn = similarities;
+ }
+
+ return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn);
+ }
+
+ private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null, Func>? filter = null)
+ {
+ var queryTokens = _textPreprocessor.TokenizeAndPreprocess(queryText);
+ float[] queryVector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, queryTokens);
+
+ // Method to get the maximum vector length in the database
+ var desiredLength = VectorStore.VocabularyStore.Count;
+
+ if (VectorStore.Count == 0)
+ {
+ throw new InvalidOperationException("The database is empty.");
+ }
+
+ var results = new ConcurrentBag>();
+ await foreach (KeyValuePair> kvp in VectorStore)
+ {
+ if (filter == null || await filter(kvp.Value.Metadata))
+ {
+ var item = kvp.Value;
+ float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength));
+
+ if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue))
+ {
+ var id = kvp.Key;
+ results.Add(
+ new VectorTextResultItem(id, item, vectorComparisonValue)
+ );
+ }
+ }
+ }
+ return results;
+ }
+
+ [Obsolete("Use SerializeToBinaryStreamAsync instead.")]
+ public virtual async Task SerializeToJsonStreamAsync(Stream stream)
+ {
+ await SerializeToBinaryStreamAsync(stream);
+ }
+
+ ///
+ /// Serializes the Vector Database to a JSON stream
+ ///
+ ///
+ ///
+ ///
+ public virtual async Task SerializeToBinaryStreamAsync(Stream stream)
+ {
+ var streamVectorStore = new MemoryStream();
+ var streamVocabularyStore = new MemoryStream();
+
+ var taskVectorStore = VectorStore.SerializeToJsonStreamAsync(streamVectorStore);
+ var taskVocabularyStore = VectorStore.VocabularyStore.SerializeToJsonStreamAsync(streamVocabularyStore);
+
+ await Task.WhenAll(taskVectorStore, taskVocabularyStore);
+
+ await DatabaseFile.SaveDatabaseToZipArchiveAsync(
+ stream,
+ new DatabaseInfo(this.GetType().FullName),
+ async (archive) =>
+ {
+ var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename);
+ using (var entryStream = entryVectorStore.Open())
+ {
+ streamVectorStore.Position = 0;
+ await streamVectorStore.CopyToAsync(entryStream);
+ await entryStream.FlushAsync();
+ }
+
+ var entryVocabularyStore = archive.CreateEntry(DatabaseFile.vocabularyStoreFilename);
+ using (var entryStream = entryVocabularyStore.Open())
+ {
+ streamVocabularyStore.Position = 0;
+ await streamVocabularyStore.CopyToAsync(entryStream);
+ await entryStream.FlushAsync();
+ }
+ }
+ );
+ await stream.FlushAsync();
+ }
+
+ [Obsolete("Use SerializeToBinaryStream instead.")]
+ public virtual void SerializeToJsonStream(Stream stream)
+ {
+ SerializeToBinaryStream(stream);
+ }
+
+ public virtual void SerializeToBinaryStream(Stream stream)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+ SerializeToBinaryStreamAsync(stream).Wait();
+ }
+
+ [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")]
+ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) {
+ await DeserializeFromBinaryStreamAsync(stream);
+ }
+
+ public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream)
+ {
+ await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
+ stream,
+ this.GetType().FullName,
+ async (archive) =>
+ {
+ await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
+ await DatabaseFile.LoadVocabularyStoreAsync(archive, VectorStore.VocabularyStore);
+
+ // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
+ if (_idGenerator is ISequentialIdGenerator seqIdGen)
+ {
+ // Re-seed the sequence only if there are existing IDs
+ var ids = VectorStore.GetIds();
+ if (ids.Any())
+ {
+ seqIdGen.SetMostRecent(ids.Max()!);
+ }
+ }
+ }
+ );
+ }
+
+
+ [Obsolete("Use DeserializeFromBinaryStream instead")]
+ public virtual void DeserializeFromJsonStream(Stream stream)
+ {
+ DeserializeFromBinaryStream(stream);
+ }
+
+ public virtual void DeserializeFromBinaryStream(Stream stream)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+ DeserializeFromBinaryStreamAsync(stream).Wait();
+ }
+
+ public IEnumerator> GetEnumerator()
+ {
+ return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator();
+ }
+
+ IEnumerator IEnumerable.GetEnumerator()
+ {
+ return GetEnumerator();
+ }
+}
+
+
+
+///
+/// Base class for a vector database.
+///
+///
+///
+///
+///
+///
+public abstract class VectorDatabaseBase
+ : IVectorDatabase
+ where TId : notnull
+ where TVectorStore : IVectorStore
+ where TIdGenerator : IIdGenerator, new()
+ where TVectorComparer : IVectorComparer, new()
+{
+ private TIdGenerator _idGenerator;
+
+ private TVectorComparer _vectorComparer;
+
+ ///
+ /// The Vector Store used to store the text vectors of the database
+ ///
+ protected TVectorStore VectorStore { get; private set; }
+
+ protected IEmbeddingsGenerator EmbeddingsGenerator { get; private set; }
+
+ public VectorDatabaseBase(IEmbeddingsGenerator embeddingsGenerator, TVectorStore vectorStore)
+ {
+ EmbeddingsGenerator = embeddingsGenerator;
+ VectorStore = vectorStore;
+ _idGenerator = new TIdGenerator();
+ _vectorComparer = new TVectorComparer();
+ }
+
+ ///
+ /// Get all the Ids for each text the database.
+ ///
+ ///
+ public IEnumerable GetIds()
+ {
+ return VectorStore.GetIds();
+ }
+
+ ///
+ /// Adds a new text with Metadata to the database and returns its ID
+ ///
+ ///
+ ///
+ ///
+ public TId AddText(string text, TMetadata? metadata = default(TMetadata))
+ {
+ return AddTextAsync(text, metadata).Result;
+ }
+
+ ///
+ /// Adds a new text with Metadata to the database and returns its ID
+ ///
+ ///
+ ///
+ ///
+ public async Task AddTextAsync(string text, TMetadata? metadata = default(TMetadata))
+ {
+ // Generate the vector asynchronously
+ var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
+
+ // Generate the ID and store the vector text item asynchronously
+ TId id = _idGenerator.NewId();
+ await VectorStore.SetAsync(id, new VectorTextItem(text, metadata, vector));
+
+ return id;
+ }
+
+ ///
+ /// Adds multiple texts with optional metadata to the database efficiently.
+ /// If the embeddings generator supports batching, this will generate vectors in a single multi-input call.
+ ///
+ /// Collection of (text, metadata) tuples to add.
+ /// List of generated IDs in the same order as inputs.
+ public async Task> AddTextsAsync(IEnumerable<(string text, TMetadata? metadata)> items)
+ {
+ if (items is null) throw new ArgumentNullException(nameof(items));
+
+ var list = items.ToList();
+ if (list.Count == 0) return Array.Empty();
+
+ // Try batch embeddings if supported
+ float[][] vectors;
+ if (EmbeddingsGenerator is IBatchEmbeddingsGenerator batchGen)
+ {
+ var batch = await batchGen.GenerateEmbeddingsAsync(list.Select(i => i.text));
+ vectors = batch.Select(v => v.ToArray()).ToArray();
+ }
+ else
+ {
+ // Fallback to per-item embedding
+ vectors = new float[list.Count][];
+ for (int i = 0; i < list.Count; i++)
+ {
+ vectors[i] = await EmbeddingsGenerator.GenerateEmbeddingsAsync(list[i].text);
+ }
+ }
+
+ // Store items and produce IDs
+ var ids = new List(list.Count);
+ for (int i = 0; i < list.Count; i++)
+ {
+ TId id = _idGenerator.NewId();
+ ids.Add(id);
+ await VectorStore.SetAsync(id, new VectorTextItem(list[i].text, list[i].metadata, vectors[i]));
+ }
+
+ return ids;
+ }
+
+ ///
+ /// Retrieves a text and metadata by its ID
+ ///
+ ///
+ ///
+ ///
+ public IVectorTextItem GetText(TId id)
+ {
+ return VectorStore.Get(id);
+ }
+
+ ///
+ /// Deletes a text by its ID
+ ///
+ ///
+ ///
+ public IVectorTextItem DeleteText(TId id)
+ {
+ return VectorStore.Delete(id);
+ }
+
+ ///
+ /// Updates a text by its ID
+ ///
+ ///
+ ///
+ ///
+ public async Task UpdateTextAsync(TId id, string text)
+ {
+ if (VectorStore.ContainsKey(id))
+ {
+ var existing = VectorStore.Get(id);
+ var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
+ var metadata = existing.Metadata;
+ VectorStore.Set(id, new VectorTextItem(text, existing.Metadata, vector));
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Updates a text by its ID
+ ///
+ ///
+ ///
+ public void UpdateText(TId id, string text)
+ {
+ var task = UpdateTextAsync(id, text);
+ task.Wait();
+ }
+
+ ///
+ /// Updates the Metadata of a Text by its ID
+ ///
+ ///
+ ///
+ ///
+ public void UpdateTextMetadata(TId id, TMetadata metadata) {
+ if (VectorStore.ContainsKey(id))
+ {
+ var existing = VectorStore.Get(id);
+
+ var item = new VectorTextItem(
+ existing.Text,
+ metadata,
+ existing.Vector
+ );
+
+ VectorStore.Set(id, item);
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Updates a Text by its ID with new text and metadata values
+ ///
+ ///
+ ///
+ ///
+ public async Task UpdateTextAndMetadataAsync(TId id, string text, TMetadata metadata)
+ {
+ if (VectorStore.ContainsKey(id))
+ {
+ var vector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(text);
+ VectorStore.Set(id, new VectorTextItem(text, metadata, vector));
+ }
+ else
+ {
+ throw new KeyNotFoundException($"Text with ID {id} not found.");
+ }
+ }
+
+ ///
+ /// Updates a Text by its ID with new text and metadata values
+ ///
+ ///
+ ///
+ ///
+ public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata)
+ {
+ var task = UpdateTextAndMetadataAsync(id, text, metadata);
+ task.Wait();
+ }
+
+ ///
+ /// Performs a vector search to find the top N most similar texts to the given text
+ ///
+ /// The query prompt to search by.
+ /// The highest number of results to show.
+ /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all.
+ /// The page index of the search results. Default is 0.
+ /// The number of search results per page. Default is Null and returns all results.
+ /// A filter function to apply to the metadata of each result.
+ /// The search results as an IVectorTextResult object.
+ public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null)
+ {
+ Func>? filterToUse = null;
+ if (filter != null)
+ {
+ filterToUse = (metadata) => Task.FromResult(filter(metadata));
+ }
+ return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result;
+ }
+
+ ///
+ /// Performs an asynchronous search vector search to find the top N most similar texts to the given text
+ ///
+ /// The query prompt to search by.
+ /// The similarity threshold to filter by.
+ /// The page index of the search results. Default is 0.
+ /// The number of search results per page. Default is Null and returns all results.
+ /// A filter function to apply to the metadata of each result.
+ /// The search results as an IVectorTextResult object.
+ public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null)
+ {
+ var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter);
+
+ similarities = await _vectorComparer.SortAsync(similarities);
+
+ var totalCountFoundInSearch = similarities.Count();
+
+ IEnumerable> resultsToReturn;
+ if (pageCount != null && pageCount >= 0 && pageIndex >= 0) {
+ resultsToReturn = similarities.Skip(pageIndex * pageCount.Value).Take(pageCount.Value);
+ } else {
+ // no paging specified, return all results
+ resultsToReturn = similarities;
+ }
+
+ return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn);
+ }
+
+ private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null, Func>? filter = null)
+ {
+ var queryVector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(queryText);
+
+ if (VectorStore.Count == 0)
+ {
+ throw new InvalidOperationException("The database is empty.");
+ }
+
+ var results = new ConcurrentBag>();
+ await foreach (var kvp in VectorStore)
+ {
+ if (filter == null || await filter(kvp.Value.Metadata))
+ {
+ var item = kvp.Value;
+
+ float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector);
+
+ if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue))
+ {
+ var id = kvp.Key;
+ results.Add(
+ new VectorTextResultItem(id, item, vectorComparisonValue)
+ );
+ }
+ }
+ }
+ return results;
+ }
+
+ [Obsolete("Use SerializeToBinaryStreamAsync Instead")]
+ public virtual async Task SerializeToJsonStreamAsync(Stream stream)
+ {
+ await SerializeToBinaryStreamAsync(stream);
+ }
+
+ ///
+ /// Serializes the Vector Database to a JSON stream
+ ///
+ ///
+ ///
+ ///
+ public virtual async Task SerializeToBinaryStreamAsync(Stream stream)
+ {
+ var streamVectorStore = new MemoryStream();
+ await VectorStore.SerializeToJsonStreamAsync(streamVectorStore);
+
+ await DatabaseFile.SaveDatabaseToZipArchiveAsync(
+ stream,
+ new DatabaseInfo(this.GetType().FullName),
+ async (archive) =>
+ {
+ var entryVectorStore = archive.CreateEntry(DatabaseFile.vectorStoreFilename);
+ using (var entryStream = entryVectorStore.Open())
+ {
+ streamVectorStore.Position = 0;
+ await streamVectorStore.CopyToAsync(entryStream);
+ await entryStream.FlushAsync();
+ }
+ }
+ );
+ await stream.FlushAsync();
+ }
+
+ [Obsolete("Use SerializeToBinaryStream Instead")]
+ public virtual void SerializeToJsonStream(Stream stream)
+ {
+ SerializeToBinaryStream(stream);
+ }
+
+ public virtual void SerializeToBinaryStream(Stream stream)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+ SerializeToBinaryStreamAsync(stream).Wait();
+ }
+
+ [Obsolete("Use DeserializeFromBinaryStreamAsync Instead")]
+ public virtual async Task DeserializeFromJsonStreamAsync(Stream stream)
+ {
+ await DeserializeFromBinaryStreamAsync(stream);
+ }
+
+ public virtual async Task DeserializeFromBinaryStreamAsync(Stream stream)
+ {
+ await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
+ stream,
+ this.GetType().FullName,
+ async (archive) =>
+ {
+ await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
+
+ // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
+ if (_idGenerator is ISequentialIdGenerator seqIdGen)
+ {
+ // Re-seed the sequence only if there are existing IDs
+ var ids = VectorStore.GetIds();
+ if (ids.Any())
+ {
+ seqIdGen.SetMostRecent(ids.Max()!);
+ }
+ }
+ }
+ );
+ }
+
+ [Obsolete("Use DeserializeFromBinaryStream Instead")]
+ public virtual void DeserializeFromJsonStream(Stream stream)
+ {
+ DeserializeFromBinaryStream(stream);
+ }
+
+ public virtual void DeserializeFromBinaryStream(Stream stream)
+ {
+ if (stream == null)
+ {
+ throw new ArgumentNullException(nameof(stream));
+ }
+ DeserializeFromBinaryStreamAsync(stream).Wait();
+ }
+
+ public IEnumerator> GetEnumerator()
+ {
+ return VectorStore.Select(kvp => new VectorTextDatabaseItem(kvp.Key, kvp.Value.Text, kvp.Value.Metadata, kvp.Value.Vector)).GetEnumerator();
+ }
+
+ System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+ {
+ return this.GetEnumerator();
+ }
+}
\ No newline at end of file
diff --git a/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs b/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs
new file mode 100644
index 0000000..137d725
--- /dev/null
+++ b/src/Build5Nines.SharpVector/VectorStore/BasicDiskVectorStore.cs
@@ -0,0 +1,356 @@
+using System.Collections;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Text.Json;
+using Build5Nines.SharpVector.VectorStore;
+using Build5Nines.SharpVector.Vocabulary;
+
+namespace Build5Nines.SharpVector.VectorStore;
+
+///
+/// Disk-backed vector store. Persists items to files and supports streaming reads.
+/// Uses the vocabulary key type as the document type to satisfy the IVectorStore contract.
+///
+public class BasicDiskVectorStore
+ : IVectorStoreWithVocabulary, IDisposable
+ where TId : notnull
+ where TVocabularyKey : notnull
+ where TVocabularyValue : notnull
+ where TVocabularyStore : IVocabularyStore
+{
+ private readonly string _rootPath;
+ private readonly string _indexPath;
+ private readonly string _itemsPath;
+ private readonly string _walPath;
+
+ private readonly ReaderWriterLockSlim _rwLock = new(LockRecursionPolicy.SupportsRecursion);
+
+ private readonly ConcurrentDictionary _index = new();
+ private readonly ConcurrentDictionary> _cache = new();
+
+ private readonly ConcurrentQueue<(TId id, VectorTextItem? item, bool isDelete)> _pending = new();
+ private readonly CancellationTokenSource _cts = new();
+ private readonly Task _backgroundFlushTask;
+
+ public TVocabularyStore VocabularyStore { get; }
+
+ public int Count => _cache.Count;
+
+ public BasicDiskVectorStore(string rootPath, TVocabularyStore vocabularyStore)
+ {
+ _rootPath = rootPath;
+ _indexPath = Path.Combine(rootPath, "index.json");
+ _itemsPath = Path.Combine(rootPath, "items.bin");
+ _walPath = Path.Combine(rootPath, "wal.log");
+ Directory.CreateDirectory(rootPath);
+ VocabularyStore = vocabularyStore;
+ RecoverFromWalOrIndex();
+ _backgroundFlushTask = Task.Run(BackgroundFlusherAsync);
+ }
+
+ public IEnumerable GetIds() => _cache.Keys;
+
+ public IVectorTextItem Get(TId id)
+ {
+ // First check cache for fast read
+ if (_cache.TryGetValue(id, out var cached)) return cached;
+
+ _rwLock.EnterReadLock();
+ try
+ {
+ if (!_index.TryGetValue(id, out var offset)) throw new KeyNotFoundException();
+ using var fs = File.OpenRead(_itemsPath);
+ fs.Seek(offset, SeekOrigin.Begin);
+ var item = ReadItem(fs);
+ _cache[id] = item;
+ return item;
+ }
+ finally
+ {
+ _rwLock.ExitReadLock();
+ }
+ }
+
+ public void Set(TId id, VectorTextItem item)
+ {
+ // Write-Ahead Log entry to ensure durability (A in ACID)
+ AppendWalRecord(id, item, isDelete: false);
+
+ // Update memory state atomically
+ _rwLock.EnterWriteLock();
+ try
+ {
+ _cache[id] = item;
+ _pending.Enqueue((id, item, false));
+ }
+ finally
+ {
+ _rwLock.ExitWriteLock();
+ }
+ }
+
+ public async Task SetAsync(TId id, VectorTextItem item)
+ {
+ Set(id, item);
+ await Task.Yield();
+ }
+
+ public IVectorTextItem Delete(TId id)
+ {
+ var existing = Get(id);
+
+ // WAL for delete
+ AppendWalRecord(id, item: null, isDelete: true);
+
+ _rwLock.EnterWriteLock();
+ try
+ {
+ _cache.TryRemove(id, out _);
+ _pending.Enqueue((id, null, true));
+ }
+ finally
+ {
+ _rwLock.ExitWriteLock();
+ }
+
+ return existing;
+ }
+
+ public bool ContainsKey(TId id) => _cache.ContainsKey(id);
+
+ public async Task SerializeToJsonStreamAsync(Stream stream)
+ {
+ await JsonSerializer.SerializeAsync(stream, _index);
+ }
+
+ public async Task DeserializeFromJsonStreamAsync(Stream stream)
+ {
+ var loaded = await JsonSerializer.DeserializeAsync>(stream);
+ if (loaded != null)
+ {
+ foreach (var kv in loaded) _index[kv.Key] = kv.Value;
+ }
+ }
+
+ public IEnumerator>> GetEnumerator()
+ {
+ foreach (var key in _cache.Keys)
+ {
+ yield return new KeyValuePair>(key, (VectorTextItem)Get(key));
+ }
+ }
+
+ System.Collections.IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
+
+ public async IAsyncEnumerator>> GetAsyncEnumerator(CancellationToken cancellationToken = default)
+ {
+ foreach (var key in _cache.Keys)
+ {
+ yield return new KeyValuePair>(key, (VectorTextItem)Get(key));
+ await Task.Yield();
+ }
+ }
+
+ private void PersistIndex()
+ {
+ const int maxRetries = 5;
+ int attempt = 0;
+ while (true)
+ {
+ try
+ {
+ using var fs = new FileStream(_indexPath, FileMode.Create, FileAccess.Write, FileShare.Read);
+ JsonSerializer.Serialize(fs, _index);
+ fs.Flush(true); // ensure durability of index checkpoint
+ break;
+ }
+ catch (IOException)
+ {
+ attempt++;
+ if (attempt >= maxRetries) throw;
+ Thread.Sleep(10 * attempt);
+ }
+ }
+ }
+
+ private void LoadIndexIfExists()
+ {
+ if (!File.Exists(_indexPath)) return;
+ using var fs = File.OpenRead(_indexPath);
+ var loaded = JsonSerializer.Deserialize>(fs);
+ if (loaded != null)
+ {
+ foreach (var kv in loaded) _index[kv.Key] = kv.Value;
+ }
+ }
+
+ private void RecoverFromWalOrIndex()
+ {
+ // Load index checkpoint if present
+ LoadIndexIfExists();
+
+ // Replay WAL to recover any operations after the last checkpoint
+ if (!File.Exists(_walPath)) return;
+ using var fs = new FileStream(_walPath, FileMode.Open, FileAccess.Read, FileShare.Read);
+ using var br = new BinaryReader(fs);
+ while (fs.Position < fs.Length)
+ {
+ bool isDelete = br.ReadBoolean();
+ var idJson = br.ReadString();
+ var id = JsonSerializer.Deserialize(idJson)!;
+ if (isDelete)
+ {
+ _index.TryRemove(id, out _);
+ _cache.TryRemove(id, out _);
+ }
+ else
+ {
+ var itemJson = br.ReadString();
+ var item = JsonSerializer.Deserialize>(itemJson)!;
+
+ // Append item to items file to bring storage up-to-date
+ using var ofs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
+ ofs.Seek(0, SeekOrigin.End);
+ var offset = ofs.Position;
+ WriteItem(ofs, item);
+ ofs.Flush(true);
+ _index[id] = offset;
+ _cache[id] = item;
+ }
+ }
+ // After successful replay, truncate WAL (commit)
+ File.WriteAllBytes(_walPath, Array.Empty());
+ PersistIndex();
+ }
+
+ private void AppendWalRecord(TId id, VectorTextItem? item, bool isDelete)
+ {
+ Directory.CreateDirectory(_rootPath);
+ using var fs = new FileStream(_walPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
+ fs.Seek(0, SeekOrigin.End);
+ using var bw = new BinaryWriter(fs);
+ bw.Write(isDelete);
+ bw.Write(JsonSerializer.Serialize(id));
+ if (!isDelete && item != null)
+ {
+ bw.Write(JsonSerializer.Serialize(item));
+ }
+ bw.Flush();
+ fs.Flush(true); // fsync for WAL to guarantee durability
+ }
+
+ private async Task BackgroundFlusherAsync()
+ {
+ var token = _cts.Token;
+ while (!token.IsCancellationRequested)
+ {
+ try
+ {
+ // Batch flush pending operations
+ if (_pending.IsEmpty)
+ {
+ await Task.Delay(25, token);
+ continue;
+ }
+
+ using var itemsFs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
+ while (_pending.TryDequeue(out var op))
+ {
+ if (op.isDelete)
+ {
+ _index.TryRemove(op.id, out _);
+ }
+ else if (op.item is not null)
+ {
+ itemsFs.Seek(0, SeekOrigin.End);
+ var offset = itemsFs.Position;
+ WriteItem(itemsFs, op.item);
+ _index[op.id] = offset;
+ }
+ }
+ itemsFs.Flush(true);
+ PersistIndex();
+
+ // After index checkpoint, truncate WAL safely
+ File.WriteAllBytes(_walPath, Array.Empty());
+ }
+ catch (OperationCanceledException)
+ {
+ // normal shutdown
+ }
+ catch
+ {
+ // best-effort: wait and retry
+ await Task.Delay(100, token);
+ }
+ }
+ }
+
+ public void Dispose()
+ {
+ try
+ {
+ _cts.Cancel();
+ _backgroundFlushTask.Wait(1500);
+ }
+ catch { }
+ finally
+ {
+ // Attempt a final flush of pending operations synchronously
+ try
+ {
+ using var itemsFs = new FileStream(_itemsPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
+ while (_pending.TryDequeue(out var op))
+ {
+ if (op.isDelete)
+ {
+ _index.TryRemove(op.id, out _);
+ }
+ else if (op.item is not null)
+ {
+ itemsFs.Seek(0, SeekOrigin.End);
+ var offset = itemsFs.Position;
+ WriteItem(itemsFs, op.item);
+ _index[op.id] = offset;
+ }
+ }
+ itemsFs.Flush(true);
+ PersistIndex();
+ File.WriteAllBytes(_walPath, Array.Empty());
+ }
+ catch { }
+
+ _cts.Dispose();
+ _rwLock.Dispose();
+ GC.SuppressFinalize(this);
+ }
+ }
+
+ private static void WriteItem(FileStream fs, VectorTextItem item)
+ {
+ using var bw = new BinaryWriter(fs, System.Text.Encoding.UTF8, leaveOpen: true);
+ var json = JsonSerializer.Serialize(item);
+ var bytes = System.Text.Encoding.UTF8.GetBytes(json);
+ bw.Write(bytes.Length);
+ bw.Write(bytes);
+ }
+
+ private static VectorTextItem ReadItem(FileStream fs)
+ {
+ using var br = new BinaryReader(fs, System.Text.Encoding.UTF8, leaveOpen: true);
+ int len = br.ReadInt32();
+ var bytes = br.ReadBytes(len);
+ var json = System.Text.Encoding.UTF8.GetString(bytes);
+ var item = JsonSerializer.Deserialize>(json)!;
+ return item;
+ }
+}
+
+public class BasicDiskVectorStore
+ : BasicDiskVectorStore, string, int>
+ where TId : notnull
+{
+ public BasicDiskVectorStore(string rootPath, IVocabularyStore vocabularyStore)
+ : base(rootPath, vocabularyStore)
+ { }
+}
diff --git a/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs b/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs
new file mode 100644
index 0000000..2b1c6cd
--- /dev/null
+++ b/src/Build5Nines.SharpVector/Vocabulary/BasicDiskVocabularyStore.cs
@@ -0,0 +1,223 @@
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Text.Json;
+using Build5Nines.SharpVector.Vocabulary;
+
+namespace Build5Nines.SharpVector.Vocabulary;
+
+///
+/// Disk-backed vocabulary store that persists token->index mapping.
+///
+public class BasicDiskVocabularyStore : IVocabularyStore, IDisposable
+ where TKey : notnull
+{
+ private readonly string _rootPath;
+ private readonly string _indexPath;
+ private readonly string _walPath;
+
+ private readonly ReaderWriterLockSlim _rwLock = new(LockRecursionPolicy.SupportsRecursion);
+
+ private ConcurrentDictionary _vocab = new();
+ private ConcurrentDictionary _cache = new();
+ private readonly ConcurrentQueue> _pending = new();
+ private readonly CancellationTokenSource _cts = new();
+ private readonly Task _backgroundFlushTask;
+
+ public int Count => _cache.Count;
+
+ public BasicDiskVocabularyStore(string rootPath)
+ {
+ _rootPath = rootPath;
+ Directory.CreateDirectory(rootPath);
+ _indexPath = Path.Combine(rootPath, "vocabulary.json");
+ _walPath = Path.Combine(rootPath, "vocabulary.wal");
+ RecoverFromWalOrIndex();
+ _backgroundFlushTask = Task.Run(BackgroundFlusherAsync);
+ }
+
+ public void Update(IEnumerable tokens)
+ {
+ var tokenList = tokens as IList ?? tokens.ToList();
+ AppendWalRecord(tokenList);
+
+ _rwLock.EnterWriteLock();
+ try
+ {
+ foreach (var token in tokenList)
+ {
+ if (!_cache.ContainsKey(token))
+ {
+ _cache[token] = _cache.Count;
+ }
+ }
+ _pending.Enqueue(tokenList);
+ }
+ finally
+ {
+ _rwLock.ExitWriteLock();
+ }
+ }
+
+ public async Task UpdateAsync(IEnumerable tokens)
+ {
+ await Task.Run(() => Update(tokens));
+ }
+
+ public bool TryGetValue(TKey token, out int index) => _cache.TryGetValue(token, out index);
+
+ public async Task SerializeToJsonStreamAsync(Stream stream)
+ {
+ await JsonSerializer.SerializeAsync(stream, _vocab);
+ }
+
+ public async Task DeserializeFromJsonStreamAsync(Stream stream)
+ {
+ var loaded = await JsonSerializer.DeserializeAsync>(stream);
+ if (loaded != null)
+ {
+ _vocab = loaded;
+ }
+ }
+
+ private void Persist()
+ {
+ const int maxRetries = 5;
+ int attempt = 0;
+ while (true)
+ {
+ try
+ {
+ using var fs = new FileStream(_indexPath, FileMode.Create, FileAccess.Write, FileShare.Read);
+ JsonSerializer.Serialize(fs, _vocab);
+ fs.Flush(true);
+ break;
+ }
+ catch (IOException)
+ {
+ attempt++;
+ if (attempt >= maxRetries) throw;
+ Thread.Sleep(10 * attempt);
+ }
+ }
+ }
+
+ private void LoadIfExists()
+ {
+ if (!File.Exists(_indexPath)) return;
+ if (new FileInfo(_indexPath).Length == 0) return;
+ using var fs = File.OpenRead(_indexPath);
+ var loaded = JsonSerializer.Deserialize>(fs);
+ if (loaded != null)
+ {
+ _vocab = loaded;
+ _cache = new ConcurrentDictionary(loaded);
+ }
+ }
+
+ private void RecoverFromWalOrIndex()
+ {
+ LoadIfExists();
+ if (!File.Exists(_walPath)) return;
+ using var fs = new FileStream(_walPath, FileMode.Open, FileAccess.Read, FileShare.Read);
+ using var br = new BinaryReader(fs);
+ while (fs.Position < fs.Length)
+ {
+ int count = br.ReadInt32();
+ for (int i = 0; i < count; i++)
+ {
+ var tokenJson = br.ReadString();
+ var token = JsonSerializer.Deserialize(tokenJson)!;
+ if (!_vocab.ContainsKey(token))
+ {
+ var idx = _vocab.Count;
+ _vocab[token] = idx;
+ _cache[token] = idx;
+ }
+ }
+ }
+ File.WriteAllBytes(_walPath, Array.Empty());
+ Persist();
+ }
+
+ private void AppendWalRecord(IList tokens)
+ {
+ Directory.CreateDirectory(_rootPath);
+ using var fs = new FileStream(_walPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
+ fs.Seek(0, SeekOrigin.End);
+ using var bw = new BinaryWriter(fs);
+ bw.Write(tokens.Count);
+ foreach (var token in tokens)
+ {
+ bw.Write(JsonSerializer.Serialize(token));
+ }
+ bw.Flush();
+ fs.Flush(true);
+ }
+
+ private async Task BackgroundFlusherAsync()
+ {
+ var token = _cts.Token;
+ while (!token.IsCancellationRequested)
+ {
+ try
+ {
+ if (_pending.IsEmpty)
+ {
+ await Task.Delay(25, token);
+ continue;
+ }
+
+ var batch = new List();
+ while (_pending.TryDequeue(out var tks))
+ {
+ batch.AddRange(tks);
+ }
+
+ // Apply to persistent vocab
+ foreach (var tk in batch)
+ {
+ if (!_vocab.ContainsKey(tk))
+ {
+ _vocab[tk] = _vocab.Count;
+ }
+ }
+
+ Persist();
+ File.WriteAllBytes(_walPath, Array.Empty());
+ }
+ catch (OperationCanceledException) { }
+ catch { await Task.Delay(100, token); }
+ }
+ }
+
+ public void Dispose()
+ {
+ try
+ {
+ _cts.Cancel();
+ _backgroundFlushTask.Wait(1500);
+ }
+ catch { }
+ finally
+ {
+ try
+ {
+ var batch = new List();
+ while (_pending.TryDequeue(out var tks)) batch.AddRange(tks);
+ foreach (var tk in batch)
+ {
+ if (!_vocab.ContainsKey(tk))
+ {
+ _vocab[tk] = _vocab.Count;
+ }
+ }
+ Persist();
+ File.WriteAllBytes(_walPath, Array.Empty());
+ }
+ catch { }
+ _cts.Dispose();
+ _rwLock.Dispose();
+ GC.SuppressFinalize(this);
+ }
+ }
+}
diff --git a/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs b/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs
new file mode 100644
index 0000000..ad3b474
--- /dev/null
+++ b/src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs
@@ -0,0 +1,73 @@
+namespace SharpVectorPerformance;
+
+using System.Diagnostics;
+using Build5Nines.SharpVector;
+using Build5Nines.SharpVector.Id;
+using Build5Nines.SharpVector.Preprocessing;
+using Build5Nines.SharpVector.VectorCompare;
+using Build5Nines.SharpVector.Vectorization;
+using Build5Nines.SharpVector.VectorStore;
+using Build5Nines.SharpVector.Vocabulary;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+
+[MemoryDiagnoser]
+public class DiskVectorDatabasePerformance
+{
+ private BasicDiskVectorDatabase? _db;
+ private string _rootPath = Path.Combine(Path.GetTempPath(), "SharpVectorPerf", Guid.NewGuid().ToString("N"));
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ Directory.CreateDirectory(_rootPath);
+ _db = new BasicDiskVectorDatabase(_rootPath);
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ try { if (Directory.Exists(_rootPath)) Directory.Delete(_rootPath, recursive: true); } catch { }
+ }
+
+ [Params(25)]
+ public int ItemCount;
+
+ [Benchmark]
+ public async Task AddTexts()
+ {
+ var indices = Enumerable.Range(0, ItemCount);
+ await Parallel.ForEachAsync(indices, async (i, ct) =>
+ {
+ var text = $"Sample text {i} fox {Random.Shared.Next(0, 100)}";
+ await _db!.AddTextAsync(text, "meta");
+ });
+ }
+
+ [Benchmark]
+ public async Task Search()
+ {
+ // Ensure some data
+ if (!_db!.GetIds().Any())
+ {
+ var indices = Enumerable.Range(0, 500);
+ await Parallel.ForEachAsync(indices, async (i, ct) =>
+ {
+ await _db.AddTextAsync($"quick brown fox {i}", null);
+ });
+ }
+ var results = await _db.SearchAsync("quick fox");
+ // Touch results to avoid dead-code elimination
+ _ = results.Texts.Take(10).Count();
+ }
+
+ [Benchmark]
+ public void DeleteIds()
+ {
+ var ids = _db!.GetIds().Take(Math.Min(50, _db.GetIds().Count())).ToList();
+ foreach (var id in ids)
+ {
+ _db.DeleteText(id);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/SharpVectorPerformance/Program.cs b/src/SharpVectorPerformance/Program.cs
index 375e8d0..281466a 100644
--- a/src/SharpVectorPerformance/Program.cs
+++ b/src/SharpVectorPerformance/Program.cs
@@ -8,5 +8,6 @@ public class Program
public static void Main(string[] args)
{
BenchmarkRunner.Run();
+ BenchmarkRunner.Run();
}
}
\ No newline at end of file
diff --git a/src/SharpVectorTest/DiskVectorDatabaseTests.cs b/src/SharpVectorTest/DiskVectorDatabaseTests.cs
new file mode 100644
index 0000000..458831f
--- /dev/null
+++ b/src/SharpVectorTest/DiskVectorDatabaseTests.cs
@@ -0,0 +1,71 @@
+namespace SharpVectorTest;
+
+using System;
+using System.IO;
+using System.Linq;
+using System.Threading.Tasks;
+using Build5Nines.SharpVector;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+
+[TestClass]
+public class DiskVectorDatabaseTests
+{
+ private static string CreateTempDir()
+ {
+ var dir = Path.Combine(Path.GetTempPath(), "SharpVectorTests", Guid.NewGuid().ToString("N"));
+ Directory.CreateDirectory(dir);
+ return dir;
+ }
+
+ [TestMethod]
+ public async Task AddAndGetText_PersistsToDisk()
+ {
+ var root = CreateTempDir();
+ var db = new BasicDiskVectorDatabase(root);
+
+ var id = await db.AddTextAsync("hello world", "meta1");
+ var item = db.GetText(id);
+ Assert.AreEqual("hello world", item.Text);
+ Assert.AreEqual("meta1", item.Metadata);
+
+ // Recreate DB and ensure data is still there
+ var db2 = new BasicDiskVectorDatabase(root);
+ var item2 = db2.GetText(id);
+ Assert.AreEqual("hello world", item2.Text);
+ Assert.AreEqual("meta1", item2.Metadata);
+ }
+
+ [TestMethod]
+ public async Task Search_ReturnsSimilarResults()
+ {
+ var root = CreateTempDir();
+ var db = new BasicDiskVectorDatabase(root);
+
+ await db.AddTextAsync("The quick brown fox", "a");
+ await db.AddTextAsync("Jumps over the lazy dog", "b");
+ await db.AddTextAsync("An unrelated sentence", "c");
+
+ var results = await db.SearchAsync("quick fox", threshold: null, pageIndex: 0, pageCount: null);
+ Assert.IsTrue(results.Texts.Any());
+ Assert.IsTrue(results.Texts.Any(r => r.Text.Contains("quick", StringComparison.OrdinalIgnoreCase)));
+ }
+
+ [TestMethod]
+ public async Task Delete_RemovesFromIndexButKeepsFile()
+ {
+ var root = CreateTempDir();
+ var db = new BasicDiskVectorDatabase(root);
+ var id = await db.AddTextAsync("to be deleted", "m");
+ var existing = db.GetText(id);
+ Assert.AreEqual("to be deleted", existing.Text);
+
+ db.DeleteText(id);
+ Assert.IsFalse(db.GetIds().Contains(id));
+ Assert.ThrowsException(() => db.GetText(id));
+
+ // Reopen and ensure deletion persists
+ var db2 = new BasicDiskVectorDatabase(root);
+ Assert.IsFalse(db2.GetIds().Contains(id));
+ Assert.ThrowsException(() => db2.GetText(id));
+ }
+}
\ No newline at end of file