diff --git a/.github/workflows/ghpages-mkdocs.yml b/.github/workflows/ghpages-mkdocs.yml index 0efa2c0..e26d799 100644 --- a/.github/workflows/ghpages-mkdocs.yml +++ b/.github/workflows/ghpages-mkdocs.yml @@ -1,9 +1,10 @@ -name: Deploy MKDocs site to Pages +name: Build MKDocs Site on: - push: + pull_request: branches: - main + - dev paths: - .github/workflows/ghpages-mkdocs.yml - docs/** @@ -63,14 +64,3 @@ jobs: uses: actions/upload-pages-artifact@v3 with: path: ./docs/_site - - deploy: - runs-on: ubuntu-latest - needs: build - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/mkdocs-build.yml b/.github/workflows/mkdocs-build.yml new file mode 100644 index 0000000..6419427 --- /dev/null +++ b/.github/workflows/mkdocs-build.yml @@ -0,0 +1,76 @@ +name: Deploy MKDocs Site to Pages + +on: + push: + branches: + - main + paths: + - .github/workflows/ghpages-mkdocs.yml + - docs/** + - mkdocs.yml + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for all branches and tags, not just the default branch. + # This is needed to ensure that the commit SHA is available for the deployment. + # See + sparse-checkout: | + docs + mkdocs.yml + .github/workflows/ghpages-mkdocs.yml + + - name: Setup pages + id: pages + uses: actions/configure-pages@v5 + + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' # Use the latest version of Python 3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r ./docs/requirements.txt + + - name: Build documentation + # Outputs to the './_site' directory by default + run: | + mkdocs build --site-dir ./_site --config-file ./docs/mkdocs.yml + + - name: Upload artifact + # Automatically upload an artifact from the './_site' directory by default + uses: actions/upload-pages-artifact@v3 + with: + path: ./docs/_site + + deploy: + runs-on: ubuntu-latest + needs: build + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 188dbca..868ef80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.1.1 + +Add: + +- Add optional `filter` parameter to `.Search` and `.SearchAsync` methods that is of type `Func` that is called for each text item in the database for more advanced filtering prior to performing vector similarity search and returning results. If undefined or `null` it's ignored. + ## v2.1.0 Add: diff --git a/docs/docs/concepts/index.md b/docs/docs/concepts/index.md index 51a6f1d..638e924 100644 --- a/docs/docs/concepts/index.md +++ b/docs/docs/concepts/index.md @@ -2,7 +2,7 @@ title: Concepts description: Understand the core concepts behind SharpVector, from vector similarity to embedding strategies and in-memory architecture. --- -# Concepts +# :octicons-light-bulb-24: Concepts ## What is a Vector Database? diff --git a/docs/docs/embeddings/index.md b/docs/docs/embeddings/index.md index 48f4bab..09d1a9d 100644 --- a/docs/docs/embeddings/index.md +++ b/docs/docs/embeddings/index.md @@ -1,4 +1,7 @@ -# Embeddings +--- +title: Embeddings +--- +# :fontawesome-solid-square-binary: Embeddings `Build5Nines.SharpVector` includes the following support for using AI Models to generate the text embeddings for the vector database instead of generating them locally. The use of an AI Embeddings model can greatly increase the quality of the semantic search. diff --git a/docs/docs/get-started/data-management/index.md b/docs/docs/get-started/data-management/index.md index 3e0e78e..ef45075 100644 --- a/docs/docs/get-started/data-management/index.md +++ b/docs/docs/get-started/data-management/index.md @@ -2,7 +2,7 @@ title: Data Management --- -# Data Management +# :material-database-edit-outline: Data Management Since `Build5Nines.SharpVector` is a database, it also has data management methods available. These methods enable you to add, remove, and update the text documents that are vectorized and indexed within the semantic database. diff --git a/docs/docs/get-started/index.md b/docs/docs/get-started/index.md index 5fbb4e9..ebd5d0a 100644 --- a/docs/docs/get-started/index.md +++ b/docs/docs/get-started/index.md @@ -2,7 +2,7 @@ title: Get Started description: Get up and running with SharpVector in minutes. Learn how to install, initialize, and begin storing and searching vectorized text data. --- -# Get Started +# :octicons-rocket-24: Get Started It's really easy to get started with using `Build5Nines.SharpVector`. Simply follow the below steps. diff --git a/docs/docs/get-started/metadata/index.md b/docs/docs/get-started/metadata/index.md index 6ee019b..d648380 100644 --- a/docs/docs/get-started/metadata/index.md +++ b/docs/docs/get-started/metadata/index.md @@ -1,7 +1,7 @@ --- title: Metadata --- -# Metadata +# :material-database-cog-outline: Metadata The `Build5Nines.SharpVector` vector database enables semantic search for `Text` that is stored in the database. Being able to semantically search text is an extremely useful way to lookup more information related to the text. For this purpose, `Metadata` is stored alongside the `Text` within the vector database. This way, when `Text` is found when performing a semantic search, then the matching `Metadata` is also retrieved. diff --git a/docs/docs/get-started/search/index.md b/docs/docs/get-started/search/index.md new file mode 100644 index 0000000..8980b07 --- /dev/null +++ b/docs/docs/get-started/search/index.md @@ -0,0 +1,106 @@ +--- +title: Semantic Search +--- +# :material-file-search: Semantic Search + +Once text items and their associated metadata have been added to the vector database, the database can be used for semantic search to find matching text items for a given query. + +The `BasicMemoryVectorDatabase` and `MemoryVectorDatabase<>` classes both contain `.Search` and `.SearchAsync` methods that can be used to perform semantic search on the database: + +=== "Sync" + + ```csharp + var query = "some text to search"; + var results = vdb.Search(query); + ``` + +=== "Async" + + ```csharp + var query = "some text to search"; + var results = await vdb.SearchAsync(query); + ``` + +## Metadata Filters + +The `.Search` and `.SearchAsync` methods also include the ability to pre-filter the search results based on a boolean evaluation of the `Metadata` for the text item. This check is run before the vector similarity search is performed, and can help increase search performance on large datasets. + +Here are a couple examples of using the `filter` parameter to perform `Metadata` filtering when performing semantic searches: + +=== "Sync" + + ```csharp + var vdb = new BasicMemoryVectorDatabase(); + + // load text and metadata into database + + var query = "some text to search"; + var results = vdb.Search( + query, + filter: (metadata) => { + // perform some operation to check metadata + // return true or false + return metadata.Contains("B59"); + } + ); + ``` + +=== "Async" + + ```csharp + var vdb = new MemoryVectorDatabase(); + + // load text and metadata into database + + var query = "some text to search"; + var results = vdb.SearchAsync( + query, + filter: async (metadata) => { + // perform some operation to check metadata + // return true or false + return metadata.LastName == "Pietschmann"; + } + ); + ``` + +!!! info "OpenAI and Ollama Support" + + This functionality works the same with both [:simple-openai: OpenAI and :simple-ollama: Ollama supported vector databases](../../embeddings/index.md) too. + +## Paging + +The `.Search` and `.SearchAsync` methods also include the ability to perform paging on the text items returned from the semantic search. This is performed after the similarity search and the `filter` has been applied to the search results. This is done using the optional `pageCount` and `pageIndex` paramters. + +Here are a couple examples of using the `pageCount` and `pageIndex` parameters to perform paging with the semantic search results: + +=== "Sync" + + ```csharp + var vdb = new BasicMemoryVectorDatabase(); + + // load text and metadata into database + + var query = "some text to search"; + var results = vdb.Search( + query, + pageIndex: 0, // return first page of results (default: 0) + pageCount: 6 // limit length of this page of results (default: unlimited) + ); + ``` + +=== "Async" + + ```csharp + var vdb = new MemoryVectorDatabase(); + + // load text and metadata into database + + var query = "some text to search"; + var results = vdb.SearchAsync( + query, + pageIndex: 0, // return first page of results (default: 0) + pageCount: 6 // limit length of this page of results (default: unlimited) + ); + ``` + +The `pageIndex` and `pageIndex` paramters are optional, and can be used individually or together. diff --git a/docs/docs/index.md b/docs/docs/index.md index 0ddf706..2b32c5c 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -2,7 +2,7 @@ title: Discover description: The lightweight, in-memory, semantic search, text vector database for .NET that powers intelligent search and recommendation features. --- -# Discover Build5Nines.SharpVector +# :fontawesome-regular-compass: Discover Build5Nines.SharpVector **Build5Nines.SharpVector** is the lightweight, in-memory, semantic search, text vector database built for .NET applications. It enables fast and flexible vector-based similarity search for text data — ideal for search engines, recommendation systems, semantic analysis, and AI-enhanced features. @@ -27,6 +27,8 @@ Vector databases are used with Semantic Search and [Generative AI](https://build While there are lots of large databases that can be used to build Vector Databases (like Azure CosmosDB, PostgreSQL w/ pgvector, Azure AI Search, Elasticsearch, and more), there are not many options for a lightweight vector database that can be embedded into any .NET application. Build5Nines SharpVector is the lightweight in-memory Text Vector Database for use in any .NET application that you're looking for! + + --- > "For the in-memory vector database, we're using Build5Nines.SharpVector, an excellent open-source project by Chris Pietschmann. SharpVector makes it easy to store and retrieve vectorized data, making it an ideal choice for our sample RAG implementation." diff --git a/docs/docs/license/index.md b/docs/docs/license/index.md index cfa5b8b..868d949 100644 --- a/docs/docs/license/index.md +++ b/docs/docs/license/index.md @@ -4,7 +4,7 @@ description: Review the MIT license terms for using and contributing to the Shar date: 2025-04-13 --- -# License +# :octicons-file-badge-24: License ```text MIT License diff --git a/docs/docs/persistence/index.md b/docs/docs/persistence/index.md index c2d4c73..55559ac 100644 --- a/docs/docs/persistence/index.md +++ b/docs/docs/persistence/index.md @@ -1,4 +1,7 @@ -# Data Persistence +--- +title: Data Persistence +--- +# :octicons-file-24: Data Persistence The `Build5Nines.SharpVector` library provides easy-to-use methods for saving a memory-based vector database to a file or stream and loading it again later. This is particularly useful for caching indexed content between runs, deploying pre-built vector stores, or shipping databases with your application. diff --git a/docs/docs/resources/index.md b/docs/docs/resources/index.md index 106d493..acd27e2 100644 --- a/docs/docs/resources/index.md +++ b/docs/docs/resources/index.md @@ -3,7 +3,7 @@ title: Resources description: Dive deeper with curated resources, links, and tools for working with vector databases, semantic search, and SharpVector. --- -# Resources +# :octicons-link-24: Resources ## Tutorials diff --git a/docs/docs/samples/index.md b/docs/docs/samples/index.md index 953a906..509f8f5 100644 --- a/docs/docs/samples/index.md +++ b/docs/docs/samples/index.md @@ -3,7 +3,7 @@ title: Samples description: Explore real-world code samples to see SharpVector in action. Build search engines, intelligent note apps, and more. --- -# Samples +# :material-run-fast: Samples ## Sample Console App diff --git a/docs/docs/text-chunking/index.md b/docs/docs/text-chunking/index.md index f961381..36468aa 100644 --- a/docs/docs/text-chunking/index.md +++ b/docs/docs/text-chunking/index.md @@ -2,7 +2,7 @@ title: Text Chunking description: Learn how to break large documents into smaller chunks to improve vector search relevance and optimize embedding performance. --- -# Text Chunking +# :material-script-text: Text Chunking **Text chunking** is the process of breaking up large documents into smaller segments ("chunks") before embedding and storing them in a vector database. This allows for more accurate semantic search and improves performance in applications that deal with large bodies of text. @@ -124,7 +124,10 @@ The `TextDataLoader` class can be used to load documents into the } ``` -> 🧠 Tip: Use chunking method and size that best aligns with your content type and retrieval goals. +!!! info "Optimization Tip" + Use chunking method and size that best aligns with your content type and retrieval goals. For larger documents, text chunking will be required to get the best semantic search results returned from the vector database. + + Some experimentation on your data set may be required to find the text chunking strategy that works best for your solution. --- diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 103ad8c..34241dd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -64,7 +64,7 @@ markdown_extensions: - md_in_html - toc: permalink: true - toc_depth: 3 + toc_depth: 2 - pymdownx.critic - pymdownx.caret - pymdownx.keys @@ -98,6 +98,11 @@ plugins: - git-committers: # mkdocs-git-committers-plugin-2 plugin to show contributors on footer of repository: Build5Nines/SharpVector branch: main + # - with-pdf: + # cover_subtitle: "Lightweight, In-memory, Semantic Search, Text Vector Database to embed in any .NET Application" + # author: Build5Nines LLC + # output_path: build5nines-sharpvector.pdf + extra: # consent: @@ -137,18 +142,10 @@ nav: - Prerequisites: get-started/#prerequisites - Install Nuget Package: get-started/#install-nuget-package - Basic Example: get-started/#basic-example - - Metadata: - - get-started/metadata/index.md - - Adding Metadata: get-started/metadata/#adding-metadata - - JSON and String Metadata: get-started/metadata/#json-and-string-metadata - - Custom Metadata Type: get-started/metadata/#custom-metadata-type - - - Data Management: - - get-started/data-management/index.md - - Get Text Item Id: get-started/data-management/#get-text-item-id - - Get Item By Id: get-started/data-management/#get - - Update Item: get-started/data-management/#update - - Delete Item: get-started/data-management/#delete + - Metadata: get-started/metadata/index.md + - Semantic search: get-started/search/index.md + - Data Management: get-started/data-management/index.md + - Concepts: - concepts/index.md - What is a Vector Database?: concepts/#what-is-a-vector-database diff --git a/docs/requirements.txt b/docs/requirements.txt index da7e448..9d201f4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ pymdown-extensions markdown-include mkdocs-git-committers-plugin-2 mkdocs-git-revision-date-localized-plugin +mkdocs-with-pdf diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj index 9cc4a28..2ac8e59 100644 --- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj +++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj @@ -9,7 +9,7 @@ Build5Nines.SharpVector https://sharpvector.build5nines.com https://github.com/Build5Nines/SharpVector - 2.1.0 + 2.1.1 Lightweight In-memory Vector Database to embed in any .NET Applications Copyright (c) 2025 Build5Nines LLC README.md diff --git a/src/Build5Nines.SharpVector/IVectorDatabase.cs b/src/Build5Nines.SharpVector/IVectorDatabase.cs index 51c12d2..47c163e 100644 --- a/src/Build5Nines.SharpVector/IVectorDatabase.cs +++ b/src/Build5Nines.SharpVector/IVectorDatabase.cs @@ -80,8 +80,9 @@ public interface IVectorDatabase /// The similarity threshold to filter by. /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. - /// - IVectorTextResult Search(TDocument queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null); + /// A filter function to apply to the metadata of the results. + /// The search results as an IVectorTextResult object. + IVectorTextResult Search(TDocument queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null); /// /// Performs an asynchronous search vector search to find the top N most similar texts to the given text @@ -90,8 +91,9 @@ public interface IVectorDatabase /// The similarity threshold to filter by. /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. - /// - Task> SearchAsync(TDocument queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null); + /// A filter function to apply to the metadata of the results. + /// The search results as an IVectorTextResult object. + Task> SearchAsync(TDocument queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null); [Obsolete("Use SerializeToBinaryStreamAsync Instead")] diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabase.cs index 3d84ec9..e08b425 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabase.cs @@ -50,6 +50,11 @@ public override void DeserializeFromJsonStream(Stream stream) DeserializeFromBinaryStream(stream); } + /// + /// Deserializes the database from a binary stream. + /// + /// + /// public override async Task DeserializeFromBinaryStreamAsync(Stream stream) { await base.DeserializeFromBinaryStreamAsync(stream); @@ -58,6 +63,10 @@ public override async Task DeserializeFromBinaryStreamAsync(Stream stream) _idGenerator = new IntIdGenerator(VectorStore.GetIds().Max()); } + /// + /// Deserializes the database from a binary stream. + /// + /// public override void DeserializeFromBinaryStream(Stream stream) { base.DeserializeFromBinaryStream(stream); diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index 8498a10..53cee62 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -15,6 +15,19 @@ namespace Build5Nines.SharpVector; +/// +/// Base class for a memory vector database. +/// +/// +/// +/// +/// +/// +/// +/// +/// +/// +/// public abstract class MemoryVectorDatabaseBase : IVectorDatabase where TId : notnull @@ -188,10 +201,18 @@ public void UpdateTextAndMetadata(TId id, TVocabularyKey text, TMetadata metadat /// The query prompt to search by. /// The highest number of results to show. /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. /// - public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public IVectorTextResult Search(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) { - return SearchAsync(queryText, threshold, pageIndex, pageCount).Result; + Func>? filterToUse = null; + if (filter != null) + { + filterToUse = (metadata) => Task.FromResult(filter(metadata)); + } + return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; } /// @@ -201,10 +222,11 @@ public IVectorTextResult Search(TVocabularyKey q /// The similarity threshold to filter by. /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. /// - public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + public async Task> SearchAsync(TVocabularyKey queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) { - var similarities = await CalculateVectorComparisonAsync(queryText, threshold); + var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); similarities = await _vectorComparer.SortAsync(similarities); @@ -221,7 +243,7 @@ public async Task> SearchAsync return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); } - private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null) + private async Task>> CalculateVectorComparisonAsync(TVocabularyKey queryText, float? threshold = null, Func>? filter = null) { var queryTokens = _textPreprocessor.TokenizeAndPreprocess(queryText); float[] queryVector = _vectorizer.GenerateVectorFromTokens(VectorStore.VocabularyStore, queryTokens); @@ -237,15 +259,18 @@ private async Task>(); await foreach (KeyValuePair> kvp in VectorStore) { - var item = kvp.Value; - float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength)); - - if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) + if (filter == null || await filter(kvp.Value.Metadata)) { - var id = kvp.Key; - results.Add( - new VectorTextResultItem(id, item, vectorComparisonValue) - ); + var item = kvp.Value; + float vectorComparisonValue = await _vectorComparer.CalculateAsync(_vectorizer.NormalizeVector(queryVector, desiredLength), _vectorizer.NormalizeVector(item.Vector, desiredLength)); + + if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) + { + var id = kvp.Key; + results.Add( + new VectorTextResultItem(id, item, vectorComparisonValue) + ); + } } } return results; @@ -361,7 +386,14 @@ IEnumerator IEnumerable.GetEnumerator() - +/// +/// Base class for a memory vector database. +/// +/// +/// +/// +/// +/// public abstract class MemoryVectorDatabaseBase : IMemoryVectorDatabase, IVectorDatabase where TId : notnull @@ -541,10 +573,18 @@ public void UpdateTextAndMetadata(TId id, string text, TMetadata metadata) /// The query prompt to search by. /// The highest number of results to show. /// The similarity threshold. Only return items greater or equal to the threshold. Null returns all. - /// - public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + /// The page index of the search results. Default is 0. + /// The number of search results per page. Default is Null and returns all results. + /// A filter function to apply to the metadata of each result. + /// The search results as an IVectorTextResult object. + public IVectorTextResult Search(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func? filter = null) { - return SearchAsync(queryText, threshold, pageIndex, pageCount).Result; + Func>? filterToUse = null; + if (filter != null) + { + filterToUse = (metadata) => Task.FromResult(filter(metadata)); + } + return SearchAsync(queryText, threshold, pageIndex, pageCount, filterToUse).Result; } /// @@ -554,10 +594,11 @@ public IVectorTextResult Search(string queryText, float? /// The similarity threshold to filter by. /// The page index of the search results. Default is 0. /// The number of search results per page. Default is Null and returns all results. - /// - public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null) + /// A filter function to apply to the metadata of each result. + /// The search results as an IVectorTextResult object. + public async Task> SearchAsync(string queryText, float? threshold = null, int pageIndex = 0, int? pageCount = null, Func>? filter = null) { - var similarities = await CalculateVectorComparisonAsync(queryText, threshold); + var similarities = await CalculateVectorComparisonAsync(queryText, threshold, filter); similarities = await _vectorComparer.SortAsync(similarities); @@ -574,7 +615,7 @@ public async Task> SearchAsync(string return new VectorTextResult(totalCountFoundInSearch, pageIndex, pageCount.HasValue ? pageCount.Value : 1, resultsToReturn); } - private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null) + private async Task>> CalculateVectorComparisonAsync(string queryText, float? threshold = null, Func>? filter = null) { var queryVector = await EmbeddingsGenerator.GenerateEmbeddingsAsync(queryText); @@ -586,16 +627,19 @@ private async Task>> C var results = new ConcurrentBag>(); await foreach (var kvp in VectorStore) { - var item = kvp.Value; + if (filter == null || await filter(kvp.Value.Metadata)) + { + var item = kvp.Value; - float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector); + float vectorComparisonValue = await _vectorComparer.CalculateAsync(queryVector, item.Vector); - if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) - { - var id = kvp.Key; - results.Add( - new VectorTextResultItem(id, item, vectorComparisonValue) - ); + if (_vectorComparer.IsWithinThreshold(threshold, vectorComparisonValue)) + { + var id = kvp.Key; + results.Add( + new VectorTextResultItem(id, item, vectorComparisonValue) + ); + } } } return results; diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index e15f183..c7117c7 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -1024,6 +1024,77 @@ public void BasicMemoryVectorDatabase_LoopThroughAllTexts_01() vdb.UpdateText(item.Id, item.Text + " - Updated"); } } + + [TestMethod] + public void BasicMemoryVectorDatabase_Search_01() + { + var vdb = new BasicMemoryVectorDatabase(); + + // // Load Vector Database with some sample text + vdb.AddText("The 👑 King", "metadata1"); + vdb.AddText("It's 🔥 Fire", "metadata2"); + vdb.AddText("👑🔥 🏕️", "metadata3"); + + var results = vdb.Search("🔥👑🏕️", pageCount: 1, filter: (metadata) => { + return metadata == "metadata1"; + }); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.AreEqual("The 👑 King", results.Texts.First().Text); + Assert.AreEqual(1, results.Texts.First().Id); + Assert.AreEqual("metadata1", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task BasicMemoryVectorDatabase_SearchAsync_01() + { + var vdb = new BasicMemoryVectorDatabase(); + + // // Load Vector Database with some sample text + vdb.AddText("The 👑 King", "metadata1"); + vdb.AddText("It's 🔥 Fire", "metadata2"); + vdb.AddText("👑🔥 🏕️", "metadata3"); + +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + var results = await vdb.SearchAsync("🔥👑🏕️", pageCount: 1, filter: async (metadata) => { + return metadata == "metadata1"; + }); +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + + Assert.AreEqual(1, results.Texts.Count()); + Assert.AreEqual("The 👑 King", results.Texts.First().Text); + Assert.AreEqual(1, results.Texts.First().Id); + Assert.AreEqual("metadata1", results.Texts.First().Metadata); + } + + [TestMethod] + public async Task BasicMemoryVectorDatabase_SearchAsync_02() + { + var vdb = new MemoryVectorDatabase(); + + // // Load Vector Database with some sample text + vdb.AddText("The 👑 King", "metadata1"); + vdb.AddText("It's 🔥 Fire", "metadata2"); + vdb.AddText("👑🔥 🏕️", "metadata3"); + + var query = "🔥👑🏕️"; + var results = await vdb.SearchAsync( + query, + filter: BasicMemoryVectorDatabase_SearchAsync_02_Filter + ); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.AreEqual("The 👑 King", results.Texts.First().Text); + Assert.AreEqual(1, results.Texts.First().Id); + Assert.AreEqual("metadata1", results.Texts.First().Metadata); + } + +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + private async Task BasicMemoryVectorDatabase_SearchAsync_02_Filter(string? metadata) +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + { + return metadata == "metadata1"; + } } public class MockMemoryVectorDatabase