From 2bfd909967a410923aded6200e67b587d5dcae0b Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Thu, 22 Jan 2026 11:13:39 +0000 Subject: [PATCH 01/11] feat(indexers): Added configurable scoring metric attributre to VectorStore --- src/classifai/indexers/main.py | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 6f87380..aada746 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -32,6 +32,7 @@ import shutil import time import uuid +from typing import Literal import numpy as np import polars as pl @@ -53,6 +54,21 @@ logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING) +def metricvalid(metric: str): + """Test that the given metric is a valid option. + + Args: + metric (str): The selected metric for the VectorStore + + Raises: + ValueError: If value is not in ["cosine", "dotprod", "l2"] + + """ + valid_metrics = ["cosine", "dotprod", "l2"] + if metric not in valid_metrics: + raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {valid_metrics}") + + class VectorStore: """A class to model and create 'VectorStore' objects for building and searching vector databases from CSV text files. @@ -60,6 +76,7 @@ class VectorStore: file_name (str): the original file with the knowledgebase to build the vector store data_type (str): the data type of the original file (curently only csv supported) vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module + scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring batch_size (int): the batch size to pass to the vectoriser when embedding meta_data (dict[str:type]): key-value pairs of metadata to extract from the input file and their correpsonding types output_dir (str): the path to the output directory where the VectorStore will be saved @@ -75,6 +92,7 @@ def __init__( # noqa: PLR0913 file_name, data_type, vectoriser, + scoring_metric: Literal["cosine", "dotprod", "l2"] = "cosine", batch_size=8, meta_data=None, output_dir=None, @@ -89,6 +107,7 @@ def __init__( # noqa: PLR0913 data_type (str): The type of input data (currently supports only "csv"). vectoriser (object): The vectoriser object used to transform text into vector embeddings. + scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring batch_size (int, optional): The batch size for processing the input file and batching to vectoriser. Defaults to 8. meta_data (dict, optional): key,value pair metadata column names to extract from the input file and their types. @@ -107,6 +126,7 @@ def __init__( # noqa: PLR0913 self.file_name = file_name self.data_type = data_type self.vectoriser = vectoriser + self.scoring_metric = scoring_metric self.batch_size = batch_size self.meta_data = meta_data if meta_data is not None else {} self.output_dir = output_dir @@ -119,6 +139,9 @@ def __init__( # noqa: PLR0913 if self.data_type not in ["csv"]: raise ValueError(f"Data type '{self.data_type}' not supported. Choose from ['csv'].") + ## validate scoring metric + metricvalid(self.scoring_metric) + if self.output_dir is None: logging.info("No output directory specified, attempting to use input file name as output folder name.") @@ -390,6 +413,12 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V # Convert the current batch of queries to vectors query_vectors = self.vectoriser.transform(query_text_batch) + ## determine proper metric to use + if self.scoring_metric == "dotprod": + print("scoring with dotprod") + if self.scoring_metric == "cosine": + print("scoring with cosine") + # Compute cosine similarity between the query batch and document vectors cosine = query_vectors @ self.vectors["embeddings"].to_numpy().T @@ -461,7 +490,7 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V return result_df @classmethod - def from_filespace(cls, folder_path, vectoriser): + def from_filespace(cls, folder_path, vectoriser, scoring_metric: Literal["cosine", "dotprod", "l2"] = "cosine"): """Creates a `VectorStore` instance from stored metadata and Parquet files. This method reads the metadata and vectors from the specified folder, validates the contents, and initializes a `VectorStore` object with the @@ -475,6 +504,7 @@ def from_filespace(cls, folder_path, vectoriser): Args: folder_path (str): The folder path containing the metadata and Parquet files. vectoriser (object): The vectoriser object used to transform text into vector embeddings. + scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring Returns: VectorStore: An instance of the `VectorStore` class. @@ -491,6 +521,9 @@ def from_filespace(cls, folder_path, vectoriser): with open(metadata_path, encoding="utf-8") as f: metadata = json.load(f) + ## validate scoring metric + metricvalid(scoring_metric) + # check that the correct keys exist in metadata required_keys = [ "vectoriser_class", @@ -544,6 +577,7 @@ def from_filespace(cls, folder_path, vectoriser): vector_store.file_name = None vector_store.data_type = None vector_store.vectoriser = vectoriser + vector_store.scoring_metric = scoring_metric vector_store.batch_size = None vector_store.meta_data = deserialized_column_meta_data vector_store.vectors = df From b2a0d9575920f6d3de58ae81dcfe62a02e704a84 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Thu, 22 Jan 2026 11:19:33 +0000 Subject: [PATCH 02/11] chore: added required metrics list --- src/classifai/indexers/main.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index aada746..0b46f1a 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -61,10 +61,10 @@ def metricvalid(metric: str): metric (str): The selected metric for the VectorStore Raises: - ValueError: If value is not in ["cosine", "dotprod", "l2"] + ValueError: If value is not in ["cosine", "dotprod", "cosinel2", "dotprodl2"] """ - valid_metrics = ["cosine", "dotprod", "l2"] + valid_metrics = ["cosine", "dotprod", "cosinel2", "dotprodl2"] if metric not in valid_metrics: raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {valid_metrics}") @@ -76,7 +76,7 @@ class VectorStore: file_name (str): the original file with the knowledgebase to build the vector store data_type (str): the data type of the original file (curently only csv supported) vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module - scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring + scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring batch_size (int): the batch size to pass to the vectoriser when embedding meta_data (dict[str:type]): key-value pairs of metadata to extract from the input file and their correpsonding types output_dir (str): the path to the output directory where the VectorStore will be saved @@ -92,7 +92,7 @@ def __init__( # noqa: PLR0913 file_name, data_type, vectoriser, - scoring_metric: Literal["cosine", "dotprod", "l2"] = "cosine", + scoring_metric: Literal["cosine", "dotprod", "cosinel2", "dotprodl2"] = "cosine", batch_size=8, meta_data=None, output_dir=None, @@ -107,7 +107,7 @@ def __init__( # noqa: PLR0913 data_type (str): The type of input data (currently supports only "csv"). vectoriser (object): The vectoriser object used to transform text into vector embeddings. - scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring + scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring batch_size (int, optional): The batch size for processing the input file and batching to vectoriser. Defaults to 8. meta_data (dict, optional): key,value pair metadata column names to extract from the input file and their types. @@ -414,10 +414,15 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V query_vectors = self.vectoriser.transform(query_text_batch) ## determine proper metric to use - if self.scoring_metric == "dotprod": - print("scoring with dotprod") - if self.scoring_metric == "cosine": - print("scoring with cosine") + # match self.scoring_metric: + # case "dotprod": + # print("scoring with dotprod") + # case "cosine": + # print("scoring with cosine") + # case "dotprodl2": + # print("scoring with dotprodl2") + # case "cosinel2": + # print("scoring with cosinel2") # Compute cosine similarity between the query batch and document vectors cosine = query_vectors @ self.vectors["embeddings"].to_numpy().T @@ -490,7 +495,9 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V return result_df @classmethod - def from_filespace(cls, folder_path, vectoriser, scoring_metric: Literal["cosine", "dotprod", "l2"] = "cosine"): + def from_filespace( + cls, folder_path, vectoriser, scoring_metric: Literal["cosine", "dotprod", "cosinel2", "dotprodl2"] = "cosine" + ): """Creates a `VectorStore` instance from stored metadata and Parquet files. This method reads the metadata and vectors from the specified folder, validates the contents, and initializes a `VectorStore` object with the @@ -504,7 +511,7 @@ def from_filespace(cls, folder_path, vectoriser, scoring_metric: Literal["cosine Args: folder_path (str): The folder path containing the metadata and Parquet files. vectoriser (object): The vectoriser object used to transform text into vector embeddings. - scoring_metric(Literal["cosine", "dotprod", "l2"]): The metric to use for scoring + scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring Returns: VectorStore: An instance of the `VectorStore` class. From 4e177ee2c66123e546a33d6589a1879c3a98cef2 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Fri, 23 Jan 2026 11:29:34 +0000 Subject: [PATCH 03/11] chore: refactored scoring to a method and changed metric literal to type alias --- src/classifai/indexers/__init__.py | 2 + src/classifai/indexers/main.py | 117 ++++++++++++++++------------- src/classifai/indexers/types.py | 5 ++ 3 files changed, 72 insertions(+), 52 deletions(-) create mode 100644 src/classifai/indexers/types.py diff --git a/src/classifai/indexers/__init__.py b/src/classifai/indexers/__init__.py index 83ad3fa..1ca851e 100644 --- a/src/classifai/indexers/__init__.py +++ b/src/classifai/indexers/__init__.py @@ -9,6 +9,7 @@ VectorStoreSearchOutput, ) from .main import VectorStore +from .types import metric_settings __all__ = [ "VectorStore", @@ -18,4 +19,5 @@ "VectorStoreReverseSearchOutput", "VectorStoreSearchInput", "VectorStoreSearchOutput", + "metric_settings", ] diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 0b46f1a..db0acb2 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -32,12 +32,13 @@ import shutil import time import uuid -from typing import Literal +from typing import get_args import numpy as np import polars as pl from tqdm.autonotebook import tqdm +from ..vectorisers import VectoriserBase from .dataclasses import ( VectorStoreEmbedInput, VectorStoreEmbedOutput, @@ -46,6 +47,7 @@ VectorStoreSearchInput, VectorStoreSearchOutput, ) +from .types import metric_settings # Configure logging for your application logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s") @@ -54,17 +56,17 @@ logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING) -def metricvalid(metric: str): +def metricvalid(metric: metric_settings): """Test that the given metric is a valid option. Args: metric (str): The selected metric for the VectorStore Raises: - ValueError: If value is not in ["cosine", "dotprod", "cosinel2", "dotprodl2"] + ValueError: If value is not in ["cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared"] """ - valid_metrics = ["cosine", "dotprod", "cosinel2", "dotprodl2"] + valid_metrics = get_args(metric_settings) if metric not in valid_metrics: raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {valid_metrics}") @@ -75,8 +77,8 @@ class VectorStore: Attributes: file_name (str): the original file with the knowledgebase to build the vector store data_type (str): the data type of the original file (curently only csv supported) - vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module - scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring + vectoriser (VectoriserBase): A Vectoriser object from the corresponding ClassifAI Pacakge module + scoring_metric(metric_settings): The metric to use for scoring batch_size (int): the batch size to pass to the vectoriser when embedding meta_data (dict[str:type]): key-value pairs of metadata to extract from the input file and their correpsonding types output_dir (str): the path to the output directory where the VectorStore will be saved @@ -91,8 +93,8 @@ def __init__( # noqa: PLR0913 self, file_name, data_type, - vectoriser, - scoring_metric: Literal["cosine", "dotprod", "cosinel2", "dotprodl2"] = "cosine", + vectoriser: VectoriserBase, + scoring_metric: metric_settings = "cosine", batch_size=8, meta_data=None, output_dir=None, @@ -105,9 +107,9 @@ def __init__( # noqa: PLR0913 Args: file_name (str): The name of the input CSV file. data_type (str): The type of input data (currently supports only "csv"). - vectoriser (object): The vectoriser object used to transform text into + vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. - scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring + scoring_metric(metric_settings): The metric to use for scoring batch_size (int, optional): The batch size for processing the input file and batching to vectoriser. Defaults to 8. meta_data (dict, optional): key,value pair metadata column names to extract from the input file and their types. @@ -370,6 +372,54 @@ def reverse_search(self, query: VectorStoreReverseSearchInput, n_results=100) -> return result_df + def score( + self, query: np.ndarray, n_results: int, query_ids_batch: list[str], query_text_batch: list[str] + ) -> tuple[pl.DataFrame, np.ndarray]: + """Perform Scoring and return Top Values. + + Args: + query(np.ndarray): query for search + n_results(int): number of results to return + query_ids_batch(list[str]): ids of query batch + query_text_batch(list[str]): source text of query batch + + Returns: + pl.DataFrame: The Polars DataFrame containing the top n most similar results to the query + """ + if self.scoring_metric.startswith("cosine"): + query = query / np.linalg.norm(query, axis=1, keepdims=True) + + result = query @ self.vectors["embeddings"].to_numpy().T + + # Get the top n_results indices for each query in the batch + idx = np.argpartition(result, -n_results, axis=1)[:, -n_results:] + + # Sort top n_results indices by their scores in descending order + idx_sorted = np.zeros_like(idx) + scores = np.zeros_like(idx, dtype=float) + + for j in range(idx.shape[0]): + row_scores = result[j, idx[j]] + sorted_indices = np.argsort(row_scores)[::-1] + idx_sorted[j] = idx[j, sorted_indices] + scores[j] = row_scores[sorted_indices] + + if "l2" in self.scoring_metric: + scores = 2 * (1 - scores) + if not self.scoring_metric.endswith("squared"): + scores = np.sqrt(scores) + + # Build a DataFrame for the current batch results + result_df = pl.DataFrame( + { + "query_id": np.repeat(query_ids_batch, n_results), + "query_text": np.repeat(query_text_batch, n_results), + "rank": np.tile(np.arange(n_results), len(query_text_batch)), + "score": scores.flatten(), + } + ) + return result_df, idx_sorted + def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> VectorStoreSearchOutput: """Searches the vector store using queries from a VectorStoreSearchInput object and returns ranked results in VectorStoreSearchOutput object. In batches, converts users text queries into vector embeddings, @@ -409,46 +459,11 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V # Get the current batch of queries query_text_batch = query.query.to_list()[i : i + batch_size] query_ids_batch = query.id.to_list()[i : i + batch_size] - # Convert the current batch of queries to vectors query_vectors = self.vectoriser.transform(query_text_batch) - ## determine proper metric to use - # match self.scoring_metric: - # case "dotprod": - # print("scoring with dotprod") - # case "cosine": - # print("scoring with cosine") - # case "dotprodl2": - # print("scoring with dotprodl2") - # case "cosinel2": - # print("scoring with cosinel2") - - # Compute cosine similarity between the query batch and document vectors - cosine = query_vectors @ self.vectors["embeddings"].to_numpy().T - - # Get the top n_results indices for each query in the batch - idx = np.argpartition(cosine, -n_results, axis=1)[:, -n_results:] - - # Sort top n_results indices by their scores in descending order - idx_sorted = np.zeros_like(idx) - scores = np.zeros_like(idx, dtype=float) - - for j in range(idx.shape[0]): - row_scores = cosine[j, idx[j]] - sorted_indices = np.argsort(row_scores)[::-1] - idx_sorted[j] = idx[j, sorted_indices] - scores[j] = row_scores[sorted_indices] - - # Build a DataFrame for the current batch results - result_df = pl.DataFrame( - { - "query_id": np.repeat(query_ids_batch, n_results), - "query_text": np.repeat(query_text_batch, n_results), - "rank": np.tile(np.arange(n_results), len(query_text_batch)), - "score": scores.flatten(), - } - ) + # perform scoring and return frame and ids + result_df, idx_sorted = self.score(query_vectors, n_results, query_ids_batch, query_text_batch) # Get the vector store results for the current batch ranked_docs = self.vectors[idx_sorted.flatten().tolist()].select(["id", "text", *self.meta_data.keys()]) @@ -495,9 +510,7 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V return result_df @classmethod - def from_filespace( - cls, folder_path, vectoriser, scoring_metric: Literal["cosine", "dotprod", "cosinel2", "dotprodl2"] = "cosine" - ): + def from_filespace(cls, folder_path, vectoriser: VectoriserBase, scoring_metric: metric_settings = "cosine"): """Creates a `VectorStore` instance from stored metadata and Parquet files. This method reads the metadata and vectors from the specified folder, validates the contents, and initializes a `VectorStore` object with the @@ -510,8 +523,8 @@ def from_filespace( Args: folder_path (str): The folder path containing the metadata and Parquet files. - vectoriser (object): The vectoriser object used to transform text into vector embeddings. - scoring_metric(Literal["cosine", "dotprod", "cosinel2", "dotprodl2"]): The metric to use for scoring + vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. + scoring_metric(metric_settings): The metric to use for scoring Returns: VectorStore: An instance of the `VectorStore` class. diff --git a/src/classifai/indexers/types.py b/src/classifai/indexers/types.py new file mode 100644 index 0000000..66ce000 --- /dev/null +++ b/src/classifai/indexers/types.py @@ -0,0 +1,5 @@ +from typing import Literal, TypeAlias + +metric_settings: TypeAlias = Literal[ + "cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared" +] From 4414401507711e9071d18b6a3557c1c5743b9fc6 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Fri, 23 Jan 2026 11:44:30 +0000 Subject: [PATCH 04/11] chore: set default gcpvectorizer task type to classification as mentioned in docstring --- src/classifai/indexers/main.py | 4 +++- src/classifai/vectorisers/gcp.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index db0acb2..0ea3e8c 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -420,7 +420,9 @@ def score( ) return result_df, idx_sorted - def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> VectorStoreSearchOutput: + def search( + self, query: VectorStoreSearchInput, n_results: int = 10, batch_size: int = 8 + ) -> VectorStoreSearchOutput: """Searches the vector store using queries from a VectorStoreSearchInput object and returns ranked results in VectorStoreSearchOutput object. In batches, converts users text queries into vector embeddings, computes cosine similarity with stored document vectors, and retrieves the top results. diff --git a/src/classifai/vectorisers/gcp.py b/src/classifai/vectorisers/gcp.py index c7824d1..1a30a9b 100644 --- a/src/classifai/vectorisers/gcp.py +++ b/src/classifai/vectorisers/gcp.py @@ -28,7 +28,7 @@ def __init__( project_id, location="europe-west2", model_name="text-embedding-004", - task_type="RETRIEVAL_DOCUMENT", + task_type="CLASSIFICATION", ): """Initializes the GcpVectoriser with the specified project ID, location, and model name. From a84575fe07a9f7e5d7f38df7bb91a290b6cbd606 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Fri, 23 Jan 2026 12:02:31 +0000 Subject: [PATCH 05/11] added normalization step for when using cosine distance --- src/classifai/indexers/main.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 0ea3e8c..0b0a030 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -171,7 +171,7 @@ def __init__( # noqa: PLR0913 os.makedirs(self.output_dir, exist_ok=True) self._create_vector_store_index() - + self._check_norm_vdb() logging.info("Gathering metadata and saving vector store / metadata...") self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1] @@ -372,6 +372,14 @@ def reverse_search(self, query: VectorStoreReverseSearchInput, n_results=100) -> return result_df + def _check_norm_vdb(self): + """Normalise Vdb if using cosine similarity.""" + if "cosine" in self.scoring_metric: + embeddings = self.vectors["embeddings"].to_numpy() + embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) + + self.vectors.with_columns(pl.Series("embeddings", embeddings)) + def score( self, query: np.ndarray, n_results: int, query_ids_batch: list[str], query_text_batch: list[str] ) -> tuple[pl.DataFrame, np.ndarray]: @@ -607,5 +615,5 @@ def from_filespace(cls, folder_path, vectoriser: VectoriserBase, scoring_metric: vector_store.num_vectors = metadata["num_vectors"] vector_store.vectoriser_class = metadata["vectoriser_class"] vector_store.hooks = {} - + vector_store._check_norm_vdb() return vector_store From c0a217044d7b1ae8b7023e964a01c4665e472abb Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Wed, 28 Jan 2026 09:43:50 +0000 Subject: [PATCH 06/11] chore: refactored types to enum --- src/classifai/indexers/__init__.py | 4 ++-- src/classifai/indexers/main.py | 24 ++++++++++++------------ src/classifai/indexers/types.py | 13 +++++++++---- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/classifai/indexers/__init__.py b/src/classifai/indexers/__init__.py index 1ca851e..7c127af 100644 --- a/src/classifai/indexers/__init__.py +++ b/src/classifai/indexers/__init__.py @@ -9,9 +9,10 @@ VectorStoreSearchOutput, ) from .main import VectorStore -from .types import metric_settings +from .types import MetricSettings __all__ = [ + "MetricSettings", "VectorStore", "VectorStoreEmbedInput", "VectorStoreEmbedOutput", @@ -19,5 +20,4 @@ "VectorStoreReverseSearchOutput", "VectorStoreSearchInput", "VectorStoreSearchOutput", - "metric_settings", ] diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 0b0a030..7b38425 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -32,7 +32,6 @@ import shutil import time import uuid -from typing import get_args import numpy as np import polars as pl @@ -47,7 +46,7 @@ VectorStoreSearchInput, VectorStoreSearchOutput, ) -from .types import metric_settings +from .types import MetricSettings # Configure logging for your application logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s") @@ -56,19 +55,18 @@ logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING) -def metricvalid(metric: metric_settings): +def metricvalid(metric: MetricSettings): """Test that the given metric is a valid option. Args: metric (str): The selected metric for the VectorStore Raises: - ValueError: If value is not in ["cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared"] + ValueError: If value is not in MetricSettings """ - valid_metrics = get_args(metric_settings) - if metric not in valid_metrics: - raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {valid_metrics}") + if metric not in MetricSettings: + raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {list(MetricSettings)}") class VectorStore: @@ -78,7 +76,7 @@ class VectorStore: file_name (str): the original file with the knowledgebase to build the vector store data_type (str): the data type of the original file (curently only csv supported) vectoriser (VectoriserBase): A Vectoriser object from the corresponding ClassifAI Pacakge module - scoring_metric(metric_settings): The metric to use for scoring + scoring_metric(MetricSettings): The metric to use for scoring batch_size (int): the batch size to pass to the vectoriser when embedding meta_data (dict[str:type]): key-value pairs of metadata to extract from the input file and their correpsonding types output_dir (str): the path to the output directory where the VectorStore will be saved @@ -94,7 +92,7 @@ def __init__( # noqa: PLR0913 file_name, data_type, vectoriser: VectoriserBase, - scoring_metric: metric_settings = "cosine", + scoring_metric: MetricSettings | str = MetricSettings.COSINE, batch_size=8, meta_data=None, output_dir=None, @@ -109,7 +107,7 @@ def __init__( # noqa: PLR0913 data_type (str): The type of input data (currently supports only "csv"). vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. - scoring_metric(metric_settings): The metric to use for scoring + scoring_metric(MetricSettings): The metric to use for scoring batch_size (int, optional): The batch size for processing the input file and batching to vectoriser. Defaults to 8. meta_data (dict, optional): key,value pair metadata column names to extract from the input file and their types. @@ -520,7 +518,9 @@ def search( return result_df @classmethod - def from_filespace(cls, folder_path, vectoriser: VectoriserBase, scoring_metric: metric_settings = "cosine"): + def from_filespace( + cls, folder_path, vectoriser: VectoriserBase, scoring_metric: MetricSettings | str = MetricSettings.COSINE + ): """Creates a `VectorStore` instance from stored metadata and Parquet files. This method reads the metadata and vectors from the specified folder, validates the contents, and initializes a `VectorStore` object with the @@ -534,7 +534,7 @@ def from_filespace(cls, folder_path, vectoriser: VectoriserBase, scoring_metric: Args: folder_path (str): The folder path containing the metadata and Parquet files. vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. - scoring_metric(metric_settings): The metric to use for scoring + scoring_metric(MetricSettings): The metric to use for scoring Returns: VectorStore: An instance of the `VectorStore` class. diff --git a/src/classifai/indexers/types.py b/src/classifai/indexers/types.py index 66ce000..0942ec6 100644 --- a/src/classifai/indexers/types.py +++ b/src/classifai/indexers/types.py @@ -1,5 +1,10 @@ -from typing import Literal, TypeAlias +from enum import Enum -metric_settings: TypeAlias = Literal[ - "cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared" -] + +class MetricSettings(str, Enum): + COSINE = "cosine" + DOTPROD = "dotprod" + COSINE_L2 = "cosinel2" + DOTPROD_L2 = "dotprodl2" + COSINE_L2_SQUARED = "cosinel2squared" + DOTPROD_L2_SQUARED = "dotprodl2squared" From 293d25d65e77defac961936ad3d40e0284c2ee32 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Wed, 28 Jan 2026 09:50:15 +0000 Subject: [PATCH 07/11] chore: moved type imports to conditionals --- src/classifai/indexers/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 7b38425..9bd4605 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -32,12 +32,14 @@ import shutil import time import uuid +from typing import TYPE_CHECKING import numpy as np import polars as pl from tqdm.autonotebook import tqdm -from ..vectorisers import VectoriserBase +if TYPE_CHECKING: + from ..vectorisers import VectoriserBase from .dataclasses import ( VectorStoreEmbedInput, VectorStoreEmbedOutput, From a464b35f6974dea9d0a2b0594655a4ac5b1a8ec6 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Wed, 28 Jan 2026 10:37:38 +0000 Subject: [PATCH 08/11] chore: added types as annotations, setup better norm logic --- src/classifai/indexers/main.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 9bd4605..e195697 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -26,6 +26,8 @@ vector databases from your own text data. """ +from __future__ import annotations + import json import logging import os @@ -87,6 +89,7 @@ class VectorStore: num_vectors (int): how many vectors are in the vector store vectoriser_class (str): the type of vectoriser used to create embeddings hooks (dict): A dictionary of user-defined hooks for preprocessing and postprocessing. + normalize(bool): Flag to choose if to normalize vectors. """ def __init__( # noqa: PLR0913 @@ -100,6 +103,7 @@ def __init__( # noqa: PLR0913 output_dir=None, overwrite=False, hooks=None, + normalize=False, ): """Initializes the VectorStore object by processing the input CSV file and generating vector embeddings. @@ -118,7 +122,7 @@ def __init__( # noqa: PLR0913 Defaults to None, where input file name will be used. overwrite (bool, optional): If True, allows overwriting existing folders with the same name. Defaults to false to prevent accidental overwrites. hooks (dict, optional): A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to None. - + normalize(bool, optional): A flag to make vectorstore normalize its vdb Raises: ValueError: If the data type is not supported or if the folder name conflicts with an existing folder. @@ -137,6 +141,7 @@ def __init__( # noqa: PLR0913 self.num_vectors = None self.vectoriser_class = vectoriser.__class__.__name__ self.hooks = {} if hooks is None else hooks + self.normalize = normalize if self.data_type not in ["csv"]: raise ValueError(f"Data type '{self.data_type}' not supported. Choose from ['csv'].") @@ -171,7 +176,14 @@ def __init__( # noqa: PLR0913 os.makedirs(self.output_dir, exist_ok=True) self._create_vector_store_index() - self._check_norm_vdb() + + ## init normalization + if normalize: + embeddings = self.vectors["embeddings"].to_numpy() + embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) + + self.vectors.with_columns(pl.Series("embeddings", embeddings)) + logging.info("Gathering metadata and saving vector store / metadata...") self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1] @@ -182,6 +194,8 @@ def __init__( # noqa: PLR0913 self._save_metadata(os.path.join(self.output_dir, "metadata.json")) logging.info("Vector Store created - files saved to %s", self.output_dir) + ## will norm in memory if using cosine metrics + self._check_norm_vdb() def _save_metadata(self, path): """Saves metadata about the vector store to a JSON file. @@ -205,6 +219,7 @@ def _save_metadata(self, path): "num_vectors": self.num_vectors, "created_at": time.time(), "meta_data": serializable_column_meta_data, + "normalized": self.normalize, } with open(path, "w", encoding="utf-8") as f: @@ -374,7 +389,10 @@ def reverse_search(self, query: VectorStoreReverseSearchInput, n_results=100) -> def _check_norm_vdb(self): """Normalise Vdb if using cosine similarity.""" - if "cosine" in self.scoring_metric: + if "cosine" in self.scoring_metric and self.normalize: + logging.warning( + "Note: you are using metrics that require norms with un-normed vdb data, this will be normed for search but vdb file will not be changed" + ) embeddings = self.vectors["embeddings"].to_numpy() embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) @@ -557,13 +575,7 @@ def from_filespace( metricvalid(scoring_metric) # check that the correct keys exist in metadata - required_keys = [ - "vectoriser_class", - "vector_shape", - "num_vectors", - "created_at", - "meta_data", - ] + required_keys = ["vectoriser_class", "vector_shape", "num_vectors", "created_at", "meta_data", "normalized"] for key in required_keys: if key not in metadata: raise ValueError(f"Metadata file is missing required key: {key}") @@ -616,6 +628,7 @@ def from_filespace( vector_store.vector_shape = metadata["vector_shape"] vector_store.num_vectors = metadata["num_vectors"] vector_store.vectoriser_class = metadata["vectoriser_class"] + vector_store.normalize = metadata["normalized"] vector_store.hooks = {} vector_store._check_norm_vdb() return vector_store From f982f4cb2de814afb5202b14cb7e8ea07b8f4463 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Wed, 28 Jan 2026 11:22:44 +0000 Subject: [PATCH 09/11] fix: added not to normalize check --- src/classifai/indexers/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index e195697..5e048bd 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -389,7 +389,7 @@ def reverse_search(self, query: VectorStoreReverseSearchInput, n_results=100) -> def _check_norm_vdb(self): """Normalise Vdb if using cosine similarity.""" - if "cosine" in self.scoring_metric and self.normalize: + if "cosine" in self.scoring_metric and not self.normalize: logging.warning( "Note: you are using metrics that require norms with un-normed vdb data, this will be normed for search but vdb file will not be changed" ) From df4e8db59f7d4a0b23feeb13cf671d50345bb265 Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Thu, 29 Jan 2026 17:14:18 +0000 Subject: [PATCH 10/11] chore: added l2 dist --- src/classifai/indexers/main.py | 32 +++++++++++++++++++++----------- src/classifai/indexers/types.py | 8 ++------ 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 80bc94d..1f96b54 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -97,7 +97,7 @@ def __init__( # noqa: PLR0913 file_name, data_type, vectoriser: VectoriserBase, - scoring_metric: MetricSettings | str = MetricSettings.COSINE, + scoring_metric: MetricSettings | str = MetricSettings.INNER_PRODUCT, batch_size=8, meta_data=None, output_dir=None, @@ -412,10 +412,25 @@ def score( Returns: pl.DataFrame: The Polars DataFrame containing the top n most similar results to the query """ - if self.scoring_metric.startswith("cosine"): - query = query / np.linalg.norm(query, axis=1, keepdims=True) + docs = self.vectors["embeddings"].to_numpy() + if self.scoring_metric == MetricSettings.INNER_PRODUCT: + result = query @ docs.T + elif self.scoring_metric == MetricSettings.L2_DISTANCE: + # Dot products (n_queries, n_docs) + dots = query @ docs.T - result = query @ self.vectors["embeddings"].to_numpy().T + # Squared norms + q_sq = np.sum(query * query, axis=1, keepdims=True) # (n_queries, 1) + d_sq = np.sum(docs * docs, axis=1, keepdims=True).T # (1, n_docs) + + # Squared distances + dist_sq = q_sq + d_sq - 2.0 * dots + + # Numerical safety: tiny negatives -> 0 + np.maximum(dist_sq, 0.0, out=dist_sq) + + # True L2 distances + result = np.sqrt(dist_sq) # (n_queries, n_docs) # Get the top n_results indices for each query in the batch idx = np.argpartition(result, -n_results, axis=1)[:, -n_results:] @@ -430,11 +445,6 @@ def score( idx_sorted[j] = idx[j, sorted_indices] scores[j] = row_scores[sorted_indices] - if "l2" in self.scoring_metric: - scores = 2 * (1 - scores) - if not self.scoring_metric.endswith("squared"): - scores = np.sqrt(scores) - # Build a DataFrame for the current batch results result_df = pl.DataFrame( { @@ -451,7 +461,7 @@ def search( ) -> VectorStoreSearchOutput: """Searches the vector store using queries from a VectorStoreSearchInput object and returns ranked results in VectorStoreSearchOutput object. In batches, converts users text queries into vector embeddings, - computes cosine similarity with stored document vectors, and retrieves the top results. + computes similarity scoring with stored document vectors, and retrieves the top results. Args: query (VectorStoreSearchInput): A VectoreStoreSearchInput object containing the text query or list of queries to search for with ids. @@ -542,7 +552,7 @@ def from_filespace( cls, folder_path, vectoriser: VectoriserBase, - scoring_metric: MetricSettings | str = MetricSettings.COSINE, + scoring_metric: MetricSettings | str = MetricSettings.INNER_PRODUCT, hooks: dict | None = None, ): """Creates a `VectorStore` instance from stored metadata and Parquet files. diff --git a/src/classifai/indexers/types.py b/src/classifai/indexers/types.py index 0942ec6..f06461f 100644 --- a/src/classifai/indexers/types.py +++ b/src/classifai/indexers/types.py @@ -2,9 +2,5 @@ class MetricSettings(str, Enum): - COSINE = "cosine" - DOTPROD = "dotprod" - COSINE_L2 = "cosinel2" - DOTPROD_L2 = "dotprodl2" - COSINE_L2_SQUARED = "cosinel2squared" - DOTPROD_L2_SQUARED = "dotprodl2squared" + INNER_PRODUCT = "inner_product" + L2_DISTANCE = "L2_distance" From cb9c561714e3fbdd4b360e04763e8548b08d975b Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Thu, 29 Jan 2026 17:30:24 +0000 Subject: [PATCH 11/11] chore: renamed metrics again --- src/classifai/indexers/main.py | 6 +++--- src/classifai/indexers/types.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 1f96b54..1c4057e 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -97,7 +97,7 @@ def __init__( # noqa: PLR0913 file_name, data_type, vectoriser: VectoriserBase, - scoring_metric: MetricSettings | str = MetricSettings.INNER_PRODUCT, + scoring_metric: MetricSettings | str = MetricSettings.DOT_PRODUCT, batch_size=8, meta_data=None, output_dir=None, @@ -413,7 +413,7 @@ def score( pl.DataFrame: The Polars DataFrame containing the top n most similar results to the query """ docs = self.vectors["embeddings"].to_numpy() - if self.scoring_metric == MetricSettings.INNER_PRODUCT: + if self.scoring_metric == MetricSettings.DOT_PRODUCT: result = query @ docs.T elif self.scoring_metric == MetricSettings.L2_DISTANCE: # Dot products (n_queries, n_docs) @@ -552,7 +552,7 @@ def from_filespace( cls, folder_path, vectoriser: VectoriserBase, - scoring_metric: MetricSettings | str = MetricSettings.INNER_PRODUCT, + scoring_metric: MetricSettings | str = MetricSettings.DOT_PRODUCT, hooks: dict | None = None, ): """Creates a `VectorStore` instance from stored metadata and Parquet files. diff --git a/src/classifai/indexers/types.py b/src/classifai/indexers/types.py index f06461f..4289310 100644 --- a/src/classifai/indexers/types.py +++ b/src/classifai/indexers/types.py @@ -2,5 +2,5 @@ class MetricSettings(str, Enum): - INNER_PRODUCT = "inner_product" + DOT_PRODUCT = "dot_product" L2_DISTANCE = "L2_distance"