From 15e5e3cb4bbc708ff9961b4838eabfda0dfddfa6 Mon Sep 17 00:00:00 2001 From: Sergey Bekkerman Date: Wed, 7 May 2025 01:13:53 +0200 Subject: [PATCH 1/3] Add scraper for knowledge base solutions data --- data_scraper/processors/solutions_provider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_scraper/processors/solutions_provider.py b/data_scraper/processors/solutions_provider.py index bb643a9..700eb8e 100644 --- a/data_scraper/processors/solutions_provider.py +++ b/data_scraper/processors/solutions_provider.py @@ -5,7 +5,6 @@ LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) - # pylint: disable=too-few-public-methods class SolutionsProvider: """Provider for Solutions""" From d16356be3230e678adce4f8fd3c9f3d94d504149 Mon Sep 17 00:00:00 2001 From: Sergey Bekkerman Date: Wed, 7 May 2025 20:23:03 +0200 Subject: [PATCH 2/3] Fix: Correct solutions query formation and standardize data types - Fixes query string in SolutionsProvider (was tuple, now string). - Changes get_solutions() return to list[dict] and updates error returns. - Defaults kb_id in SolutionsScraper to '' for string consistency. --- data_scraper/processors/solutions_provider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data_scraper/processors/solutions_provider.py b/data_scraper/processors/solutions_provider.py index 700eb8e..bb643a9 100644 --- a/data_scraper/processors/solutions_provider.py +++ b/data_scraper/processors/solutions_provider.py @@ -5,6 +5,7 @@ LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) + # pylint: disable=too-few-public-methods class SolutionsProvider: """Provider for Solutions""" From 37e83f62ffd8679c826df5a44ddb855845adbca4 Mon Sep 17 00:00:00 2001 From: Sergey Bekkerman Date: Wed, 7 May 2025 21:16:16 +0200 Subject: [PATCH 3/3] Use URL-based UUIDs in VectorDB records and fix argparse flags - Generate deterministic UUIDs using UUIDv5 based on record['url'] - Replace `type=bool` with `action='store_true'` for argparse flags to ensure proper boolean behavior --- data_scraper/core/scraper.py | 8 +++++++- data_scraper/main.py | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/data_scraper/core/scraper.py b/data_scraper/core/scraper.py index 2cde2ed..0807aaa 100644 --- a/data_scraper/core/scraper.py +++ b/data_scraper/core/scraper.py @@ -107,6 +107,12 @@ def store_records(self, records: list, recreate: bool = True) -> None: raise IOError for record in tqdm(records, desc="Processing embeddings"): + if record['url']: + record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, record["url"])) + else: + LOG.error("Missing required URL field") + continue + chunks: list[str] = self.get_chunks(record) embeddings: list[list[float]] = [] @@ -121,7 +127,7 @@ def store_records(self, records: list, recreate: bool = True) -> None: self.record_postprocessing(record) point = self.db_manager.build_record( - record_id=str(uuid.uuid4()), + record_id=record_id, payload=dict(record), vector=embeddings, ) diff --git a/data_scraper/main.py b/data_scraper/main.py index 37962f5..8bfcfc3 100644 --- a/data_scraper/main.py +++ b/data_scraper/main.py @@ -46,7 +46,7 @@ def jira_scraper(): "Date must follow ISO format 'YYYY-MM-DD'" ) ) - parser.add_argument("--recreate_collection", type=bool, default=False, + parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") args = parser.parse_args() @@ -95,7 +95,7 @@ def osp_doc_scraper(): parser.add_argument("--db_collection_name", type=str, default=constants.OSP_DOCS_COLLECTION_NAME) parser.add_argument("--osp_version", type=str, default="18.0") - parser.add_argument("--recreate_collection", type=bool, default=True, + parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") parser.add_argument( "--rhoso_docs_path", type=str, default="", @@ -154,7 +154,7 @@ def errata_scraper() -> None: "Date must follow ISO format 'YYYY-MM-DD'" ) ) - parser.add_argument("--recreate_collection", type=bool, default=True, + parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") args = parser.parse_args() @@ -275,7 +275,7 @@ def solutions_scraper() -> None: default=constants.SOLUTIONS_COLLECTION_NAME) parser.add_argument("--product_name", type=str, default=constants.SOLUTIONS_PRODUCT_NAME) - parser.add_argument("--recreate_collection", type=bool, default=True, + parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") args = parser.parse_args()