From 418c066e8a531fbcf37089ba903e330c155e981e Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 1 Apr 2025 10:16:48 +0330 Subject: [PATCH 1/2] feat: enhance crawling process with improved data extraction! Extracting one url and its routes at a time, and then merging the results. --- hivemind_etl/website/website_etl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hivemind_etl/website/website_etl.py b/hivemind_etl/website/website_etl.py index 8de8397..0f2853c 100644 --- a/hivemind_etl/website/website_etl.py +++ b/hivemind_etl/website/website_etl.py @@ -1,3 +1,4 @@ +import logging from typing import Any from hivemind_etl.website.crawlee_client import CrawleeClient @@ -47,7 +48,13 @@ async def extract( """ if not urls: raise ValueError("No URLs provided for crawling") - extracted_data = await self.crawlee_client.crawl(urls) + + extracted_data = [] + for url in urls: + logging.info(f"Crawling {url} and its routes!") + extracted_data.extend(await self.crawlee_client.crawl(links=[url])) + + logging.info(f"Extracted {len(extracted_data)} documents!") if not extracted_data: raise ValueError(f"No data extracted from URLs: {urls}") From cad10ef8f54d624715e85724188eb6b930cc5e1c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 1 Apr 2025 10:33:14 +0330 Subject: [PATCH 2/2] fix: align test case with new codes! --- tests/unit/test_website_etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_website_etl.py b/tests/unit/test_website_etl.py index 412aa2f..e1a6e92 100644 --- a/tests/unit/test_website_etl.py +++ b/tests/unit/test_website_etl.py @@ -34,7 +34,7 @@ async def test_extract(self): extracted_data = await self.website_etl.extract(urls) self.assertEqual(extracted_data, mocked_data) - self.website_etl.crawlee_client.crawl.assert_awaited_once_with(urls) + self.website_etl.crawlee_client.crawl.assert_awaited_once_with(links=urls) def test_transform(self): """