From 3f59b920e06785d3e24ec0ece8fb07eda56a0f63 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 2 Apr 2025 12:19:16 +0330 Subject: [PATCH 1/2] fix: changing request_queue_id in every init! the reason is, crawlee is caching all the requests and everytime it's being re-initialized, it would re-fetch previous enqueud links and not the new ones still no good way of removing the request_queue is not provided by crawlee community. --- hivemind_etl/website/crawlee_client.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hivemind_etl/website/crawlee_client.py b/hivemind_etl/website/crawlee_client.py index b9541de..965e238 100644 --- a/hivemind_etl/website/crawlee_client.py +++ b/hivemind_etl/website/crawlee_client.py @@ -1,5 +1,6 @@ import asyncio from typing import Any +import uuid from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from defusedxml import ElementTree as ET @@ -21,6 +22,12 @@ def __init__( # do not persist crawled data to local storage self.crawler._configuration.persist_storage = False self.crawler._configuration.write_metadata = False + self.crawler._configuration.purge_on_start = True + + # changing the id each time so it wouldn't continue + # fetching the previous links + config = self.crawler._configuration.get_global_configuration() + config.default_request_queue_id = uuid.uuid4().hex @self.crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: From 2f0fcc77210231c8deffe01b946454df77acfe9b Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 2 Apr 2025 12:20:41 +0330 Subject: [PATCH 2/2] fix: black linter issues! --- tests/unit/test_website_etl.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_website_etl.py b/tests/unit/test_website_etl.py index 825570d..d00838b 100644 --- a/tests/unit/test_website_etl.py +++ b/tests/unit/test_website_etl.py @@ -30,15 +30,17 @@ async def test_extract(self): "title": "Example", } ] - + # Mock the CrawleeClient class instead of the instance - with patch('hivemind_etl.website.website_etl.CrawleeClient') as MockCrawleeClient: + with patch( + "hivemind_etl.website.website_etl.CrawleeClient" + ) as MockCrawleeClient: mock_client_instance = AsyncMock() mock_client_instance.crawl.return_value = mocked_data MockCrawleeClient.return_value = mock_client_instance - + extracted_data = await self.website_etl.extract(urls) - + self.assertEqual(extracted_data, mocked_data) MockCrawleeClient.assert_called_once() mock_client_instance.crawl.assert_awaited_once_with(links=urls)