diff --git a/hivemind_etl/website/crawlee_client.py b/hivemind_etl/website/crawlee_client.py index b9541de..965e238 100644 --- a/hivemind_etl/website/crawlee_client.py +++ b/hivemind_etl/website/crawlee_client.py @@ -1,5 +1,6 @@ import asyncio from typing import Any +import uuid from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from defusedxml import ElementTree as ET @@ -21,6 +22,12 @@ def __init__( # do not persist crawled data to local storage self.crawler._configuration.persist_storage = False self.crawler._configuration.write_metadata = False + self.crawler._configuration.purge_on_start = True + + # changing the id each time so it wouldn't continue + # fetching the previous links + config = self.crawler._configuration.get_global_configuration() + config.default_request_queue_id = uuid.uuid4().hex @self.crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: diff --git a/tests/unit/test_website_etl.py b/tests/unit/test_website_etl.py index 825570d..d00838b 100644 --- a/tests/unit/test_website_etl.py +++ b/tests/unit/test_website_etl.py @@ -30,15 +30,17 @@ async def test_extract(self): "title": "Example", } ] - + # Mock the CrawleeClient class instead of the instance - with patch('hivemind_etl.website.website_etl.CrawleeClient') as MockCrawleeClient: + with patch( + "hivemind_etl.website.website_etl.CrawleeClient" + ) as MockCrawleeClient: mock_client_instance = AsyncMock() mock_client_instance.crawl.return_value = mocked_data MockCrawleeClient.return_value = mock_client_instance - + extracted_data = await self.website_etl.extract(urls) - + self.assertEqual(extracted_data, mocked_data) MockCrawleeClient.assert_called_once() mock_client_instance.crawl.assert_awaited_once_with(links=urls)