From 3f59b920e06785d3e24ec0ece8fb07eda56a0f63 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 2 Apr 2025 12:19:16 +0330
Subject: [PATCH 1/2] fix: changing request_queue_id in every init! the reason
 is, crawlee is caching all the requests and everytime it's being
 re-initialized, it would re-fetch previous enqueud links and not the new ones
 still no good way of removing the request_queue is not provided by crawlee
 community.

---
 hivemind_etl/website/crawlee_client.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hivemind_etl/website/crawlee_client.py b/hivemind_etl/website/crawlee_client.py
index b9541de..965e238 100644
--- a/hivemind_etl/website/crawlee_client.py
+++ b/hivemind_etl/website/crawlee_client.py
@@ -1,5 +1,6 @@
 import asyncio
 from typing import Any
+import uuid
 
 from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
 from defusedxml import ElementTree as ET
@@ -21,6 +22,12 @@ def __init__(
         # do not persist crawled data to local storage
         self.crawler._configuration.persist_storage = False
         self.crawler._configuration.write_metadata = False
+        self.crawler._configuration.purge_on_start = True
+
+        # changing the id each time so it wouldn't continue
+        # fetching the previous links
+        config = self.crawler._configuration.get_global_configuration()
+        config.default_request_queue_id = uuid.uuid4().hex
 
         @self.crawler.router.default_handler
         async def request_handler(context: PlaywrightCrawlingContext) -> None:

From 2f0fcc77210231c8deffe01b946454df77acfe9b Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Wed, 2 Apr 2025 12:20:41 +0330
Subject: [PATCH 2/2] fix: black linter issues!

---
 tests/unit/test_website_etl.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_website_etl.py b/tests/unit/test_website_etl.py
index 825570d..d00838b 100644
--- a/tests/unit/test_website_etl.py
+++ b/tests/unit/test_website_etl.py
@@ -30,15 +30,17 @@ async def test_extract(self):
                 "title": "Example",
             }
         ]
-        
+
         # Mock the CrawleeClient class instead of the instance
-        with patch('hivemind_etl.website.website_etl.CrawleeClient') as MockCrawleeClient:
+        with patch(
+            "hivemind_etl.website.website_etl.CrawleeClient"
+        ) as MockCrawleeClient:
             mock_client_instance = AsyncMock()
             mock_client_instance.crawl.return_value = mocked_data
             MockCrawleeClient.return_value = mock_client_instance
-            
+
             extracted_data = await self.website_etl.extract(urls)
-            
+
             self.assertEqual(extracted_data, mocked_data)
             MockCrawleeClient.assert_called_once()
             mock_client_instance.crawl.assert_awaited_once_with(links=urls)