From af1579390af35b1772900ba27832ada7485efaed Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 15 Apr 2025 11:44:18 +0330 Subject: [PATCH] fix: Added incomplete file removal! not to face errors when the crawling is interrupted! --- hivemind_etl/mediawiki/etl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index 256c83a..07be3bf 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -34,7 +34,14 @@ def extract(self, api_url: str, dump_dir: str | None = None) -> None: else: self.dump_dir = dump_dir - self.wikiteam_crawler.crawl(api_url, dump_dir) + try: + self.wikiteam_crawler.crawl(api_url, dump_dir) + except Exception as e: + logging.error(f"Error crawling {api_url}: {e}") + logging.warning("Removing incomplete dumped data if available!") + if os.path.exists(dump_dir): + shutil.rmtree(dump_dir) + raise e def transform(self) -> list[Document]: pages = parse_mediawiki_xml(file_dir=self.dump_dir)