From 62ee10f32d0ea9a1047cdc67b10b65e969b40ef7 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 15 Apr 2025 19:43:41 +0330
Subject: [PATCH 1/2] feat: add spacy model download and some logs!

---
 Dockerfile                    | 1 +
 hivemind_etl/mediawiki/etl.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 6bf502d..38bf556 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base
 WORKDIR /project
 COPY . .
 RUN pip install --no-cache-dir -r requirements.txt
+RUN python -m spacy download en_core_web_sm
 
 FROM base AS test
 RUN chmod +x docker-entrypoint.sh
diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py
index 07be3bf..ee415c9 100644
--- a/hivemind_etl/mediawiki/etl.py
+++ b/hivemind_etl/mediawiki/etl.py
@@ -94,10 +94,13 @@ def transform(self) -> list[Document]:
         return documents
 
     def load(self, documents: list[Document]) -> None:
+        logging.info(f"Loading {len(documents)} documents into Qdrant!")
         ingestion_pipeline = CustomIngestionPipeline(
             self.community_id, collection_name="mediawiki"
         )
         ingestion_pipeline.run_pipeline(documents)
+        logging.info(f"Loaded {len(documents)} documents into Qdrant!")
 
         if self.delete_dump_after_load:
+            logging.info(f"Removing dump directory {self.dump_dir}!")
             shutil.rmtree(self.dump_dir)

From 42d5d06733fb19aa7c1da60ce6d8b3cc4f7378c8 Mon Sep 17 00:00:00 2001
From: Mohammad Amin <dadgaramin96@gmail.com>
Date: Tue, 15 Apr 2025 21:33:56 +0330
Subject: [PATCH 2/2] fix: document conversion is now added!

---
 hivemind_etl/mediawiki/activities.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hivemind_etl/mediawiki/activities.py b/hivemind_etl/mediawiki/activities.py
index 319d899..69047b1 100644
--- a/hivemind_etl/mediawiki/activities.py
+++ b/hivemind_etl/mediawiki/activities.py
@@ -96,8 +96,11 @@ async def load_mediawiki_data(mediawiki_platform: dict[str, Any]) -> None:
     """Load the transformed MediaWiki data into the database."""
     community_id = mediawiki_platform["community_id"]
     namespaces = mediawiki_platform["namespaces"]
+
     try:
-        documents = mediawiki_platform["documents"]
+        documents_dict = mediawiki_platform["documents"]
+        # temporal had converted them to dicts, so we need to convert them back to Document objects
+        documents = [Document.from_dict(doc) for doc in documents_dict]
 
         logging.info(f"Starting data load for community {community_id}")
         mediawiki_etl = MediawikiETL(community_id=community_id, namespaces=namespaces)