From 62ee10f32d0ea9a1047cdc67b10b65e969b40ef7 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 15 Apr 2025 19:43:41 +0330 Subject: [PATCH 1/2] feat: add spacy model download and some logs! --- Dockerfile | 1 + hivemind_etl/mediawiki/etl.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/Dockerfile b/Dockerfile index 6bf502d..38bf556 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base WORKDIR /project COPY . . RUN pip install --no-cache-dir -r requirements.txt +RUN python -m spacy download en_core_web_sm FROM base AS test RUN chmod +x docker-entrypoint.sh diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index 07be3bf..ee415c9 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -94,10 +94,13 @@ def transform(self) -> list[Document]: return documents def load(self, documents: list[Document]) -> None: + logging.info(f"Loading {len(documents)} documents into Qdrant!") ingestion_pipeline = CustomIngestionPipeline( self.community_id, collection_name="mediawiki" ) ingestion_pipeline.run_pipeline(documents) + logging.info(f"Loaded {len(documents)} documents into Qdrant!") if self.delete_dump_after_load: + logging.info(f"Removing dump directory {self.dump_dir}!") shutil.rmtree(self.dump_dir) From 42d5d06733fb19aa7c1da60ce6d8b3cc4f7378c8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 15 Apr 2025 21:33:56 +0330 Subject: [PATCH 2/2] fix: document conversion is now added! --- hivemind_etl/mediawiki/activities.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hivemind_etl/mediawiki/activities.py b/hivemind_etl/mediawiki/activities.py index 319d899..69047b1 100644 --- a/hivemind_etl/mediawiki/activities.py +++ b/hivemind_etl/mediawiki/activities.py @@ -96,8 +96,11 @@ async def load_mediawiki_data(mediawiki_platform: dict[str, Any]) -> None: """Load the transformed MediaWiki data into the database.""" community_id = mediawiki_platform["community_id"] namespaces = mediawiki_platform["namespaces"] + try: - documents = mediawiki_platform["documents"] + documents_dict = mediawiki_platform["documents"] + # temporal had converted them to dicts, so we need to convert them back to Document objects + documents = [Document.from_dict(doc) for doc in documents_dict] logging.info(f"Starting data load for community {community_id}") mediawiki_etl = MediawikiETL(community_id=community_id, namespaces=namespaces)