diff --git a/Dockerfile b/Dockerfile index 6bf502d..38bf556 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base WORKDIR /project COPY . . RUN pip install --no-cache-dir -r requirements.txt +RUN python -m spacy download en_core_web_sm FROM base AS test RUN chmod +x docker-entrypoint.sh diff --git a/hivemind_etl/mediawiki/activities.py b/hivemind_etl/mediawiki/activities.py index 319d899..69047b1 100644 --- a/hivemind_etl/mediawiki/activities.py +++ b/hivemind_etl/mediawiki/activities.py @@ -96,8 +96,11 @@ async def load_mediawiki_data(mediawiki_platform: dict[str, Any]) -> None: """Load the transformed MediaWiki data into the database.""" community_id = mediawiki_platform["community_id"] namespaces = mediawiki_platform["namespaces"] + try: - documents = mediawiki_platform["documents"] + documents_dict = mediawiki_platform["documents"] + # temporal had converted them to dicts, so we need to convert them back to Document objects + documents = [Document.from_dict(doc) for doc in documents_dict] logging.info(f"Starting data load for community {community_id}") mediawiki_etl = MediawikiETL(community_id=community_id, namespaces=namespaces) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index 07be3bf..ee415c9 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -94,10 +94,13 @@ def transform(self) -> list[Document]: return documents def load(self, documents: list[Document]) -> None: + logging.info(f"Loading {len(documents)} documents into Qdrant!") ingestion_pipeline = CustomIngestionPipeline( self.community_id, collection_name="mediawiki" ) ingestion_pipeline.run_pipeline(documents) + logging.info(f"Loaded {len(documents)} documents into Qdrant!") if self.delete_dump_after_load: + logging.info(f"Removing dump directory {self.dump_dir}!") shutil.rmtree(self.dump_dir)