From 9f2e06a6e87386afc8f42793f3c4fcc4647ddcbc Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 15 Apr 2025 11:22:08 +0330 Subject: [PATCH] feat: trying to fix the no doc_ref_id error on loading documents! --- hivemind_etl/mediawiki/etl.py | 5 ++++- requirements.txt | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index fafa950..256c83a 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -42,9 +42,11 @@ def transform(self) -> list[Document]: documents: list[Document] = [] for page in pages: try: + # Generate a ref_doc_id if needed for newer llama-index versions + doc_id = page.page_id documents.append( Document( - doc_id=page.page_id, + doc_id=doc_id, text=page.revision.text, metadata={ "title": page.title, @@ -57,6 +59,7 @@ def transform(self) -> list[Document]: "contributor_user_id": page.revision.contributor.user_id, "sha1": page.revision.sha1, "model": page.revision.model, + "ref_doc_id": doc_id, # Add ref_doc_id to metadata }, excluded_embed_metadata_keys=[ "namespace", diff --git a/requirements.txt b/requirements.txt index 6bdc60d..144219e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ python-dotenv>=1.0.0, <2.0.0 -tc-hivemind-backend==1.4.0 +tc-hivemind-backend==1.4.2.post2 llama-index-storage-docstore-redis==0.1.2 llama-index-storage-docstore-mongodb==0.1.3 crawlee[playwright]==0.3.8