NVIDIA · charlesbluca · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -14,25 +14,50 @@ authors = [
     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
 ]
 license = {file = "LICENSE"}
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
+    "aiohttp",
     "backoff==2.2.1",
+    "diskcache>=5.6.3",
+    "fastparquet>=2024.11.0",
+    "ffmpeg-python==0.2.0",
+    "fsspec>=2025.5.1",
+    "glom",
+    "grpcio",
+    "langdetect>=1.0.9",
+    "librosa==0.10.2",
+    "markitdown",
+    "minio>=7.2.12",
+    "numpy",
+    "nvidia-riva-client==2.20.0",
+    "opencv-python",
     "pandas>=2.0",
+    "Pillow",
     "pydantic>2.0.0",
-    "pydantic-settings>2.0.0",
-    "fsspec>=2025.5.1",
+    "pymilvus>=2.5.10",
+    "pypdfium2>=4.30.0,<5",  # TODO: migrate get_pos() -> get_bounds() and audit other v5 API changes
+    "python-dateutil",
+    "python-docx>=1.1.2",
+    "python-pptx>=1.0.2",
+    "redis>=5.2.1",
+    "requests>=2.28.2",
+    "scikit-learn>=1.6.0",
+    "scipy>=1.15.1",
+    "tqdm>=4.67.1",
+    "transformers>=4.47.0",
+    "tritonclient[grpc]",
     "universal_pathlib>=0.2.6",
-    "ffmpeg-python==0.2.0",
-    "tritonclient",
-    "glom",
-    "pypdfium2>=4.30.0",
+    "unstructured-client",
 ]
 
 [project.optional-dependencies]
+gpu = ["cudf"]
+llm = ["openai"]
 test = [
     "moviepy==2.2.1",
 ]

@@ -291,7 +291,7 @@ def extract_simple_images_from_pdfium_page(page, max_depth):
             # Attempt to retrieve the image bitmap
             image_numpy: np.ndarray = pdfium_try_get_bitmap_as_numpy(obj)  # noqa
             image_base64: str = numpy_to_base64(image_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
-            image_bbox = obj.get_pos()
+            image_bbox = obj.get_pos()  # TODO: pypdfium2 v5 renames get_pos() -> get_bounds(); update when pinning is relaxed
             image_size = obj.get_size()
             if image_size[0] < 10 and image_size[1] < 10:
                 continue
@@ -336,7 +336,7 @@ def extract_merged_images_from_pdfium_page(page, merge=True, **kwargs):
         filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,),
         max_depth=1,
     ):
-        image_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)
+        image_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)  # TODO: pypdfium2 v5 renames get_pos() -> get_bounds(); update when pinning is relaxed
         image_bboxes.append(image_bbox)
 
     # If no merging is requested or no bounding boxes exist, return the list as is

@@ -20,17 +20,20 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "build>=1.2.2",
     "charset-normalizer>=3.4.1",
     "click>=8.1.8",
     "fsspec>=2025.2.0",
     "httpx>=0.28.1",
+    "lancedb>=0.25.3",
+    "numpy",
+    "opensearch-py",
+    "pandas>=2.0",
+    "Pillow",
+    "pyarrow",
     "pydantic>2.0.0",
-    "pydantic-settings>2.0.0",
     "requests>=2.28.2",
-    "setuptools>=78.1.1",
+    "scipy",
     "tqdm>=4.67.1",
-    "lancedb>=0.25.3",
 ]
 
 [project.optional-dependencies]

@@ -27,44 +27,42 @@ requirements:
     - python=3.12.11
     - setuptools>=78.1.1
   run:
-    - azure-core>=1.32.0
     - click>=8.1.7
     - fastapi>=0.115.6
-    - fastparquet>=2024.11.0
     - fsspec>=2024.10.0
+    - gunicorn
+    - h11>=0.16.0  # Must pin at or above 0.16.0 for CVE mitigation
     - httpx>=0.28.1
-    - isodate>=0.7.2
-    - langdetect>=1.0.9
-    - minio>=7.2.12
+    - nv-ingest-api
+    - nv-ingest-client
     - opentelemetry-api>=1.27.0
     - opentelemetry-exporter-otlp>=1.27.0
     - opentelemetry-sdk>=1.27.0
+    - pandas>=2.0
+    - prometheus-client
     - protobuf>=5.29.3
+    - psutil>=7.1.0
+    - pyarrow
     - pydantic>=2.0.0
-    - pypdfium2>=4.30.0
-    - pytest>=8.0.2
-    - pytest-mock>=3.14.0
+    - pydantic-settings>=2.0.0
+    - pymilvus>=2.5.10
+    - pypdfium2==4.30.0
+    - pyyaml
     - python>=3.12.11
-    - python-docx>=1.1.2
-    - python-dotenv>=1.0.1
-    - python-magic>=0.4.27
-    - python-pptx>=1.0.2
     - ray-all>=2.52.0
     - redis-py>=5.2.1
     - requests>=2.28.2
-    - scipy>=1.15.1
+    - rich
+    - s3fs>=2024.10.0
     - setuptools>=78.1.1
-    - tabulate>=0.9.0
-    - tqdm>=4.67.1
-    - transformers>=4.57.3
-    # - unstructured-client>=0.25.9
     - uvicorn
-    - universal_pathlib
   pip:
-    - nvidia-riva-client==2.20.0
-    - markitdown
+    - pymilvus[bulk_writer,model]>=2.5.10
 
   test:
+    requires:
+      - pytest>=8.0.2
+      - pytest-mock>=3.14.0
     commands:
       - pytest ./tests
 

@@ -27,30 +27,48 @@ requirements:
     - python=3.12.11
     - setuptools>=78.1.1
   run:
-    - azure-core>=1.32.0
+    - aiohttp
+    - backoff==2.2.1
+    - diskcache>=5.6.3
     - fastparquet>=2024.11.0
-    - fsspec>=2024.10.0
-    - httpx>=0.28.1
-    - isodate>=0.7.2
+    - ffmpeg-python==0.2.0
+    - fsspec>=2025.5.1
+    - glom
+    - grpcio
     - langdetect>=1.0.9
-    - protobuf=5.29.3
+    - librosa==0.10.2
+    - minio>=7.2.12
+    - numpy
+    - pandas>=2.0
+    - pillow
+    - protobuf>=5.29.3  # intentional pin
     - pydantic>=2.0.0
-    - pypdfium2>=4.30.0
-    - pytest>=8.0.2
-    - pytest-mock>=3.14.0
+    - py-opencv
+    - pypdfium2>=4.30.0,<5  # TODO: migrate get_pos() -> get_bounds() and audit other v5 API changes
     - python>=3.12.11
+    - python-dateutil
     - python-docx>=1.1.2
-    - python-dotenv>=1.0.1
-    - python-magic>=0.4.27
     - python-pptx>=1.0.2
+    - redis-py>=5.2.1
     - requests>=2.28.2
-    - setuptools>=78.1.1
-    - tabulate>=0.9.0
-    - transformers>=4.57.3
-    - universal-pathlib
-    # - unstructured-client>=0.25.9
+    - scikit-learn>=1.6.0
+    - scipy>=1.15.1
+    - tqdm>=4.67.1
+    - transformers>=4.47.0
+    - universal-pathlib>=0.2.6
+  pip:
+    - nvidia-riva-client==2.20.0
+    - markitdown
+    - pymilvus>=2.5.10
+    - tritonclient[grpc]
+    - unstructured-client
+  # Note: gpu extras (cudf) and llm extras (openai) are not available via conda; install via pip
 
   test:
+    requires:
+      - pytest>=8.0.2
+      - pytest-mock>=3.14.0
+      - moviepy==2.2.1
     commands:
       - pytest ./tests
 

@@ -28,16 +28,21 @@ requirements:
     - setuptools>=78.1.1
   run:
     - click>=8.1.7
-    - fsspec>=2024.10.0
+    - fsspec>=2025.2.0
     - httpx>=0.28.1
+    - numpy
+    - pandas>=2.0
+    - pillow
+    - pyarrow
     - pydantic>=2.0.0
-    - pypdfium2>=4.30.0
     - python>=3.12
-    - python-docx>=1.1.2
-    - python-pptx>=1.0.2
     - requests>=2.28.2
-    - setuptools>=78.1.1
+    - scipy
     - tqdm>=4.67.1
+  pip:
+    - lancedb>=0.25.3
+    - opensearch-py
+  # Note: milvus extras (pymilvus) and minio extras are optional; install via pip
 
   test:
     commands:

@@ -14,60 +14,49 @@ authors = [
     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
 ]
 license = {file = "LICENSE"}
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "azure-core>=1.32.0",
     "click>=8.1.7",
-    "diskcache>=5.6.3",
     "fastapi>=0.115.6",
-    "fastparquet>=2024.11.0",
     "fsspec>=2024.10.0",
-    "universal_pathlib>=0.2.6",
-    "s3fs>=2024.10.0",
     "gunicorn",
     "h11>=0.16.0", # Must pin at or above 0.16.0 for CVE mitigation
     "httpx>=0.28.1",
-    "isodate>=0.7.2",
-    "langdetect>=1.0.9",
-    "minio>=7.2.12",
-    "librosa==0.10.2",
+    "nv-ingest-api",
+    "nv-ingest-client",
     "opentelemetry-api>=1.27.0",
     "opentelemetry-exporter-otlp>=1.27.0",
     "opentelemetry-sdk>=1.27.0",
+    "pandas>=2.0",
+    "prometheus-client",
     "psutil>=7.1.0",
+    "pyarrow",
     "pydantic>2.0.0",
     "pydantic-settings>2.0.0",
+    "pymilvus>=2.5.10",
+    "pymilvus[bulk_writer, model]",
     "pypdfium2==4.30.0",
-    "pytest>=8.0.2",
-    "pytest-mock>=3.14.0",
-    "pytest-cov>=6.0.0",
-    "build>=1.2.2",
-    "python-docx>=1.1.2",
-    "python-dotenv>=1.0.1",
-    "python-pptx>=1.0.2",
-    "prometheus-client",
+    "PyYAML",
     "ray[all]>=2.49.0",
     "redis>=5.2.1",
     "requests>=2.28.2",
-    "scikit-learn>=1.6.0",
-    "scipy>=1.15.1",
+    "rich",
+    "s3fs>=2024.10.0",
     "setuptools>=78.1.1",
-    "tabulate>=0.9.0",
-    "transformers>=4.47.0",
-    "tqdm>=4.67.1",
     "uvicorn",
-    "pip",
-    "opencv-python", # For some reason conda cant solve our req set with py-opencv so we need to use pip
-    "pymilvus>=2.5.10",
-    "pymilvus[bulk_writer, model]",
-    "tritonclient",
-    "nvidia-riva-client==2.20.0",
-    "unstructured-client",
-    "markitdown",
+]
+
+[project.optional-dependencies]
+test = [
+    "build>=1.2.2",
+    "pytest>=8.0.2",
+    "pytest-cov>=6.0.0",
+    "pytest-mock>=3.14.0",
 ]
 
 [project.urls]