Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,50 @@ authors = [
{name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
]
license = {file = "LICENSE"}
requires-python = ">=3.11"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"aiohttp",
"backoff==2.2.1",
"diskcache>=5.6.3",
"fastparquet>=2024.11.0",
"ffmpeg-python==0.2.0",
"fsspec>=2025.5.1",
"glom",
"grpcio",
"langdetect>=1.0.9",
"librosa==0.10.2",
"markitdown",
"minio>=7.2.12",
"numpy",
"nvidia-riva-client==2.20.0",
"opencv-python",
"pandas>=2.0",
"Pillow",
"pydantic>2.0.0",
"pydantic-settings>2.0.0",
"fsspec>=2025.5.1",
"pymilvus>=2.5.10",
"pypdfium2>=4.30.0,<5", # TODO: migrate get_pos() -> get_bounds() and audit other v5 API changes
"python-dateutil",
"python-docx>=1.1.2",
"python-pptx>=1.0.2",
"redis>=5.2.1",
"requests>=2.28.2",
"scikit-learn>=1.6.0",
"scipy>=1.15.1",
"tqdm>=4.67.1",
"transformers>=4.47.0",
"tritonclient[grpc]",
"universal_pathlib>=0.2.6",
"ffmpeg-python==0.2.0",
"tritonclient",
"glom",
"pypdfium2>=4.30.0",
"unstructured-client",
]

[project.optional-dependencies]
gpu = ["cudf"]
llm = ["openai"]
test = [
"moviepy==2.2.1",
]
Expand Down
4 changes: 2 additions & 2 deletions api/src/nv_ingest_api/util/pdf/pdfium.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def extract_simple_images_from_pdfium_page(page, max_depth):
# Attempt to retrieve the image bitmap
image_numpy: np.ndarray = pdfium_try_get_bitmap_as_numpy(obj) # noqa
image_base64: str = numpy_to_base64(image_numpy, format=YOLOX_PAGE_IMAGE_FORMAT)
image_bbox = obj.get_pos()
image_bbox = obj.get_pos() # TODO: pypdfium2 v5 renames get_pos() -> get_bounds(); update when pinning is relaxed
image_size = obj.get_size()
if image_size[0] < 10 and image_size[1] < 10:
continue
Expand Down Expand Up @@ -336,7 +336,7 @@ def extract_merged_images_from_pdfium_page(page, merge=True, **kwargs):
filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,),
max_depth=1,
):
image_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)
image_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height) # TODO: pypdfium2 v5 renames get_pos() -> get_bounds(); update when pinning is relaxed
image_bboxes.append(image_bbox)

# If no merging is requested or no bounding boxes exist, return the list as is
Expand Down
2,883 changes: 2,845 additions & 38 deletions api/uv.lock

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions client/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,20 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
"build>=1.2.2",
"charset-normalizer>=3.4.1",
"click>=8.1.8",
"fsspec>=2025.2.0",
"httpx>=0.28.1",
"lancedb>=0.25.3",
"numpy",
"opensearch-py",
"pandas>=2.0",
"Pillow",
"pyarrow",
"pydantic>2.0.0",
"pydantic-settings>2.0.0",
"requests>=2.28.2",
"setuptools>=78.1.1",
"scipy",
"tqdm>=4.67.1",
"lancedb>=0.25.3",
]

[project.optional-dependencies]
Expand Down
166 changes: 123 additions & 43 deletions client/uv.lock

Large diffs are not rendered by default.

38 changes: 18 additions & 20 deletions conda/packages/nv_ingest/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,44 +27,42 @@ requirements:
- python=3.12.11
- setuptools>=78.1.1
run:
- azure-core>=1.32.0
- click>=8.1.7
- fastapi>=0.115.6
- fastparquet>=2024.11.0
- fsspec>=2024.10.0
- gunicorn
- h11>=0.16.0 # Must pin at or above 0.16.0 for CVE mitigation
- httpx>=0.28.1
- isodate>=0.7.2
- langdetect>=1.0.9
- minio>=7.2.12
- nv-ingest-api
- nv-ingest-client
- opentelemetry-api>=1.27.0
- opentelemetry-exporter-otlp>=1.27.0
- opentelemetry-sdk>=1.27.0
- pandas>=2.0
- prometheus-client
- protobuf>=5.29.3
- psutil>=7.1.0
- pyarrow
- pydantic>=2.0.0
- pypdfium2>=4.30.0
- pytest>=8.0.2
- pytest-mock>=3.14.0
- pydantic-settings>=2.0.0
- pymilvus>=2.5.10
- pypdfium2==4.30.0
- pyyaml
- python>=3.12.11
- python-docx>=1.1.2
- python-dotenv>=1.0.1
- python-magic>=0.4.27
- python-pptx>=1.0.2
- ray-all>=2.52.0
- redis-py>=5.2.1
- requests>=2.28.2
- scipy>=1.15.1
- rich
- s3fs>=2024.10.0
- setuptools>=78.1.1
- tabulate>=0.9.0
- tqdm>=4.67.1
- transformers>=4.57.3
# - unstructured-client>=0.25.9
- uvicorn
- universal_pathlib
pip:
- nvidia-riva-client==2.20.0
- markitdown
- pymilvus[bulk_writer,model]>=2.5.10

test:
requires:
- pytest>=8.0.2
- pytest-mock>=3.14.0
commands:
- pytest ./tests

Expand Down
48 changes: 33 additions & 15 deletions conda/packages/nv_ingest_api/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,48 @@ requirements:
- python=3.12.11
- setuptools>=78.1.1
run:
- azure-core>=1.32.0
- aiohttp
- backoff==2.2.1
- diskcache>=5.6.3
- fastparquet>=2024.11.0
- fsspec>=2024.10.0
- httpx>=0.28.1
- isodate>=0.7.2
- ffmpeg-python==0.2.0
- fsspec>=2025.5.1
- glom
- grpcio
- langdetect>=1.0.9
- protobuf=5.29.3
- librosa==0.10.2
- minio>=7.2.12
- numpy
- pandas>=2.0
- pillow
- protobuf>=5.29.3 # intentional pin
- pydantic>=2.0.0
- pypdfium2>=4.30.0
- pytest>=8.0.2
- pytest-mock>=3.14.0
- py-opencv
- pypdfium2>=4.30.0,<5 # TODO: migrate get_pos() -> get_bounds() and audit other v5 API changes
- python>=3.12.11
- python-dateutil
- python-docx>=1.1.2
- python-dotenv>=1.0.1
- python-magic>=0.4.27
- python-pptx>=1.0.2
- redis-py>=5.2.1
- requests>=2.28.2
- setuptools>=78.1.1
- tabulate>=0.9.0
- transformers>=4.57.3
- universal-pathlib
# - unstructured-client>=0.25.9
- scikit-learn>=1.6.0
- scipy>=1.15.1
- tqdm>=4.67.1
- transformers>=4.47.0
- universal-pathlib>=0.2.6
pip:
- nvidia-riva-client==2.20.0
- markitdown
- pymilvus>=2.5.10
- tritonclient[grpc]
- unstructured-client
# Note: gpu extras (cudf) and llm extras (openai) are not available via conda; install via pip

test:
requires:
- pytest>=8.0.2
- pytest-mock>=3.14.0
- moviepy==2.2.1
commands:
- pytest ./tests

Expand Down
15 changes: 10 additions & 5 deletions conda/packages/nv_ingest_client/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,21 @@ requirements:
- setuptools>=78.1.1
run:
- click>=8.1.7
- fsspec>=2024.10.0
- fsspec>=2025.2.0
- httpx>=0.28.1
- numpy
- pandas>=2.0
- pillow
- pyarrow
- pydantic>=2.0.0
- pypdfium2>=4.30.0
- python>=3.12
- python-docx>=1.1.2
- python-pptx>=1.0.2
- requests>=2.28.2
- setuptools>=78.1.1
- scipy
- tqdm>=4.67.1
pip:
- lancedb>=0.25.3
- opensearch-py
# Note: milvus extras (pymilvus) and minio extras are optional; install via pip

test:
commands:
Expand Down
49 changes: 19 additions & 30 deletions src/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,60 +14,49 @@ authors = [
{name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
]
license = {file = "LICENSE"}
requires-python = ">=3.11"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"azure-core>=1.32.0",
"click>=8.1.7",
"diskcache>=5.6.3",
"fastapi>=0.115.6",
"fastparquet>=2024.11.0",
"fsspec>=2024.10.0",
"universal_pathlib>=0.2.6",
"s3fs>=2024.10.0",
"gunicorn",
"h11>=0.16.0", # Must pin at or above 0.16.0 for CVE mitigation
"httpx>=0.28.1",
"isodate>=0.7.2",
"langdetect>=1.0.9",
"minio>=7.2.12",
"librosa==0.10.2",
"nv-ingest-api",
"nv-ingest-client",
"opentelemetry-api>=1.27.0",
"opentelemetry-exporter-otlp>=1.27.0",
"opentelemetry-sdk>=1.27.0",
"pandas>=2.0",
"prometheus-client",
"psutil>=7.1.0",
"pyarrow",
"pydantic>2.0.0",
"pydantic-settings>2.0.0",
"pymilvus>=2.5.10",
"pymilvus[bulk_writer, model]",
"pypdfium2==4.30.0",
"pytest>=8.0.2",
"pytest-mock>=3.14.0",
"pytest-cov>=6.0.0",
"build>=1.2.2",
"python-docx>=1.1.2",
"python-dotenv>=1.0.1",
"python-pptx>=1.0.2",
"prometheus-client",
"PyYAML",
"ray[all]>=2.49.0",
"redis>=5.2.1",
"requests>=2.28.2",
"scikit-learn>=1.6.0",
"scipy>=1.15.1",
"rich",
"s3fs>=2024.10.0",
"setuptools>=78.1.1",
"tabulate>=0.9.0",
"transformers>=4.47.0",
"tqdm>=4.67.1",
"uvicorn",
"pip",
"opencv-python", # For some reason conda cant solve our req set with py-opencv so we need to use pip
"pymilvus>=2.5.10",
"pymilvus[bulk_writer, model]",
"tritonclient",
"nvidia-riva-client==2.20.0",
"unstructured-client",
"markitdown",
]

[project.optional-dependencies]
test = [
"build>=1.2.2",
"pytest>=8.0.2",
"pytest-cov>=6.0.0",
"pytest-mock>=3.14.0",
]

[project.urls]
Expand Down
Loading
Loading