From 4298fc320e7abe01e183a9ad372abd9c908f1736 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vjawa@nvidia.com>
Date: Tue, 20 Jan 2026 14:14:24 -0800
Subject: [PATCH 1/2] working with pipeline models

Signed-off-by: Vibhu Jawa <vjawa@nvidia.com>
---
 .../internal/extract/image/chart_extractor.py  |  9 ++++-----
 .../extract/image/infographic_extractor.py     |  9 ++++-----
 .../internal/extract/image/ocr_extractor.py    |  9 ++++-----
 .../internal/extract/image/table_extractor.py  |  9 ++++-----
 .../primitives/nim/model_interface/ocr.py      | 18 +++++++++++++++++-
 5 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/api/src/nv_ingest_api/internal/extract/image/chart_extractor.py b/api/src/nv_ingest_api/internal/extract/image/chart_extractor.py
index 7e2fea053..508a6c090 100644
--- a/api/src/nv_ingest_api/internal/extract/image/chart_extractor.py
+++ b/api/src/nv_ingest_api/internal/extract/image/chart_extractor.py
@@ -98,7 +98,7 @@ def _run_chart_inference(
             model_name="paddle",
             max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
         )
-    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
+    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}:
         future_ocr_kwargs.update(
             model_name=ocr_model_name,
             input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -239,9 +239,10 @@ def _create_ocr_client(
     ocr_model_name: str,
     auth_token: str,
 ) -> NimClient:
+    nemo_retriever_ocr_models = {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}
     ocr_model_interface = (
         NemoRetrieverOCRModelInterface()
-        if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
+        if ocr_model_name in nemo_retriever_ocr_models
         else PaddleOCRModelInterface()
     )
 
@@ -250,9 +251,7 @@ def _create_ocr_client(
         model_interface=ocr_model_interface,
         auth_token=auth_token,
         infer_protocol=ocr_protocol,
-        enable_dynamic_batching=(
-            True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
-        ),
+        enable_dynamic_batching=(True if ocr_model_name in nemo_retriever_ocr_models else False),
         dynamic_batch_memory_budget_mb=32,
     )
 
diff --git a/api/src/nv_ingest_api/internal/extract/image/infographic_extractor.py b/api/src/nv_ingest_api/internal/extract/image/infographic_extractor.py
index 048ff979c..13521804d 100644
--- a/api/src/nv_ingest_api/internal/extract/image/infographic_extractor.py
+++ b/api/src/nv_ingest_api/internal/extract/image/infographic_extractor.py
@@ -107,7 +107,7 @@ def _update_infographic_metadata(
             model_name="paddle",
             max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
         )
-    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
+    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}:
         infer_kwargs.update(
             model_name=ocr_model_name,
             input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -151,9 +151,10 @@ def _create_ocr_client(
     ocr_model_name: str,
     auth_token: str,
 ) -> NimClient:
+    nemo_retriever_ocr_models = {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}
     ocr_model_interface = (
         NemoRetrieverOCRModelInterface()
-        if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
+        if ocr_model_name in nemo_retriever_ocr_models
         else PaddleOCRModelInterface()
     )
 
@@ -162,9 +163,7 @@ def _create_ocr_client(
         model_interface=ocr_model_interface,
         auth_token=auth_token,
         infer_protocol=ocr_protocol,
-        enable_dynamic_batching=(
-            True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
-        ),
+        enable_dynamic_batching=(True if ocr_model_name in nemo_retriever_ocr_models else False),
         dynamic_batch_memory_budget_mb=32,
     )
 
diff --git a/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py b/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py
index 31e69c481..6e4f47851 100644
--- a/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py
+++ b/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py
@@ -107,7 +107,7 @@ def _update_text_metadata(
             model_name="paddle",
             max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
         )
-    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
+    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}:
         infer_kwargs.update(
             model_name=ocr_model_name,
             input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -141,9 +141,10 @@ def _create_ocr_client(
     ocr_model_name: str,
     auth_token: str,
 ) -> NimClient:
+    nemo_retriever_ocr_models = {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}
     ocr_model_interface = (
         NemoRetrieverOCRModelInterface()
-        if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
+        if ocr_model_name in nemo_retriever_ocr_models
         else PaddleOCRModelInterface()
     )
 
@@ -152,9 +153,7 @@ def _create_ocr_client(
         model_interface=ocr_model_interface,
         auth_token=auth_token,
         infer_protocol=ocr_protocol,
-        enable_dynamic_batching=(
-            True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
-        ),
+        enable_dynamic_batching=(True if ocr_model_name in nemo_retriever_ocr_models else False),
         dynamic_batch_memory_budget_mb=32,
     )
 
diff --git a/api/src/nv_ingest_api/internal/extract/image/table_extractor.py b/api/src/nv_ingest_api/internal/extract/image/table_extractor.py
index ad188de5d..74f4ed3c8 100644
--- a/api/src/nv_ingest_api/internal/extract/image/table_extractor.py
+++ b/api/src/nv_ingest_api/internal/extract/image/table_extractor.py
@@ -100,7 +100,7 @@ def _run_inference(
             model_name="paddle",
             max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
         )
-    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
+    elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}:
         future_ocr_kwargs.update(
             model_name=ocr_model_name,
             input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -248,9 +248,10 @@ def _create_ocr_client(
     ocr_model_name: str,
     auth_token: str,
 ) -> NimClient:
+    nemo_retriever_ocr_models = {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python", "pipeline"}
     ocr_model_interface = (
         NemoRetrieverOCRModelInterface()
-        if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
+        if ocr_model_name in nemo_retriever_ocr_models
         else PaddleOCRModelInterface()
     )
 
@@ -259,9 +260,7 @@ def _create_ocr_client(
         model_interface=ocr_model_interface,
         auth_token=auth_token,
         infer_protocol=ocr_protocol,
-        enable_dynamic_batching=(
-            True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
-        ),
+        enable_dynamic_batching=(True if ocr_model_name in nemo_retriever_ocr_models else False),
         dynamic_batch_memory_budget_mb=32,
     )
 
diff --git a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
index 6dc1cf089..1b70ec59f 100644
--- a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
+++ b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
@@ -24,6 +24,7 @@
 NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_wrapper"
 NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME = "scene_text_ensemble"
 NEMORETRIEVER_OCR_BLS_MODEL_NAME = "scene_text_python"
+NEMORETRIEVER_OCR_PIPELINE_MODEL_NAME = "pipeline"
 
 
 logger = logging.getLogger(__name__)
@@ -234,11 +235,26 @@ def _extract_content_from_ocr_grpc_response(
         if not isinstance(response, np.ndarray):
             raise ValueError("Unexpected response format: response is not a NumPy array.")
 
-        if model_name in [
+        # Handle different response formats from OCR models
+        if model_name == NEMORETRIEVER_OCR_PIPELINE_MODEL_NAME:
+            # Pipeline model returns flat array (N*3,) with interleaved data:
+            # [bbox0, text0, conf0, bbox1, text1, conf1, ...]
+            # Reshape to (3, N) format
+            if response.ndim == 1:
+                if response.shape[0] % 3 != 0:
+                    raise ValueError(
+                        f"Pipeline response length {response.shape[0]} is not divisible by 3. "
+                        "Expected format: [bbox0, text0, conf0, bbox1, text1, conf1, ...]"
+                    )
+                batch_size = response.shape[0] // 3
+                # Reshape from (N*3,) to (N, 3) then transpose to (3, N)
+                response = response.reshape(batch_size, 3).transpose((1, 0))
+        elif model_name in [
             NEMORETRIEVER_OCR_MODEL_NAME,
             NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME,
             NEMORETRIEVER_OCR_BLS_MODEL_NAME,
         ]:
+            # Other NemoRetriever models return (batch_size, 3), transpose to (3, batch_size)
             response = response.transpose((1, 0))
 
         # If we have shape (3,), convert to (3, 1)

From 27c3243a335ee0b744b1453dd12a890c3f891e10 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vjawa@nvidia.com>
Date: Tue, 20 Jan 2026 23:32:08 -0800
Subject: [PATCH 2/2] add a multi gpu yaml that i use for benchmarking

---
 docker-compose.multigpu.yaml | 95 ++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 docker-compose.multigpu.yaml

diff --git a/docker-compose.multigpu.yaml b/docker-compose.multigpu.yaml
new file mode 100644
index 000000000..1c18046c7
--- /dev/null
+++ b/docker-compose.multigpu.yaml
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Docker Compose Override File
+# This file allows you to customize environment variables for NIM services
+# and override the nv-ingest-ms-runtime inference protocols to use HTTP.
+#
+# Usage: docker compose -f docker-compose.yaml -f docker-compose.override.yaml up
+
+services:
+  # NV-Ingest Runtime - Mount config and set custom pipeline
+  nv-ingest-ms-runtime:
+    volumes:
+      - ./config:/workspace/config
+    environment:
+      - INGEST_CONFIG_PATH=/workspace/config/default_pipeline.yaml
+
+  # Page Elements NIM - Customize environment variables as needed
+  page-elements:
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_PIPELINE_MAX_BATCH_SIZE=64
+      - NIM_TRITON_PIPELINE_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_WORKER_INSTANCE_COUNT=8
+      - NIM_TRITON_ENABLE_PIPELINE_TIMING=true
+      - NIM_TRITON_RATE_LIMIT=256  # Override base value of 3 to allow high concurrency
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${PAGE_ELEMENTS_GPU_ID:-0}"]
+              capabilities: [gpu]
+  # Graphic Elements NIM - Customize environment variables as needed
+  graphic-elements:
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_PIPELINE_MAX_BATCH_SIZE=64
+      - NIM_TRITON_PIPELINE_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_WORKER_INSTANCE_COUNT=8
+      - NIM_TRITON_ENABLE_PIPELINE_TIMING=true
+      - NIM_TRITON_RATE_LIMIT=256  # Override base value of 3 to allow high concurrency
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${GRAPHIC_ELEMENTS_GPU_ID:-0}"]
+              capabilities: [gpu]
+  # Table Structure NIM - Customize environment variables as needed
+  table-structure:
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_PIPELINE_MAX_BATCH_SIZE=64
+      - NIM_TRITON_PIPELINE_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_WORKER_INSTANCE_COUNT=8
+      - NIM_TRITON_ENABLE_PIPELINE_TIMING=true
+      - NIM_TRITON_RATE_LIMIT=256  # Override base value of 3 to allow high concurrency
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${TABLE_STRUCTURE_GPU_ID:-0}"]
+              capabilities: [gpu]
+
+  # OCR NIM - Customize environment variables as needed
+  ocr:
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_PIPELINE_MAX_BATCH_SIZE=64
+      - NIM_TRITON_PIPELINE_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_BATCH_SIZE=16
+      - NIM_TRITON_MODEL_MAX_QUEUE_DELAY_MICROSECONDS=5000
+      - NIM_TRITON_WORKER_INSTANCE_COUNT=8
+      - NIM_TRITON_ENABLE_PIPELINE_TIMING=true
+      - NIM_TRITON_RATE_LIMIT=256  # Override base value of 3 to allow high concurrency
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${OCR_GPU_ID:-1}"]
+              capabilities: [gpu]
\ No newline at end of file