diff --git a/api/api_tests/internal/schemas/transform/test_image_caption_schema.py b/api/api_tests/internal/schemas/transform/test_image_caption_schema.py index 12f24c358..6dbfd3784 100644 --- a/api/api_tests/internal/schemas/transform/test_image_caption_schema.py +++ b/api/api_tests/internal/schemas/transform/test_image_caption_schema.py @@ -14,6 +14,8 @@ def test_image_caption_extraction_schema_defaults(): assert schema.endpoint_url.startswith("https://") assert schema.prompt.startswith("Caption") assert schema.model_name.startswith("nvidia/") + assert schema.context_text_max_chars == 0 + assert schema.temperature == 1.0 assert schema.raise_on_failure is False @@ -40,6 +42,26 @@ def test_image_caption_extraction_schema_accepts_truthy_values(): assert schema.raise_on_failure is False +def test_image_caption_extraction_schema_context_text_max_chars_custom(): + schema = ImageCaptionExtractionSchema(context_text_max_chars=512) + assert schema.context_text_max_chars == 512 + + +def test_image_caption_extraction_schema_context_text_max_chars_none_coerced(): + schema = ImageCaptionExtractionSchema(context_text_max_chars=None) + assert schema.context_text_max_chars == 0 + + +def test_image_caption_extraction_schema_temperature_custom(): + schema = ImageCaptionExtractionSchema(temperature=0.5) + assert schema.temperature == 0.5 + + +def test_image_caption_extraction_schema_temperature_none_coerced(): + schema = ImageCaptionExtractionSchema(temperature=None) + assert schema.temperature == 1.0 + + def test_image_caption_extraction_schema_rejects_extra_fields(): with pytest.raises(ValidationError) as excinfo: ImageCaptionExtractionSchema(extra_field="oops") diff --git a/api/api_tests/internal/transform/test_caption_image.py b/api/api_tests/internal/transform/test_caption_image.py index 49e014405..a3d8327f7 100644 --- a/api/api_tests/internal/transform/test_caption_image.py +++ b/api/api_tests/internal/transform/test_caption_image.py @@ -72,6 +72,7 @@ def test_transform_image_create_vlm_caption_internal_happy_path( dummy_task_config["api_key"], dummy_task_config["endpoint_url"], dummy_task_config["model_name"], + temperature=1.0, ) # Assert captions updated correctly in the DataFrame @@ -125,6 +126,7 @@ def test_transform_image_create_vlm_caption_internal_uses_fallback_config( dummy_transform_config.api_key, dummy_transform_config.endpoint_url, dummy_transform_config.model_name, + temperature=1.0, ) # Assert captions updated correctly @@ -236,7 +238,7 @@ def test_generate_captions_happy_path(mock_scale, mock_create_client): # Assert infer called with correct data expected_payload = {"base64_images": ["scaled_b64img1", "scaled_b64img2"], "prompt": "describe this"} - mock_client.infer.assert_called_once_with(expected_payload, model_name="test_model") + mock_client.infer.assert_called_once_with(expected_payload, model_name="test_model", temperature=1.0) # Result matches mock captions assert result == ["Caption 1", "Caption 2"] @@ -280,6 +282,216 @@ def test_generate_captions_empty_images_returns_empty_list(mock_scale, mock_crea model_name="test_model", ) - mock_client.infer.assert_called_once_with({"base64_images": [], "prompt": "describe this"}, model_name="test_model") + mock_client.infer.assert_called_once_with( + {"base64_images": [], "prompt": "describe this"}, model_name="test_model", temperature=1.0 + ) assert result == [] + + +# --- _gather_context_text_for_image tests --- + + +def test_gather_context_text_page_match(): + """Page text is returned when page_number matches.""" + image_meta = { + "content_metadata": { + "type": "image", + "page_number": 3, + }, + } + page_text_map = {3: ["page three text", "more text"]} + result = module_under_test._gather_context_text_for_image(image_meta, page_text_map, 200) + assert result == "page three text more text" + + +def test_gather_context_text_truncation(): + """Text is truncated to max_chars.""" + image_meta = { + "content_metadata": { + "type": "image", + "page_number": 0, + }, + } + page_text_map = {0: ["a" * 500]} + result = module_under_test._gather_context_text_for_image(image_meta, page_text_map, 10) + assert len(result) == 10 + + +def test_gather_context_text_safety_cap(): + """Text is capped at _MAX_CONTEXT_TEXT_CHARS even if max_chars is larger.""" + image_meta = { + "content_metadata": { + "type": "image", + "page_number": 0, + }, + } + big_text = "x" * 10000 + page_text_map = {0: [big_text]} + result = module_under_test._gather_context_text_for_image(image_meta, page_text_map, 99999) + assert len(result) == module_under_test._MAX_CONTEXT_TEXT_CHARS + + +def test_gather_context_text_no_text(): + """Returns empty string when no text is available.""" + image_meta = { + "content_metadata": { + "type": "image", + "page_number": 5, + }, + } + result = module_under_test._gather_context_text_for_image(image_meta, {}, 200) + assert result == "" + + +def test_gather_context_text_wrong_page(): + """Returns empty string when page number doesn't match any text.""" + image_meta = { + "content_metadata": { + "type": "image", + "page_number": 99, + }, + } + page_text_map = {0: ["some text"]} + result = module_under_test._gather_context_text_for_image(image_meta, page_text_map, 200) + assert result == "" + + +# --- _build_prompt_with_context tests --- + + +def test_build_prompt_with_context(): + result = module_under_test._build_prompt_with_context("Caption this:", "nearby text") + assert result == "Text near this image:\n---\nnearby text\n---\n\nCaption this:" + + +def test_build_prompt_with_empty_context(): + result = module_under_test._build_prompt_with_context("Caption this:", "") + assert result == "Caption this:" + + +# --- _build_page_text_map tests --- + + +def test_build_page_text_map(): + df = pd.DataFrame( + [ + { + "metadata": { + "content": "text on page 0", + "content_metadata": {"type": "text", "page_number": 0}, + } + }, + { + "metadata": { + "content": "more on page 0", + "content_metadata": {"type": "text", "page_number": 0}, + } + }, + { + "metadata": { + "content": "image content", + "content_metadata": {"type": "image", "page_number": 0}, + } + }, + { + "metadata": { + "content": "page 1 text", + "content_metadata": {"type": "text", "page_number": 1}, + } + }, + ] + ) + result = module_under_test._build_page_text_map(df) + assert result == {0: ["text on page 0", "more on page 0"], 1: ["page 1 text"]} + + +# --- Context-enabled integration tests --- + + +@patch(f"{MODULE_UNDER_TEST}._generate_captions") +def test_transform_context_enabled_per_image_calls(mock_generate, dummy_transform_config): + """With context enabled, each image gets its own VLM call with enriched prompt.""" + df = pd.DataFrame( + [ + { + "metadata": { + "content": "b64_img1", + "content_metadata": {"type": "image", "page_number": 0}, + "image_metadata": {}, + } + }, + { + "metadata": { + "content": "page zero text", + "content_metadata": {"type": "text", "page_number": 0}, + "image_metadata": {}, + } + }, + ] + ) + mock_generate.return_value = ["caption_with_context"] + + task_config = { + "api_key": "key", + "prompt": "Caption this:", + "system_prompt": "sys", + "endpoint_url": "https://url", + "model_name": "model", + "context_text_max_chars": 500, + } + + result = transform_image_create_vlm_caption_internal(df.copy(), task_config, dummy_transform_config) + + # Should be called once (one image) + assert mock_generate.call_count == 1 + call_args = mock_generate.call_args + # The prompt should be enriched with context + assert "Text near this image:" in call_args[0][1] + assert "page zero text" in call_args[0][1] + assert "Caption this:" in call_args[0][1] + # The image should be passed individually + assert call_args[0][0] == ["b64_img1"] + # Caption should be set + assert result.iloc[0]["metadata"]["image_metadata"]["caption"] == "caption_with_context" + + +@patch(f"{MODULE_UNDER_TEST}._generate_captions") +def test_transform_temperature_forwarded(mock_generate, dummy_df_with_images, dummy_transform_config): + """Temperature from task_config is forwarded to _generate_captions.""" + mock_generate.return_value = ["c1", "c2"] + + task_config = { + "api_key": "key", + "prompt": "Describe", + "system_prompt": "sys", + "endpoint_url": "https://url", + "model_name": "model", + "temperature": 0.7, + } + + transform_image_create_vlm_caption_internal(dummy_df_with_images.copy(), task_config, dummy_transform_config) + + mock_generate.assert_called_once() + _, kwargs = mock_generate.call_args + assert kwargs["temperature"] == 0.7 + + +@patch(f"{MODULE_UNDER_TEST}._generate_captions") +def test_transform_context_disabled_batch_preserved( + mock_generate, dummy_df_with_images, dummy_task_config, dummy_transform_config +): + """With context disabled (default), batch behavior is unchanged.""" + mock_generate.return_value = ["c1", "c2"] + + _ = transform_image_create_vlm_caption_internal( + dummy_df_with_images.copy(), dummy_task_config, dummy_transform_config + ) + + # Should be called once in batch mode + mock_generate.assert_called_once() + call_args = mock_generate.call_args + # All images passed at once + assert call_args[0][0] == ["base64_image_1", "base64_image_2"] + # Prompt should NOT be enriched + assert "Text near this image:" not in call_args[0][1] diff --git a/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py b/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py index 84d78551f..458b58a77 100644 --- a/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +++ b/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py @@ -113,6 +113,8 @@ class IngestTaskCaptionSchema(BaseModelNoExt): prompt: Optional[str] = None system_prompt: Optional[str] = None model_name: Optional[str] = None + context_text_max_chars: Optional[int] = None + temperature: Optional[float] = None class IngestTaskFilterParamsSchema(BaseModelNoExt): diff --git a/api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py b/api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py index 9bf68fe37..7717f9695 100644 --- a/api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +++ b/api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py @@ -12,6 +12,8 @@ class ImageCaptionExtractionSchema(BaseModel): prompt: str = "Caption the content of this image:" system_prompt: str = "/no_think" model_name: str = "nvidia/nemotron-nano-12b-v2-vl" + context_text_max_chars: int = 0 + temperature: float = 1.0 raise_on_failure: bool = False model_config = ConfigDict(extra="forbid") @@ -33,4 +35,8 @@ def _coerce_none_to_defaults(cls, values): values["prompt"] = cls.model_fields["prompt"].default if values.get("system_prompt") is None: values["system_prompt"] = cls.model_fields["system_prompt"].default + if values.get("context_text_max_chars") is None: + values["context_text_max_chars"] = cls.model_fields["context_text_max_chars"].default + if values.get("temperature") is None: + values["temperature"] = cls.model_fields["temperature"].default return values diff --git a/api/src/nv_ingest_api/internal/transform/caption_image.py b/api/src/nv_ingest_api/internal/transform/caption_image.py index 936896ce8..5cfe4add7 100644 --- a/api/src/nv_ingest_api/internal/transform/caption_image.py +++ b/api/src/nv_ingest_api/internal/transform/caption_image.py @@ -16,6 +16,75 @@ logger = logging.getLogger(__name__) +_MAX_CONTEXT_TEXT_CHARS = 4096 + + +def _gather_context_text_for_image( + image_meta: Dict[str, Any], + page_text_map: Dict[int, List[str]], + max_chars: int, +) -> str: + """ + Gather surrounding OCR text for an image to provide as VLM prompt context. + + Parameters + ---------- + image_meta : dict + The full metadata dict for the image row. + page_text_map : dict + Mapping of page number -> list of text strings, precomputed from the + DataFrame's text rows. + max_chars : int + Maximum number of characters to return. Will be clamped to + ``_MAX_CONTEXT_TEXT_CHARS``. + + Returns + ------- + str + Surrounding text (possibly truncated), or empty string if none found. + """ + effective_max = min(max_chars, _MAX_CONTEXT_TEXT_CHARS) + content_meta = image_meta.get("content_metadata", {}) + page_num = content_meta.get("page_number", -1) + page_texts = page_text_map.get(page_num, []) + if page_texts: + combined = " ".join(page_texts) + return combined[:effective_max] + + return "" + + +def _build_prompt_with_context(base_prompt: str, context_text: str) -> str: + """ + Prepend surrounding-text context to the base VLM prompt. + + If *context_text* is empty the *base_prompt* is returned unchanged. + """ + if not context_text: + return base_prompt + return f"Text near this image:\n---\n{context_text}\n---\n\n{base_prompt}" + + +def _build_page_text_map(df: pd.DataFrame) -> Dict[int, List[str]]: + """ + Build a mapping of page number -> list of text content strings from text + rows in the DataFrame. Computed once per call to avoid O(images * rows). + """ + page_text_map: Dict[int, List[str]] = {} + for _, row in df.iterrows(): + meta = row.get("metadata") + if meta is None: + continue + cm = meta.get("content_metadata", {}) + if cm.get("type") != "text": + continue + content = meta.get("content", "") + if not content: + continue + page_num = cm.get("page_number", -1) + page_text_map.setdefault(page_num, []).append(content) + return page_text_map + def _prepare_dataframes_mod(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]: """ @@ -62,6 +131,7 @@ def _generate_captions( api_key: str, endpoint_url: str, model_name: str, + temperature: float = 1.0, ) -> List[str]: """ Generates captions for a list of base64-encoded PNG images using the VLM model API. @@ -116,7 +186,7 @@ def _generate_captions( ) # Perform inference to generate captions. - captions: List[str] = nim_client.infer(data, model_name=model_name) + captions: List[str] = nim_client.infer(data, model_name=model_name, temperature=temperature) return captions except Exception as e: @@ -182,6 +252,14 @@ def transform_image_create_vlm_caption_internal( endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url model_name: str = task_config.get("model_name") or transform_config.model_name + # Context text: task config overrides pipeline default. + context_text_max_chars: int = task_config.get("context_text_max_chars") or getattr( + transform_config, "context_text_max_chars", 0 + ) + + # Temperature: task config overrides pipeline default. + temperature: float = task_config.get("temperature") or getattr(transform_config, "temperature", 1.0) + # Create a mask for rows where the content type is "image". df_mask: pd.Series = df_transform_ledger["metadata"].apply( lambda meta: meta.get("content_metadata", {}).get("type") == "image" @@ -191,26 +269,50 @@ def transform_image_create_vlm_caption_internal( if not df_mask.any(): return df_transform_ledger - # Collect base64-encoded images from the rows where the content type is "image". - base64_images: List[str] = df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist() - - # Generate captions for the collected images. - captions: List[str] = _generate_captions( - base64_images, - prompt, - system_prompt, - api_key, - endpoint_url, - model_name, - ) + if context_text_max_chars and context_text_max_chars > 0: + page_text_map = _build_page_text_map(df_transform_ledger) + + for idx in df_transform_ledger.loc[df_mask].index: + meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"] + base64_image: str = meta["content"] + context_text = _gather_context_text_for_image(meta, page_text_map, context_text_max_chars) + enriched_prompt = _build_prompt_with_context(prompt, context_text) + + captions: List[str] = _generate_captions( + [base64_image], + enriched_prompt, + system_prompt, + api_key, + endpoint_url, + model_name, + temperature=temperature, + ) + + image_meta: Dict[str, Any] = meta.get("image_metadata", {}) + image_meta["caption"] = captions[0] if captions else "" + meta["image_metadata"] = image_meta + df_transform_ledger.at[idx, "metadata"] = meta + else: + base64_images: List[str] = ( + df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist() + ) + + captions: List[str] = _generate_captions( + base64_images, + prompt, + system_prompt, + api_key, + endpoint_url, + model_name, + temperature=temperature, + ) - # Update the DataFrame: assign each generated caption to the corresponding row. - for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions): - meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"] - image_meta: Dict[str, Any] = meta.get("image_metadata", {}) - image_meta["caption"] = caption - meta["image_metadata"] = image_meta - df_transform_ledger.at[idx, "metadata"] = meta + for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions): + meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"] + image_meta: Dict[str, Any] = meta.get("image_metadata", {}) + image_meta["caption"] = caption + meta["image_metadata"] = image_meta + df_transform_ledger.at[idx, "metadata"] = meta logger.debug("Image content captioning complete") result, execution_trace_log = df_transform_ledger, {} diff --git a/client/client_tests/primitives/tasks/test_caption.py b/client/client_tests/primitives/tasks/test_caption.py index dd20be95c..34a47d31b 100644 --- a/client/client_tests/primitives/tasks/test_caption.py +++ b/client/client_tests/primitives/tasks/test_caption.py @@ -113,6 +113,110 @@ def test_caption_task_to_dict_empty_fields(): assert task_dict == {"type": "caption", "task_properties": {}} +def test_caption_task_temperature_init(): + """Test initializing CaptionTask with temperature.""" + task = CaptionTask(temperature=0.7) + assert task._temperature == 0.7 + + +def test_caption_task_temperature_default(): + """Test that temperature defaults to None.""" + task = CaptionTask() + assert task._temperature is None + + +def test_caption_task_temperature_to_dict(): + """Test to_dict includes temperature when set.""" + task = CaptionTask(temperature=0.5) + task_dict = task.to_dict() + assert task_dict["task_properties"]["temperature"] == 0.5 + + +def test_caption_task_temperature_to_dict_unset(): + """Test to_dict excludes temperature when not set.""" + task = CaptionTask() + task_dict = task.to_dict() + assert "temperature" not in task_dict["task_properties"] + + +def test_caption_task_temperature_str(): + """Test __str__ includes temperature when set.""" + task = CaptionTask(temperature=0.3) + task_str = str(task) + assert "temperature: 0.3" in task_str + + +def test_caption_task_temperature_str_unset(): + """Test __str__ omits temperature when not set.""" + task = CaptionTask() + task_str = str(task) + assert "temperature" not in task_str + + +def test_caption_task_context_text_max_chars_init(): + """Test initializing CaptionTask with context_text_max_chars.""" + task = CaptionTask(context_text_max_chars=512) + assert task._context_text_max_chars == 512 + + +def test_caption_task_context_text_max_chars_default(): + """Test that context_text_max_chars defaults to None.""" + task = CaptionTask() + assert task._context_text_max_chars is None + + +def test_caption_task_context_text_max_chars_to_dict(): + """Test to_dict includes context_text_max_chars when set.""" + task = CaptionTask(context_text_max_chars=256) + task_dict = task.to_dict() + assert task_dict["task_properties"]["context_text_max_chars"] == 256 + + +def test_caption_task_context_text_max_chars_to_dict_unset(): + """Test to_dict excludes context_text_max_chars when not set.""" + task = CaptionTask() + task_dict = task.to_dict() + assert "context_text_max_chars" not in task_dict["task_properties"] + + +def test_caption_task_context_text_max_chars_str(): + """Test __str__ includes context_text_max_chars when set.""" + task = CaptionTask(context_text_max_chars=1024) + task_str = str(task) + assert "context_text_max_chars: 1024" in task_str + + +def test_caption_task_context_text_max_chars_str_unset(): + """Test __str__ omits context_text_max_chars when not set.""" + task = CaptionTask() + task_str = str(task) + assert "context_text_max_chars" not in task_str + + +def test_caption_task_schema_context_text_max_chars(): + """Test IngestTaskCaptionSchema accepts context_text_max_chars.""" + schema = IngestTaskCaptionSchema(context_text_max_chars=100) + assert schema.context_text_max_chars == 100 + + +def test_caption_task_schema_context_text_max_chars_default(): + """Test IngestTaskCaptionSchema context_text_max_chars defaults to None.""" + schema = IngestTaskCaptionSchema() + assert schema.context_text_max_chars is None + + +def test_caption_task_schema_temperature(): + """Test IngestTaskCaptionSchema accepts temperature.""" + schema = IngestTaskCaptionSchema(temperature=0.5) + assert schema.temperature == 0.5 + + +def test_caption_task_schema_temperature_default(): + """Test IngestTaskCaptionSchema temperature defaults to None.""" + schema = IngestTaskCaptionSchema() + assert schema.temperature is None + + # Execute tests if __name__ == "__main__": test_caption_task_schema_valid_all_fields() @@ -127,4 +231,12 @@ def test_caption_task_to_dict_empty_fields(): test_caption_task_to_dict_all_fields() test_caption_task_to_dict_partial_fields() test_caption_task_to_dict_empty_fields() + test_caption_task_context_text_max_chars_init() + test_caption_task_context_text_max_chars_default() + test_caption_task_context_text_max_chars_to_dict() + test_caption_task_context_text_max_chars_to_dict_unset() + test_caption_task_context_text_max_chars_str() + test_caption_task_context_text_max_chars_str_unset() + test_caption_task_schema_context_text_max_chars() + test_caption_task_schema_context_text_max_chars_default() print("All tests passed.") diff --git a/client/src/nv_ingest_client/client/interface.py b/client/src/nv_ingest_client/client/interface.py index de423d295..c991eb8b5 100644 --- a/client/src/nv_ingest_client/client/interface.py +++ b/client/src/nv_ingest_client/client/interface.py @@ -1524,6 +1524,8 @@ def caption(self, **kwargs: Any) -> "Ingestor": "prompt": task_options.prompt, "system_prompt": task_options.system_prompt, "model_name": task_options.model_name, + "context_text_max_chars": task_options.context_text_max_chars, + "temperature": task_options.temperature, } caption_task = CaptionTask(**caption_params) self._job_specs.add_task(caption_task) diff --git a/client/src/nv_ingest_client/primitives/tasks/caption.py b/client/src/nv_ingest_client/primitives/tasks/caption.py index 93c08a2b4..5d10a5056 100644 --- a/client/src/nv_ingest_client/primitives/tasks/caption.py +++ b/client/src/nv_ingest_client/primitives/tasks/caption.py @@ -24,6 +24,8 @@ def __init__( prompt: str = None, system_prompt: str = None, model_name: str = None, + context_text_max_chars: int = None, + temperature: float = None, ) -> None: super().__init__() @@ -34,6 +36,8 @@ def __init__( prompt=prompt, system_prompt=system_prompt, model_name=model_name, + context_text_max_chars=context_text_max_chars, + temperature=temperature, ) self._api_key = validated_data.api_key @@ -41,6 +45,8 @@ def __init__( self._prompt = validated_data.prompt self._system_prompt = validated_data.system_prompt self._model_name = validated_data.model_name + self._context_text_max_chars = validated_data.context_text_max_chars + self._temperature = validated_data.temperature def __str__(self) -> str: """ @@ -59,6 +65,10 @@ def __str__(self) -> str: info += f" system_prompt: {self._system_prompt}\n" if self._model_name: info += f" model_name: {self._model_name}\n" + if self._context_text_max_chars: + info += f" context_text_max_chars: {self._context_text_max_chars}\n" + if self._temperature is not None: + info += f" temperature: {self._temperature}\n" return info @@ -83,4 +93,10 @@ def to_dict(self) -> Dict: if self._model_name: task_properties["model_name"] = self._model_name + if self._context_text_max_chars: + task_properties["context_text_max_chars"] = self._context_text_max_chars + + if self._temperature is not None: + task_properties["temperature"] = self._temperature + return {"type": "caption", "task_properties": task_properties} diff --git a/docs/docs/extraction/vlm-embed.md b/docs/docs/extraction/vlm-embed.md index ed36a7fcd..123c9bf33 100644 --- a/docs/docs/extraction/vlm-embed.md +++ b/docs/docs/extraction/vlm-embed.md @@ -217,4 +217,4 @@ results = ingestor.ingest() - [Support Matrix](support-matrix.md) - [Troubleshoot Nemo Retriever Extraction](troubleshoot.md) - [Use the NV-Ingest Python API](nv-ingest-python-api.md) -- [Extract Captions from Images](nv-ingest-python-api.md#extract-captions-from-images) \ No newline at end of file +- [Extract Captions from Images](nv-ingest-python-api.md#extract-captions-from-images)