From 4c548e9ecf730762c6f05ccf1209fe312c050c08 Mon Sep 17 00:00:00 2001 From: edknv Date: Wed, 14 Jan 2026 21:53:30 -0800 Subject: [PATCH 1/5] Add text_image modality support for text elements in multimodal embedding --- .../internal/extract/image/ocr_extractor.py | 5 ++ .../internal/transform/embed_text.py | 53 ++++++++++++++++--- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py b/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py index 31e69c481..324139d99 100644 --- a/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py +++ b/api/src/nv_ingest_api/internal/extract/image/ocr_extractor.py @@ -257,6 +257,11 @@ def _process_page_elements(df_to_process: pd.DataFrame, ocr_results: List[Tuple] return df_to_process for result_idx, df_idx in enumerate(valid_indices): + # Preserve the original base64 image before overwriting with OCR text. + # This enables text_image modality for multimodal embeddings. + original_image = df_to_process.loc[df_idx, "metadata"]["content"] + df_to_process.loc[df_idx, "metadata"]["text_metadata"]["source_image"] = original_image + # Unpack result: (bounding_boxes, text_predictions, confidence_scores) bboxes, texts, _ = ocr_results[result_idx] if not bboxes or not texts: diff --git a/api/src/nv_ingest_api/internal/transform/embed_text.py b/api/src/nv_ingest_api/internal/transform/embed_text.py index d6ee06c4b..084a2b4cc 100644 --- a/api/src/nv_ingest_api/internal/transform/embed_text.py +++ b/api/src/nv_ingest_api/internal/transform/embed_text.py @@ -23,7 +23,10 @@ logging.getLogger("httpcore").setLevel(logging.ERROR) -MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"] +MULTI_MODAL_MODELS = [ + "llama-3.2-nemoretriever-1b-vlm-embed-v1", + "nvidia/llama-nemotron-embed-vl-1b-v2", +] # ------------------------------------------------------------------------------ @@ -326,19 +329,29 @@ def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optiona def _get_pandas_text_content(row, modality="text"): """ - Extracts text content from a DataFrame row. + Extracts text content from a DataFrame row's metadata. Parameters ---------- - row : pandas.Series - A row containing the 'content' key. + row : dict + The metadata dictionary containing 'content' and optionally 'text_metadata.source_image'. Returns ------- str - The text content from the row. + The text content, image content, or combined text+image content based on modality. """ - return row["content"] + if modality == "text": + content = row.get("content") + elif modality == "image": + source_image = row.get("text_metadata", {}).get("source_image") + content = _format_image_input_string(source_image) + elif modality == "text_image": + text = row.get("content") + source_image = row.get("text_metadata", {}).get("source_image") + content = _format_text_image_pair_input_string(text, source_image) + + return content def _get_pandas_table_content(row, modality="text"): @@ -425,6 +438,30 @@ def _get_pandas_custom_content(row, custom_content_field): return None +def _cleanup_source_images(row): + """ + Removes source_image from text_metadata to reduce metadata size. + + The source_image field is used during embedding for text_image modality + but should be removed afterward to avoid exceeding storage limits + (e.g., Milvus JSON field 64KB limit). + + Parameters + ---------- + row : pandas.Series + A DataFrame row containing 'metadata'. + + Returns + ------- + pandas.Series + The row with source_image removed from text_metadata. + """ + text_metadata = row.get("metadata", {}).get("text_metadata") + if text_metadata and "source_image" in text_metadata: + del text_metadata["source_image"] + return row + + # ------------------------------------------------------------------------------ # Batch Processing Utilities # ------------------------------------------------------------------------------ @@ -699,4 +736,8 @@ def _content_type_getter(row): _add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1 ) + # Clean up source_image from text_metadata to avoid exceeding Milvus JSON field limits. + # The source_image is only needed during embedding and can be safely removed afterward. + combined_df = combined_df.apply(_cleanup_source_images, axis=1) + return combined_df, {"trace_info": execution_trace_log} From d4fb345c3017b75e114c56fabd88e1df890fe47b Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 15 Jan 2026 18:11:01 -0800 Subject: [PATCH 2/5] check image format --- api/src/nv_ingest_api/internal/transform/embed_text.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/api/src/nv_ingest_api/internal/transform/embed_text.py b/api/src/nv_ingest_api/internal/transform/embed_text.py index 084a2b4cc..e82013790 100644 --- a/api/src/nv_ingest_api/internal/transform/embed_text.py +++ b/api/src/nv_ingest_api/internal/transform/embed_text.py @@ -318,7 +318,9 @@ def _add_custom_embeddings(row, embeddings, result_target_field): def _format_image_input_string(image_b64: Optional[str]) -> str: if not image_b64: return - return f"data:image/png;base64,{image_b64}" + # Detect format from base64 magic bytes: JPEG starts with /9j/, PNG starts with iVBORw + mime_type = "image/jpeg" if image_b64.startswith("/9j/") else "image/png" + return f"data:{mime_type};base64,{image_b64}" def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str: From 92f907d4a186689184f6425f1a022b60e70995a1 Mon Sep 17 00:00:00 2001 From: edknv Date: Fri, 16 Jan 2026 10:02:00 -0800 Subject: [PATCH 3/5] aggregate contents for page image --- .../internal/extract/pdf/engines/pdfium.py | 7 +- .../schemas/meta/ingest_job_schema.py | 5 + .../transform_text_embedding_schema.py | 5 + .../internal/transform/embed_text.py | 97 +++++++++++++++++++ .../primitives/tasks/embed.py | 40 ++++++++ .../primitives/tasks/extract.py | 4 + 6 files changed, 157 insertions(+), 1 deletion(-) diff --git a/api/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py b/api/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py index 5ddf13707..4baf07d83 100644 --- a/api/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +++ b/api/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py @@ -435,6 +435,7 @@ def pdfium_extractor( text_extraction_method = extractor_config.get("extract_method", "pdfium") extract_images_method = extractor_config.get("extract_images_method", "group") extract_images_params = extractor_config.get("extract_images_params", {}) + page_image_max_dimension = extractor_config.get("page_image_max_dimension", 1024) # Extract metadata_column metadata_column = extractor_config.get("metadata_column", "metadata") @@ -555,7 +556,11 @@ def pdfium_extractor( page_text = "" else: page_text = _extract_page_text(page) - image, _ = pdfium_pages_to_numpy([page], scale_tuple=(16384, 16384), trace_info=execution_trace_log) + image, _ = pdfium_pages_to_numpy( + [page], + scale_tuple=(page_image_max_dimension, page_image_max_dimension), + trace_info=execution_trace_log, + ) base64_image = numpy_to_base64(image[0]) if len(base64_image) > 2**24 - 1: base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1) diff --git a/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py b/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py index 7756c861c..45ec72850 100644 --- a/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +++ b/api/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py @@ -143,8 +143,13 @@ class IngestTaskEmbedSchema(BaseModelNoExt): model_name: Optional[str] = None api_key: Optional[str] = Field(default=None, repr=False) filter_errors: bool = False + embed_text_elements: Optional[bool] = None + embed_structured_elements: Optional[bool] = None + embed_image_elements: Optional[bool] = None + embed_audio_elements: Optional[bool] = None text_elements_modality: Optional[str] = None image_elements_modality: Optional[str] = None + image_elements_aggregate_page_content: Optional[bool] = None structured_elements_modality: Optional[str] = None audio_elements_modality: Optional[str] = None custom_content_field: Optional[str] = None diff --git a/api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py b/api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py index d6595c579..ae63cd6b5 100644 --- a/api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +++ b/api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py @@ -24,8 +24,13 @@ class TextEmbeddingSchema(BaseModel): input_type: str = Field(default="passage") raise_on_failure: bool = Field(default=False) truncate: str = Field(default="END") + embed_text_elements: bool = Field(default=True) + embed_structured_elements: bool = Field(default=True) + embed_image_elements: bool = Field(default=True) + embed_audio_elements: bool = Field(default=True) text_elements_modality: str = Field(default="text") image_elements_modality: str = Field(default="text") + image_elements_aggregate_page_content: bool = Field(default=False) structured_elements_modality: str = Field(default="text") audio_elements_modality: str = Field(default="text") custom_content_field: Optional[str] = None diff --git a/api/src/nv_ingest_api/internal/transform/embed_text.py b/api/src/nv_ingest_api/internal/transform/embed_text.py index e82013790..973773c8c 100644 --- a/api/src/nv_ingest_api/internal/transform/embed_text.py +++ b/api/src/nv_ingest_api/internal/transform/embed_text.py @@ -464,6 +464,72 @@ def _cleanup_source_images(row): return row +def _aggregate_page_content(df: pd.DataFrame) -> pd.DataFrame: + """ + Aggregates text content from TEXT and STRUCTURED elements into PAGE_IMAGE entries. + + For each page, collects text from: + - TEXT elements: content field + - STRUCTURED elements (tables/charts): table_metadata.table_content field + + The aggregated text is stored in image_metadata.text for PAGE_IMAGE entries, + enabling text_image modality embedding with full page context. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing extracted content with metadata. + + Returns + ------- + pd.DataFrame + DataFrame with PAGE_IMAGE entries updated to include aggregated page text. + """ + # Build mapping of page_number -> list of text content + page_text_map: Dict[int, List[str]] = {} + + for _, row in df.iterrows(): + metadata = row.get("metadata", {}) + content_metadata = metadata.get("content_metadata", {}) + content_type = content_metadata.get("type") + page_number = content_metadata.get("page_number") + + if page_number is None: + continue + + if page_number not in page_text_map: + page_text_map[page_number] = [] + + # Collect text from TEXT elements + if content_type == ContentTypeEnum.TEXT.value: + text_content = metadata.get("content") + if text_content and isinstance(text_content, str) and text_content.strip(): + page_text_map[page_number].append(text_content.strip()) + + # Collect text from STRUCTURED elements (tables, charts) + elif content_type == ContentTypeEnum.STRUCTURED.value: + table_content = metadata.get("table_metadata", {}).get("table_content") + if table_content and isinstance(table_content, str) and table_content.strip(): + page_text_map[page_number].append(table_content.strip()) + + # Apply aggregated text to PAGE_IMAGE entries + for idx, row in df.iterrows(): + metadata = row.get("metadata", {}) + content_metadata = metadata.get("content_metadata", {}) + + if ( + content_metadata.get("type") == ContentTypeEnum.IMAGE.value + and content_metadata.get("subtype") == ContentTypeEnum.PAGE_IMAGE.value + ): + page_number = content_metadata.get("page_number") + if page_number in page_text_map and page_text_map[page_number]: + aggregated_text = "\n\n".join(page_text_map[page_number]) + image_metadata = metadata.get("image_metadata", {}) + image_metadata["text"] = aggregated_text + + return df + + # ------------------------------------------------------------------------------ # Batch Processing Utilities # ------------------------------------------------------------------------------ @@ -612,6 +678,15 @@ def transform_create_text_embeddings_internal( if df_transform_ledger.empty: return df_transform_ledger, {"trace_info": execution_trace_log} + # Aggregate text content from TEXT and STRUCTURED elements into PAGE_IMAGE entries + image_elements_aggregate_page_content = ( + task_config.get("image_elements_aggregate_page_content") + or transform_config.image_elements_aggregate_page_content + ) + if image_elements_aggregate_page_content: + df_transform_ledger = _aggregate_page_content(df_transform_ledger) + logger.debug("Aggregated page content into PAGE_IMAGE entries for text_image embedding") + embedding_dataframes = [] content_masks = [] @@ -632,6 +707,23 @@ def transform_create_text_embeddings_internal( ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet. } + # Determine which content types to embed + def _get_embed_flag(content_type: ContentTypeEnum) -> bool: + flag_map = { + ContentTypeEnum.TEXT: task_config.get("embed_text_elements"), + ContentTypeEnum.STRUCTURED: task_config.get("embed_structured_elements"), + ContentTypeEnum.IMAGE: task_config.get("embed_image_elements"), + ContentTypeEnum.AUDIO: task_config.get("embed_audio_elements"), + } + default_map = { + ContentTypeEnum.TEXT: transform_config.embed_text_elements, + ContentTypeEnum.STRUCTURED: transform_config.embed_structured_elements, + ContentTypeEnum.IMAGE: transform_config.embed_image_elements, + ContentTypeEnum.AUDIO: transform_config.embed_audio_elements, + } + task_flag = flag_map.get(content_type) + return task_flag if task_flag is not None else default_map.get(content_type, True) + def _content_type_getter(row): return row["content_metadata"]["type"] @@ -640,6 +732,11 @@ def _content_type_getter(row): logger.warning(f"Skipping text_embedding generation for unsupported content type: {content_type}") continue + # Check if this content type should be embedded + if not _get_embed_flag(content_type): + logger.debug(f"Skipping embedding for content type {content_type} (disabled by configuration)") + continue + # Get rows matching the content type content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value if not content_mask.any(): diff --git a/client/src/nv_ingest_client/primitives/tasks/embed.py b/client/src/nv_ingest_client/primitives/tasks/embed.py index edc403fdf..40fa1ba70 100644 --- a/client/src/nv_ingest_client/primitives/tasks/embed.py +++ b/client/src/nv_ingest_client/primitives/tasks/embed.py @@ -32,8 +32,13 @@ def __init__( text: Optional[bool] = None, tables: Optional[bool] = None, filter_errors: bool = False, + embed_text_elements: Optional[bool] = None, + embed_structured_elements: Optional[bool] = None, + embed_image_elements: Optional[bool] = None, + embed_audio_elements: Optional[bool] = None, text_elements_modality: Optional[str] = None, image_elements_modality: Optional[str] = None, + image_elements_aggregate_page_content: Optional[bool] = None, structured_elements_modality: Optional[str] = None, audio_elements_modality: Optional[str] = None, custom_content_field: Optional[str] = None, @@ -75,8 +80,13 @@ def __init__( model_name=model_name, api_key=api_key, filter_errors=filter_errors, + embed_text_elements=embed_text_elements, + embed_structured_elements=embed_structured_elements, + embed_image_elements=embed_image_elements, + embed_audio_elements=embed_audio_elements, text_elements_modality=text_elements_modality, image_elements_modality=image_elements_modality, + image_elements_aggregate_page_content=image_elements_aggregate_page_content, structured_elements_modality=structured_elements_modality, audio_elements_modality=audio_elements_modality, custom_content_field=custom_content_field, @@ -88,8 +98,13 @@ def __init__( self._model_name = validated_data.model_name self._api_key = validated_data.api_key self._filter_errors = validated_data.filter_errors + self._embed_text_elements = validated_data.embed_text_elements + self._embed_structured_elements = validated_data.embed_structured_elements + self._embed_image_elements = validated_data.embed_image_elements + self._embed_audio_elements = validated_data.embed_audio_elements self._text_elements_modality = validated_data.text_elements_modality self._image_elements_modality = validated_data.image_elements_modality + self._image_elements_aggregate_page_content = validated_data.image_elements_aggregate_page_content self._structured_elements_modality = validated_data.structured_elements_modality self._audio_elements_modality = validated_data.audio_elements_modality self._custom_content_field = validated_data.custom_content_field @@ -115,10 +130,20 @@ def __str__(self) -> str: if self._api_key: info += " api_key: [redacted]\n" info += f" filter_errors: {self._filter_errors}\n" + if self._embed_text_elements is not None: + info += f" embed_text_elements: {self._embed_text_elements}\n" + if self._embed_structured_elements is not None: + info += f" embed_structured_elements: {self._embed_structured_elements}\n" + if self._embed_image_elements is not None: + info += f" embed_image_elements: {self._embed_image_elements}\n" + if self._embed_audio_elements is not None: + info += f" embed_audio_elements: {self._embed_audio_elements}\n" if self._text_elements_modality: info += f" text_elements_modality: {self._text_elements_modality}\n" if self._image_elements_modality: info += f" image_elements_modality: {self._image_elements_modality}\n" + if self._image_elements_aggregate_page_content: + info += f" image_elements_aggregate_page_content: {self._image_elements_aggregate_page_content}\n" if self._structured_elements_modality: info += f" structured_elements_modality: {self._structured_elements_modality}\n" if self._audio_elements_modality: @@ -152,12 +177,27 @@ def to_dict(self) -> Dict[str, Any]: if self._api_key: task_properties["api_key"] = self._api_key + if self._embed_text_elements is not None: + task_properties["embed_text_elements"] = self._embed_text_elements + + if self._embed_structured_elements is not None: + task_properties["embed_structured_elements"] = self._embed_structured_elements + + if self._embed_image_elements is not None: + task_properties["embed_image_elements"] = self._embed_image_elements + + if self._embed_audio_elements is not None: + task_properties["embed_audio_elements"] = self._embed_audio_elements + if self._text_elements_modality: task_properties["text_elements_modality"] = self._text_elements_modality if self._image_elements_modality: task_properties["image_elements_modality"] = self._image_elements_modality + if self._image_elements_aggregate_page_content: + task_properties["image_elements_aggregate_page_content"] = self._image_elements_aggregate_page_content + if self._structured_elements_modality: task_properties["structured_elements_modality"] = self._structured_elements_modality diff --git a/client/src/nv_ingest_client/primitives/tasks/extract.py b/client/src/nv_ingest_client/primitives/tasks/extract.py index e5a2563ee..7c1101ada 100644 --- a/client/src/nv_ingest_client/primitives/tasks/extract.py +++ b/client/src/nv_ingest_client/primitives/tasks/extract.py @@ -99,6 +99,7 @@ def __init__( extract_tables_method: _Type_Extract_Tables_Method_PDF = "yolox", extract_infographics: bool = False, extract_page_as_image: bool = False, + page_image_max_dimension: int = 1024, text_depth: str = "document", paddle_output_format: str = "pseudo_markdown", table_output_format: str = "markdown", @@ -143,6 +144,7 @@ def __init__( "extract_charts": extract_charts, "extract_infographics": extract_infographics, "extract_page_as_image": extract_page_as_image, + "page_image_max_dimension": page_image_max_dimension, "text_depth": text_depth, "table_output_format": table_output_format, } @@ -172,6 +174,7 @@ def __init__( self._extract_charts = extract_charts self._extract_infographics = extract_infographics self._extract_page_as_image = extract_page_as_image + self._page_image_max_dimension = page_image_max_dimension self._extract_text = extract_text self._text_depth = text_depth self._paddle_output_format = paddle_output_format @@ -191,6 +194,7 @@ def __str__(self) -> str: info += f" extract_charts: {self._extract_charts}\n" info += f" extract_infographics: {self._extract_infographics}\n" info += f" extract_page_as_image: {self._extract_page_as_image}\n" + info += f" page_image_max_dimension: {self._page_image_max_dimension}\n" info += f" text_depth: {self._text_depth}\n" info += f" table_output_format: {self._table_output_format}\n" return info From eaf38c249e9acb0d0e3f96f4351f9d0ec873365b Mon Sep 17 00:00:00 2001 From: edknv Date: Fri, 16 Jan 2026 10:58:27 -0800 Subject: [PATCH 4/5] simplify page image usage --- .../internal/transform/embed_text.py | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/api/src/nv_ingest_api/internal/transform/embed_text.py b/api/src/nv_ingest_api/internal/transform/embed_text.py index 973773c8c..fddddd30e 100644 --- a/api/src/nv_ingest_api/internal/transform/embed_text.py +++ b/api/src/nv_ingest_api/internal/transform/embed_text.py @@ -678,11 +678,40 @@ def transform_create_text_embeddings_internal( if df_transform_ledger.empty: return df_transform_ledger, {"trace_info": execution_trace_log} - # Aggregate text content from TEXT and STRUCTURED elements into PAGE_IMAGE entries + # Determine if page content aggregation should be enabled + image_elements_modality = ( + task_config.get("image_elements_modality") or transform_config.image_elements_modality + ) + + # Check if user explicitly set the aggregation flag + explicit_aggregate_setting = task_config.get("image_elements_aggregate_page_content") image_elements_aggregate_page_content = ( - task_config.get("image_elements_aggregate_page_content") - or transform_config.image_elements_aggregate_page_content + explicit_aggregate_setting + if explicit_aggregate_setting is not None + else transform_config.image_elements_aggregate_page_content ) + + # Auto-enable aggregation when using text_image modality with PAGE_IMAGE entries + # Only auto-enable if user hasn't explicitly set the flag + if explicit_aggregate_setting is None and not image_elements_aggregate_page_content: + if image_elements_modality == "text_image": + # Check if PAGE_IMAGE entries exist + def _has_page_images(df): + for _, row in df.iterrows(): + metadata = row.get("metadata", {}) + content_metadata = metadata.get("content_metadata", {}) + if ( + content_metadata.get("type") == ContentTypeEnum.IMAGE.value + and content_metadata.get("subtype") == ContentTypeEnum.PAGE_IMAGE.value + ): + return True + return False + + if _has_page_images(df_transform_ledger): + image_elements_aggregate_page_content = True + logger.debug("Auto-enabled page content aggregation for text_image modality with PAGE_IMAGE entries") + + # Aggregate text content from TEXT and STRUCTURED elements into PAGE_IMAGE entries if image_elements_aggregate_page_content: df_transform_ledger = _aggregate_page_content(df_transform_ledger) logger.debug("Aggregated page content into PAGE_IMAGE entries for text_image embedding") @@ -708,6 +737,7 @@ def transform_create_text_embeddings_internal( } # Determine which content types to embed + # When aggregating page content, automatically skip TEXT and STRUCTURED unless explicitly set def _get_embed_flag(content_type: ContentTypeEnum) -> bool: flag_map = { ContentTypeEnum.TEXT: task_config.get("embed_text_elements"), @@ -722,7 +752,16 @@ def _get_embed_flag(content_type: ContentTypeEnum) -> bool: ContentTypeEnum.AUDIO: transform_config.embed_audio_elements, } task_flag = flag_map.get(content_type) - return task_flag if task_flag is not None else default_map.get(content_type, True) + if task_flag is not None: + return task_flag + # When aggregating page content, skip TEXT and STRUCTURED by default + # since their content is already included in PAGE_IMAGE entries + if image_elements_aggregate_page_content and content_type in ( + ContentTypeEnum.TEXT, + ContentTypeEnum.STRUCTURED, + ): + return False + return default_map.get(content_type, True) def _content_type_getter(row): return row["content_metadata"]["type"] From 33fdc0cbe12ba42898c915aa2b448c0bf8a63102 Mon Sep 17 00:00:00 2001 From: edknv Date: Tue, 20 Jan 2026 10:03:11 -0800 Subject: [PATCH 5/5] lint --- api/src/nv_ingest_api/internal/transform/embed_text.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/src/nv_ingest_api/internal/transform/embed_text.py b/api/src/nv_ingest_api/internal/transform/embed_text.py index fddddd30e..6ad538884 100644 --- a/api/src/nv_ingest_api/internal/transform/embed_text.py +++ b/api/src/nv_ingest_api/internal/transform/embed_text.py @@ -679,9 +679,7 @@ def transform_create_text_embeddings_internal( return df_transform_ledger, {"trace_info": execution_trace_log} # Determine if page content aggregation should be enabled - image_elements_modality = ( - task_config.get("image_elements_modality") or transform_config.image_elements_modality - ) + image_elements_modality = task_config.get("image_elements_modality") or transform_config.image_elements_modality # Check if user explicitly set the aggregation flag explicit_aggregate_setting = task_config.get("image_elements_aggregate_page_content")