diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 99706cfd..0a2aa995 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -176,7 +176,7 @@ class MultiModalTextToTextExportableModule(torch.nn.Module): Args: model (torch.nn.Module): The multimodal model to export. modality (str): The input modality type ("audio" or "vision"). - encoder_name (str): Name of the encoder attribute in the model. + encoder_model (str): The encoder model within the mutlimodal model. processor_config (dict, optional): Preprocessor configuration loaded from preprocessor_config.json. use_custom_kv_cache (bool): Whether to use custom key-value caching for optimization. use_custom_sdpa (bool): Whether to use custom scaled dot-product attention. @@ -186,7 +186,7 @@ def __init__( self, model: torch.nn.Module, modality: str, - encoder_name: str, + encoder_model: torch.nn.Module, max_seq_len: int, processor_config: dict = None, use_custom_kv_cache: bool = False, @@ -194,13 +194,10 @@ def __init__( ): super().__init__() - if not hasattr(model, encoder_name): - raise ValueError(f'Model does not contain encoder "{encoder_name}".') - self.model = model self.config = model.config self.modality = modality - self.encoder_name = encoder_name + self.encoder_model = encoder_model self.processor_config = processor_config self.use_custom_kv_cache = use_custom_kv_cache self.use_custom_sdpa = use_custom_sdpa @@ -370,7 +367,7 @@ def export( # 3. Export encoder. if self.use_custom_sdpa: - getattr(self.model, self.encoder_name).config._attn_implementation = "custom_sdpa" + self.encoder_model.config._attn_implementation = "custom_sdpa" if self.modality == "audio": encoder = AudioExportableModule(self.model) diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py index d48f8290..cdb26b9f 100644 --- a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py +++ b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py @@ -25,68 +25,6 @@ from ..task_registry import register_task -def _validate_multimodal_components(model): - """ - Validates that the multimodal model has required decoder and encoder components. - - Args: - model: The loaded model instance - - Returns: - tuple: (decoder_name, audio_encoder_name, vision_encoder_name) - """ - POTENTIAL_DECODER_NAMES = [ - "language_model", - "text_model", - ] - POTENTIAL_AUDIO_ENCODER_NAMES = [ - "encoder", # Here mainly for Granite Speech. - "audio_tower", - "audio_model", - ] - POTENTIAL_VISION_ENCODER_NAMES = [ - "vision_tower", - "vision_model", - ] - - # Find decoder component - decoder_name = None - for name in POTENTIAL_DECODER_NAMES: - if hasattr(model, name): - decoder_name = name - break - - if decoder_name is None: - raise ValueError( - "The model does not have any of the expected decoder attributes: " - f"{POTENTIAL_DECODER_NAMES}. This is required for multimodal text-to-text models." - ) - - # Find encoder components - audio_encoder_name = None - for name in POTENTIAL_AUDIO_ENCODER_NAMES: - if hasattr(model, name): - audio_encoder_name = name - break - - vision_encoder_name = None - for name in POTENTIAL_VISION_ENCODER_NAMES: - if hasattr(model, name): - vision_encoder_name = name - break - - if (audio_encoder_name is None) == (vision_encoder_name is None): - raise ValueError( - "The model does not have one of the expected encoder attributes: " - f"{POTENTIAL_AUDIO_ENCODER_NAMES + POTENTIAL_VISION_ENCODER_NAMES}. " - "This is required for multimodal text-to-text models." - "Currently only a maximum of 1 modality is supported, so there can only be one of these" - "encoders in the model." - ) - - return decoder_name, audio_encoder_name, vision_encoder_name - - # NOTE: It's important to map the registered task name to the pipeline name in https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py. # This will streamline using inferred task names and make exporting models to Hugging Face pipelines easier. @register_task("image-text-to-text") @@ -181,13 +119,22 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): "device": device, }, ) - decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model) - encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name + + # Find the modality (only one modality of either image/audio is supported at the moment). + if len(eager_model.input_modalities) != 2: + raise AttributeError( + "Only one modality is supported for multimodal models at the moment. The input modalities must be either ['text', 'image'] or ['text, 'audio']" + ) + for input_modality in eager_model.input_modalities: + if input_modality == "text": + continue + modality = input_modality + eager_encoder = eager_model.get_encoder(modality) # Need to do this since apparently when nested modules (e.g. model.language_model) access the .property # config, it always comes from the generation_config.json file, not the `generation_config` override # from from_pretrained(). - getattr(eager_model, decoder_name).generation_config = eager_model.generation_config + eager_model.get_decoder().generation_config = eager_model.generation_config # Must disable gradient when exporting a model with a prequantized checkpoint, # e.g. "pytorch/Phi-4-mini-instruct-8da4w". @@ -210,7 +157,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): if qlinear_config: logging.info("Quantizing decoder linears...") quantize_decoder_kwargs = { - "eager_model": getattr(eager_model, decoder_name), + "eager_model": eager_model.get_decoder(), "qlinear_config": qlinear_config, } if qlinear_group_size is not None: @@ -225,13 +172,15 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): # self.decoder = ... # self.lm_head = ... # lm_head is not part of the decoder instance # ... - if not hasattr(getattr(eager_model, decoder_name), "lm_head"): - if not hasattr(eager_model, "lm_head"): + if not hasattr(eager_model.get_decoder(), "lm_head"): + # Voxtral specifically is weird since you need to specifically do eager_model.language_model.lm_head. + lm_head = getattr(eager_model, "lm_head", None) or getattr(eager_model.language_model, "lm_head", None) + if not lm_head: raise AttributeError( f"Could not find `lm_head` for {model_name_or_path} has no `lm_head`, please double check if this is expected." ) quantize_lm_head_kwargs = { - "eager_model": eager_model.lm_head, + "eager_model": lm_head, "qlinear_config": qlinear_config, } quantize_model_(**quantize_lm_head_kwargs) @@ -240,7 +189,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): if qlinear_encoder_config: logging.info("Quantizing encoder linears...") quantize_encoder_kwargs = { - "eager_model": getattr(eager_model, encoder_name), + "eager_model": eager_encoder, "qlinear_config": qlinear_encoder_config, } if qlinear_encoder_group_size is not None: @@ -253,7 +202,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): if qembedding_config: logging.info("Quantizing embeddings...") quantize_decoder_embedding_kwargs = { - "eager_model": getattr(eager_model, decoder_name), + "eager_model": eager_model.get_decoder(), "qembedding_config": qembedding_config, } if qembedding_group_size is not None: @@ -264,7 +213,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): if qembedding_encoder_config: logging.info("Quantizing embeddings...") quantize_encoder_embedding_kwargs = { - "eager_model": getattr(eager_model, encoder_name), + "eager_model": eager_encoder, "qembedding_config": qembedding_encoder_config, } if qembedding_encoder_group_size is not None: @@ -273,8 +222,10 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs): return MultiModalTextToTextExportableModule( model=eager_model, - modality="audio" if audio_encoder_name else "vision", - encoder_name=audio_encoder_name if audio_encoder_name else vision_encoder_name, + modality="vision" + if modality == "image" + else modality, # TODO: hack since downstream uses "vision" atm. Change this to match Transformers. + encoder_model=eager_encoder, max_seq_len=max_length, processor_config=processor_config, use_custom_kv_cache=use_custom_kv_cache, diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py index 83dc72fa..da8cfb2b 100644 --- a/tests/models/test_modeling_gemma3.py +++ b/tests/models/test_modeling_gemma3.py @@ -350,5 +350,5 @@ def test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we(self): self.assertTrue("serene" in generated_text) self.assertTrue("lake" in generated_text) self.assertTrue( - check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=5) + check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=10) ) diff --git a/tests/models/test_modeling_voxtral.py b/tests/models/test_modeling_voxtral.py index 1f7f6b34..d66b26e6 100644 --- a/tests/models/test_modeling_voxtral.py +++ b/tests/models/test_modeling_voxtral.py @@ -326,7 +326,7 @@ def test_voxtral_audio_text_to_text_generation_with_custom_sdpa_kv_cache_8da4w_8 # likely friends or acquaintances, who are discussing tattoos.' self.assertTrue("tattoo" in generated_text) self.assertTrue( - check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=5) + check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=10) ) @slow