From e2f26fc25fb6fbaf08e690a6acff94c8c5df81f1 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Sun, 7 Dec 2025 15:51:51 +0200 Subject: [PATCH 1/4] make ollamaInferenceEngine handle return_meta_data Signed-off-by: lilacheden --- src/unitxt/inference.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index c31d612a3a..2b05ff4a6b 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1461,7 +1461,18 @@ def _infer( options=args, ) results.append(response) - + if return_meta_data: + return [ + TextGenerationInferenceOutput( + prediction=element["message"]["content"], + generated_text=element["message"]["content"], + input_tokens=element.get("prompt_eval_count", 0), + output_tokens=element.get("eval_count", 0), + model_name=self.model, + inference_type=self.label, + ) + for element in results + ] return [element["message"]["content"] for element in results] From ea09bab53fc952b5e10823ee089f0902e3d0c9b4 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Mon, 8 Dec 2025 16:23:18 +0200 Subject: [PATCH 2/4] fix wml inference tests to use supported models Signed-off-by: lilacheden --- tests/inference/test_inference_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index 48261b8f01..b412159b23 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -159,7 +159,7 @@ def test_llava_inference_engine(self): def test_watsonx_inference(self): model = WMLInferenceEngineGeneration( - model_name="google/flan-t5-xl", + model_name="ibm/granite-3-8b-instruct", data_classification_policy=["public"], random_seed=111, min_new_tokens=1, @@ -193,7 +193,7 @@ def test_watsonx_inference_with_external_client(self): from ibm_watsonx_ai.client import APIClient, Credentials model = WMLInferenceEngineGeneration( - model_name="google/flan-t5-xl", + model_name="ibm/granite-3-8b-instruct", data_classification_policy=["public"], random_seed=111, min_new_tokens=1, @@ -279,7 +279,7 @@ def test_option_selecting_by_log_prob_inference_engines(self): ] watsonx_engine = WMLInferenceEngineGeneration( - model_name="meta-llama/llama-3-2-1b-instruct" + model_name="ibm/granite-3-8b-instruct" ) for engine in [watsonx_engine]: @@ -383,7 +383,7 @@ def test_lite_llm_inference_engine(self): def test_lite_llm_inference_engine_without_task_data_not_failing(self): LiteLLMInferenceEngine( - model="watsonx/meta-llama/llama-3-2-1b-instruct", + model="watsonx/meta-llama/llama-3-2-11b-instruct", max_tokens=2, temperature=0, top_p=1, From b3f86684b5ece22a5a8e395e57aa9fda002bd11f Mon Sep 17 00:00:00 2001 From: lilacheden Date: Mon, 8 Dec 2025 16:32:52 +0200 Subject: [PATCH 3/4] more model fixes to test_inference_engine Signed-off-by: lilacheden --- tests/inference/test_inference_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index b412159b23..779c90c1f1 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -383,7 +383,7 @@ def test_lite_llm_inference_engine(self): def test_lite_llm_inference_engine_without_task_data_not_failing(self): LiteLLMInferenceEngine( - model="watsonx/meta-llama/llama-3-2-11b-instruct", + model="watsonx/meta-llama/llama-3-2-11b-vision-instruct", max_tokens=2, temperature=0, top_p=1, @@ -464,7 +464,7 @@ def test_ollama_inference_engine(self): {"source": "Answer in one word only. What is the capital of Canada"}, ] - engine = OllamaInferenceEngine(model="llama3.2:1b", temperature=0.0) + engine = OllamaInferenceEngine(model="llama3:8b", temperature=0.0) predictions = engine.infer(dataset) self.assertTrue("Ottawa" in predictions[0], predictions[0]) From 4b4fe74b2750ba027bc2df992af3944e8d805536 Mon Sep 17 00:00:00 2001 From: lilacheden Date: Mon, 8 Dec 2025 17:05:26 +0200 Subject: [PATCH 4/4] allow small diff in metric test Signed-off-by: lilacheden --- tests/inference/test_inference_engine.py | 2 +- tests/library/test_metrics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index 779c90c1f1..f70a3be1c0 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -464,7 +464,7 @@ def test_ollama_inference_engine(self): {"source": "Answer in one word only. What is the capital of Canada"}, ] - engine = OllamaInferenceEngine(model="llama3:8b", temperature=0.0) + engine = OllamaInferenceEngine(model="llama3.2:1b", temperature=0.0) predictions = engine.infer(dataset) self.assertTrue("Ottawa" in predictions[0], predictions[0]) diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py index 160b9d2543..097856d267 100644 --- a/tests/library/test_metrics.py +++ b/tests/library/test_metrics.py @@ -2708,7 +2708,7 @@ def test_perplexity(self): metric=perplexity_question, predictions=prediction, references=references ) self.assertAlmostEqual( - first_instance_target, outputs[0]["score"]["instance"]["score"] + first_instance_target, outputs[0]["score"]["instance"]["score"], places=5 ) def test_fuzzyner(self):