From e2f26fc25fb6fbaf08e690a6acff94c8c5df81f1 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Sun, 7 Dec 2025 15:51:51 +0200
Subject: [PATCH 1/4] make ollamaInferenceEngine handle return_meta_data

Signed-off-by: lilacheden <lilach.edel@gmail.com>
---
 src/unitxt/inference.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index c31d612a3a..2b05ff4a6b 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1461,7 +1461,18 @@ def _infer(
                 options=args,
             )
             results.append(response)
-
+        if return_meta_data:
+            return [
+                TextGenerationInferenceOutput(
+                    prediction=element["message"]["content"],
+                    generated_text=element["message"]["content"],
+                    input_tokens=element.get("prompt_eval_count", 0),
+                    output_tokens=element.get("eval_count", 0),
+                    model_name=self.model,
+                    inference_type=self.label,
+                )
+                for element in results
+            ]
         return [element["message"]["content"] for element in results]
 
 

From ea09bab53fc952b5e10823ee089f0902e3d0c9b4 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Mon, 8 Dec 2025 16:23:18 +0200
Subject: [PATCH 2/4] fix wml inference tests to use supported models

Signed-off-by: lilacheden <lilach.edel@gmail.com>
---
 tests/inference/test_inference_engine.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index 48261b8f01..b412159b23 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -159,7 +159,7 @@ def test_llava_inference_engine(self):
 
     def test_watsonx_inference(self):
         model = WMLInferenceEngineGeneration(
-            model_name="google/flan-t5-xl",
+            model_name="ibm/granite-3-8b-instruct",
             data_classification_policy=["public"],
             random_seed=111,
             min_new_tokens=1,
@@ -193,7 +193,7 @@ def test_watsonx_inference_with_external_client(self):
         from ibm_watsonx_ai.client import APIClient, Credentials
 
         model = WMLInferenceEngineGeneration(
-            model_name="google/flan-t5-xl",
+            model_name="ibm/granite-3-8b-instruct",
             data_classification_policy=["public"],
             random_seed=111,
             min_new_tokens=1,
@@ -279,7 +279,7 @@ def test_option_selecting_by_log_prob_inference_engines(self):
         ]
 
         watsonx_engine = WMLInferenceEngineGeneration(
-            model_name="meta-llama/llama-3-2-1b-instruct"
+            model_name="ibm/granite-3-8b-instruct"
         )
 
         for engine in [watsonx_engine]:
@@ -383,7 +383,7 @@ def test_lite_llm_inference_engine(self):
 
     def test_lite_llm_inference_engine_without_task_data_not_failing(self):
         LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-2-1b-instruct",
+            model="watsonx/meta-llama/llama-3-2-11b-instruct",
             max_tokens=2,
             temperature=0,
             top_p=1,

From b3f86684b5ece22a5a8e395e57aa9fda002bd11f Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Mon, 8 Dec 2025 16:32:52 +0200
Subject: [PATCH 3/4] more model fixes to test_inference_engine

Signed-off-by: lilacheden <lilach.edel@gmail.com>
---
 tests/inference/test_inference_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index b412159b23..779c90c1f1 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -383,7 +383,7 @@ def test_lite_llm_inference_engine(self):
 
     def test_lite_llm_inference_engine_without_task_data_not_failing(self):
         LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-2-11b-instruct",
+            model="watsonx/meta-llama/llama-3-2-11b-vision-instruct",
             max_tokens=2,
             temperature=0,
             top_p=1,
@@ -464,7 +464,7 @@ def test_ollama_inference_engine(self):
             {"source": "Answer in one word only. What is the capital of Canada"},
         ]
 
-        engine = OllamaInferenceEngine(model="llama3.2:1b", temperature=0.0)
+        engine = OllamaInferenceEngine(model="llama3:8b", temperature=0.0)
         predictions = engine.infer(dataset)
 
         self.assertTrue("Ottawa" in predictions[0], predictions[0])

From 4b4fe74b2750ba027bc2df992af3944e8d805536 Mon Sep 17 00:00:00 2001
From: lilacheden <lilach.edel@gmail.com>
Date: Mon, 8 Dec 2025 17:05:26 +0200
Subject: [PATCH 4/4] allow small diff in metric test

Signed-off-by: lilacheden <lilach.edel@gmail.com>
---
 tests/inference/test_inference_engine.py | 2 +-
 tests/library/test_metrics.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index 779c90c1f1..f70a3be1c0 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -464,7 +464,7 @@ def test_ollama_inference_engine(self):
             {"source": "Answer in one word only. What is the capital of Canada"},
         ]
 
-        engine = OllamaInferenceEngine(model="llama3:8b", temperature=0.0)
+        engine = OllamaInferenceEngine(model="llama3.2:1b", temperature=0.0)
         predictions = engine.infer(dataset)
 
         self.assertTrue("Ottawa" in predictions[0], predictions[0])
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
index 160b9d2543..097856d267 100644
--- a/tests/library/test_metrics.py
+++ b/tests/library/test_metrics.py
@@ -2708,7 +2708,7 @@ def test_perplexity(self):
             metric=perplexity_question, predictions=prediction, references=references
         )
         self.assertAlmostEqual(
-            first_instance_target, outputs[0]["score"]["instance"]["score"]
+            first_instance_target, outputs[0]["score"]["instance"]["score"], places=5
         )
 
     def test_fuzzyner(self):