From d0b5adb1b50f35f789783cdce81e12eea90c7cc4 Mon Sep 17 00:00:00 2001
From: "s.malakhov" <s.malakhov@partner.samsung.com>
Date: Fri, 6 Feb 2026 16:01:34 +0300
Subject: [PATCH] [quantization] Introduce a script for LLM evaluation

This PR introduces an option to run many LLM-related tasks using `lm_eval` package.

TICO-DCO-1.0-Signed-off-by: s.malakhov <s.malakhov@partner.samsung.com>
---
 .../evaluation/script/llm_tasks_eval.py       | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 tico/quantization/evaluation/script/llm_tasks_eval.py

diff --git a/tico/quantization/evaluation/script/llm_tasks_eval.py b/tico/quantization/evaluation/script/llm_tasks_eval.py
new file mode 100644
index 00000000..b75ec9f3
--- /dev/null
+++ b/tico/quantization/evaluation/script/llm_tasks_eval.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+from typing import Any
+
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+from lm_eval.utils import make_table
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def evaluate_llm_on_tasks(
+    model: AutoModelForCausalLM, tokenizer: AutoTokenizer, tasks: str
+) -> dict[str, Any]:
+    model_to_evaluate = HFLM(model, "causal", tokenizer=tokenizer)
+    tasks_list: list[str] = tasks.split(",")
+    return evaluator.simple_evaluate(model_to_evaluate, tasks=tasks_list)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Model name or path (e.g. 'TinyLlama/TinyLlama-1.1B-Chat-v1.0')",
+    )
+    ap.add_argument(
+        "--eval_tasks",
+        type=str,
+        default="arc_easy",
+        help="tasks to run evaluation, e.g `winogrande,arc_easy,arc_challenge,openbookqa,mmlu_pro,ifeval,bbh",
+    )
+    ap.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+    )
+    ap.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Enable only if you trust the model repo code.",
+    )
+    ap.add_argument(
+        "--hf-token",
+        type=str,
+        default=None,
+        help="Optional HF token for gated/private repos.",
+    )
+    ap.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="cache_dir for to use for model loading",
+    )
+
+    args = ap.parse_args()
+
+    print("Loading FP model …")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model,
+        trust_remote_code=args.trust_remote_code,
+        token=args.hf_token,
+        cache_dir=args.cache_dir,
+    )
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            args.model,
+            trust_remote_code=args.trust_remote_code,
+            token=args.hf_token,
+            cache_dir=args.cache_dir,
+        )
+        .to(args.device)
+        .eval()
+    )
+
+    results = evaluate_llm_on_tasks(model, tokenizer, args.eval_tasks)
+
+    print(f"results of {args.model} evaluation:")
+    print(make_table(results))
+
+
+if __name__ == "__main__":
+    main()