Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
ac8bef8
judge llm
Feb 2, 2026
85aa1fa
Merge branch 'master_center' into dev
Feb 2, 2026
16a9848
reconstruct the judgedatasets
Feb 5, 2026
df6bc4d
reconstruct judgedataset
Feb 11, 2026
312bb1d
suppport gedit infer
Feb 13, 2026
aae408c
add qwen image edit dep
Feb 13, 2026
ce167ed
add qwen image edit dep
Feb 13, 2026
898385c
llm eval
Feb 14, 2026
af1240f
base judge ds class generalize
Feb 14, 2026
e18209c
llm eval
Feb 24, 2026
c543451
fix judge worker bug
Feb 24, 2026
5724e58
fix judge worker bug
Feb 24, 2026
77c8fc5
lmm eval fix
Feb 25, 2026
68a1596
support multi judge dataset tasks
Feb 25, 2026
04fa57c
support multi judge dataset tasks
Feb 25, 2026
7355dcb
fix custom config
Feb 25, 2026
963a3b4
support multi judge dataset tasks
Feb 25, 2026
f42cdb7
support multi judge dataset tasks
Feb 25, 2026
9d26569
judge fix
Feb 25, 2026
5c92fd7
asnyc process predictions
Feb 25, 2026
dcfc50e
fast trans to dataset
Feb 25, 2026
880bc8c
fast trans to dataset
Feb 25, 2026
6d13d2a
add a gedit display tool
Feb 27, 2026
6ecd383
add task_state_manager to base dataset
Feb 27, 2026
bcb61ec
add task_state_manager to base dataset
Feb 27, 2026
fcee08f
add process bar to task state manager
Feb 27, 2026
d4861fe
add base jdg process bar to task state manager
Feb 27, 2026
d7be38b
add lmm jdg process bar to task state manager
Feb 27, 2026
d0279e8
self task state manager in api infer task
Feb 27, 2026
b3291c0
load function from static to member
Feb 27, 2026
5d66eb9
update function fix
Feb 27, 2026
d714b03
task manager effect in jdg class
Feb 27, 2026
ace5d27
fix status
Feb 27, 2026
c00788a
mv third party
Feb 27, 2026
2fd6ca8
mv tool to inner
Feb 27, 2026
e1027dc
adaptor new third party path
Feb 27, 2026
5375295
add result converter
Feb 28, 2026
3b367e6
add result converter
Feb 28, 2026
c6328cb
add result converter
Feb 28, 2026
ee78867
add result converter
Feb 28, 2026
f82836e
add result converter
Feb 28, 2026
bd04af1
useful config
Mar 2, 2026
7238077
mv org third party
Mar 3, 2026
e845ca0
fix conflict
Mar 3, 2026
af1e570
process description fix
Mar 3, 2026
8da5705
remove copy from org result
Mar 3, 2026
8de36e4
fix conifg device
Mar 3, 2026
6d24a50
fix
Mar 3, 2026
2346a52
fix
Mar 3, 2026
3186fcb
fix ut
Mar 3, 2026
4ed77d3
fix ut
Mar 3, 2026
9284ee0
fix ut
Mar 3, 2026
01c16a5
delete unused dataset config
Mar 4, 2026
38a071e
Merge branch 'master_center' into edit_dev_eval
Mar 4, 2026
0a5bc1f
Update ais_bench/benchmark/datasets/base.py
SJTUyh Mar 4, 2026
d0b9df1
Update ais_bench/benchmark/openicl/icl_inferencer/output_handler/gen_…
SJTUyh Mar 4, 2026
fefb51a
Update ais_bench/benchmark/datasets/utils/llm_judge.py
SJTUyh Mar 4, 2026
8b91ac6
Update ais_bench/benchmark/utils/file/file.py
SJTUyh Mar 4, 2026
a845479
fix
Mar 4, 2026
fb442eb
fix review
Mar 4, 2026
f886815
fix review
Mar 4, 2026
d80b428
fix review
Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ais_bench/benchmark/cli/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _base_parser(self):
help='Running mode. Choose "perf" for performance evaluation, "infer" to run inference only, '
'"eval" to evaluate existing inference results, or "viz" to visualize the results. '
'The default mode is "all", which runs all steps.',
choices=['all', 'infer', 'eval', 'viz', 'perf', 'perf_viz'],
choices=['all', 'infer', 'eval', 'viz', 'perf', 'perf_viz', 'judge', 'infer_judge'],
default='all',
type=str
)
Expand Down
3 changes: 1 addition & 2 deletions ais_bench/benchmark/cli/config_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import os
import os.path as osp
import tabulate
Expand Down Expand Up @@ -104,7 +103,7 @@ def load_config(self, workflow):
self._update_cfg_of_workflow(workflow)
self._dump_and_reload_config()
return self.cfg

def _fill_dataset_configs(self):
for dataset_cfg in self.cfg["datasets"]:
fill_test_range_use_num_prompts(self.cfg["cli_args"].get("num_prompts"), dataset_cfg)
Expand Down
176 changes: 170 additions & 6 deletions ais_bench/benchmark/cli/workers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import os.path as osp
import copy
import shutil
from abc import ABC, abstractmethod
from collections import defaultdict

Expand All @@ -8,12 +10,15 @@
from ais_bench.benchmark.registry import PARTITIONERS, RUNNERS, build_from_cfg
from ais_bench.benchmark.utils.config.run import get_config_type
from ais_bench.benchmark.utils.logging.logger import AISLogger
from ais_bench.benchmark.utils.logging.exceptions import PredictionInvalidException
from ais_bench.benchmark.utils.logging.error_codes import TMAN_CODES
from ais_bench.benchmark.partitioners import NaivePartitioner
from ais_bench.benchmark.runners import LocalRunner
from ais_bench.benchmark.tasks import OpenICLEvalTask, OpenICLApiInferTask, OpenICLInferTask
from ais_bench.benchmark.summarizers import DefaultSummarizer, DefaultPerfSummarizer
from ais_bench.benchmark.calculators import DefaultPerfMetricCalculator
from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need
from ais_bench.benchmark.utils.file.file import load_jsonl, dump_jsonl

logger = AISLogger()

Expand Down Expand Up @@ -108,6 +113,133 @@ def _update_tasks_cfg(self, tasks, cfg: ConfigDict):
task.attack = cfg.attack


class JudgeInfer(BaseWorker):
def update_cfg(self, cfg: ConfigDict) -> None:
def get_task_type() -> str:
if cfg["datasets"][0]["judge_infer_cfg"]["judge_model"]["attr"] == "service":
return get_config_type(OpenICLApiInferTask)
else:
return get_config_type(OpenICLInferTask)

new_cfg = dict(
judge_infer=dict(
partitioner=dict(type=get_config_type(NaivePartitioner)),
runner=dict(
max_num_workers=self.args.max_num_workers,
max_workers_per_gpu=self.args.max_workers_per_gpu,
debug=self.args.debug,
task=dict(type=get_task_type()),
type=get_config_type(LocalRunner),
),
),
)

cfg.merge_from_dict(new_cfg)
if cfg.cli_args.debug:
cfg.judge_infer.runner.debug = True
cfg.judge_infer.partitioner["out_dir"] = osp.join(cfg["work_dir"], "predictions/")
return cfg

def do_work(self, cfg: ConfigDict):
partitioner = PARTITIONERS.build(cfg.judge_infer.partitioner)
logger.info("Starting inference tasks...")
self._cfg_pre_process(cfg)
tasks = partitioner(cfg)

# delete the tasks without judge_infer_cfg
new_tasks = []
for task in tasks:
if task["datasets"][0][0].get("judge_infer_cfg"):
new_tasks.append(task)
tasks = new_tasks
if len(tasks) == 0:
return

# update tasks cfg before run
self._update_tasks_cfg(tasks, cfg)

if (
cfg.get("cli_args", {}).get("merge_ds", False)
or cfg.get("cli_args", {}).get("mode") == "perf" # performance mode will enable merge datasets by default
):
logger.info("Merging datasets with the same model and inferencer...")
tasks = self._merge_datasets(tasks)

runner = RUNNERS.build(cfg.judge_infer.runner)
runner(tasks)
self._result_post_process(tasks, cfg)
logger.info("Inference tasks completed.")

def _merge_datasets(self, tasks):
# merge datasets with the same model, dataset type and inferencer
task_groups = defaultdict(list)
for task in tasks:
key = (
task["models"][0]["abbr"] # same model
+ "_"
+ str(task['datasets'][0][0]['type']) # same dataset type
+ "_"
+ str(task["datasets"][0][0]["infer_cfg"]["inferencer"]) # same inferencer with the same args
)
task_groups[key].append(task)
new_tasks = []
for key, task_group in task_groups.items():
new_task = copy.deepcopy(task_group[0])
if len(task_group) > 1:
for t in task_group[1:]:
new_task["datasets"][0].extend(t["datasets"][0])
new_tasks.append(new_task)
return new_tasks

def _cfg_pre_process(self, cfg: ConfigDict) -> None:
self.org_dataset_abbrs = {}
def change_judge_dataset_abbr(item):
if item.get("judge_infer_cfg"):
org_dataset_abbr = item["abbr"]
new_dataset_abbr = f'{item["abbr"]}-{item["judge_infer_cfg"]["judge_model"]["abbr"]}'
item["abbr"] = new_dataset_abbr
self.org_dataset_abbrs[new_dataset_abbr] = org_dataset_abbr
if cfg.get('model_dataset_combinations', None) is not None:
for item in cfg.model_dataset_combinations:
for dataset in item["datasets"]:
change_judge_dataset_abbr(dataset)
for dataset in cfg.datasets:
change_judge_dataset_abbr(dataset)
return cfg

def _update_tasks_cfg(self, tasks, cfg: ConfigDict):
# update parameters to correct sub cfg
if hasattr(cfg, "attack"):
for task in tasks:
cfg.attack.dataset = task.datasets[0][0].abbr
task.attack = cfg.attack

# update judge cfgs to model cfgs and data
for task in tasks:
task["datasets"][0][0]["predictions_path"] = osp.join(cfg.judge_infer.partitioner.out_dir, task["models"][0]["abbr"], f'{self.org_dataset_abbrs[task["datasets"][0][0]["abbr"]]}.jsonl')
if not osp.exists(task["datasets"][0][0]["predictions_path"]):
raise PredictionInvalidException(TMAN_CODES.UNKNOWN_ERROR, f"Predictions path {task['datasets'][0][0]['predictions_path']} does not exist.")
model_abbr = task["models"][0]["abbr"]
task["models"][0] = task["datasets"][0][0]["judge_infer_cfg"].pop("judge_model")
task["models"][0]["abbr"] = model_abbr
task["datasets"][0][0]["type"] = task["datasets"][0][0]["judge_infer_cfg"].pop("judge_dataset_type")
task["datasets"][0][0]["reader_cfg"] = task["datasets"][0][0]["judge_infer_cfg"].pop("judge_reader_cfg")
task["datasets"][0][0]["infer_cfg"] = task["datasets"][0][0].pop("judge_infer_cfg")

def _result_post_process(self, tasks, cfg: ConfigDict):
# Reconstruct the judge infer predictions to normal predictions format
for task in tasks:
model_org_prediction_path = task["datasets"][0][0]["predictions_path"]
model_preds: dict = {item["uuid"]: item for item in load_jsonl(model_org_prediction_path)}
judge_org_prediction_path = osp.join(cfg.judge_infer.partitioner.out_dir, task["models"][0]["abbr"], f'{task["datasets"][0][0]["abbr"]}.jsonl')
judge_preds: list = load_jsonl(judge_org_prediction_path)
for i, pred in enumerate(judge_preds):
uuid = pred["gold"]
judge_preds[i]["id"] = model_preds[uuid]["id"]
os.remove(judge_org_prediction_path)
dump_jsonl(judge_preds, judge_org_prediction_path)


class Eval(BaseWorker):
def update_cfg(self, cfg: ConfigDict) -> None:
new_cfg = dict(
Expand Down Expand Up @@ -136,9 +268,11 @@ def update_cfg(self, cfg: ConfigDict) -> None:
def do_work(self, cfg: ConfigDict):
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
logger.info("Starting evaluation tasks...")
self._cfg_pre_process(cfg)

tasks = partitioner(cfg)

# update tasks cfg before run
# Update tasks cfg before run
self._update_tasks_cfg(tasks, cfg)

runner = RUNNERS.build(cfg.eval.runner)
Expand All @@ -150,9 +284,28 @@ def do_work(self, cfg: ConfigDict):
runner(tasks)
logger.info("Evaluation tasks completed.")

def _cfg_pre_process(self, cfg: ConfigDict) -> None:
self.org_dataset_abbrs = {}
def change_eval_dataset_abbr(item):
if item.get("judge_infer_cfg"):
org_dataset_abbr = item["abbr"]
new_dataset_abbr = f'{item["abbr"]}-{item["judge_infer_cfg"]["judge_model"]["abbr"]}'
item["abbr"] = new_dataset_abbr
self.org_dataset_abbrs[new_dataset_abbr] = org_dataset_abbr
if cfg.get('model_dataset_combinations', None) is not None:
for item in cfg.model_dataset_combinations:
for dataset in item["datasets"]:
change_eval_dataset_abbr(dataset)
for dataset in cfg.datasets:
change_eval_dataset_abbr(dataset)
return cfg
Comment on lines +287 to +301
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The _cfg_pre_process method in the Eval class is identical to the one in the JudgeInfer class (lines 194-208). This code duplication can make maintenance harder. Consider refactoring this logic into a shared helper function or a method in a common base class to improve code reuse and maintainability.


def _update_tasks_cfg(self, tasks, cfg: ConfigDict):
# update parameters to correct sub cfg
pass
# Replace default model config to judge model config
self.judge_result_paths = {}
for task in tasks:
if task["datasets"][0][0].get("judge_infer_cfg"):
task["datasets"][0][0].pop("judge_infer_cfg")


class AccViz(BaseWorker):
Expand All @@ -171,6 +324,7 @@ def update_cfg(self, cfg: ConfigDict) -> None:
def do_work(self, cfg: ConfigDict) -> int:
logger.info("Summarizing evaluation results...")
summarizer_cfg = cfg.get("summarizer", {})
cfg = self._cfg_pre_process(cfg)

# For subjective summarizer
if summarizer_cfg.get("function", None):
Expand Down Expand Up @@ -203,6 +357,13 @@ def do_work(self, cfg: ConfigDict) -> int:
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=self.args.cfg_time_str)

def _cfg_pre_process(self, cfg: ConfigDict) -> None:
for i, dataset in enumerate(cfg.datasets):
if dataset.get("judge_infer_cfg"):
cfg.datasets[i]["abbr"] = f'{cfg.datasets[i]["abbr"]}-{cfg.datasets[i]["judge_infer_cfg"]["judge_model"]["abbr"]}'
cfg.datasets[i].pop("judge_infer_cfg")
return cfg


class PerfViz(BaseWorker):
def update_cfg(self, cfg: ConfigDict) -> None:
Expand Down Expand Up @@ -233,9 +394,11 @@ def do_work(self, cfg: ConfigDict) -> int:


WORK_FLOW = dict(
all=[Infer, Eval, AccViz],
all=[Infer, JudgeInfer, Eval, AccViz],
infer=[Infer],
eval=[Eval, AccViz],
judge=[JudgeInfer],
infer_judge=[Infer, JudgeInfer],
eval=[JudgeInfer, Eval, AccViz],
viz=[AccViz],
perf=[Infer, PerfViz],
perf_viz=[PerfViz],
Expand All @@ -249,4 +412,5 @@ def __init__(self, cfg, workflow) -> None:

def execute(self) -> None:
for worker in self.workflow:
worker.do_work(self.cfg)
cfg = copy.deepcopy(self.cfg)
worker.do_work(cfg)
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate
from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
from ais_bench.benchmark.models import VLLMCustomAPIChat
from ais_bench.benchmark.utils.postprocess.model_postprocessors import extract_non_reasoning_content
from ais_bench.benchmark.datasets import (
Aime2025Dataset,
Aime2025JDGDataset,
)
from ais_bench.benchmark.datasets.utils.llm_judge import get_a_or_b, LLMJudgeCorrectEvaluator


aime2025_reader_cfg = dict(input_columns=["question"], output_column="answer")


aime2025_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt="{question}\nRemember to put your final answer within \\boxed{}.",
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.

Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
6. If the candidate's answer is semantically incomplete at the end, please judge it as inconsistent.

Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: Means the answer is consistent with the standard answer.
B: Means the answer is inconsistent with the standard answer.
Just return the letters "A" or "B", with no text around it.

Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.


<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{model_answer}\n<Predicted End>\n\n
Comment on lines +51 to +53
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

The GRADER_TEMPLATE directly embeds the target model's output ({model_answer}) into the prompt for the judge model. This is vulnerable to prompt injection, where the target model's output could contain instructions that manipulate the judge model's evaluation logic (e.g., by including fake delimiters and instructions to always output 'A'). This can lead to inaccurate or biased evaluation results. It is recommended to use clear, unique delimiters and explicit instructions to help the judge model distinguish between its task and the content it is evaluating.


Judging the correctness of candidates' answers, please return the the letters "A" or "B" first before your thinking:
""".strip()

aime2025_judge_infer_cfg = dict(
judge_reader_cfg = dict(input_columns=["question", "answer", "model_answer"], output_column="model_pred_uuid"),
judge_model=dict(
attr="service",
type=VLLMCustomAPIChat,
abbr="judge", # Be added after dataset abbr
path="",
model="",
stream=True,
request_rate=0,
use_timestamp=False,
retry=2,
api_key="",
host_ip="localhost",
host_port=8080,
url="",
max_out_len=512,
batch_size=1,
trust_remote_code=False,
generation_kwargs=dict(
temperature=0.01,
ignore_eos=False,
),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
judge_dataset_type=Aime2025JDGDataset,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

aime2025_eval_cfg = dict(
evaluator=dict(type=LLMJudgeCorrectEvaluator),
pred_postprocessor=dict(type=get_a_or_b),
)

aime2025_datasets = [
dict(
abbr="aime2025",
type=Aime2025Dataset,
path="ais_bench/datasets/aime2025/aime2025.jsonl",
reader_cfg=aime2025_reader_cfg,
infer_cfg=aime2025_infer_cfg,
judge_infer_cfg=aime2025_judge_infer_cfg,
eval_cfg=aime2025_eval_cfg,
)
]
Loading