AISBench · GaoHuaZhang · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026
diff --git a/ais_bench/benchmark/cli/task_manager.py b/ais_bench/benchmark/cli/task_manager.py
@@ -29,6 +29,17 @@ def run(self) -> None:
                 f"and performance metrics will be loaded from the reuse work dir."
             )
             run_mode = "perf_viz"
+        if self.args.config and run_mode == "all":
+            try:
+                from mmengine.config import Config
+                peek_cfg = Config.fromfile(self.args.config, format_python_code=False)
+                if "infer" not in peek_cfg:
+                    run_mode = "eval"
+                    self.logger.info(
+                        f"Config has no infer section, defaulting to mode '{run_mode}'"
+                    )
+            except Exception:
+                pass
         self.workflow = [worker_class(self.args) for worker_class in WORK_FLOW.get(run_mode)]
 
         # load config

diff --git a/ais_bench/benchmark/cli/workers.py b/ais_bench/benchmark/cli/workers.py
@@ -10,7 +10,11 @@
 from ais_bench.benchmark.utils.logging.logger import AISLogger
 from ais_bench.benchmark.partitioners import NaivePartitioner
 from ais_bench.benchmark.runners import LocalRunner
-from ais_bench.benchmark.tasks import OpenICLEvalTask, OpenICLApiInferTask, OpenICLInferTask
+from ais_bench.benchmark.tasks import (
+    OpenICLEvalTask,
+    OpenICLApiInferTask,
+    OpenICLInferTask,
+)
 from ais_bench.benchmark.summarizers import DefaultSummarizer, DefaultPerfSummarizer
 from ais_bench.benchmark.calculators import DefaultPerfMetricCalculator
 from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need
@@ -110,13 +114,19 @@ def _update_tasks_cfg(self, tasks, cfg: ConfigDict):
 
 class Eval(BaseWorker):
     def update_cfg(self, cfg: ConfigDict) -> None:
+        existing_task = cfg.get("eval", {}).get("runner", {}).get("task")
+        if existing_task and existing_task.get("type") is not None:
+            t = existing_task["type"]
+            eval_task_type = t if isinstance(t, str) else get_config_type(t)
+        else:
+            eval_task_type = get_config_type(OpenICLEvalTask)
         new_cfg = dict(
             eval=dict(
                 partitioner=dict(type=get_config_type(NaivePartitioner)),
                 runner=dict(
                     max_num_workers=self.args.max_num_workers,
                     debug=self.args.debug,
-                    task=dict(type=get_config_type(OpenICLEvalTask)),
+                    task=dict(type=eval_task_type),
                 ),
             ),
         )

diff --git a/ais_bench/benchmark/configs/summarizers/vbench.py b/ais_bench/benchmark/configs/summarizers/vbench.py
@@ -0,0 +1,7 @@
+from ais_bench.benchmark.summarizers import VBenchSummarizer
+
+summarizer = dict(
+    attr='accuracy',
+    type=VBenchSummarizer,
+    summary_groups=[],
+)
diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py
@@ -26,6 +26,7 @@
 from ais_bench.benchmark.datasets.race import *
 from ais_bench.benchmark.datasets.textvqa import *
 from ais_bench.benchmark.datasets.videobench import *
+from ais_bench.benchmark.datasets.vbench import *
 from ais_bench.benchmark.datasets.vocalsound import *
 from ais_bench.benchmark.datasets.lambada import * # noqa: F401, F403
 from ais_bench.benchmark.datasets.lcsts import * # noqa: F401, F403

diff --git a/ais_bench/benchmark/datasets/vbench.py b/ais_bench/benchmark/datasets/vbench.py
@@ -0,0 +1,21 @@
+"""VBench 1.0 dataset config type for video/image quality evaluation (eval-only, no loader)."""
+from datasets import Dataset
+
+from ais_bench.benchmark.registry import LOAD_DATASET
+from ais_bench.benchmark.datasets.base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class VBenchDataset(BaseDataset):
+    """Placeholder dataset for VBench evaluation.
+
+    VBench evaluation uses only dataset config (path/videos_path, dimension_list,
+    full_json_dir, eval_cfg). This class provides a minimal load() so that
+    LOAD_DATASET.build(dataset_cfg) does not fail if ever called; the actual
+    evaluation is done in VBenchEvalTask which reads the config directly.
+    """
+
+    @staticmethod
+    def load(path: str, **kwargs):
+        """Return a minimal placeholder dataset. VBench eval uses config only."""
+        return Dataset.from_list([{"dummy": 0}])
diff --git a/ais_bench/benchmark/summarizers/__init__.py b/ais_bench/benchmark/summarizers/__init__.py
@@ -1,4 +1,5 @@
 # flake8: noqa: F401, E501
 from ais_bench.benchmark.summarizers.default import DefaultSummarizer  # noqa: F401
 from ais_bench.benchmark.summarizers.default_subjective import DefaultSubjectiveSummarizer  # noqa: F401
-from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer # noqa: F401
+from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer  # noqa: F401
+from ais_bench.benchmark.summarizers.vbench import VBenchSummarizer  # noqa: F401
diff --git a/ais_bench/benchmark/summarizers/vbench.py b/ais_bench/benchmark/summarizers/vbench.py
@@ -0,0 +1,168 @@
+# flake8: noqa
+# yapf: disable
+"""VBench summarizer with official normalization and aggregation logic."""
+import re
+from typing import Dict
+
+from ais_bench.benchmark.summarizers.default import DefaultSummarizer
+
+# VBench official constants from scripts/constant.py
+NORMALIZE_DIC = {
+    "subject consistency": {"Min": 0.1462, "Max": 1.0},
+    "background consistency": {"Min": 0.2615, "Max": 1.0},
+    "temporal flickering": {"Min": 0.6293, "Max": 1.0},
+    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+    "dynamic degree": {"Min": 0.0, "Max": 1.0},
+    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+    "imaging quality": {"Min": 0.0, "Max": 1.0},
+    "object class": {"Min": 0.0, "Max": 1.0},
+    "multiple objects": {"Min": 0.0, "Max": 1.0},
+    "human action": {"Min": 0.0, "Max": 1.0},
+    "color": {"Min": 0.0, "Max": 1.0},
+    "spatial relationship": {"Min": 0.0, "Max": 1.0},
+    "scene": {"Min": 0.0, "Max": 0.8222},
+    "appearance style": {"Min": 0.0009, "Max": 0.2855},
+    "temporal style": {"Min": 0.0, "Max": 0.364},
+    "overall consistency": {"Min": 0.0, "Max": 0.364},
+}
+DIM_WEIGHT = {
+    "subject consistency": 1,
+    "background consistency": 1,
+    "temporal flickering": 1,
+    "motion smoothness": 1,
+    "aesthetic quality": 1,
+    "imaging quality": 1,
+    "dynamic degree": 0.5,
+    "object class": 1,
+    "multiple objects": 1,
+    "human action": 1,
+    "color": 1,
+    "spatial relationship": 1,
+    "scene": 1,
+    "appearance style": 1,
+    "temporal style": 1,
+    "overall consistency": 1,
+}
+QUALITY_LIST = [
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "aesthetic quality",
+    "imaging quality",
+    "dynamic degree",
+]
+SEMANTIC_LIST = [
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency",
+]
+QUALITY_WEIGHT = 4
+SEMANTIC_WEIGHT = 1
+
+# Known dimension names (underscore form) for regex extraction from abbr
+_DIM_PATTERN = re.compile(
+    r'(subject_consistency|background_consistency|temporal_flickering|'
+    r'motion_smoothness|dynamic_degree|aesthetic_quality|imaging_quality|'
+    r'object_class|multiple_objects|human_action|color|spatial_relationship|'
+    r'scene|appearance_style|temporal_style|overall_consistency)$'
+)
+
+
+def _abbr_to_const_key(abbr: str) -> str:
+    """Extract dimension from abbr, e.g. vbench_custom_subject_consistency -> subject consistency."""
+    m = _DIM_PATTERN.search(abbr)
+    if m:
+        return m.group(1).replace('_', ' ')
+    if abbr.startswith('vbench_'):
+        return abbr[7:].replace('_', ' ')
+    return abbr.replace('_', ' ')
+
+
+def _get_normalized_score(raw_score: float, const_key: str) -> float:
+    """Normalize and apply DIM_WEIGHT per cal_final_score.py."""
+    if const_key not in NORMALIZE_DIC or const_key not in DIM_WEIGHT:
+        return 0.0
+    raw = raw_score / 100.0 if raw_score > 1 else raw_score
+    min_val = NORMALIZE_DIC[const_key]['Min']
+    max_val = NORMALIZE_DIC[const_key]['Max']
+    span = max_val - min_val
+    if span <= 0:
+        norm = 1.0 if raw >= max_val else 0.0
+    else:
+        norm = (raw - min_val) / span
+    return norm * DIM_WEIGHT[const_key]
+
+
+class VBenchSummarizer(DefaultSummarizer):
+    """VBench summarizer using official cal_final_score.py logic.
+
+    Computes Quality Score, Semantic Score, Total Score with:
+    - Per-dimension normalization: (score - Min) / (Max - Min) * DIM_WEIGHT
+    - Quality = weighted avg of QUALITY_LIST dims
+    - Semantic = weighted avg of SEMANTIC_LIST dims
+    - Total = (Quality * 4 + Semantic * 1) / 5
+    """
+
+    def _calculate_group_metrics(
+        self,
+        raw_results: Dict,
+        parsed_results: Dict,
+        dataset_metrics: Dict,
+        dataset_eval_mode: Dict,
+    ):
+        """Compute vbench Quality, Semantic, Total using official formula."""
+        for model_abbr in self.model_abbrs:
+            model_results = parsed_results.get(model_abbr, {})
+            vbench_scores = {}
+            for abbr, data in model_results.items():
+                if not abbr.startswith('vbench_'):
+                    continue
+                acc = data.get('accuracy')
+                if acc is None or not isinstance(acc, (int, float)):
+                    continue
+                const_key = _abbr_to_const_key(abbr)
+                vbench_scores[const_key] = acc
+
+            if not vbench_scores:
+                continue
+
+            normalized = {
+                k: _get_normalized_score(v, k)
+                for k, v in vbench_scores.items()
+            }
+
+            quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST)
+            quality_denom = sum(DIM_WEIGHT.get(k, 0) for k in QUALITY_LIST)
+            quality_score = (
+                quality_num / quality_denom if quality_denom else 0.0
+            )
+
+            semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST)
+            semantic_denom = sum(DIM_WEIGHT.get(k, 0) for k in SEMANTIC_LIST)
+            semantic_score = (
+                semantic_num / semantic_denom if semantic_denom else 0.0
+            )
+
+            total_score = (
+                quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT
+            ) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
+
+            for name, score in [
+                ('vbench_quality', quality_score * 100),
+                ('vbench_semantic', semantic_score * 100),
+                ('vbench_total', total_score * 100),
+            ]:
+                raw_results[model_abbr].setdefault(name, {})['accuracy'] = score
+                parsed_results[model_abbr].setdefault(name, {})['accuracy'] = score
+                if name not in dataset_metrics:
+                    dataset_metrics[name] = ['accuracy']
+                dataset_eval_mode[name] = 'gen'
+
+        return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
diff --git a/ais_bench/benchmark/tasks/__init__.py b/ais_bench/benchmark/tasks/__init__.py
@@ -1,3 +1,4 @@
 from ais_bench.benchmark.tasks.openicl_eval import *  # noqa: F401, F403
 from ais_bench.benchmark.tasks.openicl_infer import *  # noqa: F401, F403
 from ais_bench.benchmark.tasks.openicl_api_infer import OpenICLApiInferTask
+from ais_bench.benchmark.tasks.vbench_eval import VBenchEvalTask  # noqa: F401