Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
11 changes: 11 additions & 0 deletions ais_bench/benchmark/cli/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ def run(self) -> None:
f"and performance metrics will be loaded from the reuse work dir."
)
run_mode = "perf_viz"
if self.args.config and run_mode == "all":
try:
from mmengine.config import Config
peek_cfg = Config.fromfile(self.args.config, format_python_code=False)
if "infer" not in peek_cfg:
run_mode = "eval"
self.logger.info(
f"Config has no infer section, defaulting to mode '{run_mode}'"
)
except Exception:
pass
self.workflow = [worker_class(self.args) for worker_class in WORK_FLOW.get(run_mode)]

# load config
Expand Down
14 changes: 12 additions & 2 deletions ais_bench/benchmark/cli/workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from ais_bench.benchmark.utils.logging.logger import AISLogger
from ais_bench.benchmark.partitioners import NaivePartitioner
from ais_bench.benchmark.runners import LocalRunner
from ais_bench.benchmark.tasks import OpenICLEvalTask, OpenICLApiInferTask, OpenICLInferTask
from ais_bench.benchmark.tasks import (
OpenICLEvalTask,
OpenICLApiInferTask,
OpenICLInferTask,
)
from ais_bench.benchmark.summarizers import DefaultSummarizer, DefaultPerfSummarizer
from ais_bench.benchmark.calculators import DefaultPerfMetricCalculator
from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need
Expand Down Expand Up @@ -110,13 +114,19 @@ def _update_tasks_cfg(self, tasks, cfg: ConfigDict):

class Eval(BaseWorker):
def update_cfg(self, cfg: ConfigDict) -> None:
existing_task = cfg.get("eval", {}).get("runner", {}).get("task")
if existing_task and existing_task.get("type") is not None:
t = existing_task["type"]
eval_task_type = t if isinstance(t, str) else get_config_type(t)
else:
eval_task_type = get_config_type(OpenICLEvalTask)
new_cfg = dict(
eval=dict(
partitioner=dict(type=get_config_type(NaivePartitioner)),
runner=dict(
max_num_workers=self.args.max_num_workers,
debug=self.args.debug,
task=dict(type=get_config_type(OpenICLEvalTask)),
task=dict(type=eval_task_type),
),
),
)
Expand Down
7 changes: 7 additions & 0 deletions ais_bench/benchmark/configs/summarizers/vbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from ais_bench.benchmark.summarizers import VBenchSummarizer

summarizer = dict(
attr='accuracy',
type=VBenchSummarizer,
summary_groups=[],
)
1 change: 1 addition & 0 deletions ais_bench/benchmark/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ais_bench.benchmark.datasets.race import *
from ais_bench.benchmark.datasets.textvqa import *
from ais_bench.benchmark.datasets.videobench import *
from ais_bench.benchmark.datasets.vbench import *
from ais_bench.benchmark.datasets.vocalsound import *
from ais_bench.benchmark.datasets.lambada import * # noqa: F401, F403
from ais_bench.benchmark.datasets.lcsts import * # noqa: F401, F403
Expand Down
21 changes: 21 additions & 0 deletions ais_bench/benchmark/datasets/vbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""VBench 1.0 dataset config type for video/image quality evaluation (eval-only, no loader)."""
from datasets import Dataset

from ais_bench.benchmark.registry import LOAD_DATASET
from ais_bench.benchmark.datasets.base import BaseDataset


@LOAD_DATASET.register_module()
class VBenchDataset(BaseDataset):
"""Placeholder dataset for VBench evaluation.

VBench evaluation uses only dataset config (path/videos_path, dimension_list,
full_json_dir, eval_cfg). This class provides a minimal load() so that
LOAD_DATASET.build(dataset_cfg) does not fail if ever called; the actual
evaluation is done in VBenchEvalTask which reads the config directly.
"""

@staticmethod
def load(path: str, **kwargs):
"""Return a minimal placeholder dataset. VBench eval uses config only."""
return Dataset.from_list([{"dummy": 0}])
3 changes: 2 additions & 1 deletion ais_bench/benchmark/summarizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# flake8: noqa: F401, E501
from ais_bench.benchmark.summarizers.default import DefaultSummarizer # noqa: F401
from ais_bench.benchmark.summarizers.default_subjective import DefaultSubjectiveSummarizer # noqa: F401
from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer # noqa: F401
from ais_bench.benchmark.summarizers.default_perf import DefaultPerfSummarizer # noqa: F401
from ais_bench.benchmark.summarizers.vbench import VBenchSummarizer # noqa: F401
168 changes: 168 additions & 0 deletions ais_bench/benchmark/summarizers/vbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# flake8: noqa
# yapf: disable
"""VBench summarizer with official normalization and aggregation logic."""
import re
from typing import Dict

from ais_bench.benchmark.summarizers.default import DefaultSummarizer

# VBench official constants from scripts/constant.py
NORMALIZE_DIC = {
"subject consistency": {"Min": 0.1462, "Max": 1.0},
"background consistency": {"Min": 0.2615, "Max": 1.0},
"temporal flickering": {"Min": 0.6293, "Max": 1.0},
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
"dynamic degree": {"Min": 0.0, "Max": 1.0},
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
"imaging quality": {"Min": 0.0, "Max": 1.0},
"object class": {"Min": 0.0, "Max": 1.0},
"multiple objects": {"Min": 0.0, "Max": 1.0},
"human action": {"Min": 0.0, "Max": 1.0},
"color": {"Min": 0.0, "Max": 1.0},
"spatial relationship": {"Min": 0.0, "Max": 1.0},
"scene": {"Min": 0.0, "Max": 0.8222},
"appearance style": {"Min": 0.0009, "Max": 0.2855},
"temporal style": {"Min": 0.0, "Max": 0.364},
"overall consistency": {"Min": 0.0, "Max": 0.364},
}
DIM_WEIGHT = {
"subject consistency": 1,
"background consistency": 1,
"temporal flickering": 1,
"motion smoothness": 1,
"aesthetic quality": 1,
"imaging quality": 1,
"dynamic degree": 0.5,
"object class": 1,
"multiple objects": 1,
"human action": 1,
"color": 1,
"spatial relationship": 1,
"scene": 1,
"appearance style": 1,
"temporal style": 1,
"overall consistency": 1,
}
QUALITY_LIST = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",
]
SEMANTIC_LIST = [
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency",
]
QUALITY_WEIGHT = 4
SEMANTIC_WEIGHT = 1

# Known dimension names (underscore form) for regex extraction from abbr
_DIM_PATTERN = re.compile(
r'(subject_consistency|background_consistency|temporal_flickering|'
r'motion_smoothness|dynamic_degree|aesthetic_quality|imaging_quality|'
r'object_class|multiple_objects|human_action|color|spatial_relationship|'
r'scene|appearance_style|temporal_style|overall_consistency)$'
)


def _abbr_to_const_key(abbr: str) -> str:
"""Extract dimension from abbr, e.g. vbench_custom_subject_consistency -> subject consistency."""
m = _DIM_PATTERN.search(abbr)
if m:
return m.group(1).replace('_', ' ')
if abbr.startswith('vbench_'):
return abbr[7:].replace('_', ' ')
return abbr.replace('_', ' ')


def _get_normalized_score(raw_score: float, const_key: str) -> float:
"""Normalize and apply DIM_WEIGHT per cal_final_score.py."""
if const_key not in NORMALIZE_DIC or const_key not in DIM_WEIGHT:
return 0.0
raw = raw_score / 100.0 if raw_score > 1 else raw_score
min_val = NORMALIZE_DIC[const_key]['Min']
max_val = NORMALIZE_DIC[const_key]['Max']
span = max_val - min_val
if span <= 0:
norm = 1.0 if raw >= max_val else 0.0
else:
norm = (raw - min_val) / span
return norm * DIM_WEIGHT[const_key]


class VBenchSummarizer(DefaultSummarizer):
"""VBench summarizer using official cal_final_score.py logic.

Computes Quality Score, Semantic Score, Total Score with:
- Per-dimension normalization: (score - Min) / (Max - Min) * DIM_WEIGHT
- Quality = weighted avg of QUALITY_LIST dims
- Semantic = weighted avg of SEMANTIC_LIST dims
- Total = (Quality * 4 + Semantic * 1) / 5
"""

def _calculate_group_metrics(
self,
raw_results: Dict,
parsed_results: Dict,
dataset_metrics: Dict,
dataset_eval_mode: Dict,
):
"""Compute vbench Quality, Semantic, Total using official formula."""
for model_abbr in self.model_abbrs:
model_results = parsed_results.get(model_abbr, {})
vbench_scores = {}
for abbr, data in model_results.items():
if not abbr.startswith('vbench_'):
continue
acc = data.get('accuracy')
if acc is None or not isinstance(acc, (int, float)):
continue
const_key = _abbr_to_const_key(abbr)
vbench_scores[const_key] = acc

if not vbench_scores:
continue

normalized = {
k: _get_normalized_score(v, k)
for k, v in vbench_scores.items()
}

quality_num = sum(normalized.get(k, 0) for k in QUALITY_LIST)
quality_denom = sum(DIM_WEIGHT.get(k, 0) for k in QUALITY_LIST)
quality_score = (
quality_num / quality_denom if quality_denom else 0.0
)

semantic_num = sum(normalized.get(k, 0) for k in SEMANTIC_LIST)
semantic_denom = sum(DIM_WEIGHT.get(k, 0) for k in SEMANTIC_LIST)
semantic_score = (
semantic_num / semantic_denom if semantic_denom else 0.0
)

total_score = (
quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT
) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)

for name, score in [
('vbench_quality', quality_score * 100),
('vbench_semantic', semantic_score * 100),
('vbench_total', total_score * 100),
]:
raw_results[model_abbr].setdefault(name, {})['accuracy'] = score
parsed_results[model_abbr].setdefault(name, {})['accuracy'] = score
if name not in dataset_metrics:
dataset_metrics[name] = ['accuracy']
dataset_eval_mode[name] = 'gen'

return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
1 change: 1 addition & 0 deletions ais_bench/benchmark/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ais_bench.benchmark.tasks.openicl_eval import * # noqa: F401, F403
from ais_bench.benchmark.tasks.openicl_infer import * # noqa: F401, F403
from ais_bench.benchmark.tasks.openicl_api_infer import OpenICLApiInferTask
from ais_bench.benchmark.tasks.vbench_eval import VBenchEvalTask # noqa: F401
Loading
Loading