From 5fb843fda4c33ab667fc3f6469dcac54ca841cab Mon Sep 17 00:00:00 2001
From: ChenLi <cl507523@antgroup.com>
Date: Wed, 4 Feb 2026 01:03:50 +0800
Subject: [PATCH 1/9] feat(evm): enable basic performance check in ci

---
 .ci/run_test_suite.sh                   |  60 +++++
 .github/workflows/dtvm_evm_test_x86.yml | 106 ++++++++
 tools/check_performance_regression.py   | 345 ++++++++++++++++++++++++
 3 files changed, 511 insertions(+)
 create mode 100755 tools/check_performance_regression.py

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index fdce39cf..62e7c55e 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -77,6 +77,9 @@ case $TestSuite in
     "evmfallbacksuite")
         CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_SPEC_TEST=ON -DZEN_ENABLE_ASSEMBLYSCRIPT_TEST=ON -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_JIT_FALLBACK_TEST=ON"
         ;;
+    "benchmarksuite")
+        CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_SINGLEPASS_JIT=OFF -DZEN_ENABLE_MULTIPASS_JIT=ON"
+        ;;
 esac
 
 case $CPU_EXCEPTION_TYPE in
@@ -97,6 +100,10 @@ if [[ $TestSuite == "evmonetestsuite" ]]; then
     STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON")
 fi
 
+if [[ $TestSuite == "benchmarksuite" ]]; then
+    STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON")
+fi
+
 export PATH=$PATH:$PWD/build
 CMAKE_OPTIONS_ORIGIN="$CMAKE_OPTIONS"
 
@@ -163,5 +170,58 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             python3 tools/run_evm_tests.py -r build/dtvm $EXTRA_EXE_OPTIONS
             ./build/evmFallbackExecutionTests
             ;;
+        "benchmarksuite")
+            # Clone evmone and run performance regression check
+            EVMONE_DIR="evmone"
+            if [ ! -d "$EVMONE_DIR" ]; then
+                git clone --depth 1 --recurse-submodules -b for_test https://github.com/DTVMStack/evmone.git $EVMONE_DIR
+            fi
+
+            # Set default values for benchmark
+            BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.10}
+            BENCHMARK_MODE=${BENCHMARK_MODE:-multipass}
+
+            # Copy DTVM library to evmone directory
+            cp build/lib/* $EVMONE_DIR/
+
+            cd $EVMONE_DIR
+
+            # Copy check_performance_regression.py from DTVM repo
+            cp ../tools/check_performance_regression.py scripts/
+
+            # Build evmone if not already built
+            if [ ! -f "build/bin/evmone-bench" ]; then
+                cmake -S . -B build -DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release
+                cmake --build build --parallel -j 16
+            fi
+
+            # Run performance check based on mode
+            if [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
+                echo "Saving performance baseline..."
+                python3 scripts/check_performance_regression.py \
+                    --save-baseline "$BENCHMARK_SAVE_BASELINE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_BASELINE_FILE" ]; then
+                echo "Checking performance regression against baseline..."
+                python3 scripts/check_performance_regression.py \
+                    --baseline "$BENCHMARK_BASELINE_FILE" \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            else
+                echo "Running benchmark suite without comparison..."
+                python3 scripts/check_performance_regression.py \
+                    --save-baseline benchmark_results.json \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+                cat benchmark_results.json
+            fi
+
+            cd ..
+            ;;
     esac
 done
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 2070e2fa..32525fff 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -201,3 +201,109 @@ jobs:
           export ENABLE_GAS_METER=true
 
           bash .ci/run_test_suite.sh
+
+  performance_regression_check:
+    name: Performance Regression Check (10% threshold)
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    container:
+      image: dtvmdev1/dtvm-dev-x64:main
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
+          fetch-depth: 0
+
+      - name: Code Format Check
+        run: |
+          ./tools/format.sh check
+
+      - name: Build baseline (${{ github.base_ref }})
+        run: |
+          echo "Building baseline on branch: ${{ github.base_ref }}"
+          export LLVM_SYS_150_PREFIX=/opt/llvm15
+          export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
+          export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
+
+          # Save current state
+          git stash
+          git checkout ${{ github.base_ref }}
+
+          # Build baseline
+          export CMAKE_BUILD_TARGET=Release
+          export ENABLE_ASAN=false
+          export RUN_MODE=multipass
+          export ENABLE_LAZY=false
+          export ENABLE_MULTITHREAD=true
+          export TestSuite=benchmarksuite
+          export CPU_EXCEPTION_TYPE='cpu'
+          export BENCHMARK_MODE=multipass
+          export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline.json
+
+          bash .ci/run_test_suite.sh
+
+      - name: Build current PR and check regression
+        id: perf-check
+        run: |
+          echo "Building PR branch: ${{ github.sha }}"
+          export LLVM_SYS_150_PREFIX=/opt/llvm15
+          export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
+          export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
+
+          # Switch back to PR branch
+          git checkout ${{ github.sha }}
+          git stash pop || true
+
+          # Clean and rebuild for current PR
+          rm -rf build evmone
+
+          # Build and check
+          export CMAKE_BUILD_TARGET=Release
+          export ENABLE_ASAN=false
+          export RUN_MODE=multipass
+          export ENABLE_LAZY=false
+          export ENABLE_MULTITHREAD=true
+          export TestSuite=benchmarksuite
+          export CPU_EXCEPTION_TYPE='cpu'
+          export BENCHMARK_MODE=multipass
+          export BENCHMARK_THRESHOLD=0.10
+          export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline.json
+
+          set +e
+          bash .ci/run_test_suite.sh
+          EXIT_CODE=$?
+          set -e
+          echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
+          exit $EXIT_CODE
+        continue-on-error: true
+
+      - name: Comment on PR (Success)
+        if: steps.perf-check.outputs.exit_code == '0'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '✅ **Performance Check Passed**\n\nNo significant performance regression detected (< 10% threshold).'
+            })
+
+      - name: Comment on PR (Regression)
+        if: steps.perf-check.outputs.exit_code == '1'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '⚠️ **Performance Regression Detected**\n\nSome benchmarks show >10% performance degradation compared to `${{ github.base_ref }}`. Please check the workflow logs for details.'
+            })
+
+      - name: Fail on regression
+        if: steps.perf-check.outputs.exit_code == '1'
+        run: |
+          echo "::error::Performance regression detected. See logs for details."
+          exit 1
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
new file mode 100755
index 00000000..33042cd0
--- /dev/null
+++ b/tools/check_performance_regression.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+"""
+Performance regression checker for evmone benchmarks.
+
+Usage:
+  # Save baseline results
+  python check_performance_regression.py --save-baseline baseline.json
+
+  # Check for regressions against baseline
+  python check_performance_regression.py --baseline baseline.json
+
+  # Check with custom threshold (default 10%)
+  python check_performance_regression.py --baseline baseline.json --threshold 0.15
+
+Exit codes:
+  0 - No significant regression detected
+  1 - Performance regression detected (> threshold)
+  2 - Script error (execution failed, file not found, etc.)
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+
+@dataclass
+class BenchmarkResult:
+    name: str
+    time_ns: float  # Time in nanoseconds
+    cpu_time_ns: float
+    iterations: int
+
+
+def run_benchmark(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    extra_args: Optional[List[str]] = None,
+) -> List[BenchmarkResult]:
+    """Run benchmark and parse JSON output."""
+    env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
+
+    cmd = [
+        "./build/bin/evmone-bench",
+        benchmark_dir,
+        "--benchmark_filter=external/*",
+        "--benchmark_format=json",
+    ]
+
+    if extra_args:
+        cmd.extend(extra_args)
+
+    print(f"Running: {' '.join(cmd)}")
+    print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
+
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        env={**subprocess.os.environ, **env},
+    )
+
+    if result.returncode != 0:
+        print(f"Benchmark execution failed with code {result.returncode}")
+        print(f"stderr: {result.stderr}")
+        sys.exit(2)
+
+    return parse_benchmark_json(result.stdout)
+
+
+def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]:
+    """Parse Google Benchmark JSON output."""
+    try:
+        data = json.loads(json_output)
+    except json.JSONDecodeError as e:
+        print(f"Failed to parse JSON: {e}")
+        sys.exit(2)
+
+    results = []
+    for benchmark in data.get("benchmarks", []):
+        # Skip aggregates like mean, median, stddev
+        if benchmark.get("run_type") != "iteration":
+            continue
+
+        results.append(
+            BenchmarkResult(
+                name=benchmark["name"],
+                time_ns=benchmark.get("real_time", 0),
+                cpu_time_ns=benchmark.get("cpu_time", 0),
+                iterations=benchmark.get("iterations", 1),
+            )
+        )
+
+    return results
+
+
+def load_baseline(path: str) -> List[BenchmarkResult]:
+    """Load baseline results from JSON file."""
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"::error::Baseline file not found: {path}")
+        sys.exit(2)
+    except json.JSONDecodeError as e:
+        print(f"::error::Failed to parse baseline JSON: {e}")
+        sys.exit(2)
+
+    results = []
+    for item in data:
+        results.append(
+            BenchmarkResult(
+                name=item["name"],
+                time_ns=item["time_ns"],
+                cpu_time_ns=item["cpu_time_ns"],
+                iterations=item["iterations"],
+            )
+        )
+
+    return results
+
+
+def save_baseline(results: List[BenchmarkResult], path: str) -> None:
+    """Save baseline results to JSON file."""
+    data = []
+    for r in results:
+        data.append({
+            "name": r.name,
+            "time_ns": r.time_ns,
+            "cpu_time_ns": r.cpu_time_ns,
+            "iterations": r.iterations,
+        })
+
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+
+    print(f"Saved {len(results)} benchmark results to {path}")
+
+
+def compare_benchmarks(
+    current: List[BenchmarkResult],
+    baseline: List[BenchmarkResult],
+    threshold: float,
+) -> Tuple[bool, List[dict]]:
+    """
+    Compare current results against baseline.
+
+    Returns:
+        (has_regression, comparison_details)
+    """
+    baseline_map = {b.name: b for b in baseline}
+    current_map = {c.name: c for c in current}
+
+    # Find missing and new benchmarks
+    baseline_names = set(baseline_map.keys())
+    current_names = set(current_map.keys())
+
+    missing = baseline_names - current_names
+    new = current_names - baseline_names
+
+    if missing:
+        print(f"::warning::Missing benchmarks (in baseline but not in current): {missing}")
+    if new:
+        print(f"::notice::New benchmarks (in current but not in baseline): {new}")
+
+    # Compare common benchmarks
+    comparisons = []
+    has_regression = False
+
+    for name in sorted(baseline_names & current_names):
+        b = baseline_map[name]
+        c = current_map[name]
+
+        # Calculate percentage change (positive = slower/regression)
+        time_change = (c.time_ns - b.time_ns) / b.time_ns
+        cpu_change = (c.cpu_time_ns - b.cpu_time_ns) / b.cpu_time_ns
+
+        # Use the worse of real_time or cpu_time change
+        max_change = max(time_change, cpu_change)
+
+        is_regression = max_change > threshold
+        if is_regression:
+            has_regression = True
+
+        comparisons.append({
+            "name": name,
+            "baseline_time_ns": b.time_ns,
+            "current_time_ns": c.time_ns,
+            "time_change": time_change,
+            "cpu_change": cpu_change,
+            "max_change": max_change,
+            "is_regression": is_regression,
+        })
+
+    return has_regression, comparisons
+
+
+def print_comparison_table(comparisons: List[dict], threshold: float) -> None:
+    """Print a formatted comparison table."""
+    if not comparisons:
+        print("No benchmarks to compare.")
+        return
+
+    # GitHub Actions annotation messages
+    print("\n" + "=" * 100)
+    print(f"{'Benchmark':<60} {'Baseline(μs)':<15} {'Current(μs)':<15} {'Change':<12} {'Status'}")
+    print("=" * 100)
+
+    regression_count = 0
+    for comp in comparisons:
+        name = comp["name"]
+        baseline_us = comp["baseline_time_ns"] / 1000
+        current_us = comp["current_time_ns"] / 1000
+        change_pct = comp["max_change"] * 100
+        status = "✓ PASS" if not comp["is_regression"] else "✗ FAIL"
+
+        # Truncate long names
+        display_name = name if len(name) < 60 else name[:57] + "..."
+
+        print(f"{display_name:<60} {baseline_us:<15.2f} {current_us:<15.2f} {change_pct:>+10.1f}%  {status}")
+
+        if comp["is_regression"]:
+            regression_count += 1
+            # GitHub Actions warning annotation
+            print(f"::warning title=Performance Regression::{name} regressed by {change_pct:.1f}% (threshold: {threshold*100:.0f}%)")
+
+    print("=" * 100)
+    print(f"\nTotal benchmarks: {len(comparisons)}")
+    print(f"Regressions (> {threshold*100:.0f}%): {regression_count}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check for performance regressions in evmone benchmarks",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Save baseline after a known-good commit
+  python check_performance_regression.py --save-baseline baseline.json
+
+  # Check current commit against baseline in CI
+  python check_performance_regression.py --baseline baseline.json
+
+  # Check with custom threshold (15% instead of default 10%)
+  python check_performance_regression.py --baseline baseline.json --threshold 0.15
+
+  # Specify different library or benchmark directory
+  python check_performance_regression.py --baseline baseline.json --lib ./other.so --mode jit
+""",
+    )
+
+    parser.add_argument(
+        "--baseline",
+        metavar="PATH",
+        help="Path to baseline JSON file for comparison",
+    )
+    parser.add_argument(
+        "--save-baseline",
+        metavar="PATH",
+        help="Run benchmarks and save results to file (use this to create baseline)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.10,
+        help="Regression threshold as ratio (default: 0.10 = 10%%)",
+    )
+    parser.add_argument(
+        "--lib",
+        default="./libdtvmapi.so",
+        help="Path to the library to benchmark (default: ./libdtvmapi.so)",
+    )
+    parser.add_argument(
+        "--mode",
+        default="interpreter",
+        help="Mode for the library (default: interpreter)",
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        default="test/evm-benchmarks/benchmarks",
+        help="Path to benchmark directory (default: test/evm-benchmarks/benchmarks)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Verbose output",
+    )
+
+    args = parser.parse_args()
+
+    if not args.baseline and not args.save_baseline:
+        parser.error("Either --baseline or --save-baseline must be specified")
+
+    # Run benchmarks
+    try:
+        current_results = run_benchmark(
+            lib_path=args.lib,
+            mode=args.mode,
+            benchmark_dir=args.benchmark_dir,
+        )
+    except Exception as e:
+        print(f"::error::Failed to run benchmarks: {e}")
+        sys.exit(2)
+
+    if not current_results:
+        print("::error::No benchmark results found")
+        sys.exit(2)
+
+    print(f"Collected {len(current_results)} benchmark results")
+
+    # Save baseline mode
+    if args.save_baseline:
+        save_baseline(current_results, args.save_baseline)
+        return 0
+
+    # Compare mode
+    baseline_results = load_baseline(args.baseline)
+    print(f"Loaded {len(baseline_results)} baseline results from {args.baseline}")
+
+    has_regression, comparisons = compare_benchmarks(
+        current_results,
+        baseline_results,
+        args.threshold,
+    )
+
+    print_comparison_table(comparisons, args.threshold)
+
+    # Summary for GitHub Actions
+    print("\n" + "=" * 100)
+    if has_regression:
+        print(f"::error::Performance regression detected! Some benchmarks exceeded {args.threshold*100:.0f}% threshold.")
+        print("RESULT: FAIL")
+        return 1
+    else:
+        print("::notice::No significant performance regression detected.")
+        print("RESULT: PASS")
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From c4821334e8a5de7c97cbe625a8181d911b34436d Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Thu, 5 Feb 2026 06:21:31 +0000
Subject: [PATCH 2/9] fix: ci workflow

---
 .ci/run_test_suite.sh                   |  8 ++++----
 .github/workflows/dtvm_evm_test_x86.yml | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index 62e7c55e..f5ce6d6b 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -187,7 +187,7 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             cd $EVMONE_DIR
 
             # Copy check_performance_regression.py from DTVM repo
-            cp ../tools/check_performance_regression.py scripts/
+            cp ../tools/check_performance_regression.py ./
 
             # Build evmone if not already built
             if [ ! -f "build/bin/evmone-bench" ]; then
@@ -198,14 +198,14 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             # Run performance check based on mode
             if [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
                 echo "Saving performance baseline..."
-                python3 scripts/check_performance_regression.py \
+                python3 check_performance_regression.py \
                     --save-baseline "$BENCHMARK_SAVE_BASELINE" \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
                     --benchmark-dir test/evm-benchmarks/benchmarks
             elif [ -n "$BENCHMARK_BASELINE_FILE" ]; then
                 echo "Checking performance regression against baseline..."
-                python3 scripts/check_performance_regression.py \
+                python3 check_performance_regression.py \
                     --baseline "$BENCHMARK_BASELINE_FILE" \
                     --threshold "$BENCHMARK_THRESHOLD" \
                     --lib ./libdtvmapi.so \
@@ -213,7 +213,7 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                     --benchmark-dir test/evm-benchmarks/benchmarks
             else
                 echo "Running benchmark suite without comparison..."
-                python3 scripts/check_performance_regression.py \
+                python3 check_performance_regression.py \
                     --save-baseline benchmark_results.json \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 32525fff..cd002eb5 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -206,6 +206,9 @@ jobs:
     name: Performance Regression Check (10% threshold)
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
     container:
       image: dtvmdev1/dtvm-dev-x64:main
     steps:
@@ -215,6 +218,11 @@ jobs:
           submodules: "true"
           fetch-depth: 0
 
+      - name: Setup git safe directory
+        run: |
+          echo "Configuring git safe directory: ${{ github.workspace }}"
+          git config --global --add safe.directory /__w/DTVM/DTVM
+
       - name: Code Format Check
         run: |
           ./tools/format.sh check
@@ -222,12 +230,13 @@ jobs:
       - name: Build baseline (${{ github.base_ref }})
         run: |
           echo "Building baseline on branch: ${{ github.base_ref }}"
+
           export LLVM_SYS_150_PREFIX=/opt/llvm15
           export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
           export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
 
-          # Save current state
-          git stash
+          # Save current state (including untracked files)
+          git stash push -u -m "perf-check-stash"
           git checkout ${{ github.base_ref }}
 
           # Build baseline
@@ -247,6 +256,7 @@ jobs:
         id: perf-check
         run: |
           echo "Building PR branch: ${{ github.sha }}"
+
           export LLVM_SYS_150_PREFIX=/opt/llvm15
           export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
           export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH

From 65019a836abddedf192540747f926a2da150e9ba Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Mon, 9 Feb 2026 06:29:43 +0000
Subject: [PATCH 3/9] fix: add summary

---
 .ci/run_test_suite.sh                   |  6 ++
 .github/workflows/dtvm_evm_test_x86.yml | 41 +++++------
 tools/check_performance_regression.py   | 92 +++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index f5ce6d6b..c5bcf40c 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -195,11 +195,15 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 cmake --build build --parallel -j 16
             fi
 
+            # Default summary output path (can be overridden via env)
+            BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md}
+
             # Run performance check based on mode
             if [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
                 echo "Saving performance baseline..."
                 python3 check_performance_regression.py \
                     --save-baseline "$BENCHMARK_SAVE_BASELINE" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
                     --benchmark-dir test/evm-benchmarks/benchmarks
@@ -208,6 +212,7 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 python3 check_performance_regression.py \
                     --baseline "$BENCHMARK_BASELINE_FILE" \
                     --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
                     --benchmark-dir test/evm-benchmarks/benchmarks
@@ -215,6 +220,7 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 echo "Running benchmark suite without comparison..."
                 python3 check_performance_regression.py \
                     --save-baseline benchmark_results.json \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
                     --benchmark-dir test/evm-benchmarks/benchmarks
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index cd002eb5..8203acd8 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -279,41 +279,38 @@ jobs:
           export BENCHMARK_MODE=multipass
           export BENCHMARK_THRESHOLD=0.10
           export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline.json
+          export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary.md
 
-          set +e
           bash .ci/run_test_suite.sh
-          EXIT_CODE=$?
-          set -e
-          echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
-          exit $EXIT_CODE
         continue-on-error: true
 
-      - name: Comment on PR (Success)
-        if: steps.perf-check.outputs.exit_code == '0'
+      - name: Comment on PR
+        if: always()
         uses: actions/github-script@v6
         with:
           script: |
+            const fs = require('fs');
+            const passed = '${{ steps.perf-check.outcome }}' === 'success';
+            let summary = '';
+            try {
+              summary = fs.readFileSync('/tmp/perf_summary.md', 'utf8');
+            } catch (e) {
+              summary = '_No benchmark summary available._';
+            }
+            const icon = passed ? '✅' : '⚠️';
+            const title = passed
+              ? 'Performance Check Passed'
+              : 'Performance Regression Detected';
+            const body = `${icon} **${title}**\n\n${summary}`;
             github.rest.issues.createComment({
               issue_number: context.issue.number,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: '✅ **Performance Check Passed**\n\nNo significant performance regression detected (< 10% threshold).'
-            })
-
-      - name: Comment on PR (Regression)
-        if: steps.perf-check.outputs.exit_code == '1'
-        uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              body: '⚠️ **Performance Regression Detected**\n\nSome benchmarks show >10% performance degradation compared to `${{ github.base_ref }}`. Please check the workflow logs for details.'
-            })
+              body: body
+            });
 
       - name: Fail on regression
-        if: steps.perf-check.outputs.exit_code == '1'
+        if: steps.perf-check.outcome == 'failure'
         run: |
           echo "::error::Performance regression detected. See logs for details."
           exit 1
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 33042cd0..571ce371 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -232,6 +232,79 @@ def print_comparison_table(comparisons: List[dict], threshold: float) -> None:
     print(f"Regressions (> {threshold*100:.0f}%): {regression_count}")
 
 
+def _short_name(name: str) -> str:
+    """Extract a short display name from the full benchmark name.
+
+    Benchmark names typically look like 'external/some_case/variant'.
+    We strip the leading 'external/' prefix to keep the table compact.
+    """
+    if name.startswith("external/"):
+        return name[len("external/"):]
+    return name
+
+
+def generate_markdown_summary(
+    comparisons: List[dict],
+    threshold: float,
+    has_regression: bool,
+) -> str:
+    """Generate a concise Markdown summary of benchmark comparison results."""
+    lines: List[str] = []
+
+    regression_count = sum(1 for c in comparisons if c["is_regression"])
+
+    lines.append(
+        f"**Performance Benchmark Results** (threshold: {threshold*100:.0f}%)"
+    )
+    lines.append("")
+
+    if not comparisons:
+        lines.append("_No benchmarks to compare._")
+        return "\n".join(lines)
+
+    # Markdown table header
+    lines.append("| Benchmark | Baseline (us) | Current (us) | Change | Status |")
+    lines.append("|-----------|--------------|-------------|--------|--------|")
+
+    for comp in comparisons:
+        name = _short_name(comp["name"])
+        baseline_us = comp["baseline_time_ns"] / 1000
+        current_us = comp["current_time_ns"] / 1000
+        change_pct = comp["max_change"] * 100
+        status = "PASS" if not comp["is_regression"] else "**REGRESSED**"
+
+        lines.append(
+            f"| {name} | {baseline_us:.2f} | {current_us:.2f} "
+            f"| {change_pct:+.1f}% | {status} |"
+        )
+
+    lines.append("")
+    lines.append(
+        f"**Summary**: {len(comparisons)} benchmarks, "
+        f"{regression_count} regressions"
+    )
+
+    return "\n".join(lines)
+
+
+def generate_baseline_summary(results: List[BenchmarkResult]) -> str:
+    """Generate a concise Markdown summary for a baseline-save run."""
+    lines: List[str] = []
+    lines.append("**Baseline Benchmark Results**")
+    lines.append("")
+    lines.append("| Benchmark | Time (us) |")
+    lines.append("|-----------|----------|")
+
+    for r in results:
+        name = _short_name(r.name)
+        time_us = r.time_ns / 1000
+        lines.append(f"| {name} | {time_us:.2f} |")
+
+    lines.append("")
+    lines.append(f"**Total**: {len(results)} benchmarks collected")
+    return "\n".join(lines)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Check for performance regressions in evmone benchmarks",
@@ -283,6 +356,11 @@ def main():
         default="test/evm-benchmarks/benchmarks",
         help="Path to benchmark directory (default: test/evm-benchmarks/benchmarks)",
     )
+    parser.add_argument(
+        "--output-summary",
+        metavar="PATH",
+        help="Write a concise Markdown summary to the given file (for PR comments)",
+    )
     parser.add_argument(
         "--verbose",
         "-v",
@@ -315,6 +393,11 @@ def main():
     # Save baseline mode
     if args.save_baseline:
         save_baseline(current_results, args.save_baseline)
+        if args.output_summary:
+            summary_md = generate_baseline_summary(current_results)
+            with open(args.output_summary, "w") as f:
+                f.write(summary_md)
+            print(f"Wrote baseline summary to {args.output_summary}")
         return 0
 
     # Compare mode
@@ -329,6 +412,15 @@ def main():
 
     print_comparison_table(comparisons, args.threshold)
 
+    # Write Markdown summary for PR comments
+    if args.output_summary:
+        summary_md = generate_markdown_summary(
+            comparisons, args.threshold, has_regression
+        )
+        with open(args.output_summary, "w") as f:
+            f.write(summary_md)
+        print(f"Wrote comparison summary to {args.output_summary}")
+
     # Summary for GitHub Actions
     print("\n" + "=" * 100)
     if has_regression:

From 36fd85db2ef00e63b9e2f8432e4bc48d2afdbef6 Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Mon, 9 Feb 2026 07:15:44 +0000
Subject: [PATCH 4/9] fix: only interpreter

---
 .github/workflows/dtvm_evm_test_x86.yml |  4 +--
 tools/check_performance_regression.py   | 39 +++++++++++++++++++++----
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 8203acd8..e817b16f 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -247,7 +247,7 @@ jobs:
           export ENABLE_MULTITHREAD=true
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
-          export BENCHMARK_MODE=multipass
+          export BENCHMARK_MODE=interpreter
           export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline.json
 
           bash .ci/run_test_suite.sh
@@ -276,7 +276,7 @@ jobs:
           export ENABLE_MULTITHREAD=true
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
-          export BENCHMARK_MODE=multipass
+          export BENCHMARK_MODE=interpreter
           export BENCHMARK_THRESHOLD=0.10
           export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline.json
           export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary.md
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 571ce371..12c0eb2f 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -20,8 +20,10 @@
 
 import argparse
 import json
+import os
 import subprocess
 import sys
+import tempfile
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
@@ -40,14 +42,26 @@ def run_benchmark(
     benchmark_dir: str,
     extra_args: Optional[List[str]] = None,
 ) -> List[BenchmarkResult]:
-    """Run benchmark and parse JSON output."""
+    """Run benchmark and parse JSON output.
+
+    Uses --benchmark_out to write JSON results to a temporary file so that
+    the human-readable benchmark progress streams to stdout/stderr in real
+    time (important for CI visibility).
+    """
     env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
 
+    # Write JSON results to a temp file instead of capturing stdout.
+    # This lets Google Benchmark's normal console output (one line per
+    # completed case) stream directly to the CI log in real time.
+    fd, json_out_path = tempfile.mkstemp(suffix=".json")
+    os.close(fd)
+
     cmd = [
         "./build/bin/evmone-bench",
         benchmark_dir,
         "--benchmark_filter=external/*",
-        "--benchmark_format=json",
+        f"--benchmark_out={json_out_path}",
+        "--benchmark_out_format=json",
     ]
 
     if extra_args:
@@ -55,20 +69,33 @@ def run_benchmark(
 
     print(f"Running: {' '.join(cmd)}")
     print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
+    sys.stdout.flush()
 
     result = subprocess.run(
         cmd,
-        capture_output=True,
-        text=True,
         env={**subprocess.os.environ, **env},
     )
 
     if result.returncode != 0:
         print(f"Benchmark execution failed with code {result.returncode}")
-        print(f"stderr: {result.stderr}")
+        # Clean up temp file on failure
+        try:
+            os.unlink(json_out_path)
+        except OSError:
+            pass
         sys.exit(2)
 
-    return parse_benchmark_json(result.stdout)
+    # Read JSON results from the temp file
+    try:
+        with open(json_out_path, "r") as f:
+            json_data = f.read()
+    finally:
+        try:
+            os.unlink(json_out_path)
+        except OSError:
+            pass
+
+    return parse_benchmark_json(json_data)
 
 
 def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]:

From 71fb23522eee1e0b0dbf65153e7c53ab2ec9e504 Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Wed, 25 Feb 2026 07:10:50 +0000
Subject: [PATCH 5/9] feat(ci): add multipass mode to performance regression
 check workflow

- Add matrix strategy (interpreter, multipass) to the
  performance_regression_check job so both modes run in parallel;
  fail-fast: false so a failure in one doesn't cancel the other.
- Baseline and summary files are namespaced by mode
  (perf_baseline_{mode}.json, perf_summary_{mode}.md) to avoid
  collisions between matrix runs.
- PR comments include the mode name in the title.
- Add ZEN_ENABLE_JIT_PRECOMPILE_FALLBACK=ON to benchmarksuite CMake
  options so the JIT suitability checker is active and prevents
  infinite compilation hangs on pathological synth bytecode.
- Add --benchmark-filter option to check_performance_regression.py
  to allow callers to override the default external/* filter.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .ci/run_test_suite.sh                   |  2 +-
 .github/workflows/dtvm_evm_test_x86.yml | 25 +++++++++++++++----------
 tools/check_performance_regression.py   | 14 +++++++++++++-
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index c5bcf40c..5eb986b1 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -78,7 +78,7 @@ case $TestSuite in
         CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_SPEC_TEST=ON -DZEN_ENABLE_ASSEMBLYSCRIPT_TEST=ON -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_JIT_FALLBACK_TEST=ON"
         ;;
     "benchmarksuite")
-        CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_SINGLEPASS_JIT=OFF -DZEN_ENABLE_MULTIPASS_JIT=ON"
+        CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_SINGLEPASS_JIT=OFF -DZEN_ENABLE_MULTIPASS_JIT=ON -DZEN_ENABLE_JIT_PRECOMPILE_FALLBACK=ON"
         ;;
 esac
 
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index e817b16f..fe48b533 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -203,12 +203,16 @@ jobs:
           bash .ci/run_test_suite.sh
 
   performance_regression_check:
-    name: Performance Regression Check (10% threshold)
+    name: Performance Regression Check (${{ matrix.mode }}, 10% threshold)
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     permissions:
       contents: read
       pull-requests: write
+    strategy:
+      fail-fast: false
+      matrix:
+        mode: [interpreter, multipass]
     container:
       image: dtvmdev1/dtvm-dev-x64:main
     steps:
@@ -247,8 +251,8 @@ jobs:
           export ENABLE_MULTITHREAD=true
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
-          export BENCHMARK_MODE=interpreter
-          export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline.json
+          export BENCHMARK_MODE=${{ matrix.mode }}
+          export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline_${{ matrix.mode }}.json
 
           bash .ci/run_test_suite.sh
 
@@ -276,10 +280,10 @@ jobs:
           export ENABLE_MULTITHREAD=true
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
-          export BENCHMARK_MODE=interpreter
+          export BENCHMARK_MODE=${{ matrix.mode }}
           export BENCHMARK_THRESHOLD=0.10
-          export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline.json
-          export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary.md
+          export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline_${{ matrix.mode }}.json
+          export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary_${{ matrix.mode }}.md
 
           bash .ci/run_test_suite.sh
         continue-on-error: true
@@ -290,17 +294,18 @@ jobs:
         with:
           script: |
             const fs = require('fs');
+            const mode = '${{ matrix.mode }}';
             const passed = '${{ steps.perf-check.outcome }}' === 'success';
             let summary = '';
             try {
-              summary = fs.readFileSync('/tmp/perf_summary.md', 'utf8');
+              summary = fs.readFileSync(`/tmp/perf_summary_${mode}.md`, 'utf8');
             } catch (e) {
               summary = '_No benchmark summary available._';
             }
             const icon = passed ? '✅' : '⚠️';
             const title = passed
-              ? 'Performance Check Passed'
-              : 'Performance Regression Detected';
+              ? `Performance Check Passed (${mode})`
+              : `Performance Regression Detected (${mode})`;
             const body = `${icon} **${title}**\n\n${summary}`;
             github.rest.issues.createComment({
               issue_number: context.issue.number,
@@ -312,5 +317,5 @@ jobs:
       - name: Fail on regression
         if: steps.perf-check.outcome == 'failure'
         run: |
-          echo "::error::Performance regression detected. See logs for details."
+          echo "::error::Performance regression detected in ${{ matrix.mode }} mode. See logs for details."
           exit 1
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 12c0eb2f..b18ce692 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -59,7 +59,6 @@ def run_benchmark(
     cmd = [
         "./build/bin/evmone-bench",
         benchmark_dir,
-        "--benchmark_filter=external/*",
         f"--benchmark_out={json_out_path}",
         "--benchmark_out_format=json",
     ]
@@ -67,6 +66,9 @@ def run_benchmark(
     if extra_args:
         cmd.extend(extra_args)
 
+    if not any(arg.startswith("--benchmark_filter") for arg in cmd):
+        cmd.append("--benchmark_filter=external/*")
+
     print(f"Running: {' '.join(cmd)}")
     print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
     sys.stdout.flush()
@@ -394,18 +396,28 @@ def main():
         action="store_true",
         help="Verbose output",
     )
+    parser.add_argument(
+        "--benchmark-filter",
+        default=None,
+        help="Custom regex filter forwarded to evmone-bench --benchmark_filter (default: external/*)",
+    )
 
     args = parser.parse_args()
 
     if not args.baseline and not args.save_baseline:
         parser.error("Either --baseline or --save-baseline must be specified")
 
+    bench_extra = None
+    if args.benchmark_filter:
+        bench_extra = [f"--benchmark_filter={args.benchmark_filter}"]
+
     # Run benchmarks
     try:
         current_results = run_benchmark(
             lib_path=args.lib,
             mode=args.mode,
             benchmark_dir=args.benchmark_dir,
+            extra_args=bench_extra,
         )
     except Exception as e:
         print(f"::error::Failed to run benchmarks: {e}")

From 248058f9357ddee6cd3ef43e51a2c003b0b33f6c Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Wed, 25 Feb 2026 07:41:21 +0000
Subject: [PATCH 6/9] fix(ci): add issues: write permission for PR comment step

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/dtvm_evm_test_x86.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index fe48b533..2f859380 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -209,6 +209,7 @@ jobs:
     permissions:
       contents: read
       pull-requests: write
+      issues: write
     strategy:
       fail-fast: false
       matrix:

From b4f0bd2b167412e2a1a87bf33d1fb5dd69f56faa Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Thu, 26 Feb 2026 12:02:15 +0000
Subject: [PATCH 7/9] fix(ci): handle fork PR token permissions in performance
 check

For fork PRs, the GITHUB_TOKEN in pull_request events is always
read-only, causing the "Comment on PR" step to fail with 403.

- Write results to $GITHUB_STEP_SUMMARY for always-visible output
- Upload benchmark artifacts for cross-workflow access
- Add continue-on-error and try/catch to the PR comment step
- Add workflow_run-triggered workflow to post PR comments with
  write permissions (works for fork PRs after merge to main)

Made-with: Cursor
---
 .github/workflows/dtvm_evm_test_x86.yml | 52 ++++++++++++--
 .github/workflows/perf_pr_comment.yml   | 96 +++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/perf_pr_comment.yml

diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 2f859380..fbad94b7 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -289,9 +289,45 @@ jobs:
           bash .ci/run_test_suite.sh
         continue-on-error: true
 
+      - name: Write Performance Summary
+        if: always()
+        run: |
+          MODE="${{ matrix.mode }}"
+          OUTCOME="${{ steps.perf-check.outcome }}"
+          SUMMARY_FILE="/tmp/perf_summary_${MODE}.md"
+          if [ "$OUTCOME" = "success" ]; then
+            echo "✅ **Performance Check Passed (${MODE})**" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ **Performance Regression Detected (${MODE})**" >> $GITHUB_STEP_SUMMARY
+          fi
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ -f "$SUMMARY_FILE" ]; then
+            cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "_No benchmark summary available._" >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Save performance artifacts
+        if: always()
+        run: |
+          mkdir -p /tmp/perf-artifacts
+          echo "${{ github.event.pull_request.number }}" > /tmp/perf-artifacts/pr_number
+          echo "${{ steps.perf-check.outcome }}" > /tmp/perf-artifacts/outcome
+          cp "/tmp/perf_summary_${{ matrix.mode }}.md" /tmp/perf-artifacts/summary.md 2>/dev/null || \
+            echo "_No benchmark summary available._" > /tmp/perf-artifacts/summary.md
+
+      - name: Upload performance results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-results-${{ matrix.mode }}
+          path: /tmp/perf-artifacts/
+          retention-days: 7
+
       - name: Comment on PR
         if: always()
         uses: actions/github-script@v6
+        continue-on-error: true
         with:
           script: |
             const fs = require('fs');
@@ -308,12 +344,16 @@ jobs:
               ? `Performance Check Passed (${mode})`
               : `Performance Regression Detected (${mode})`;
             const body = `${icon} **${title}**\n\n${summary}`;
-            github.rest.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              body: body
-            });
+            try {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body
+              });
+            } catch (error) {
+              core.warning(`Could not comment on PR (expected for fork PRs): ${error.message}. Results are available in the job summary above.`);
+            }
 
       - name: Fail on regression
         if: steps.perf-check.outcome == 'failure'
diff --git a/.github/workflows/perf_pr_comment.yml b/.github/workflows/perf_pr_comment.yml
new file mode 100644
index 00000000..43a44eff
--- /dev/null
+++ b/.github/workflows/perf_pr_comment.yml
@@ -0,0 +1,96 @@
+name: Post Performance Check Results
+
+on:
+  workflow_run:
+    workflows: ["DTVM-EVM test CI in x86-64"]
+    types:
+      - completed
+
+permissions:
+  pull-requests: write
+  actions: read
+
+jobs:
+  comment:
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download interpreter results
+        uses: actions/download-artifact@v4
+        with:
+          name: perf-results-interpreter
+          path: /tmp/perf-interpreter
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+        continue-on-error: true
+
+      - name: Download multipass results
+        uses: actions/download-artifact@v4
+        with:
+          name: perf-results-multipass
+          path: /tmp/perf-multipass
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+        continue-on-error: true
+
+      - name: Post PR comment
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            let prNumber = null;
+            let body = '<!-- perf-check-results -->\n## ⚡ Performance Regression Check Results\n\n';
+            let hasResults = false;
+
+            for (const mode of ['interpreter', 'multipass']) {
+              const dir = `/tmp/perf-${mode}`;
+              try {
+                if (!prNumber) {
+                  prNumber = parseInt(fs.readFileSync(`${dir}/pr_number`, 'utf8').trim());
+                }
+                const outcome = fs.readFileSync(`${dir}/outcome`, 'utf8').trim();
+                const summary = fs.readFileSync(`${dir}/summary.md`, 'utf8');
+                const passed = outcome === 'success';
+                const icon = passed ? '✅' : '⚠️';
+                const title = passed
+                  ? `Performance Check Passed (${mode})`
+                  : `Performance Regression Detected (${mode})`;
+                body += `### ${icon} ${title}\n\n${summary}\n\n---\n\n`;
+                hasResults = true;
+              } catch (e) {
+                core.info(`No results for ${mode}: ${e.message}`);
+              }
+            }
+
+            if (!prNumber || !hasResults) {
+              core.info('No performance results to post');
+              return;
+            }
+
+            const { data: comments } = await github.rest.issues.listComments({
+              issue_number: prNumber,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+
+            const existing = comments.find(c =>
+              c.body && c.body.includes('<!-- perf-check-results -->')
+            );
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                comment_id: existing.id,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body.trim()
+              });
+              core.info(`Updated existing comment ${existing.id}`);
+            } else {
+              await github.rest.issues.createComment({
+                issue_number: prNumber,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body.trim()
+              });
+              core.info('Created new PR comment');
+            }

From bf6afd54551d44a48105a5a8f143e3bb8d4210f4 Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Thu, 26 Feb 2026 12:34:14 +0000
Subject: [PATCH 8/9] fix(ci): fix baseline generation and reduce benchmark
 noise in perf check

The baseline step checked out main and ran run_test_suite.sh, but main
has no benchmarksuite case, so no baseline file was produced. The PR
step then failed with "Baseline file not found".

Fix: build the DTVM library on the base branch and save only the
binary. Benchmarks for both baseline and PR libraries run from the PR
branch's infrastructure via a new BENCHMARK_BASELINE_LIB env var.

Also mitigate CI runner variance:
- Raise default threshold from 10% to 15%
- Require at least 3 benchmarks to exceed threshold before flagging
  regression (--min-regressions), filtering out single-outlier noise

Made-with: Cursor
---
 .ci/run_test_suite.sh                   | 32 ++++++++++++-----
 .github/workflows/dtvm_evm_test_x86.yml | 46 ++++++++++++-------------
 tools/check_performance_regression.py   | 43 ++++++++++++++++++-----
 3 files changed, 80 insertions(+), 41 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index 5eb986b1..ab066c2f 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -177,29 +177,43 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 git clone --depth 1 --recurse-submodules -b for_test https://github.com/DTVMStack/evmone.git $EVMONE_DIR
             fi
 
-            # Set default values for benchmark
-            BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.10}
+            BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.15}
             BENCHMARK_MODE=${BENCHMARK_MODE:-multipass}
+            BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md}
 
-            # Copy DTVM library to evmone directory
             cp build/lib/* $EVMONE_DIR/
 
             cd $EVMONE_DIR
 
-            # Copy check_performance_regression.py from DTVM repo
             cp ../tools/check_performance_regression.py ./
 
-            # Build evmone if not already built
             if [ ! -f "build/bin/evmone-bench" ]; then
                 cmake -S . -B build -DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release
                 cmake --build build --parallel -j 16
             fi
 
-            # Default summary output path (can be overridden via env)
-            BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md}
+            if [ -n "$BENCHMARK_BASELINE_LIB" ]; then
+                # Run baseline benchmarks with the pre-built baseline library,
+                # then run current benchmarks with the PR library and compare.
+                # This avoids depending on the base branch having benchmark scripts.
+                echo "Running baseline benchmarks with library from base branch..."
+                cp "$BENCHMARK_BASELINE_LIB"/libdtvmapi.so ./libdtvmapi.so
+                python3 check_performance_regression.py \
+                    --save-baseline /tmp/perf_baseline.json \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
 
-            # Run performance check based on mode
-            if [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
+                echo "Running current benchmarks with PR library..."
+                cp ../build/lib/libdtvmapi.so ./libdtvmapi.so
+                python3 check_performance_regression.py \
+                    --baseline /tmp/perf_baseline.json \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
                 echo "Saving performance baseline..."
                 python3 check_performance_regression.py \
                     --save-baseline "$BENCHMARK_SAVE_BASELINE" \
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index fbad94b7..4d08bb1b 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -203,7 +203,7 @@ jobs:
           bash .ci/run_test_suite.sh
 
   performance_regression_check:
-    name: Performance Regression Check (${{ matrix.mode }}, 10% threshold)
+    name: Performance Regression Check (${{ matrix.mode }})
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     permissions:
@@ -232,30 +232,34 @@ jobs:
         run: |
           ./tools/format.sh check
 
-      - name: Build baseline (${{ github.base_ref }})
+      - name: Build baseline library (${{ github.base_ref }})
         run: |
-          echo "Building baseline on branch: ${{ github.base_ref }}"
+          echo "Building baseline library from branch: ${{ github.base_ref }}"
 
           export LLVM_SYS_150_PREFIX=/opt/llvm15
           export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
           export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
 
-          # Save current state (including untracked files)
           git stash push -u -m "perf-check-stash"
           git checkout ${{ github.base_ref }}
 
-          # Build baseline
-          export CMAKE_BUILD_TARGET=Release
-          export ENABLE_ASAN=false
-          export RUN_MODE=multipass
-          export ENABLE_LAZY=false
-          export ENABLE_MULTITHREAD=true
-          export TestSuite=benchmarksuite
-          export CPU_EXCEPTION_TYPE='cpu'
-          export BENCHMARK_MODE=${{ matrix.mode }}
-          export BENCHMARK_SAVE_BASELINE=/tmp/perf_baseline_${{ matrix.mode }}.json
-
-          bash .ci/run_test_suite.sh
+          cmake -S . -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DZEN_ENABLE_SINGLEPASS_JIT=OFF \
+            -DZEN_ENABLE_MULTIPASS_JIT=ON \
+            -DZEN_ENABLE_EVM=ON \
+            -DZEN_ENABLE_LIBEVM=ON \
+            -DZEN_ENABLE_JIT_PRECOMPILE_FALLBACK=ON \
+            -DZEN_ENABLE_CPU_EXCEPTION=ON \
+            -DZEN_ENABLE_VIRTUAL_STACK=ON
+          cmake --build build -j 16
+
+          mkdir -p /tmp/baseline_lib
+          cp build/lib/* /tmp/baseline_lib/
+
+          rm -rf build
+          git checkout ${{ github.sha }}
+          git stash pop || true
 
       - name: Build current PR and check regression
         id: perf-check
@@ -266,14 +270,8 @@ jobs:
           export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
           export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
 
-          # Switch back to PR branch
-          git checkout ${{ github.sha }}
-          git stash pop || true
-
-          # Clean and rebuild for current PR
           rm -rf build evmone
 
-          # Build and check
           export CMAKE_BUILD_TARGET=Release
           export ENABLE_ASAN=false
           export RUN_MODE=multipass
@@ -282,8 +280,8 @@ jobs:
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
           export BENCHMARK_MODE=${{ matrix.mode }}
-          export BENCHMARK_THRESHOLD=0.10
-          export BENCHMARK_BASELINE_FILE=/tmp/perf_baseline_${{ matrix.mode }}.json
+          export BENCHMARK_THRESHOLD=0.15
+          export BENCHMARK_BASELINE_LIB=/tmp/baseline_lib
           export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary_${{ matrix.mode }}.md
 
           bash .ci/run_test_suite.sh
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index b18ce692..4c83a249 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -173,17 +173,21 @@ def compare_benchmarks(
     current: List[BenchmarkResult],
     baseline: List[BenchmarkResult],
     threshold: float,
+    min_regressions: int = 3,
 ) -> Tuple[bool, List[dict]]:
     """
     Compare current results against baseline.
 
+    A regression is only flagged if at least ``min_regressions`` individual
+    benchmarks exceed the threshold.  This prevents CI noise on shared
+    runners from causing false positives when a single outlier spikes.
+
     Returns:
         (has_regression, comparison_details)
     """
     baseline_map = {b.name: b for b in baseline}
     current_map = {c.name: c for c in current}
 
-    # Find missing and new benchmarks
     baseline_names = set(baseline_map.keys())
     current_names = set(current_map.keys())
 
@@ -195,24 +199,21 @@ def compare_benchmarks(
     if new:
         print(f"::notice::New benchmarks (in current but not in baseline): {new}")
 
-    # Compare common benchmarks
     comparisons = []
-    has_regression = False
+    regression_count = 0
 
     for name in sorted(baseline_names & current_names):
         b = baseline_map[name]
         c = current_map[name]
 
-        # Calculate percentage change (positive = slower/regression)
         time_change = (c.time_ns - b.time_ns) / b.time_ns
         cpu_change = (c.cpu_time_ns - b.cpu_time_ns) / b.cpu_time_ns
 
-        # Use the worse of real_time or cpu_time change
         max_change = max(time_change, cpu_change)
 
         is_regression = max_change > threshold
         if is_regression:
-            has_regression = True
+            regression_count += 1
 
         comparisons.append({
             "name": name,
@@ -224,6 +225,14 @@ def compare_benchmarks(
             "is_regression": is_regression,
         })
 
+    has_regression = regression_count >= min_regressions
+
+    if regression_count > 0 and not has_regression:
+        print(
+            f"::notice::{regression_count} benchmark(s) exceeded threshold but "
+            f"below min_regressions={min_regressions}; treating as noise."
+        )
+
     return has_regression, comparisons
 
 
@@ -401,6 +410,13 @@ def main():
         default=None,
         help="Custom regex filter forwarded to evmone-bench --benchmark_filter (default: external/*)",
     )
+    parser.add_argument(
+        "--min-regressions",
+        type=int,
+        default=3,
+        help="Minimum number of regressed benchmarks before flagging overall failure (default: 3). "
+             "Prevents CI noise from causing false positives.",
+    )
 
     args = parser.parse_args()
 
@@ -447,6 +463,7 @@ def main():
         current_results,
         baseline_results,
         args.threshold,
+        min_regressions=args.min_regressions,
     )
 
     print_comparison_table(comparisons, args.threshold)
@@ -460,13 +477,23 @@ def main():
             f.write(summary_md)
         print(f"Wrote comparison summary to {args.output_summary}")
 
-    # Summary for GitHub Actions
+    regression_count = sum(1 for c in comparisons if c["is_regression"])
+
     print("\n" + "=" * 100)
     if has_regression:
-        print(f"::error::Performance regression detected! Some benchmarks exceeded {args.threshold*100:.0f}% threshold.")
+        print(
+            f"::error::Performance regression detected! "
+            f"{regression_count} benchmarks exceeded {args.threshold*100:.0f}% threshold "
+            f"(min required: {args.min_regressions})."
+        )
         print("RESULT: FAIL")
         return 1
     else:
+        if regression_count > 0:
+            print(
+                f"::notice::{regression_count} benchmark(s) exceeded threshold "
+                f"but below minimum of {args.min_regressions}; treated as CI noise."
+            )
         print("::notice::No significant performance regression detected.")
         print("RESULT: PASS")
         return 0

From d5438be8da8616386779fa53cced161745698de9 Mon Sep 17 00:00:00 2001
From: cl507523 <cl507523@antgroup.com>
Date: Fri, 27 Feb 2026 02:51:13 +0000
Subject: [PATCH 9/9] perf(ci): stabilize benchmarks with repetitions/median
 and cache baseline

Reduce CI benchmark variance on shared GitHub Actions runners:
- Run each benchmark 3x and use the median (--benchmark_repetitions)
- Pin benchmarks to CPU 0 via taskset when available
- Filter to external/total/* to avoid execute/total double-counting
- Skip micro-benchmarks (<5us) that are dominated by noise
- Raise threshold from 15% to 20% and require >=5 regressions

Cache baseline results with actions/cache keyed by base branch SHA
so the baseline build and benchmark run are skipped on subsequent
pushes to the same PR. On cache miss the baseline library is built
once and benchmarked; the resulting JSON is cached for future runs.

Made-with: Cursor
---
 .ci/run_test_suite.sh                   | 24 ++++--
 .github/workflows/dtvm_evm_test_x86.yml | 11 ++-
 tools/check_performance_regression.py   | 97 ++++++++++++++++++++-----
 3 files changed, 105 insertions(+), 27 deletions(-)

diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index ab066c2f..a690deea 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -192,14 +192,26 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 cmake --build build --parallel -j 16
             fi
 
-            if [ -n "$BENCHMARK_BASELINE_LIB" ]; then
-                # Run baseline benchmarks with the pre-built baseline library,
-                # then run current benchmarks with the PR library and compare.
-                # This avoids depending on the base branch having benchmark scripts.
+            BASELINE_CACHE=${BENCHMARK_BASELINE_CACHE:-}
+
+            if [ -n "$BASELINE_CACHE" ] && [ -f "$BASELINE_CACHE" ]; then
+                # Cached baseline available -- only run current benchmarks.
+                echo "Using cached baseline: $BASELINE_CACHE"
+                python3 check_performance_regression.py \
+                    --baseline "$BASELINE_CACHE" \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_BASELINE_LIB" ]; then
+                # No cache -- run baseline benchmarks with the pre-built
+                # baseline library, then run current benchmarks and compare.
                 echo "Running baseline benchmarks with library from base branch..."
                 cp "$BENCHMARK_BASELINE_LIB"/libdtvmapi.so ./libdtvmapi.so
+                SAVE_PATH=${BASELINE_CACHE:-/tmp/perf_baseline.json}
                 python3 check_performance_regression.py \
-                    --save-baseline /tmp/perf_baseline.json \
+                    --save-baseline "$SAVE_PATH" \
                     --lib ./libdtvmapi.so \
                     --mode "$BENCHMARK_MODE" \
                     --benchmark-dir test/evm-benchmarks/benchmarks
@@ -207,7 +219,7 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
                 echo "Running current benchmarks with PR library..."
                 cp ../build/lib/libdtvmapi.so ./libdtvmapi.so
                 python3 check_performance_regression.py \
-                    --baseline /tmp/perf_baseline.json \
+                    --baseline "$SAVE_PATH" \
                     --threshold "$BENCHMARK_THRESHOLD" \
                     --output-summary "$BENCHMARK_SUMMARY_FILE" \
                     --lib ./libdtvmapi.so \
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 4d08bb1b..d4c32f76 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -232,7 +232,15 @@ jobs:
         run: |
           ./tools/format.sh check
 
+      - name: Restore baseline cache
+        id: baseline-cache
+        uses: actions/cache@v4
+        with:
+          path: /tmp/perf_baseline_${{ matrix.mode }}.json
+          key: perf-baseline-${{ matrix.mode }}-${{ github.event.pull_request.base.sha }}
+
       - name: Build baseline library (${{ github.base_ref }})
+        if: steps.baseline-cache.outputs.cache-hit != 'true'
         run: |
           echo "Building baseline library from branch: ${{ github.base_ref }}"
 
@@ -280,7 +288,8 @@ jobs:
           export TestSuite=benchmarksuite
           export CPU_EXCEPTION_TYPE='cpu'
           export BENCHMARK_MODE=${{ matrix.mode }}
-          export BENCHMARK_THRESHOLD=0.15
+          export BENCHMARK_THRESHOLD=0.20
+          export BENCHMARK_BASELINE_CACHE=/tmp/perf_baseline_${{ matrix.mode }}.json
           export BENCHMARK_BASELINE_LIB=/tmp/baseline_lib
           export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary_${{ matrix.mode }}.md
 
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
index 4c83a249..d20525ab 100755
--- a/tools/check_performance_regression.py
+++ b/tools/check_performance_regression.py
@@ -21,6 +21,7 @@
 import argparse
 import json
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -41,33 +42,44 @@ def run_benchmark(
     mode: str,
     benchmark_dir: str,
     extra_args: Optional[List[str]] = None,
+    repetitions: int = 3,
 ) -> List[BenchmarkResult]:
     """Run benchmark and parse JSON output.
 
     Uses --benchmark_out to write JSON results to a temporary file so that
     the human-readable benchmark progress streams to stdout/stderr in real
     time (important for CI visibility).
+
+    When *repetitions* > 1, each benchmark runs N times and only the
+    median aggregate is kept, significantly reducing noise from ASLR and
+    shared-runner contention.
     """
     env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
 
-    # Write JSON results to a temp file instead of capturing stdout.
-    # This lets Google Benchmark's normal console output (one line per
-    # completed case) stream directly to the CI log in real time.
     fd, json_out_path = tempfile.mkstemp(suffix=".json")
     os.close(fd)
 
-    cmd = [
+    cmd: List[str] = []
+
+    if shutil.which("taskset"):
+        cmd.extend(["taskset", "-c", "0"])
+
+    cmd.extend([
         "./build/bin/evmone-bench",
         benchmark_dir,
         f"--benchmark_out={json_out_path}",
         "--benchmark_out_format=json",
-    ]
+    ])
+
+    if repetitions > 1:
+        cmd.append(f"--benchmark_repetitions={repetitions}")
+        cmd.append("--benchmark_report_aggregates_only=true")
 
     if extra_args:
         cmd.extend(extra_args)
 
     if not any(arg.startswith("--benchmark_filter") for arg in cmd):
-        cmd.append("--benchmark_filter=external/*")
+        cmd.append("--benchmark_filter=external/total/*")
 
     print(f"Running: {' '.join(cmd)}")
     print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
@@ -80,14 +92,12 @@ def run_benchmark(
 
     if result.returncode != 0:
         print(f"Benchmark execution failed with code {result.returncode}")
-        # Clean up temp file on failure
         try:
             os.unlink(json_out_path)
         except OSError:
             pass
         sys.exit(2)
 
-    # Read JSON results from the temp file
     try:
         with open(json_out_path, "r") as f:
             json_data = f.read()
@@ -101,22 +111,38 @@ def run_benchmark(
 
 
 def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]:
-    """Parse Google Benchmark JSON output."""
+    """Parse Google Benchmark JSON output.
+
+    When the output contains aggregate entries (from ``--benchmark_repetitions``),
+    only the **median** aggregate is kept and the ``_median`` suffix is stripped
+    from names so they match the non-repetition format.  Otherwise individual
+    iteration results are returned.
+    """
     try:
         data = json.loads(json_output)
     except json.JSONDecodeError as e:
         print(f"Failed to parse JSON: {e}")
         sys.exit(2)
 
+    benchmarks = data.get("benchmarks", [])
+    has_aggregates = any(b.get("run_type") == "aggregate" for b in benchmarks)
+
     results = []
-    for benchmark in data.get("benchmarks", []):
-        # Skip aggregates like mean, median, stddev
-        if benchmark.get("run_type") != "iteration":
-            continue
+    for benchmark in benchmarks:
+        if has_aggregates:
+            if benchmark.get("aggregate_name") != "median":
+                continue
+            name = benchmark["name"]
+            if name.endswith("_median"):
+                name = name[:-len("_median")]
+        else:
+            if benchmark.get("run_type") != "iteration":
+                continue
+            name = benchmark["name"]
 
         results.append(
             BenchmarkResult(
-                name=benchmark["name"],
+                name=name,
                 time_ns=benchmark.get("real_time", 0),
                 cpu_time_ns=benchmark.get("cpu_time", 0),
                 iterations=benchmark.get("iterations", 1),
@@ -173,7 +199,8 @@ def compare_benchmarks(
     current: List[BenchmarkResult],
     baseline: List[BenchmarkResult],
     threshold: float,
-    min_regressions: int = 3,
+    min_regressions: int = 5,
+    min_time_ns: float = 5000.0,
 ) -> Tuple[bool, List[dict]]:
     """
     Compare current results against baseline.
@@ -182,6 +209,10 @@ def compare_benchmarks(
     benchmarks exceed the threshold.  This prevents CI noise on shared
     runners from causing false positives when a single outlier spikes.
 
+    Benchmarks whose baseline time is below ``min_time_ns`` are excluded
+    from regression counting because percentage changes on sub-microsecond
+    timings are dominated by measurement overhead and ASLR noise.
+
     Returns:
         (has_regression, comparison_details)
     """
@@ -201,6 +232,7 @@ def compare_benchmarks(
 
     comparisons = []
     regression_count = 0
+    skipped_small = 0
 
     for name in sorted(baseline_names & current_names):
         b = baseline_map[name]
@@ -211,9 +243,12 @@ def compare_benchmarks(
 
         max_change = max(time_change, cpu_change)
 
-        is_regression = max_change > threshold
+        too_small = b.time_ns < min_time_ns
+        is_regression = max_change > threshold and not too_small
         if is_regression:
             regression_count += 1
+        if too_small and max_change > threshold:
+            skipped_small += 1
 
         comparisons.append({
             "name": name,
@@ -227,6 +262,11 @@ def compare_benchmarks(
 
     has_regression = regression_count >= min_regressions
 
+    if skipped_small > 0:
+        print(
+            f"::notice::{skipped_small} micro-benchmark(s) exceeded threshold but "
+            f"excluded (baseline < {min_time_ns/1000:.1f}us)."
+        )
     if regression_count > 0 and not has_regression:
         print(
             f"::notice::{regression_count} benchmark(s) exceeded threshold but "
@@ -376,8 +416,8 @@ def main():
     parser.add_argument(
         "--threshold",
         type=float,
-        default=0.10,
-        help="Regression threshold as ratio (default: 0.10 = 10%%)",
+        default=0.15,
+        help="Regression threshold as ratio (default: 0.15 = 15%%)",
     )
     parser.add_argument(
         "--lib",
@@ -413,10 +453,25 @@ def main():
     parser.add_argument(
         "--min-regressions",
         type=int,
-        default=3,
-        help="Minimum number of regressed benchmarks before flagging overall failure (default: 3). "
+        default=5,
+        help="Minimum number of regressed benchmarks before flagging overall failure (default: 5). "
              "Prevents CI noise from causing false positives.",
     )
+    parser.add_argument(
+        "--min-time-ns",
+        type=float,
+        default=5000.0,
+        help="Exclude benchmarks whose baseline time is below this value (in nanoseconds) "
+             "from regression counting. Sub-microsecond timings are dominated by "
+             "measurement noise. (default: 5000 = 5us)",
+    )
+    parser.add_argument(
+        "--benchmark-repetitions",
+        type=int,
+        default=3,
+        help="Run each benchmark N times and use the median. "
+             "Reduces ASLR and shared-runner noise. (default: 3)",
+    )
 
     args = parser.parse_args()
 
@@ -434,6 +489,7 @@ def main():
             mode=args.mode,
             benchmark_dir=args.benchmark_dir,
             extra_args=bench_extra,
+            repetitions=args.benchmark_repetitions,
         )
     except Exception as e:
         print(f"::error::Failed to run benchmarks: {e}")
@@ -464,6 +520,7 @@ def main():
         baseline_results,
         args.threshold,
         min_regressions=args.min_regressions,
+        min_time_ns=args.min_time_ns,
     )
 
     print_comparison_table(comparisons, args.threshold)