diff --git a/.ci/run_test_suite.sh b/.ci/run_test_suite.sh
index fdce39cf..a690deea 100644
--- a/.ci/run_test_suite.sh
+++ b/.ci/run_test_suite.sh
@@ -77,6 +77,9 @@ case $TestSuite in
     "evmfallbacksuite")
         CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_SPEC_TEST=ON -DZEN_ENABLE_ASSEMBLYSCRIPT_TEST=ON -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_JIT_FALLBACK_TEST=ON"
         ;;
+    "benchmarksuite")
+        CMAKE_OPTIONS="$CMAKE_OPTIONS -DZEN_ENABLE_EVM=ON -DZEN_ENABLE_LIBEVM=ON -DZEN_ENABLE_SINGLEPASS_JIT=OFF -DZEN_ENABLE_MULTIPASS_JIT=ON -DZEN_ENABLE_JIT_PRECOMPILE_FALLBACK=ON"
+        ;;
 esac
 
 case $CPU_EXCEPTION_TYPE in
@@ -97,6 +100,10 @@ if [[ $TestSuite == "evmonetestsuite" ]]; then
     STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON")
 fi
 
+if [[ $TestSuite == "benchmarksuite" ]]; then
+    STACK_TYPES=("-DZEN_ENABLE_VIRTUAL_STACK=ON")
+fi
+
 export PATH=$PATH:$PWD/build
 CMAKE_OPTIONS_ORIGIN="$CMAKE_OPTIONS"
 
@@ -163,5 +170,90 @@ for STACK_TYPE in ${STACK_TYPES[@]}; do
             python3 tools/run_evm_tests.py -r build/dtvm $EXTRA_EXE_OPTIONS
             ./build/evmFallbackExecutionTests
             ;;
+        "benchmarksuite")
+            # Clone evmone and run performance regression check
+            EVMONE_DIR="evmone"
+            if [ ! -d "$EVMONE_DIR" ]; then
+                git clone --depth 1 --recurse-submodules -b for_test https://github.com/DTVMStack/evmone.git $EVMONE_DIR
+            fi
+
+            BENCHMARK_THRESHOLD=${BENCHMARK_THRESHOLD:-0.15}
+            BENCHMARK_MODE=${BENCHMARK_MODE:-multipass}
+            BENCHMARK_SUMMARY_FILE=${BENCHMARK_SUMMARY_FILE:-/tmp/perf_summary.md}
+
+            cp build/lib/* $EVMONE_DIR/
+
+            cd $EVMONE_DIR
+
+            cp ../tools/check_performance_regression.py ./
+
+            if [ ! -f "build/bin/evmone-bench" ]; then
+                cmake -S . -B build -DEVMONE_TESTING=ON -DCMAKE_BUILD_TYPE=Release
+                cmake --build build --parallel -j 16
+            fi
+
+            BASELINE_CACHE=${BENCHMARK_BASELINE_CACHE:-}
+
+            if [ -n "$BASELINE_CACHE" ] && [ -f "$BASELINE_CACHE" ]; then
+                # Cached baseline available -- only run current benchmarks.
+                echo "Using cached baseline: $BASELINE_CACHE"
+                python3 check_performance_regression.py \
+                    --baseline "$BASELINE_CACHE" \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_BASELINE_LIB" ]; then
+                # No cache -- run baseline benchmarks with the pre-built
+                # baseline library, then run current benchmarks and compare.
+                echo "Running baseline benchmarks with library from base branch..."
+                cp "$BENCHMARK_BASELINE_LIB"/libdtvmapi.so ./libdtvmapi.so
+                SAVE_PATH=${BASELINE_CACHE:-/tmp/perf_baseline.json}
+                python3 check_performance_regression.py \
+                    --save-baseline "$SAVE_PATH" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+
+                echo "Running current benchmarks with PR library..."
+                cp ../build/lib/libdtvmapi.so ./libdtvmapi.so
+                python3 check_performance_regression.py \
+                    --baseline "$SAVE_PATH" \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_SAVE_BASELINE" ]; then
+                echo "Saving performance baseline..."
+                python3 check_performance_regression.py \
+                    --save-baseline "$BENCHMARK_SAVE_BASELINE" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            elif [ -n "$BENCHMARK_BASELINE_FILE" ]; then
+                echo "Checking performance regression against baseline..."
+                python3 check_performance_regression.py \
+                    --baseline "$BENCHMARK_BASELINE_FILE" \
+                    --threshold "$BENCHMARK_THRESHOLD" \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+            else
+                echo "Running benchmark suite without comparison..."
+                python3 check_performance_regression.py \
+                    --save-baseline benchmark_results.json \
+                    --output-summary "$BENCHMARK_SUMMARY_FILE" \
+                    --lib ./libdtvmapi.so \
+                    --mode "$BENCHMARK_MODE" \
+                    --benchmark-dir test/evm-benchmarks/benchmarks
+                cat benchmark_results.json
+            fi
+
+            cd ..
+            ;;
     esac
 done
diff --git a/.github/workflows/dtvm_evm_test_x86.yml b/.github/workflows/dtvm_evm_test_x86.yml
index 2070e2fa..d4c32f76 100644
--- a/.github/workflows/dtvm_evm_test_x86.yml
+++ b/.github/workflows/dtvm_evm_test_x86.yml
@@ -201,3 +201,169 @@ jobs:
           export ENABLE_GAS_METER=true
 
           bash .ci/run_test_suite.sh
+
+  performance_regression_check:
+    name: Performance Regression Check (${{ matrix.mode }})
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+      issues: write
+    strategy:
+      fail-fast: false
+      matrix:
+        mode: [interpreter, multipass]
+    container:
+      image: dtvmdev1/dtvm-dev-x64:main
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
+          fetch-depth: 0
+
+      - name: Setup git safe directory
+        run: |
+          echo "Configuring git safe directory: ${{ github.workspace }}"
+          git config --global --add safe.directory /__w/DTVM/DTVM
+
+      - name: Code Format Check
+        run: |
+          ./tools/format.sh check
+
+      - name: Restore baseline cache
+        id: baseline-cache
+        uses: actions/cache@v4
+        with:
+          path: /tmp/perf_baseline_${{ matrix.mode }}.json
+          key: perf-baseline-${{ matrix.mode }}-${{ github.event.pull_request.base.sha }}
+
+      - name: Build baseline library (${{ github.base_ref }})
+        if: steps.baseline-cache.outputs.cache-hit != 'true'
+        run: |
+          echo "Building baseline library from branch: ${{ github.base_ref }}"
+
+          export LLVM_SYS_150_PREFIX=/opt/llvm15
+          export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
+          export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
+
+          git stash push -u -m "perf-check-stash"
+          git checkout ${{ github.base_ref }}
+
+          cmake -S . -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DZEN_ENABLE_SINGLEPASS_JIT=OFF \
+            -DZEN_ENABLE_MULTIPASS_JIT=ON \
+            -DZEN_ENABLE_EVM=ON \
+            -DZEN_ENABLE_LIBEVM=ON \
+            -DZEN_ENABLE_JIT_PRECOMPILE_FALLBACK=ON \
+            -DZEN_ENABLE_CPU_EXCEPTION=ON \
+            -DZEN_ENABLE_VIRTUAL_STACK=ON
+          cmake --build build -j 16
+
+          mkdir -p /tmp/baseline_lib
+          cp build/lib/* /tmp/baseline_lib/
+
+          rm -rf build
+          git checkout ${{ github.sha }}
+          git stash pop || true
+
+      - name: Build current PR and check regression
+        id: perf-check
+        run: |
+          echo "Building PR branch: ${{ github.sha }}"
+
+          export LLVM_SYS_150_PREFIX=/opt/llvm15
+          export LLVM_DIR=$LLVM_SYS_150_PREFIX/lib/cmake/llvm
+          export PATH=$LLVM_SYS_150_PREFIX/bin:$PATH
+
+          rm -rf build evmone
+
+          export CMAKE_BUILD_TARGET=Release
+          export ENABLE_ASAN=false
+          export RUN_MODE=multipass
+          export ENABLE_LAZY=false
+          export ENABLE_MULTITHREAD=true
+          export TestSuite=benchmarksuite
+          export CPU_EXCEPTION_TYPE='cpu'
+          export BENCHMARK_MODE=${{ matrix.mode }}
+          export BENCHMARK_THRESHOLD=0.20
+          export BENCHMARK_BASELINE_CACHE=/tmp/perf_baseline_${{ matrix.mode }}.json
+          export BENCHMARK_BASELINE_LIB=/tmp/baseline_lib
+          export BENCHMARK_SUMMARY_FILE=/tmp/perf_summary_${{ matrix.mode }}.md
+
+          bash .ci/run_test_suite.sh
+        continue-on-error: true
+
+      - name: Write Performance Summary
+        if: always()
+        run: |
+          MODE="${{ matrix.mode }}"
+          OUTCOME="${{ steps.perf-check.outcome }}"
+          SUMMARY_FILE="/tmp/perf_summary_${MODE}.md"
+          if [ "$OUTCOME" = "success" ]; then
+            echo "✅ **Performance Check Passed (${MODE})**" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ **Performance Regression Detected (${MODE})**" >> $GITHUB_STEP_SUMMARY
+          fi
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ -f "$SUMMARY_FILE" ]; then
+            cat "$SUMMARY_FILE" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "_No benchmark summary available._" >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Save performance artifacts
+        if: always()
+        run: |
+          mkdir -p /tmp/perf-artifacts
+          echo "${{ github.event.pull_request.number }}" > /tmp/perf-artifacts/pr_number
+          echo "${{ steps.perf-check.outcome }}" > /tmp/perf-artifacts/outcome
+          cp "/tmp/perf_summary_${{ matrix.mode }}.md" /tmp/perf-artifacts/summary.md 2>/dev/null || \
+            echo "_No benchmark summary available._" > /tmp/perf-artifacts/summary.md
+
+      - name: Upload performance results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-results-${{ matrix.mode }}
+          path: /tmp/perf-artifacts/
+          retention-days: 7
+
+      - name: Comment on PR
+        if: always()
+        uses: actions/github-script@v6
+        continue-on-error: true
+        with:
+          script: |
+            const fs = require('fs');
+            const mode = '${{ matrix.mode }}';
+            const passed = '${{ steps.perf-check.outcome }}' === 'success';
+            let summary = '';
+            try {
+              summary = fs.readFileSync(`/tmp/perf_summary_${mode}.md`, 'utf8');
+            } catch (e) {
+              summary = '_No benchmark summary available._';
+            }
+            const icon = passed ? '✅' : '⚠️';
+            const title = passed
+              ? `Performance Check Passed (${mode})`
+              : `Performance Regression Detected (${mode})`;
+            const body = `${icon} **${title}**\n\n${summary}`;
+            try {
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body
+              });
+            } catch (error) {
+              core.warning(`Could not comment on PR (expected for fork PRs): ${error.message}. Results are available in the job summary above.`);
+            }
+
+      - name: Fail on regression
+        if: steps.perf-check.outcome == 'failure'
+        run: |
+          echo "::error::Performance regression detected in ${{ matrix.mode }} mode. See logs for details."
+          exit 1
diff --git a/.github/workflows/perf_pr_comment.yml b/.github/workflows/perf_pr_comment.yml
new file mode 100644
index 00000000..43a44eff
--- /dev/null
+++ b/.github/workflows/perf_pr_comment.yml
@@ -0,0 +1,96 @@
+name: Post Performance Check Results
+
+on:
+  workflow_run:
+    workflows: ["DTVM-EVM test CI in x86-64"]
+    types:
+      - completed
+
+permissions:
+  pull-requests: write
+  actions: read
+
+jobs:
+  comment:
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download interpreter results
+        uses: actions/download-artifact@v4
+        with:
+          name: perf-results-interpreter
+          path: /tmp/perf-interpreter
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+        continue-on-error: true
+
+      - name: Download multipass results
+        uses: actions/download-artifact@v4
+        with:
+          name: perf-results-multipass
+          path: /tmp/perf-multipass
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+        continue-on-error: true
+
+      - name: Post PR comment
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            let prNumber = null;
+            let body = '<!-- perf-check-results -->\n## ⚡ Performance Regression Check Results\n\n';
+            let hasResults = false;
+
+            for (const mode of ['interpreter', 'multipass']) {
+              const dir = `/tmp/perf-${mode}`;
+              try {
+                if (!prNumber) {
+                  prNumber = parseInt(fs.readFileSync(`${dir}/pr_number`, 'utf8').trim());
+                }
+                const outcome = fs.readFileSync(`${dir}/outcome`, 'utf8').trim();
+                const summary = fs.readFileSync(`${dir}/summary.md`, 'utf8');
+                const passed = outcome === 'success';
+                const icon = passed ? '✅' : '⚠️';
+                const title = passed
+                  ? `Performance Check Passed (${mode})`
+                  : `Performance Regression Detected (${mode})`;
+                body += `### ${icon} ${title}\n\n${summary}\n\n---\n\n`;
+                hasResults = true;
+              } catch (e) {
+                core.info(`No results for ${mode}: ${e.message}`);
+              }
+            }
+
+            if (!prNumber || !hasResults) {
+              core.info('No performance results to post');
+              return;
+            }
+
+            const { data: comments } = await github.rest.issues.listComments({
+              issue_number: prNumber,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+
+            const existing = comments.find(c =>
+              c.body && c.body.includes('<!-- perf-check-results -->')
+            );
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                comment_id: existing.id,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body.trim()
+              });
+              core.info(`Updated existing comment ${existing.id}`);
+            } else {
+              await github.rest.issues.createComment({
+                issue_number: prNumber,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body.trim()
+              });
+              core.info('Created new PR comment');
+            }
diff --git a/tools/check_performance_regression.py b/tools/check_performance_regression.py
new file mode 100755
index 00000000..d20525ab
--- /dev/null
+++ b/tools/check_performance_regression.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""
+Performance regression checker for evmone benchmarks.
+
+Usage:
+  # Save baseline results
+  python check_performance_regression.py --save-baseline baseline.json
+
+  # Check for regressions against baseline
+  python check_performance_regression.py --baseline baseline.json
+
+  # Check with custom threshold (default 10%)
+  python check_performance_regression.py --baseline baseline.json --threshold 0.15
+
+Exit codes:
+  0 - No significant regression detected
+  1 - Performance regression detected (> threshold)
+  2 - Script error (execution failed, file not found, etc.)
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+
+@dataclass
+class BenchmarkResult:
+    name: str
+    time_ns: float  # Time in nanoseconds
+    cpu_time_ns: float
+    iterations: int
+
+
+def run_benchmark(
+    lib_path: str,
+    mode: str,
+    benchmark_dir: str,
+    extra_args: Optional[List[str]] = None,
+    repetitions: int = 3,
+) -> List[BenchmarkResult]:
+    """Run benchmark and parse JSON output.
+
+    Uses --benchmark_out to write JSON results to a temporary file so that
+    the human-readable benchmark progress streams to stdout/stderr in real
+    time (important for CI visibility).
+
+    When *repetitions* > 1, each benchmark runs N times and only the
+    median aggregate is kept, significantly reducing noise from ASLR and
+    shared-runner contention.
+    """
+    env = {"EVMONE_EXTERNAL_OPTIONS": f"{lib_path},mode={mode}"}
+
+    fd, json_out_path = tempfile.mkstemp(suffix=".json")
+    os.close(fd)
+
+    cmd: List[str] = []
+
+    if shutil.which("taskset"):
+        cmd.extend(["taskset", "-c", "0"])
+
+    cmd.extend([
+        "./build/bin/evmone-bench",
+        benchmark_dir,
+        f"--benchmark_out={json_out_path}",
+        "--benchmark_out_format=json",
+    ])
+
+    if repetitions > 1:
+        cmd.append(f"--benchmark_repetitions={repetitions}")
+        cmd.append("--benchmark_report_aggregates_only=true")
+
+    if extra_args:
+        cmd.extend(extra_args)
+
+    if not any(arg.startswith("--benchmark_filter") for arg in cmd):
+        cmd.append("--benchmark_filter=external/total/*")
+
+    print(f"Running: {' '.join(cmd)}")
+    print(f"Environment: EVMONE_EXTERNAL_OPTIONS={env['EVMONE_EXTERNAL_OPTIONS']}")
+    sys.stdout.flush()
+
+    result = subprocess.run(
+        cmd,
+        env={**subprocess.os.environ, **env},
+    )
+
+    if result.returncode != 0:
+        print(f"Benchmark execution failed with code {result.returncode}")
+        try:
+            os.unlink(json_out_path)
+        except OSError:
+            pass
+        sys.exit(2)
+
+    try:
+        with open(json_out_path, "r") as f:
+            json_data = f.read()
+    finally:
+        try:
+            os.unlink(json_out_path)
+        except OSError:
+            pass
+
+    return parse_benchmark_json(json_data)
+
+
+def parse_benchmark_json(json_output: str) -> List[BenchmarkResult]:
+    """Parse Google Benchmark JSON output.
+
+    When the output contains aggregate entries (from ``--benchmark_repetitions``),
+    only the **median** aggregate is kept and the ``_median`` suffix is stripped
+    from names so they match the non-repetition format.  Otherwise individual
+    iteration results are returned.
+    """
+    try:
+        data = json.loads(json_output)
+    except json.JSONDecodeError as e:
+        print(f"Failed to parse JSON: {e}")
+        sys.exit(2)
+
+    benchmarks = data.get("benchmarks", [])
+    has_aggregates = any(b.get("run_type") == "aggregate" for b in benchmarks)
+
+    results = []
+    for benchmark in benchmarks:
+        if has_aggregates:
+            if benchmark.get("aggregate_name") != "median":
+                continue
+            name = benchmark["name"]
+            if name.endswith("_median"):
+                name = name[:-len("_median")]
+        else:
+            if benchmark.get("run_type") != "iteration":
+                continue
+            name = benchmark["name"]
+
+        results.append(
+            BenchmarkResult(
+                name=name,
+                time_ns=benchmark.get("real_time", 0),
+                cpu_time_ns=benchmark.get("cpu_time", 0),
+                iterations=benchmark.get("iterations", 1),
+            )
+        )
+
+    return results
+
+
+def load_baseline(path: str) -> List[BenchmarkResult]:
+    """Load baseline results from JSON file."""
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"::error::Baseline file not found: {path}")
+        sys.exit(2)
+    except json.JSONDecodeError as e:
+        print(f"::error::Failed to parse baseline JSON: {e}")
+        sys.exit(2)
+
+    results = []
+    for item in data:
+        results.append(
+            BenchmarkResult(
+                name=item["name"],
+                time_ns=item["time_ns"],
+                cpu_time_ns=item["cpu_time_ns"],
+                iterations=item["iterations"],
+            )
+        )
+
+    return results
+
+
+def save_baseline(results: List[BenchmarkResult], path: str) -> None:
+    """Save baseline results to JSON file."""
+    data = []
+    for r in results:
+        data.append({
+            "name": r.name,
+            "time_ns": r.time_ns,
+            "cpu_time_ns": r.cpu_time_ns,
+            "iterations": r.iterations,
+        })
+
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+
+    print(f"Saved {len(results)} benchmark results to {path}")
+
+
+def compare_benchmarks(
+    current: List[BenchmarkResult],
+    baseline: List[BenchmarkResult],
+    threshold: float,
+    min_regressions: int = 5,
+    min_time_ns: float = 5000.0,
+) -> Tuple[bool, List[dict]]:
+    """
+    Compare current results against baseline.
+
+    A regression is only flagged if at least ``min_regressions`` individual
+    benchmarks exceed the threshold.  This prevents CI noise on shared
+    runners from causing false positives when a single outlier spikes.
+
+    Benchmarks whose baseline time is below ``min_time_ns`` are excluded
+    from regression counting because percentage changes on sub-microsecond
+    timings are dominated by measurement overhead and ASLR noise.
+
+    Returns:
+        (has_regression, comparison_details)
+    """
+    baseline_map = {b.name: b for b in baseline}
+    current_map = {c.name: c for c in current}
+
+    baseline_names = set(baseline_map.keys())
+    current_names = set(current_map.keys())
+
+    missing = baseline_names - current_names
+    new = current_names - baseline_names
+
+    if missing:
+        print(f"::warning::Missing benchmarks (in baseline but not in current): {missing}")
+    if new:
+        print(f"::notice::New benchmarks (in current but not in baseline): {new}")
+
+    comparisons = []
+    regression_count = 0
+    skipped_small = 0
+
+    for name in sorted(baseline_names & current_names):
+        b = baseline_map[name]
+        c = current_map[name]
+
+        time_change = (c.time_ns - b.time_ns) / b.time_ns
+        cpu_change = (c.cpu_time_ns - b.cpu_time_ns) / b.cpu_time_ns
+
+        max_change = max(time_change, cpu_change)
+
+        too_small = b.time_ns < min_time_ns
+        is_regression = max_change > threshold and not too_small
+        if is_regression:
+            regression_count += 1
+        if too_small and max_change > threshold:
+            skipped_small += 1
+
+        comparisons.append({
+            "name": name,
+            "baseline_time_ns": b.time_ns,
+            "current_time_ns": c.time_ns,
+            "time_change": time_change,
+            "cpu_change": cpu_change,
+            "max_change": max_change,
+            "is_regression": is_regression,
+        })
+
+    has_regression = regression_count >= min_regressions
+
+    if skipped_small > 0:
+        print(
+            f"::notice::{skipped_small} micro-benchmark(s) exceeded threshold but "
+            f"excluded (baseline < {min_time_ns/1000:.1f}us)."
+        )
+    if regression_count > 0 and not has_regression:
+        print(
+            f"::notice::{regression_count} benchmark(s) exceeded threshold but "
+            f"below min_regressions={min_regressions}; treating as noise."
+        )
+
+    return has_regression, comparisons
+
+
+def print_comparison_table(comparisons: List[dict], threshold: float) -> None:
+    """Print a formatted comparison table."""
+    if not comparisons:
+        print("No benchmarks to compare.")
+        return
+
+    # GitHub Actions annotation messages
+    print("\n" + "=" * 100)
+    print(f"{'Benchmark':<60} {'Baseline(μs)':<15} {'Current(μs)':<15} {'Change':<12} {'Status'}")
+    print("=" * 100)
+
+    regression_count = 0
+    for comp in comparisons:
+        name = comp["name"]
+        baseline_us = comp["baseline_time_ns"] / 1000
+        current_us = comp["current_time_ns"] / 1000
+        change_pct = comp["max_change"] * 100
+        status = "✓ PASS" if not comp["is_regression"] else "✗ FAIL"
+
+        # Truncate long names
+        display_name = name if len(name) < 60 else name[:57] + "..."
+
+        print(f"{display_name:<60} {baseline_us:<15.2f} {current_us:<15.2f} {change_pct:>+10.1f}%  {status}")
+
+        if comp["is_regression"]:
+            regression_count += 1
+            # GitHub Actions warning annotation
+            print(f"::warning title=Performance Regression::{name} regressed by {change_pct:.1f}% (threshold: {threshold*100:.0f}%)")
+
+    print("=" * 100)
+    print(f"\nTotal benchmarks: {len(comparisons)}")
+    print(f"Regressions (> {threshold*100:.0f}%): {regression_count}")
+
+
+def _short_name(name: str) -> str:
+    """Extract a short display name from the full benchmark name.
+
+    Benchmark names typically look like 'external/some_case/variant'.
+    We strip the leading 'external/' prefix to keep the table compact.
+    """
+    if name.startswith("external/"):
+        return name[len("external/"):]
+    return name
+
+
+def generate_markdown_summary(
+    comparisons: List[dict],
+    threshold: float,
+    has_regression: bool,
+) -> str:
+    """Generate a concise Markdown summary of benchmark comparison results."""
+    lines: List[str] = []
+
+    regression_count = sum(1 for c in comparisons if c["is_regression"])
+
+    lines.append(
+        f"**Performance Benchmark Results** (threshold: {threshold*100:.0f}%)"
+    )
+    lines.append("")
+
+    if not comparisons:
+        lines.append("_No benchmarks to compare._")
+        return "\n".join(lines)
+
+    # Markdown table header
+    lines.append("| Benchmark | Baseline (us) | Current (us) | Change | Status |")
+    lines.append("|-----------|--------------|-------------|--------|--------|")
+
+    for comp in comparisons:
+        name = _short_name(comp["name"])
+        baseline_us = comp["baseline_time_ns"] / 1000
+        current_us = comp["current_time_ns"] / 1000
+        change_pct = comp["max_change"] * 100
+        status = "PASS" if not comp["is_regression"] else "**REGRESSED**"
+
+        lines.append(
+            f"| {name} | {baseline_us:.2f} | {current_us:.2f} "
+            f"| {change_pct:+.1f}% | {status} |"
+        )
+
+    lines.append("")
+    lines.append(
+        f"**Summary**: {len(comparisons)} benchmarks, "
+        f"{regression_count} regressions"
+    )
+
+    return "\n".join(lines)
+
+
+def generate_baseline_summary(results: List[BenchmarkResult]) -> str:
+    """Generate a concise Markdown summary for a baseline-save run."""
+    lines: List[str] = []
+    lines.append("**Baseline Benchmark Results**")
+    lines.append("")
+    lines.append("| Benchmark | Time (us) |")
+    lines.append("|-----------|----------|")
+
+    for r in results:
+        name = _short_name(r.name)
+        time_us = r.time_ns / 1000
+        lines.append(f"| {name} | {time_us:.2f} |")
+
+    lines.append("")
+    lines.append(f"**Total**: {len(results)} benchmarks collected")
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check for performance regressions in evmone benchmarks",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Save baseline after a known-good commit
+  python check_performance_regression.py --save-baseline baseline.json
+
+  # Check current commit against baseline in CI
+  python check_performance_regression.py --baseline baseline.json
+
+  # Check with custom threshold (15% instead of default 10%)
+  python check_performance_regression.py --baseline baseline.json --threshold 0.15
+
+  # Specify different library or benchmark directory
+  python check_performance_regression.py --baseline baseline.json --lib ./other.so --mode jit
+""",
+    )
+
+    parser.add_argument(
+        "--baseline",
+        metavar="PATH",
+        help="Path to baseline JSON file for comparison",
+    )
+    parser.add_argument(
+        "--save-baseline",
+        metavar="PATH",
+        help="Run benchmarks and save results to file (use this to create baseline)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.15,
+        help="Regression threshold as ratio (default: 0.15 = 15%%)",
+    )
+    parser.add_argument(
+        "--lib",
+        default="./libdtvmapi.so",
+        help="Path to the library to benchmark (default: ./libdtvmapi.so)",
+    )
+    parser.add_argument(
+        "--mode",
+        default="interpreter",
+        help="Mode for the library (default: interpreter)",
+    )
+    parser.add_argument(
+        "--benchmark-dir",
+        default="test/evm-benchmarks/benchmarks",
+        help="Path to benchmark directory (default: test/evm-benchmarks/benchmarks)",
+    )
+    parser.add_argument(
+        "--output-summary",
+        metavar="PATH",
+        help="Write a concise Markdown summary to the given file (for PR comments)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Verbose output",
+    )
+    parser.add_argument(
+        "--benchmark-filter",
+        default=None,
+        help="Custom regex filter forwarded to evmone-bench --benchmark_filter (default: external/*)",
+    )
+    parser.add_argument(
+        "--min-regressions",
+        type=int,
+        default=5,
+        help="Minimum number of regressed benchmarks before flagging overall failure (default: 5). "
+             "Prevents CI noise from causing false positives.",
+    )
+    parser.add_argument(
+        "--min-time-ns",
+        type=float,
+        default=5000.0,
+        help="Exclude benchmarks whose baseline time is below this value (in nanoseconds) "
+             "from regression counting. Sub-microsecond timings are dominated by "
+             "measurement noise. (default: 5000 = 5us)",
+    )
+    parser.add_argument(
+        "--benchmark-repetitions",
+        type=int,
+        default=3,
+        help="Run each benchmark N times and use the median. "
+             "Reduces ASLR and shared-runner noise. (default: 3)",
+    )
+
+    args = parser.parse_args()
+
+    if not args.baseline and not args.save_baseline:
+        parser.error("Either --baseline or --save-baseline must be specified")
+
+    bench_extra = None
+    if args.benchmark_filter:
+        bench_extra = [f"--benchmark_filter={args.benchmark_filter}"]
+
+    # Run benchmarks
+    try:
+        current_results = run_benchmark(
+            lib_path=args.lib,
+            mode=args.mode,
+            benchmark_dir=args.benchmark_dir,
+            extra_args=bench_extra,
+            repetitions=args.benchmark_repetitions,
+        )
+    except Exception as e:
+        print(f"::error::Failed to run benchmarks: {e}")
+        sys.exit(2)
+
+    if not current_results:
+        print("::error::No benchmark results found")
+        sys.exit(2)
+
+    print(f"Collected {len(current_results)} benchmark results")
+
+    # Save baseline mode
+    if args.save_baseline:
+        save_baseline(current_results, args.save_baseline)
+        if args.output_summary:
+            summary_md = generate_baseline_summary(current_results)
+            with open(args.output_summary, "w") as f:
+                f.write(summary_md)
+            print(f"Wrote baseline summary to {args.output_summary}")
+        return 0
+
+    # Compare mode
+    baseline_results = load_baseline(args.baseline)
+    print(f"Loaded {len(baseline_results)} baseline results from {args.baseline}")
+
+    has_regression, comparisons = compare_benchmarks(
+        current_results,
+        baseline_results,
+        args.threshold,
+        min_regressions=args.min_regressions,
+        min_time_ns=args.min_time_ns,
+    )
+
+    print_comparison_table(comparisons, args.threshold)
+
+    # Write Markdown summary for PR comments
+    if args.output_summary:
+        summary_md = generate_markdown_summary(
+            comparisons, args.threshold, has_regression
+        )
+        with open(args.output_summary, "w") as f:
+            f.write(summary_md)
+        print(f"Wrote comparison summary to {args.output_summary}")
+
+    regression_count = sum(1 for c in comparisons if c["is_regression"])
+
+    print("\n" + "=" * 100)
+    if has_regression:
+        print(
+            f"::error::Performance regression detected! "
+            f"{regression_count} benchmarks exceeded {args.threshold*100:.0f}% threshold "
+            f"(min required: {args.min_regressions})."
+        )
+        print("RESULT: FAIL")
+        return 1
+    else:
+        if regression_count > 0:
+            print(
+                f"::notice::{regression_count} benchmark(s) exceeded threshold "
+                f"but below minimum of {args.min_regressions}; treated as CI noise."
+            )
+        print("::notice::No significant performance regression detected.")
+        print("RESULT: PASS")
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())