From ebe69f4a6a539bd51233283553e314ac4cd11e80 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 09:00:37 +0000 Subject: [PATCH 1/3] feat: add MCP contract testing for external server interface drift detection Adds the ability to snapshot external MCP server tool definitions and detect breaking changes (removed tools, new required params, type changes) before running tests. This addresses Scenario 2 of distributed AI evaluation: when you don't own the MCP server code. New commands: evalview mcp snapshot/check/list/show/delete New flag: evalview run --contracts --fail-on CONTRACT_DRIFT New CI status: CONTRACT_DRIFT (joins REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED) https://claude.ai/code/session_019CvwYcAoNoitWBdhEYUbxV --- action.yml | 11 +- docs/MCP_CONTRACTS.md | 228 ++++++++++++++ evalview/adapters/mcp_adapter.py | 88 ++++++ evalview/cli.py | 335 +++++++++++++++++++- evalview/core/contract_diff.py | 246 +++++++++++++++ evalview/core/diff.py | 1 + evalview/core/mcp_contract.py | 152 +++++++++ examples/mcp/contract-testing.yaml | 42 +++ tests/test_mcp_contracts.py | 475 +++++++++++++++++++++++++++++ 9 files changed, 1573 insertions(+), 5 deletions(-) create mode 100644 docs/MCP_CONTRACTS.md create mode 100644 evalview/core/contract_diff.py create mode 100644 evalview/core/mcp_contract.py create mode 100644 examples/mcp/contract-testing.yaml create mode 100644 tests/test_mcp_contracts.py diff --git a/action.yml b/action.yml index 6d59925..640ef5b 100644 --- a/action.yml +++ b/action.yml @@ -37,9 +37,13 @@ inputs: required: false default: 'false' fail-on: - description: 'Comma-separated statuses that fail CI (REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED)' + description: 'Comma-separated statuses that fail CI (REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED, CONTRACT_DRIFT)' required: false default: 'REGRESSION' + contracts: + description: 'Check MCP contracts for interface drift before running tests' + required: false + default: 'false' generate-report: description: 'Generate HTML report' required: false @@ -105,6 +109,11 @@ runs: CMD="$CMD --diff --fail-on ${{ inputs.fail-on }}" fi + # MCP contract drift detection + if [ "${{ inputs.contracts }}" = "true" ]; then + CMD="$CMD --contracts --fail-on ${{ inputs.fail-on }}" + fi + if [ "${{ inputs.generate-report }}" = "true" ]; then CMD="$CMD --html-report evalview-report.html" fi diff --git a/docs/MCP_CONTRACTS.md b/docs/MCP_CONTRACTS.md new file mode 100644 index 0000000..dc58f1a --- /dev/null +++ b/docs/MCP_CONTRACTS.md @@ -0,0 +1,228 @@ +# MCP Contract Testing + +Detect when external MCP servers change their interface before your agent breaks. + +## The Problem + +When you use MCP servers you don't own (Scenario 2), the server can change its +tool definitions at any time: rename parameters, remove tools, add required fields. +Your agent tests pass today and fail tomorrow — not because your code changed, +but because the server did. + +## The Solution + +MCP contract testing captures a snapshot of a server's tool definitions and diffs +against it on every CI run. If the interface changed, you know immediately — before +running your full test suite. + +This mirrors EvalView's golden baseline system: +- **Golden traces** detect when your agent's *behavior* drifts +- **MCP contracts** detect when an external server's *interface* drifts + +## Quick Start + +### 1. Snapshot a server + +```bash +evalview mcp snapshot "npx:@modelcontextprotocol/server-github" --name server-github +``` + +Output: +``` +Snapshot saved: .evalview/contracts/server-github.contract.json +Tools discovered: 8 + - create_issue + - list_issues + - create_pull_request + - ... +``` + +### 2. Check for drift + +```bash +evalview mcp check server-github +``` + +If the server changed: +``` +CONTRACT_DRIFT - 2 breaking change(s) + REMOVED: create_pull_request - tool 'create_pull_request' no longer available + CHANGED: list_issues - new required parameter 'owner' +``` + +### 3. Use in CI + +```bash +evalview run tests/ --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +``` + +The `--contracts` flag checks all saved contracts *before* running tests. +If any contract drifted, the run aborts immediately — no point testing against +a broken interface. + +## CLI Reference + +### `evalview mcp snapshot` + +Capture tool definitions from an MCP server. + +```bash +evalview mcp snapshot --name [--notes "..."] [--timeout 30] +``` + +| Argument | Description | +|----------|-------------| +| `endpoint` | MCP server endpoint (e.g., `npx:@modelcontextprotocol/server-github`) | +| `--name` | Human-readable identifier for this contract (required) | +| `--notes` | Optional notes about this snapshot | +| `--timeout` | Connection timeout in seconds (default: 30) | + +Supports all MCP transport types: +- **stdio**: `"npx:@modelcontextprotocol/server-filesystem /tmp"` +- **HTTP**: `"http://localhost:8080"` +- **Command**: `"stdio:python my_server.py"` + +### `evalview mcp check` + +Compare current server interface against a saved contract. + +```bash +evalview mcp check [--endpoint ] [--timeout 30] +``` + +| Argument | Description | +|----------|-------------| +| `name` | Contract name (from `--name` in snapshot) | +| `--endpoint` | Override endpoint (default: use endpoint from snapshot) | + +Exit codes: +- `0` — No breaking changes +- `1` — Breaking changes detected (CONTRACT_DRIFT) +- `2` — Could not connect to server + +### `evalview mcp list` + +List all saved contracts. + +```bash +evalview mcp list +``` + +### `evalview mcp show` + +Show full details of a contract including all tool schemas. + +```bash +evalview mcp show +``` + +### `evalview mcp delete` + +Remove a contract. + +```bash +evalview mcp delete [--force] +``` + +## Integration with `evalview run` + +The `--contracts` flag adds a pre-flight check to any test run: + +```bash +evalview run tests/ --contracts +``` + +This checks all contracts in `.evalview/contracts/` before running tests. +Combine with `--fail-on CONTRACT_DRIFT` to fail CI on drift: + +```bash +evalview run tests/ --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +``` + +Or use `--strict` (now includes CONTRACT_DRIFT): + +```bash +evalview run tests/ --contracts --strict +``` + +## GitHub Actions + +```yaml +- name: Run EvalView + uses: hidai25/eval-view@v0.2.1 + with: + diff: true + contracts: true + fail-on: 'REGRESSION,CONTRACT_DRIFT' +``` + +## What Gets Detected + +### Breaking changes (trigger CONTRACT_DRIFT) + +| Change | Example | +|--------|---------| +| Tool removed | `create_pull_request` no longer exists | +| Required parameter added | New required param `owner` on `list_issues` | +| Parameter removed | `repo` param no longer accepted | +| Parameter type changed | `limit` changed from `string` to `integer` | +| Parameter became required | `owner` was optional, now required | + +### Informational changes (logged, don't fail) + +| Change | Example | +|--------|---------| +| New tool added | `merge_pull_request` now available | +| Optional parameter added | New optional param `labels` on `create_issue` | +| Description changed | Tool description updated | + +## Contract File Format + +Contracts are stored as JSON in `.evalview/contracts/`: + +```json +{ + "metadata": { + "server_name": "server-github", + "endpoint": "npx:@modelcontextprotocol/server-github", + "snapshot_at": "2026-02-07T10:30:00", + "protocol_version": "2024-11-05", + "tool_count": 8, + "schema_hash": "a1b2c3d4e5f67890" + }, + "tools": [ + { + "name": "create_issue", + "description": "Create a new issue in a GitHub repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": { "type": "string" }, + "title": { "type": "string" }, + "body": { "type": "string" } + }, + "required": ["repo", "title"] + } + } + ] +} +``` + +Commit these files to your repo so CI can use them. + +## Best Practices + +1. **Snapshot after verifying** — Run your tests first, confirm everything works, + then snapshot. The contract represents a known-good interface. + +2. **Refresh periodically** — If a contract is >30 days old, `evalview mcp check` + will warn you. Re-snapshot to accept intentional changes. + +3. **One contract per server** — Name contracts after the server, not the tools. + `server-github` not `create-issue-tool`. + +4. **Commit contracts** — Store `.evalview/contracts/` in git. They're small JSON + files and CI needs them. + +5. **Check before testing** — Use `--contracts` on `evalview run` so drift is + caught before wasting time on tests that will fail anyway. diff --git a/evalview/adapters/mcp_adapter.py b/evalview/adapters/mcp_adapter.py index 6f49865..3a3e1fa 100644 --- a/evalview/adapters/mcp_adapter.py +++ b/evalview/adapters/mcp_adapter.py @@ -449,6 +449,94 @@ def _create_error_trace( trace_context=tracer.build_trace_context(), ) + async def discover_tools(self) -> List[Dict[str, Any]]: + """Discover available tools from an MCP server. + + Connects to the server, initializes the session, and calls tools/list + to retrieve all available tool definitions (name, description, inputSchema). + + Returns: + List of tool definition dicts from the MCP server. + + Raises: + Exception: If connection or discovery fails. + """ + transport, target = self._parse_endpoint() + + if transport == "stdio": + return await self._discover_tools_stdio(target) + else: + return await self._discover_tools_http(target) + + async def _discover_tools_stdio(self, command: str) -> List[Dict[str, Any]]: + """Discover tools via stdio transport.""" + import shlex + + cmd_parts = shlex.split(command) + process = await asyncio.create_subprocess_exec( + *cmd_parts, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + try: + await self._send_request( + process, + "initialize", + { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "evalview", "version": "0.1.7"}, + }, + ) + await self._send_notification(process, "notifications/initialized", {}) + + result = await self._send_request(process, "tools/list", {}) + return result.get("tools", []) + finally: + process.terminate() + await process.wait() + + async def _discover_tools_http(self, url: str) -> List[Dict[str, Any]]: + """Discover tools via HTTP transport.""" + import httpx + + async with httpx.AsyncClient(timeout=self.timeout) as client: + # Initialize session + self._request_id += 1 + await client.post( + url, + json={ + "jsonrpc": "2.0", + "id": self._request_id, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "evalview", "version": "0.1.7"}, + }, + }, + ) + + # Request tool list + self._request_id += 1 + response = await client.post( + url, + json={ + "jsonrpc": "2.0", + "id": self._request_id, + "method": "tools/list", + "params": {}, + }, + ) + result = response.json() + + if "error" in result: + raise Exception(f"MCP tools/list error: {result['error']}") + + return result.get("result", {}).get("tools", []) + async def health_check(self) -> bool: """Check if MCP server is reachable.""" transport, target = self._parse_endpoint() diff --git a/evalview/cli.py b/evalview/cli.py index c74e777..fd837f7 100644 --- a/evalview/cli.py +++ b/evalview/cli.py @@ -1360,7 +1360,7 @@ async def _init_wizard_async(dir: str): "--fail-on", type=str, default=None, - help="Comma-separated diff statuses that cause exit code 1 (default: REGRESSION, or from ci.fail_on in config.yaml)", + help="Comma-separated statuses that cause exit code 1: REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED, CONTRACT_DRIFT (default: REGRESSION)", ) @click.option( "--warn-on", @@ -1401,6 +1401,11 @@ async def _init_wizard_async(dir: str): default=None, help="Filter tests by difficulty level.", ) +@click.option( + "--contracts", + is_flag=True, + help="Check MCP contracts for interface drift before running tests. Fails fast if external servers changed.", +) def run( path: Optional[str], pattern: str, @@ -1432,6 +1437,7 @@ def run( runs: Optional[int], pass_rate: float, difficulty: Optional[str], + contracts: bool, ): """Run test cases against the agent. @@ -1448,7 +1454,7 @@ def run( # Handle --strict flag (overrides config and CLI) if strict: - fail_on = "REGRESSION,TOOLS_CHANGED,OUTPUT_CHANGED" + fail_on = "REGRESSION,TOOLS_CHANGED,OUTPUT_CHANGED,CONTRACT_DRIFT" warn_on = "" asyncio.run(_run_async( @@ -1456,7 +1462,8 @@ def run( sequential, max_workers, max_retries, retry_delay, watch, html_report, summary, coverage, adapter_override=adapter, diff=diff, diff_report=diff_report, fail_on=fail_on, warn_on=warn_on, trace=trace, trace_out=trace_out, - runs=runs, pass_rate=pass_rate, difficulty_filter=difficulty + runs=runs, pass_rate=pass_rate, difficulty_filter=difficulty, + contracts=contracts, )) @@ -1488,6 +1495,7 @@ async def _run_async( runs: Optional[int] = None, pass_rate: float = 0.8, difficulty_filter: Optional[str] = None, + contracts: bool = False, ): """Async implementation of run command.""" import fnmatch @@ -1611,6 +1619,53 @@ async def _run_async( console.print("[yellow]⚠️ Watch mode requires watchdog. Install with: pip install watchdog[/yellow]") watch = False + # --- MCP Contract Check (runs before tests, fail fast) --- + contract_drifts = [] + if contracts: + from evalview.core.mcp_contract import ContractStore + from evalview.core.contract_diff import diff_contract, ContractDriftStatus + from evalview.adapters.mcp_adapter import MCPAdapter + + contract_store = ContractStore() + all_contracts = contract_store.list_contracts() + + if all_contracts: + console.print("[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") + + for meta in all_contracts: + contract = contract_store.load_contract(meta.server_name) + if not contract: + continue + + adapter = MCPAdapter(endpoint=contract.metadata.endpoint, timeout=30.0) + try: + current_tools = await asyncio.get_event_loop().run_in_executor( + None, lambda: asyncio.run(adapter.discover_tools()) + ) + except Exception as e: + console.print(f" [yellow]WARN: {meta.server_name}[/yellow] - could not connect: {e}") + continue + + result = diff_contract(contract, current_tools) + + if result.status == ContractDriftStatus.CONTRACT_DRIFT: + contract_drifts.append(result) + console.print(f" [red]CONTRACT_DRIFT: {meta.server_name}[/red] - {result.summary()}") + for change in result.breaking_changes: + console.print(f" [red]{change.kind.value}: {change.tool_name}[/red] - {change.detail}") + else: + console.print(f" [green]PASSED: {meta.server_name}[/green]") + + console.print() + + # Check if we should fail fast + if contract_drifts and fail_on and "CONTRACT_DRIFT" in (fail_on or "").upper(): + console.print("[bold red]Aborting: MCP contract drift detected. Fix contracts before running tests.[/bold red]") + console.print("[dim]Accept changes: evalview mcp snapshot --name [/dim]\n") + raise SystemExit(1) + else: + console.print("[dim]--contracts: No contracts found. Create one: evalview mcp snapshot --name [/dim]\n") + console.print("[blue]Running test cases...[/blue]\n") # Load config - check path directory first, then current directory @@ -2807,7 +2862,7 @@ async def update_display(): # Parse fail_on and warn_on into sets fail_statuses = set() warn_statuses = set() - valid_statuses = {"REGRESSION", "TOOLS_CHANGED", "OUTPUT_CHANGED", "PASSED"} + valid_statuses = {"REGRESSION", "TOOLS_CHANGED", "OUTPUT_CHANGED", "PASSED", "CONTRACT_DRIFT"} for s in fail_on.upper().split(","): s = s.strip() @@ -5976,6 +6031,278 @@ def golden_show(test_name: str): console.print() +# ============================================================================ +# MCP Contract Commands +# ============================================================================ + + +@main.group() +def mcp(): + """Manage MCP contracts (detect external server interface drift). + + MCP contracts are snapshots of an external MCP server's tool definitions. + Use them with `evalview run --contracts` to detect when servers change + their interface before your tests break. + + Examples: + evalview mcp snapshot "npx:@modelcontextprotocol/server-github" --name server-github + evalview mcp check server-github + evalview mcp list + """ + pass + + +@mcp.command("snapshot") +@click.argument("endpoint") +@click.option("--name", "-n", required=True, help="Server name (used as contract identifier)") +@click.option("--notes", help="Notes about this snapshot") +@click.option("--timeout", default=30.0, type=float, help="Connection timeout in seconds") +def mcp_snapshot(endpoint: str, name: str, notes: str, timeout: float): + """Snapshot an MCP server's tool definitions as a contract. + + ENDPOINT is the MCP server endpoint (e.g., "npx:@modelcontextprotocol/server-github"). + + Examples: + evalview mcp snapshot "npx:@modelcontextprotocol/server-filesystem /tmp" --name fs-server + evalview mcp snapshot "http://localhost:8080" --name my-server --notes "v2.1 release" + """ + import asyncio + from evalview.adapters.mcp_adapter import MCPAdapter + from evalview.core.mcp_contract import ContractStore + + console.print(f"\n[cyan]━━━ MCP Contract Snapshot ━━━[/cyan]\n") + console.print(f" Server: [bold]{name}[/bold]") + console.print(f" Endpoint: {endpoint}") + console.print() + + adapter = MCPAdapter(endpoint=endpoint, timeout=timeout) + + try: + tools = asyncio.run(adapter.discover_tools()) + except Exception as e: + console.print(f"[red]Failed to connect to MCP server: {e}[/red]") + console.print("[dim]Check that the server is running and the endpoint is correct.[/dim]\n") + raise SystemExit(1) + + if not tools: + console.print("[yellow]Server returned no tools.[/yellow]\n") + raise SystemExit(1) + + store = ContractStore() + + if store.has_contract(name): + if not click.confirm( + f"Contract '{name}' already exists. Overwrite?", + default=False, + ): + console.print("[dim]Cancelled[/dim]\n") + return + + path = store.save_contract( + server_name=name, + endpoint=endpoint, + tools=tools, + notes=notes, + ) + + console.print(f"[green]Snapshot saved: {path}[/green]") + console.print(f" Tools discovered: [bold]{len(tools)}[/bold]") + for tool in tools: + desc = tool.get("description", "") + if len(desc) > 60: + desc = desc[:57] + "..." + console.print(f" [dim]- {tool['name']}[/dim] {desc}") + console.print() + console.print("[dim]Check for drift: evalview mcp check " + name + "[/dim]") + console.print("[dim]Use in CI: evalview run --contracts --fail-on CONTRACT_DRIFT[/dim]\n") + + +@mcp.command("check") +@click.argument("name") +@click.option("--endpoint", help="Override endpoint (default: use endpoint from snapshot)") +@click.option("--timeout", default=30.0, type=float, help="Connection timeout in seconds") +def mcp_check(name: str, endpoint: str, timeout: float): + """Check an MCP server for contract drift. + + NAME is the contract name (from `evalview mcp snapshot --name`). + + Examples: + evalview mcp check server-github + evalview mcp check my-server --endpoint "http://new-host:8080" + """ + import asyncio + from evalview.adapters.mcp_adapter import MCPAdapter + from evalview.core.mcp_contract import ContractStore + from evalview.core.contract_diff import diff_contract, ContractDriftStatus + + store = ContractStore() + contract = store.load_contract(name) + + if not contract: + console.print(f"\n[red]No contract found: {name}[/red]") + console.print("[dim]Create one with: evalview mcp snapshot --name " + name + "[/dim]\n") + raise SystemExit(1) + + target_endpoint = endpoint or contract.metadata.endpoint + adapter = MCPAdapter(endpoint=target_endpoint, timeout=timeout) + + console.print(f"\n[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") + console.print(f" Contract: [bold]{name}[/bold]") + console.print(f" Endpoint: {target_endpoint}") + + # Show snapshot age + from datetime import datetime + age = datetime.now() - contract.metadata.snapshot_at + age_days = age.days + if age_days > 30: + console.print(f" Snapshot age: [yellow]{age_days} days (consider refreshing)[/yellow]") + else: + console.print(f" Snapshot age: [dim]{age_days} day(s)[/dim]") + console.print() + + try: + current_tools = asyncio.run(adapter.discover_tools()) + except Exception as e: + console.print(f"[red]Failed to connect to MCP server: {e}[/red]") + console.print("[dim]The server may be down. Use --endpoint to try a different host.[/dim]\n") + raise SystemExit(2) + + result = diff_contract(contract, current_tools) + + if result.status == ContractDriftStatus.PASSED: + if result.informational_changes: + console.print(f"[green]PASSED[/green] - No breaking changes ({result.summary()})") + console.print() + for change in result.informational_changes: + console.print(f" [dim]+ {change.tool_name}: {change.detail}[/dim]") + else: + console.print("[green]PASSED[/green] - Interface matches snapshot exactly") + console.print() + else: + console.print(f"[red]CONTRACT_DRIFT[/red] - {result.summary()}\n") + + for change in result.breaking_changes: + if change.kind.value == "removed": + console.print(f" [red]REMOVED: {change.tool_name}[/red] - {change.detail}") + else: + console.print(f" [red]CHANGED: {change.tool_name}[/red] - {change.detail}") + + if result.informational_changes: + console.print() + for change in result.informational_changes: + console.print(f" [dim]INFO: {change.tool_name} - {change.detail}[/dim]") + + console.print() + console.print("[dim]To accept the new interface: evalview mcp snapshot " + target_endpoint + " --name " + name + "[/dim]\n") + raise SystemExit(1) + + +@mcp.command("list") +def mcp_list(): + """List all MCP contract snapshots. + + Shows all saved contracts with metadata. + """ + from evalview.core.mcp_contract import ContractStore + from datetime import datetime + + store = ContractStore() + contracts = store.list_contracts() + + if not contracts: + console.print("\n[yellow]No MCP contracts found.[/yellow]") + console.print("[dim]Create one: evalview mcp snapshot --name [/dim]\n") + return + + console.print("\n[cyan]━━━ MCP Contracts ━━━[/cyan]\n") + + for c in sorted(contracts, key=lambda x: x.server_name): + age = datetime.now() - c.snapshot_at + age_str = f"{age.days}d ago" if age.days > 0 else "today" + + console.print(f" [bold]{c.server_name}[/bold]") + console.print(f" [dim]Endpoint: {c.endpoint}[/dim]") + console.print(f" [dim]Tools: {c.tool_count} | Snapshot: {age_str}[/dim]") + if c.notes: + console.print(f" [dim]Notes: {c.notes}[/dim]") + console.print() + + console.print(f"[dim]Total: {len(contracts)} contract(s)[/dim]") + console.print("[dim]Check for drift: evalview mcp check [/dim]\n") + + +@mcp.command("delete") +@click.argument("name") +@click.option("--force", "-f", is_flag=True, help="Skip confirmation") +def mcp_delete(name: str, force: bool): + """Delete an MCP contract snapshot. + + NAME is the contract name to delete. + """ + from evalview.core.mcp_contract import ContractStore + + store = ContractStore() + + if not store.has_contract(name): + console.print(f"\n[yellow]No contract found: {name}[/yellow]\n") + return + + if not force: + if not click.confirm(f"Delete contract '{name}'?", default=False): + console.print("[dim]Cancelled[/dim]") + return + + store.delete_contract(name) + console.print(f"\n[green]Deleted contract: {name}[/green]\n") + + +@mcp.command("show") +@click.argument("name") +def mcp_show(name: str): + """Show details of an MCP contract snapshot. + + NAME is the contract name. + """ + from evalview.core.mcp_contract import ContractStore + from rich.panel import Panel + from datetime import datetime + + store = ContractStore() + contract = store.load_contract(name) + + if not contract: + console.print(f"\n[yellow]No contract found: {name}[/yellow]") + console.print("[dim]Create one: evalview mcp snapshot --name " + name + "[/dim]\n") + return + + meta = contract.metadata + age = datetime.now() - meta.snapshot_at + + console.print(f"\n[cyan]━━━ MCP Contract: {meta.server_name} ━━━[/cyan]\n") + console.print(f" Endpoint: {meta.endpoint}") + console.print(f" Snapshot: {meta.snapshot_at.strftime('%Y-%m-%d %H:%M')} ({age.days}d ago)") + console.print(f" Protocol: {meta.protocol_version}") + console.print(f" Schema hash: {meta.schema_hash}") + if meta.notes: + console.print(f" Notes: {meta.notes}") + console.print() + + console.print(f"[bold]Tools ({meta.tool_count}):[/bold]\n") + + for tool in contract.tools: + console.print(f" [bold]{tool.name}[/bold]") + if tool.description: + console.print(f" {tool.description}") + if tool.inputSchema.get("properties"): + props = tool.inputSchema["properties"] + required = set(tool.inputSchema.get("required", [])) + for pname, pdef in props.items(): + ptype = pdef.get("type", "any") + req_marker = " [red]*[/red]" if pname in required else "" + console.print(f" [dim]- {pname}: {ptype}{req_marker}[/dim]") + console.print() + + @main.command() @click.option( "--provider", diff --git a/evalview/core/contract_diff.py b/evalview/core/contract_diff.py new file mode 100644 index 0000000..37d535d --- /dev/null +++ b/evalview/core/contract_diff.py @@ -0,0 +1,246 @@ +"""Contract diff engine for detecting MCP server interface drift. + +Compares a saved contract snapshot against the current tool definitions +from an MCP server. Detects: + - REMOVED tools (breaking) + - ADDED tools (informational) + - CHANGED schemas: new required params, renamed params, type changes (breaking) + - Description-only changes (informational, ignored by default) + +This is the mirror of diff.py: diff.py compares agent behavior traces, +contract_diff.py compares server interface schemas. +""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import List, Dict, Any, Optional, Set +import logging + +from evalview.core.mcp_contract import MCPContract, ToolSchema + +logger = logging.getLogger(__name__) + + +class ContractDriftStatus(Enum): + """Result of comparing current tools against a contract snapshot. + + Two states: + - PASSED: Interface matches snapshot (no breaking changes). + - CONTRACT_DRIFT: Breaking changes detected (tools removed, schemas changed). + """ + + PASSED = "passed" + CONTRACT_DRIFT = "contract_drift" + + +class ChangeKind(Enum): + """Kind of change detected in a tool schema.""" + + REMOVED = "removed" # Tool no longer exists (breaking) + ADDED = "added" # New tool available (informational) + PARAM_ADDED_REQ = "param_added_required" # New required param (breaking) + PARAM_REMOVED = "param_removed" # Param removed (breaking) + PARAM_TYPE_CHANGED = "param_type_changed" # Param type changed (breaking) + PARAM_ADDED_OPT = "param_added_optional" # New optional param (safe) + DESCRIPTION_CHANGED = "description_changed" # Description changed (info) + + +# Changes that constitute a breaking contract drift +BREAKING_CHANGES: Set[ChangeKind] = { + ChangeKind.REMOVED, + ChangeKind.PARAM_ADDED_REQ, + ChangeKind.PARAM_REMOVED, + ChangeKind.PARAM_TYPE_CHANGED, +} + + +@dataclass +class ToolChange: + """A single change detected in a tool's schema.""" + + tool_name: str + kind: ChangeKind + detail: str + + @property + def is_breaking(self) -> bool: + return self.kind in BREAKING_CHANGES + + +@dataclass +class ContractDiff: + """Complete diff between a contract snapshot and current server tools.""" + + server_name: str + changes: List[ToolChange] = field(default_factory=list) + snapshot_tool_count: int = 0 + current_tool_count: int = 0 + + @property + def has_breaking_changes(self) -> bool: + return any(c.is_breaking for c in self.changes) + + @property + def status(self) -> ContractDriftStatus: + if self.has_breaking_changes: + return ContractDriftStatus.CONTRACT_DRIFT + return ContractDriftStatus.PASSED + + @property + def breaking_changes(self) -> List[ToolChange]: + return [c for c in self.changes if c.is_breaking] + + @property + def informational_changes(self) -> List[ToolChange]: + return [c for c in self.changes if not c.is_breaking] + + def summary(self) -> str: + if not self.changes: + return "No changes" + + breaking = len(self.breaking_changes) + info = len(self.informational_changes) + parts = [] + if breaking: + parts.append(f"{breaking} breaking change(s)") + if info: + parts.append(f"{info} informational change(s)") + return ", ".join(parts) + + +def _get_required_params(schema: Dict[str, Any]) -> Set[str]: + """Extract required parameter names from a JSON Schema.""" + return set(schema.get("required", [])) + + +def _get_all_params(schema: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: + """Extract all parameter definitions from a JSON Schema.""" + return schema.get("properties", {}) + + +def _diff_tool_schema( + tool_name: str, + snapshot: ToolSchema, + current: ToolSchema, +) -> List[ToolChange]: + """Compare two versions of the same tool's schema.""" + changes: List[ToolChange] = [] + + # Description change (informational) + if snapshot.description != current.description: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.DESCRIPTION_CHANGED, + detail=( + f"description changed from " + f"'{snapshot.description[:60]}' to '{current.description[:60]}'" + ), + )) + + snap_schema = snapshot.inputSchema + curr_schema = current.inputSchema + + snap_params = _get_all_params(snap_schema) + curr_params = _get_all_params(curr_schema) + snap_required = _get_required_params(snap_schema) + curr_required = _get_required_params(curr_schema) + + snap_names = set(snap_params.keys()) + curr_names = set(curr_params.keys()) + + # Removed parameters (breaking) + for name in snap_names - curr_names: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_REMOVED, + detail=f"parameter '{name}' removed", + )) + + # Added parameters + for name in curr_names - snap_names: + if name in curr_required: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_REQ, + detail=f"new required parameter '{name}'", + )) + else: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_OPT, + detail=f"new optional parameter '{name}'", + )) + + # Changed parameters (type changes) + for name in snap_names & curr_names: + snap_type = snap_params[name].get("type") + curr_type = curr_params[name].get("type") + if snap_type != curr_type: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_TYPE_CHANGED, + detail=f"parameter '{name}' type changed from '{snap_type}' to '{curr_type}'", + )) + + # Parameter became required (breaking - it's a new constraint) + if name not in snap_required and name in curr_required: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_REQ, + detail=f"parameter '{name}' became required", + )) + + return changes + + +def diff_contract( + contract: MCPContract, + current_tools: List[Dict[str, Any]], +) -> ContractDiff: + """Compare a saved contract against current tool definitions. + + Args: + contract: The saved contract snapshot. + current_tools: Current tool definitions from the MCP server. + + Returns: + ContractDiff with all detected changes. + """ + current_schemas = { + t["name"]: ToolSchema.model_validate(t) for t in current_tools + } + snapshot_schemas = {t.name: t for t in contract.tools} + + changes: List[ToolChange] = [] + + # Removed tools (in snapshot but not in current) + for name in snapshot_schemas: + if name not in current_schemas: + changes.append(ToolChange( + tool_name=name, + kind=ChangeKind.REMOVED, + detail=f"tool '{name}' no longer available", + )) + + # Added tools (in current but not in snapshot) + for name in current_schemas: + if name not in snapshot_schemas: + changes.append(ToolChange( + tool_name=name, + kind=ChangeKind.ADDED, + detail=f"new tool '{name}' available", + )) + + # Changed tools (in both) + for name in snapshot_schemas.keys() & current_schemas.keys(): + tool_changes = _diff_tool_schema( + name, snapshot_schemas[name], current_schemas[name] + ) + changes.extend(tool_changes) + + return ContractDiff( + server_name=contract.metadata.server_name, + changes=changes, + snapshot_tool_count=len(snapshot_schemas), + current_tool_count=len(current_schemas), + ) diff --git a/evalview/core/diff.py b/evalview/core/diff.py index 95f2f0a..c8a74f6 100644 --- a/evalview/core/diff.py +++ b/evalview/core/diff.py @@ -36,6 +36,7 @@ class DiffStatus(Enum): TOOLS_CHANGED = "tools_changed" # Tool sequence differs from golden OUTPUT_CHANGED = "output_changed" # Output differs beyond similarity threshold REGRESSION = "regression" # Score dropped >5 points from golden + CONTRACT_DRIFT = "contract_drift" # External MCP server interface changed # Alias for backwards compatibility diff --git a/evalview/core/mcp_contract.py b/evalview/core/mcp_contract.py new file mode 100644 index 0000000..56aa890 --- /dev/null +++ b/evalview/core/mcp_contract.py @@ -0,0 +1,152 @@ +"""MCP contract storage and management. + +MCP contracts are snapshots of an external MCP server's tool definitions. +When running tests with --contracts, current tool definitions are compared +against the snapshot to detect interface drift before tests execute. + +This is the mirror of golden traces: golden traces detect when YOUR agent +drifts, contracts detect when an EXTERNAL server drifts underneath you. + +Storage format: + .evalview/contracts/ + .contract.json # The tool schema snapshot +""" + +import json +import hashlib +from pathlib import Path +from datetime import datetime +from typing import Optional, List, Dict, Any +from pydantic import BaseModel, Field +import logging + +logger = logging.getLogger(__name__) + + +class ToolSchema(BaseModel): + """Schema for a single MCP tool.""" + + name: str + description: str = "" + inputSchema: Dict[str, Any] = Field(default_factory=dict) + + +class ContractMetadata(BaseModel): + """Metadata about a contract snapshot.""" + + server_name: str + endpoint: str + snapshot_at: datetime + protocol_version: str = "2024-11-05" + tool_count: int = 0 + notes: Optional[str] = None + schema_hash: str = "" # Hash of tool schemas for quick comparison + + +class MCPContract(BaseModel): + """A contract snapshot from an MCP server.""" + + metadata: ContractMetadata + tools: List[ToolSchema] = Field(default_factory=list) + + @property + def tool_names(self) -> List[str]: + return [t.name for t in self.tools] + + +class ContractStore: + """Manages MCP contract storage and retrieval.""" + + def __init__(self, base_path: Optional[Path] = None): + self.base_path = base_path or Path(".") + self.contracts_dir = self.base_path / ".evalview" / "contracts" + + def _safe_name(self, server_name: str) -> str: + return "".join(c if c.isalnum() or c in "._-" else "_" for c in server_name) + + def _get_contract_path(self, server_name: str) -> Path: + return self.contracts_dir / f"{self._safe_name(server_name)}.contract.json" + + def _hash_schemas(self, tools: List[Dict[str, Any]]) -> str: + """Hash tool schemas for quick drift comparison.""" + canonical = json.dumps(tools, sort_keys=True, default=str) + return hashlib.sha256(canonical.encode()).hexdigest()[:16] + + def save_contract( + self, + server_name: str, + endpoint: str, + tools: List[Dict[str, Any]], + notes: Optional[str] = None, + ) -> Path: + """Save a tool schema snapshot as a contract. + + Args: + server_name: Human-readable server identifier. + endpoint: The MCP server endpoint used for discovery. + tools: Raw tool definitions from tools/list response. + notes: Optional notes about this snapshot. + + Returns: + Path to saved contract file. + """ + self.contracts_dir.mkdir(parents=True, exist_ok=True) + + tool_schemas = [ToolSchema.model_validate(t) for t in tools] + + contract = MCPContract( + metadata=ContractMetadata( + server_name=server_name, + endpoint=endpoint, + snapshot_at=datetime.now(), + tool_count=len(tool_schemas), + notes=notes, + schema_hash=self._hash_schemas(tools), + ), + tools=tool_schemas, + ) + + contract_path = self._get_contract_path(server_name) + with open(contract_path, "w") as f: + f.write(contract.model_dump_json(indent=2)) + + logger.info(f"Saved contract: {contract_path}") + return contract_path + + def load_contract(self, server_name: str) -> Optional[MCPContract]: + """Load a contract by server name.""" + contract_path = self._get_contract_path(server_name) + if not contract_path.exists(): + return None + + with open(contract_path) as f: + data = json.load(f) + + return MCPContract.model_validate(data) + + def has_contract(self, server_name: str) -> bool: + return self._get_contract_path(server_name).exists() + + def list_contracts(self) -> List[ContractMetadata]: + """List all saved contracts.""" + if not self.contracts_dir.exists(): + return [] + + results = [] + for path in self.contracts_dir.glob("*.contract.json"): + try: + with open(path) as f: + data = json.load(f) + results.append(ContractMetadata.model_validate(data["metadata"])) + except Exception as e: + logger.warning(f"Failed to load contract {path}: {e}") + + return results + + def delete_contract(self, server_name: str) -> bool: + """Delete a contract.""" + contract_path = self._get_contract_path(server_name) + if contract_path.exists(): + contract_path.unlink() + return True + return False diff --git a/examples/mcp/contract-testing.yaml b/examples/mcp/contract-testing.yaml new file mode 100644 index 0000000..7caad1d --- /dev/null +++ b/examples/mcp/contract-testing.yaml @@ -0,0 +1,42 @@ +# MCP Contract Testing Example +# +# This demonstrates evaluating an agent that uses an external MCP server +# you don't control (Scenario 2). The workflow: +# +# 1. Snapshot the server: +# evalview mcp snapshot "npx:@modelcontextprotocol/server-filesystem /tmp" --name fs-server +# +# 2. Run this test with contract checking: +# evalview run examples/mcp/contract-testing.yaml --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +# +# 3. If the server's tools change, the run fails before this test even executes. +# +# This test validates that an AI agent correctly uses filesystem tools. +# The contract ensures those tools still exist with the expected interface. + +name: "Agent uses filesystem MCP tools" +description: "Test that the agent calls the right MCP tools for a file read task" + +adapter: mcp +endpoint: "npx:@modelcontextprotocol/server-filesystem /tmp" + +# Reference the contract for this server (checked by --contracts flag) +# If the contract drifted, this test won't even run — saving time and +# giving a clear error message instead of a confusing tool-not-found failure. + +input: + query: "read_file" + context: + arguments: + path: "/tmp/evalview-contract-test.txt" + +expected: + tools: + - "read_file" + output: + contains: + - "hello" + +thresholds: + min_score: 50 + max_latency: 10000 diff --git a/tests/test_mcp_contracts.py b/tests/test_mcp_contracts.py new file mode 100644 index 0000000..a35355f --- /dev/null +++ b/tests/test_mcp_contracts.py @@ -0,0 +1,475 @@ +"""Tests for MCP contract testing: storage, diff engine, and integration.""" + +import json +import pytest +from datetime import datetime +from pathlib import Path + +from evalview.core.mcp_contract import ( + ContractStore, + MCPContract, + ContractMetadata, + ToolSchema, +) +from evalview.core.contract_diff import ( + diff_contract, + ContractDiff, + ContractDriftStatus, + ChangeKind, + ToolChange, + BREAKING_CHANGES, + _diff_tool_schema, +) + + +# ============================================================================ +# Test Data +# ============================================================================ + +SAMPLE_TOOLS = [ + { + "name": "create_issue", + "description": "Create a new issue in a GitHub repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": {"type": "string"}, + "title": {"type": "string"}, + "body": {"type": "string"}, + }, + "required": ["repo", "title"], + }, + }, + { + "name": "list_issues", + "description": "List issues in a repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": {"type": "string"}, + "state": {"type": "string"}, + }, + "required": ["repo"], + }, + }, + { + "name": "read_file", + "description": "Read a file from the filesystem", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string"}, + }, + "required": ["path"], + }, + }, +] + + +@pytest.fixture +def contract_store(tmp_path): + """Create a ContractStore using a temp directory.""" + return ContractStore(base_path=tmp_path) + + +@pytest.fixture +def saved_contract(contract_store): + """Create and save a sample contract.""" + contract_store.save_contract( + server_name="test-server", + endpoint="npx:@test/server", + tools=SAMPLE_TOOLS, + notes="Test snapshot", + ) + return contract_store.load_contract("test-server") + + +# ============================================================================ +# ContractStore Tests +# ============================================================================ + + +class TestContractStore: + """Tests for MCP contract storage.""" + + def test_save_and_load(self, contract_store): + """Save a contract and load it back.""" + path = contract_store.save_contract( + server_name="my-server", + endpoint="http://localhost:8080", + tools=SAMPLE_TOOLS, + notes="Initial snapshot", + ) + + assert path.exists() + assert path.suffix == ".json" + + loaded = contract_store.load_contract("my-server") + assert loaded is not None + assert loaded.metadata.server_name == "my-server" + assert loaded.metadata.endpoint == "http://localhost:8080" + assert loaded.metadata.tool_count == 3 + assert loaded.metadata.notes == "Initial snapshot" + assert len(loaded.tools) == 3 + + def test_tool_names(self, saved_contract): + """Contract exposes tool names.""" + assert saved_contract.tool_names == ["create_issue", "list_issues", "read_file"] + + def test_load_nonexistent(self, contract_store): + """Loading a nonexistent contract returns None.""" + assert contract_store.load_contract("does-not-exist") is None + + def test_has_contract(self, contract_store): + """has_contract returns correct boolean.""" + assert not contract_store.has_contract("my-server") + + contract_store.save_contract( + server_name="my-server", + endpoint="http://localhost:8080", + tools=SAMPLE_TOOLS, + ) + + assert contract_store.has_contract("my-server") + + def test_list_contracts(self, contract_store): + """List all saved contracts.""" + assert contract_store.list_contracts() == [] + + contract_store.save_contract("server-a", "http://a", SAMPLE_TOOLS[:1]) + contract_store.save_contract("server-b", "http://b", SAMPLE_TOOLS[:2]) + + contracts = contract_store.list_contracts() + assert len(contracts) == 2 + names = {c.server_name for c in contracts} + assert names == {"server-a", "server-b"} + + def test_delete_contract(self, contract_store): + """Delete a contract.""" + contract_store.save_contract("my-server", "http://localhost", SAMPLE_TOOLS) + assert contract_store.has_contract("my-server") + + result = contract_store.delete_contract("my-server") + assert result is True + assert not contract_store.has_contract("my-server") + + def test_delete_nonexistent(self, contract_store): + """Deleting a nonexistent contract returns False.""" + assert contract_store.delete_contract("nope") is False + + def test_overwrite(self, contract_store): + """Overwriting a contract replaces it.""" + contract_store.save_contract("my-server", "http://old", SAMPLE_TOOLS[:1]) + contract_store.save_contract("my-server", "http://new", SAMPLE_TOOLS[:2]) + + loaded = contract_store.load_contract("my-server") + assert loaded.metadata.endpoint == "http://new" + assert loaded.metadata.tool_count == 2 + + def test_schema_hash_changes_on_different_tools(self, contract_store): + """Different tool sets produce different schema hashes.""" + contract_store.save_contract("server-a", "http://a", SAMPLE_TOOLS[:1]) + contract_store.save_contract("server-b", "http://b", SAMPLE_TOOLS[:2]) + + a = contract_store.load_contract("server-a") + b = contract_store.load_contract("server-b") + assert a.metadata.schema_hash != b.metadata.schema_hash + + def test_safe_name_sanitization(self, contract_store): + """Server names with special chars are sanitized for filesystem.""" + contract_store.save_contract( + "my server/with:special chars!", + "http://x", + SAMPLE_TOOLS[:1], + ) + assert contract_store.has_contract("my server/with:special chars!") + + def test_metadata_timestamp(self, contract_store): + """Saved contract has a valid timestamp.""" + contract_store.save_contract("ts-test", "http://x", SAMPLE_TOOLS[:1]) + loaded = contract_store.load_contract("ts-test") + assert isinstance(loaded.metadata.snapshot_at, datetime) + + +# ============================================================================ +# Contract Diff Engine Tests +# ============================================================================ + + +class TestContractDiff: + """Tests for the schema diff engine.""" + + def test_no_changes(self, saved_contract): + """Identical tools produce no changes.""" + result = diff_contract(saved_contract, SAMPLE_TOOLS) + + assert result.status == ContractDriftStatus.PASSED + assert result.changes == [] + assert not result.has_breaking_changes + assert result.summary() == "No changes" + + def test_tool_removed(self, saved_contract): + """Removing a tool is a breaking change.""" + current = [t for t in SAMPLE_TOOLS if t["name"] != "read_file"] + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert result.has_breaking_changes + + removed = [c for c in result.changes if c.kind == ChangeKind.REMOVED] + assert len(removed) == 1 + assert removed[0].tool_name == "read_file" + assert removed[0].is_breaking + + def test_tool_added(self, saved_contract): + """Adding a tool is informational (not breaking).""" + current = SAMPLE_TOOLS + [{ + "name": "delete_file", + "description": "Delete a file", + "inputSchema": {"type": "object", "properties": {}}, + }] + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED # Not breaking + assert len(result.changes) == 1 + assert result.changes[0].kind == ChangeKind.ADDED + assert result.changes[0].tool_name == "delete_file" + assert not result.changes[0].is_breaking + + def test_required_param_added(self, saved_contract): + """Adding a new required parameter is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["inputSchema"]["properties"]["owner"] = {"type": "string"} + current[0]["inputSchema"]["required"].append("owner") + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + breaking = result.breaking_changes + assert any( + c.kind == ChangeKind.PARAM_ADDED_REQ and "owner" in c.detail + for c in breaking + ) + + def test_optional_param_added(self, saved_contract): + """Adding a new optional parameter is not breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["inputSchema"]["properties"]["labels"] = {"type": "array"} + # Not added to "required" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED + info = result.informational_changes + assert any(c.kind == ChangeKind.PARAM_ADDED_OPT for c in info) + + def test_param_removed(self, saved_contract): + """Removing a parameter is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + del current[0]["inputSchema"]["properties"]["body"] + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_REMOVED and "body" in c.detail + for c in result.breaking_changes + ) + + def test_param_type_changed(self, saved_contract): + """Changing a parameter type is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[1]["inputSchema"]["properties"]["repo"]["type"] = "integer" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_TYPE_CHANGED + for c in result.breaking_changes + ) + + def test_description_changed(self, saved_contract): + """Description change is informational.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["description"] = "Updated description for create_issue" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED + assert any(c.kind == ChangeKind.DESCRIPTION_CHANGED for c in result.changes) + + def test_param_became_required(self, saved_contract): + """A parameter becoming required is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # "body" was optional, make it required + current[0]["inputSchema"]["required"].append("body") + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_ADDED_REQ and "body" in c.detail + and "became required" in c.detail + for c in result.breaking_changes + ) + + def test_multiple_changes(self, saved_contract): + """Multiple changes across tools are all detected.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # Remove read_file + current = [t for t in current if t["name"] != "read_file"] + # Add required param to list_issues + current[1]["inputSchema"]["properties"]["owner"] = {"type": "string"} + current[1]["inputSchema"]["required"].append("owner") + # Add new tool + current.append({ + "name": "merge_pr", + "description": "Merge a pull request", + "inputSchema": {"type": "object", "properties": {}}, + }) + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert result.snapshot_tool_count == 3 + assert result.current_tool_count == 3 # removed 1, added 1 + + kinds = {c.kind for c in result.changes} + assert ChangeKind.REMOVED in kinds + assert ChangeKind.ADDED in kinds + assert ChangeKind.PARAM_ADDED_REQ in kinds + + def test_summary_no_changes(self, saved_contract): + """Summary for no changes.""" + result = diff_contract(saved_contract, SAMPLE_TOOLS) + assert result.summary() == "No changes" + + def test_summary_with_changes(self, saved_contract): + """Summary includes counts.""" + current = [t for t in SAMPLE_TOOLS if t["name"] != "read_file"] + result = diff_contract(saved_contract, current) + summary = result.summary() + assert "breaking" in summary + + def test_empty_snapshot_vs_tools(self): + """Empty snapshot vs populated tools shows all as added.""" + contract = MCPContract( + metadata=ContractMetadata( + server_name="empty", + endpoint="http://x", + snapshot_at=datetime.now(), + ), + tools=[], + ) + result = diff_contract(contract, SAMPLE_TOOLS) + + assert result.status == ContractDriftStatus.PASSED # All added = not breaking + assert all(c.kind == ChangeKind.ADDED for c in result.changes) + assert len(result.changes) == 3 + + def test_all_tools_removed(self, saved_contract): + """All tools removed is breaking.""" + result = diff_contract(saved_contract, []) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert all(c.kind == ChangeKind.REMOVED for c in result.changes) + assert len(result.changes) == 3 + + +# ============================================================================ +# Breaking Changes Classification Tests +# ============================================================================ + + +class TestBreakingChanges: + """Test that the right change kinds are classified as breaking.""" + + def test_breaking_set(self): + """Verify the breaking changes set.""" + assert ChangeKind.REMOVED in BREAKING_CHANGES + assert ChangeKind.PARAM_ADDED_REQ in BREAKING_CHANGES + assert ChangeKind.PARAM_REMOVED in BREAKING_CHANGES + assert ChangeKind.PARAM_TYPE_CHANGED in BREAKING_CHANGES + + def test_non_breaking_set(self): + """Verify non-breaking changes.""" + assert ChangeKind.ADDED not in BREAKING_CHANGES + assert ChangeKind.PARAM_ADDED_OPT not in BREAKING_CHANGES + assert ChangeKind.DESCRIPTION_CHANGED not in BREAKING_CHANGES + + def test_tool_change_is_breaking(self): + """ToolChange.is_breaking reflects its kind.""" + breaking = ToolChange("t", ChangeKind.REMOVED, "gone") + assert breaking.is_breaking + + safe = ToolChange("t", ChangeKind.ADDED, "new") + assert not safe.is_breaking + + +# ============================================================================ +# DiffStatus Integration Tests +# ============================================================================ + + +class TestDiffStatusIntegration: + """Test that CONTRACT_DRIFT integrates with the existing DiffStatus.""" + + def test_contract_drift_in_diff_status(self): + """CONTRACT_DRIFT is available in the main DiffStatus enum.""" + from evalview.core.diff import DiffStatus + + assert hasattr(DiffStatus, "CONTRACT_DRIFT") + assert DiffStatus.CONTRACT_DRIFT.value == "contract_drift" + + def test_contract_drift_status_enum(self): + """ContractDriftStatus has the expected values.""" + assert ContractDriftStatus.PASSED.value == "passed" + assert ContractDriftStatus.CONTRACT_DRIFT.value == "contract_drift" + + +# ============================================================================ +# Tool Schema Diff Tests (unit level) +# ============================================================================ + + +class TestToolSchemaDiff: + """Unit tests for _diff_tool_schema.""" + + def test_identical_schemas(self): + """No changes for identical schemas.""" + schema = ToolSchema( + name="test", + description="A test tool", + inputSchema={ + "type": "object", + "properties": {"x": {"type": "string"}}, + "required": ["x"], + }, + ) + changes = _diff_tool_schema("test", schema, schema) + assert changes == [] + + def test_empty_schemas(self): + """No changes for two empty schemas.""" + schema = ToolSchema(name="test", description="", inputSchema={}) + changes = _diff_tool_schema("test", schema, schema) + assert changes == [] + + def test_schema_with_no_properties(self): + """Handle schemas without properties gracefully.""" + snap = ToolSchema( + name="test", + description="old", + inputSchema={"type": "object"}, + ) + curr = ToolSchema( + name="test", + description="new", + inputSchema={"type": "object", "properties": {"x": {"type": "string"}}}, + ) + changes = _diff_tool_schema("test", snap, curr) + # Should detect description change and new optional param + kinds = {c.kind for c in changes} + assert ChangeKind.DESCRIPTION_CHANGED in kinds + assert ChangeKind.PARAM_ADDED_OPT in kinds From 75de2a240d2d136d253bf8b4f69d05e5a86103f9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 09:17:56 +0000 Subject: [PATCH 2/3] fix: address 5 bugs found in self-review of MCP contracts feature 1. BUG: asyncio.run() nested inside run_in_executor in _run_async (already in an async context). Fixed to just await directly. 2. BUG: Contract check ran before fail_on resolved from config.yaml defaults, so config-based fail_on: [CONTRACT_DRIFT] was ignored. Moved contract check after config loading. 3. BUG: _discover_tools_http missing notifications/initialized after init (inconsistent with stdio path, violates MCP protocol). 4. BUG: _discover_tools_http didn't check init response for errors. 5. BUG: DiffStatus docstring said "Four states" but now has five. Also fixed: unused Optional import, duplicate datetime imports, variable shadowing (adapter/result in _run_async contract block). https://claude.ai/code/session_019CvwYcAoNoitWBdhEYUbxV --- evalview/adapters/mcp_adapter.py | 16 +++++- evalview/cli.py | 96 +++++++++++++++----------------- evalview/core/contract_diff.py | 2 +- evalview/core/diff.py | 3 +- 4 files changed, 64 insertions(+), 53 deletions(-) diff --git a/evalview/adapters/mcp_adapter.py b/evalview/adapters/mcp_adapter.py index 3a3e1fa..4abe502 100644 --- a/evalview/adapters/mcp_adapter.py +++ b/evalview/adapters/mcp_adapter.py @@ -505,7 +505,7 @@ async def _discover_tools_http(self, url: str) -> List[Dict[str, Any]]: async with httpx.AsyncClient(timeout=self.timeout) as client: # Initialize session self._request_id += 1 - await client.post( + init_response = await client.post( url, json={ "jsonrpc": "2.0", @@ -518,6 +518,20 @@ async def _discover_tools_http(self, url: str) -> List[Dict[str, Any]]: }, }, ) + init_result = init_response.json() + if "error" in init_result: + raise Exception(f"MCP initialize error: {init_result['error']}") + + # Send initialized notification (required by MCP protocol) + self._request_id += 1 + await client.post( + url, + json={ + "jsonrpc": "2.0", + "method": "notifications/initialized", + "params": {}, + }, + ) # Request tool list self._request_id += 1 diff --git a/evalview/cli.py b/evalview/cli.py index fd837f7..6797921 100644 --- a/evalview/cli.py +++ b/evalview/cli.py @@ -1619,53 +1619,6 @@ async def _run_async( console.print("[yellow]⚠️ Watch mode requires watchdog. Install with: pip install watchdog[/yellow]") watch = False - # --- MCP Contract Check (runs before tests, fail fast) --- - contract_drifts = [] - if contracts: - from evalview.core.mcp_contract import ContractStore - from evalview.core.contract_diff import diff_contract, ContractDriftStatus - from evalview.adapters.mcp_adapter import MCPAdapter - - contract_store = ContractStore() - all_contracts = contract_store.list_contracts() - - if all_contracts: - console.print("[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") - - for meta in all_contracts: - contract = contract_store.load_contract(meta.server_name) - if not contract: - continue - - adapter = MCPAdapter(endpoint=contract.metadata.endpoint, timeout=30.0) - try: - current_tools = await asyncio.get_event_loop().run_in_executor( - None, lambda: asyncio.run(adapter.discover_tools()) - ) - except Exception as e: - console.print(f" [yellow]WARN: {meta.server_name}[/yellow] - could not connect: {e}") - continue - - result = diff_contract(contract, current_tools) - - if result.status == ContractDriftStatus.CONTRACT_DRIFT: - contract_drifts.append(result) - console.print(f" [red]CONTRACT_DRIFT: {meta.server_name}[/red] - {result.summary()}") - for change in result.breaking_changes: - console.print(f" [red]{change.kind.value}: {change.tool_name}[/red] - {change.detail}") - else: - console.print(f" [green]PASSED: {meta.server_name}[/green]") - - console.print() - - # Check if we should fail fast - if contract_drifts and fail_on and "CONTRACT_DRIFT" in (fail_on or "").upper(): - console.print("[bold red]Aborting: MCP contract drift detected. Fix contracts before running tests.[/bold red]") - console.print("[dim]Accept changes: evalview mcp snapshot --name [/dim]\n") - raise SystemExit(1) - else: - console.print("[dim]--contracts: No contracts found. Create one: evalview mcp snapshot --name [/dim]\n") - console.print("[blue]Running test cases...[/blue]\n") # Load config - check path directory first, then current directory @@ -1710,6 +1663,52 @@ async def _run_async( else: warn_on = str(config_warn_on) + # --- MCP Contract Check (runs before tests, fail fast) --- + # Placed after fail_on/warn_on resolution so config.yaml defaults are available. + contract_drifts = [] + if contracts: + from evalview.core.mcp_contract import ContractStore + from evalview.core.contract_diff import diff_contract, ContractDriftStatus + from evalview.adapters.mcp_adapter import MCPAdapter as MCPContractAdapter + + contract_store = ContractStore() + all_contracts = contract_store.list_contracts() + + if all_contracts: + console.print("[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") + + for meta in all_contracts: + contract = contract_store.load_contract(meta.server_name) + if not contract: + continue + + mcp_adapter = MCPContractAdapter(endpoint=contract.metadata.endpoint, timeout=30.0) + try: + current_tools = await mcp_adapter.discover_tools() + except Exception as e: + console.print(f" [yellow]WARN: {meta.server_name}[/yellow] - could not connect: {e}") + continue + + contract_result = diff_contract(contract, current_tools) + + if contract_result.status == ContractDriftStatus.CONTRACT_DRIFT: + contract_drifts.append(contract_result) + console.print(f" [red]CONTRACT_DRIFT: {meta.server_name}[/red] - {contract_result.summary()}") + for change in contract_result.breaking_changes: + console.print(f" [red]{change.kind.value}: {change.tool_name}[/red] - {change.detail}") + else: + console.print(f" [green]PASSED: {meta.server_name}[/green]") + + console.print() + + # Fail fast if contract drift detected and fail_on includes it + if contract_drifts and "CONTRACT_DRIFT" in fail_on.upper(): + console.print("[bold red]Aborting: MCP contract drift detected. Fix contracts before running tests.[/bold red]") + console.print("[dim]Accept changes: evalview mcp snapshot --name [/dim]\n") + raise SystemExit(1) + else: + console.print("[dim]--contracts: No contracts found. Create one: evalview mcp snapshot --name [/dim]\n") + # Extract model config (can be string or dict) model_config = config.get("model", {}) if verbose and model_config: @@ -6151,7 +6150,6 @@ def mcp_check(name: str, endpoint: str, timeout: float): console.print(f" Endpoint: {target_endpoint}") # Show snapshot age - from datetime import datetime age = datetime.now() - contract.metadata.snapshot_at age_days = age.days if age_days > 30: @@ -6204,7 +6202,6 @@ def mcp_list(): Shows all saved contracts with metadata. """ from evalview.core.mcp_contract import ContractStore - from datetime import datetime store = ContractStore() contracts = store.list_contracts() @@ -6265,7 +6262,6 @@ def mcp_show(name: str): """ from evalview.core.mcp_contract import ContractStore from rich.panel import Panel - from datetime import datetime store = ContractStore() contract = store.load_contract(name) diff --git a/evalview/core/contract_diff.py b/evalview/core/contract_diff.py index 37d535d..f43dc49 100644 --- a/evalview/core/contract_diff.py +++ b/evalview/core/contract_diff.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from enum import Enum -from typing import List, Dict, Any, Optional, Set +from typing import List, Dict, Any, Set import logging from evalview.core.mcp_contract import MCPContract, ToolSchema diff --git a/evalview/core/diff.py b/evalview/core/diff.py index c8a74f6..c623bf5 100644 --- a/evalview/core/diff.py +++ b/evalview/core/diff.py @@ -25,11 +25,12 @@ class DiffStatus(Enum): A test may have additional pass/fail criteria (cost limits, latency thresholds) beyond the diff status. - Four states with clear developer-friendly terminology: + Five states with clear developer-friendly terminology: - PASSED: Output and tools match within tolerance - safe to ship - TOOLS_CHANGED: Tool sequence differs - agent behavior shifted, review before deploy - OUTPUT_CHANGED: Same tools but output differs beyond threshold - review before deploy - REGRESSION: Score dropped significantly - likely a bug, fix before deploy + - CONTRACT_DRIFT: External MCP server interface changed - fix integration before deploy """ PASSED = "passed" # Output and tools match within tolerance From a3c0046d2ef773293290e280de10af84be2d0ea9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 7 Feb 2026 09:39:52 +0000 Subject: [PATCH 3/3] fix: second audit - action.yml double --fail-on, spurious request_id, test gaps 1. action.yml: When both diff=true and contracts=true, --fail-on was appended twice. Refactored to set it once after both flags. 2. mcp_adapter: _discover_tools_http incremented _request_id before the notification. JSON-RPC notifications don't carry an id, so incrementing is semantically wrong (wastes an id for the next call). 3. Tests: Removed unused Path import. Added test for summary() with mixed breaking + informational changes. Added test for duplicate tool names in current_tools (edge case). https://claude.ai/code/session_019CvwYcAoNoitWBdhEYUbxV --- action.yml | 9 +++++++-- evalview/adapters/mcp_adapter.py | 1 - tests/test_mcp_contracts.py | 28 ++++++++++++++++++++++++---- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/action.yml b/action.yml index 640ef5b..aa7b0b5 100644 --- a/action.yml +++ b/action.yml @@ -106,12 +106,17 @@ runs: # Regression detection mode if [ "${{ inputs.diff }}" = "true" ]; then - CMD="$CMD --diff --fail-on ${{ inputs.fail-on }}" + CMD="$CMD --diff" fi # MCP contract drift detection if [ "${{ inputs.contracts }}" = "true" ]; then - CMD="$CMD --contracts --fail-on ${{ inputs.fail-on }}" + CMD="$CMD --contracts" + fi + + # Set fail-on (applies to both diff and contract statuses) + if [ "${{ inputs.diff }}" = "true" ] || [ "${{ inputs.contracts }}" = "true" ]; then + CMD="$CMD --fail-on ${{ inputs.fail-on }}" fi if [ "${{ inputs.generate-report }}" = "true" ]; then diff --git a/evalview/adapters/mcp_adapter.py b/evalview/adapters/mcp_adapter.py index 4abe502..6a68f4b 100644 --- a/evalview/adapters/mcp_adapter.py +++ b/evalview/adapters/mcp_adapter.py @@ -523,7 +523,6 @@ async def _discover_tools_http(self, url: str) -> List[Dict[str, Any]]: raise Exception(f"MCP initialize error: {init_result['error']}") # Send initialized notification (required by MCP protocol) - self._request_id += 1 await client.post( url, json={ diff --git a/tests/test_mcp_contracts.py b/tests/test_mcp_contracts.py index a35355f..5f51c9d 100644 --- a/tests/test_mcp_contracts.py +++ b/tests/test_mcp_contracts.py @@ -3,7 +3,6 @@ import json import pytest from datetime import datetime -from pathlib import Path from evalview.core.mcp_contract import ( ContractStore, @@ -345,12 +344,33 @@ def test_summary_no_changes(self, saved_contract): result = diff_contract(saved_contract, SAMPLE_TOOLS) assert result.summary() == "No changes" - def test_summary_with_changes(self, saved_contract): - """Summary includes counts.""" + def test_summary_with_breaking_only(self, saved_contract): + """Summary with breaking changes only.""" current = [t for t in SAMPLE_TOOLS if t["name"] != "read_file"] result = diff_contract(saved_contract, current) + assert result.summary() == "1 breaking change(s)" + + def test_summary_with_mixed_changes(self, saved_contract): + """Summary with both breaking and informational changes.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # Remove a tool (breaking) and add one (informational) + current = [t for t in current if t["name"] != "read_file"] + current.append({ + "name": "new_tool", + "description": "A new tool", + "inputSchema": {"type": "object", "properties": {}}, + }) + result = diff_contract(saved_contract, current) summary = result.summary() - assert "breaking" in summary + assert "1 breaking change(s)" in summary + assert "1 informational change(s)" in summary + + def test_duplicate_tool_names_in_current(self, saved_contract): + """Duplicate tool names in current tools are deduplicated silently.""" + current = SAMPLE_TOOLS + [SAMPLE_TOOLS[0]] # Duplicate create_issue + result = diff_contract(saved_contract, current) + # Dict comprehension deduplicates - should still pass + assert result.status == ContractDriftStatus.PASSED def test_empty_snapshot_vs_tools(self): """Empty snapshot vs populated tools shows all as added."""