diff --git a/action.yml b/action.yml index 6d59925..aa7b0b5 100644 --- a/action.yml +++ b/action.yml @@ -37,9 +37,13 @@ inputs: required: false default: 'false' fail-on: - description: 'Comma-separated statuses that fail CI (REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED)' + description: 'Comma-separated statuses that fail CI (REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED, CONTRACT_DRIFT)' required: false default: 'REGRESSION' + contracts: + description: 'Check MCP contracts for interface drift before running tests' + required: false + default: 'false' generate-report: description: 'Generate HTML report' required: false @@ -102,7 +106,17 @@ runs: # Regression detection mode if [ "${{ inputs.diff }}" = "true" ]; then - CMD="$CMD --diff --fail-on ${{ inputs.fail-on }}" + CMD="$CMD --diff" + fi + + # MCP contract drift detection + if [ "${{ inputs.contracts }}" = "true" ]; then + CMD="$CMD --contracts" + fi + + # Set fail-on (applies to both diff and contract statuses) + if [ "${{ inputs.diff }}" = "true" ] || [ "${{ inputs.contracts }}" = "true" ]; then + CMD="$CMD --fail-on ${{ inputs.fail-on }}" fi if [ "${{ inputs.generate-report }}" = "true" ]; then diff --git a/docs/MCP_CONTRACTS.md b/docs/MCP_CONTRACTS.md new file mode 100644 index 0000000..dc58f1a --- /dev/null +++ b/docs/MCP_CONTRACTS.md @@ -0,0 +1,228 @@ +# MCP Contract Testing + +Detect when external MCP servers change their interface before your agent breaks. + +## The Problem + +When you use MCP servers you don't own (Scenario 2), the server can change its +tool definitions at any time: rename parameters, remove tools, add required fields. +Your agent tests pass today and fail tomorrow — not because your code changed, +but because the server did. + +## The Solution + +MCP contract testing captures a snapshot of a server's tool definitions and diffs +against it on every CI run. If the interface changed, you know immediately — before +running your full test suite. + +This mirrors EvalView's golden baseline system: +- **Golden traces** detect when your agent's *behavior* drifts +- **MCP contracts** detect when an external server's *interface* drifts + +## Quick Start + +### 1. Snapshot a server + +```bash +evalview mcp snapshot "npx:@modelcontextprotocol/server-github" --name server-github +``` + +Output: +``` +Snapshot saved: .evalview/contracts/server-github.contract.json +Tools discovered: 8 + - create_issue + - list_issues + - create_pull_request + - ... +``` + +### 2. Check for drift + +```bash +evalview mcp check server-github +``` + +If the server changed: +``` +CONTRACT_DRIFT - 2 breaking change(s) + REMOVED: create_pull_request - tool 'create_pull_request' no longer available + CHANGED: list_issues - new required parameter 'owner' +``` + +### 3. Use in CI + +```bash +evalview run tests/ --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +``` + +The `--contracts` flag checks all saved contracts *before* running tests. +If any contract drifted, the run aborts immediately — no point testing against +a broken interface. + +## CLI Reference + +### `evalview mcp snapshot` + +Capture tool definitions from an MCP server. + +```bash +evalview mcp snapshot --name [--notes "..."] [--timeout 30] +``` + +| Argument | Description | +|----------|-------------| +| `endpoint` | MCP server endpoint (e.g., `npx:@modelcontextprotocol/server-github`) | +| `--name` | Human-readable identifier for this contract (required) | +| `--notes` | Optional notes about this snapshot | +| `--timeout` | Connection timeout in seconds (default: 30) | + +Supports all MCP transport types: +- **stdio**: `"npx:@modelcontextprotocol/server-filesystem /tmp"` +- **HTTP**: `"http://localhost:8080"` +- **Command**: `"stdio:python my_server.py"` + +### `evalview mcp check` + +Compare current server interface against a saved contract. + +```bash +evalview mcp check [--endpoint ] [--timeout 30] +``` + +| Argument | Description | +|----------|-------------| +| `name` | Contract name (from `--name` in snapshot) | +| `--endpoint` | Override endpoint (default: use endpoint from snapshot) | + +Exit codes: +- `0` — No breaking changes +- `1` — Breaking changes detected (CONTRACT_DRIFT) +- `2` — Could not connect to server + +### `evalview mcp list` + +List all saved contracts. + +```bash +evalview mcp list +``` + +### `evalview mcp show` + +Show full details of a contract including all tool schemas. + +```bash +evalview mcp show +``` + +### `evalview mcp delete` + +Remove a contract. + +```bash +evalview mcp delete [--force] +``` + +## Integration with `evalview run` + +The `--contracts` flag adds a pre-flight check to any test run: + +```bash +evalview run tests/ --contracts +``` + +This checks all contracts in `.evalview/contracts/` before running tests. +Combine with `--fail-on CONTRACT_DRIFT` to fail CI on drift: + +```bash +evalview run tests/ --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +``` + +Or use `--strict` (now includes CONTRACT_DRIFT): + +```bash +evalview run tests/ --contracts --strict +``` + +## GitHub Actions + +```yaml +- name: Run EvalView + uses: hidai25/eval-view@v0.2.1 + with: + diff: true + contracts: true + fail-on: 'REGRESSION,CONTRACT_DRIFT' +``` + +## What Gets Detected + +### Breaking changes (trigger CONTRACT_DRIFT) + +| Change | Example | +|--------|---------| +| Tool removed | `create_pull_request` no longer exists | +| Required parameter added | New required param `owner` on `list_issues` | +| Parameter removed | `repo` param no longer accepted | +| Parameter type changed | `limit` changed from `string` to `integer` | +| Parameter became required | `owner` was optional, now required | + +### Informational changes (logged, don't fail) + +| Change | Example | +|--------|---------| +| New tool added | `merge_pull_request` now available | +| Optional parameter added | New optional param `labels` on `create_issue` | +| Description changed | Tool description updated | + +## Contract File Format + +Contracts are stored as JSON in `.evalview/contracts/`: + +```json +{ + "metadata": { + "server_name": "server-github", + "endpoint": "npx:@modelcontextprotocol/server-github", + "snapshot_at": "2026-02-07T10:30:00", + "protocol_version": "2024-11-05", + "tool_count": 8, + "schema_hash": "a1b2c3d4e5f67890" + }, + "tools": [ + { + "name": "create_issue", + "description": "Create a new issue in a GitHub repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": { "type": "string" }, + "title": { "type": "string" }, + "body": { "type": "string" } + }, + "required": ["repo", "title"] + } + } + ] +} +``` + +Commit these files to your repo so CI can use them. + +## Best Practices + +1. **Snapshot after verifying** — Run your tests first, confirm everything works, + then snapshot. The contract represents a known-good interface. + +2. **Refresh periodically** — If a contract is >30 days old, `evalview mcp check` + will warn you. Re-snapshot to accept intentional changes. + +3. **One contract per server** — Name contracts after the server, not the tools. + `server-github` not `create-issue-tool`. + +4. **Commit contracts** — Store `.evalview/contracts/` in git. They're small JSON + files and CI needs them. + +5. **Check before testing** — Use `--contracts` on `evalview run` so drift is + caught before wasting time on tests that will fail anyway. diff --git a/evalview/adapters/mcp_adapter.py b/evalview/adapters/mcp_adapter.py index 6f49865..6a68f4b 100644 --- a/evalview/adapters/mcp_adapter.py +++ b/evalview/adapters/mcp_adapter.py @@ -449,6 +449,107 @@ def _create_error_trace( trace_context=tracer.build_trace_context(), ) + async def discover_tools(self) -> List[Dict[str, Any]]: + """Discover available tools from an MCP server. + + Connects to the server, initializes the session, and calls tools/list + to retrieve all available tool definitions (name, description, inputSchema). + + Returns: + List of tool definition dicts from the MCP server. + + Raises: + Exception: If connection or discovery fails. + """ + transport, target = self._parse_endpoint() + + if transport == "stdio": + return await self._discover_tools_stdio(target) + else: + return await self._discover_tools_http(target) + + async def _discover_tools_stdio(self, command: str) -> List[Dict[str, Any]]: + """Discover tools via stdio transport.""" + import shlex + + cmd_parts = shlex.split(command) + process = await asyncio.create_subprocess_exec( + *cmd_parts, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + try: + await self._send_request( + process, + "initialize", + { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "evalview", "version": "0.1.7"}, + }, + ) + await self._send_notification(process, "notifications/initialized", {}) + + result = await self._send_request(process, "tools/list", {}) + return result.get("tools", []) + finally: + process.terminate() + await process.wait() + + async def _discover_tools_http(self, url: str) -> List[Dict[str, Any]]: + """Discover tools via HTTP transport.""" + import httpx + + async with httpx.AsyncClient(timeout=self.timeout) as client: + # Initialize session + self._request_id += 1 + init_response = await client.post( + url, + json={ + "jsonrpc": "2.0", + "id": self._request_id, + "method": "initialize", + "params": { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "evalview", "version": "0.1.7"}, + }, + }, + ) + init_result = init_response.json() + if "error" in init_result: + raise Exception(f"MCP initialize error: {init_result['error']}") + + # Send initialized notification (required by MCP protocol) + await client.post( + url, + json={ + "jsonrpc": "2.0", + "method": "notifications/initialized", + "params": {}, + }, + ) + + # Request tool list + self._request_id += 1 + response = await client.post( + url, + json={ + "jsonrpc": "2.0", + "id": self._request_id, + "method": "tools/list", + "params": {}, + }, + ) + result = response.json() + + if "error" in result: + raise Exception(f"MCP tools/list error: {result['error']}") + + return result.get("result", {}).get("tools", []) + async def health_check(self) -> bool: """Check if MCP server is reachable.""" transport, target = self._parse_endpoint() diff --git a/evalview/cli.py b/evalview/cli.py index c74e777..6797921 100644 --- a/evalview/cli.py +++ b/evalview/cli.py @@ -1360,7 +1360,7 @@ async def _init_wizard_async(dir: str): "--fail-on", type=str, default=None, - help="Comma-separated diff statuses that cause exit code 1 (default: REGRESSION, or from ci.fail_on in config.yaml)", + help="Comma-separated statuses that cause exit code 1: REGRESSION, TOOLS_CHANGED, OUTPUT_CHANGED, CONTRACT_DRIFT (default: REGRESSION)", ) @click.option( "--warn-on", @@ -1401,6 +1401,11 @@ async def _init_wizard_async(dir: str): default=None, help="Filter tests by difficulty level.", ) +@click.option( + "--contracts", + is_flag=True, + help="Check MCP contracts for interface drift before running tests. Fails fast if external servers changed.", +) def run( path: Optional[str], pattern: str, @@ -1432,6 +1437,7 @@ def run( runs: Optional[int], pass_rate: float, difficulty: Optional[str], + contracts: bool, ): """Run test cases against the agent. @@ -1448,7 +1454,7 @@ def run( # Handle --strict flag (overrides config and CLI) if strict: - fail_on = "REGRESSION,TOOLS_CHANGED,OUTPUT_CHANGED" + fail_on = "REGRESSION,TOOLS_CHANGED,OUTPUT_CHANGED,CONTRACT_DRIFT" warn_on = "" asyncio.run(_run_async( @@ -1456,7 +1462,8 @@ def run( sequential, max_workers, max_retries, retry_delay, watch, html_report, summary, coverage, adapter_override=adapter, diff=diff, diff_report=diff_report, fail_on=fail_on, warn_on=warn_on, trace=trace, trace_out=trace_out, - runs=runs, pass_rate=pass_rate, difficulty_filter=difficulty + runs=runs, pass_rate=pass_rate, difficulty_filter=difficulty, + contracts=contracts, )) @@ -1488,6 +1495,7 @@ async def _run_async( runs: Optional[int] = None, pass_rate: float = 0.8, difficulty_filter: Optional[str] = None, + contracts: bool = False, ): """Async implementation of run command.""" import fnmatch @@ -1655,6 +1663,52 @@ async def _run_async( else: warn_on = str(config_warn_on) + # --- MCP Contract Check (runs before tests, fail fast) --- + # Placed after fail_on/warn_on resolution so config.yaml defaults are available. + contract_drifts = [] + if contracts: + from evalview.core.mcp_contract import ContractStore + from evalview.core.contract_diff import diff_contract, ContractDriftStatus + from evalview.adapters.mcp_adapter import MCPAdapter as MCPContractAdapter + + contract_store = ContractStore() + all_contracts = contract_store.list_contracts() + + if all_contracts: + console.print("[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") + + for meta in all_contracts: + contract = contract_store.load_contract(meta.server_name) + if not contract: + continue + + mcp_adapter = MCPContractAdapter(endpoint=contract.metadata.endpoint, timeout=30.0) + try: + current_tools = await mcp_adapter.discover_tools() + except Exception as e: + console.print(f" [yellow]WARN: {meta.server_name}[/yellow] - could not connect: {e}") + continue + + contract_result = diff_contract(contract, current_tools) + + if contract_result.status == ContractDriftStatus.CONTRACT_DRIFT: + contract_drifts.append(contract_result) + console.print(f" [red]CONTRACT_DRIFT: {meta.server_name}[/red] - {contract_result.summary()}") + for change in contract_result.breaking_changes: + console.print(f" [red]{change.kind.value}: {change.tool_name}[/red] - {change.detail}") + else: + console.print(f" [green]PASSED: {meta.server_name}[/green]") + + console.print() + + # Fail fast if contract drift detected and fail_on includes it + if contract_drifts and "CONTRACT_DRIFT" in fail_on.upper(): + console.print("[bold red]Aborting: MCP contract drift detected. Fix contracts before running tests.[/bold red]") + console.print("[dim]Accept changes: evalview mcp snapshot --name [/dim]\n") + raise SystemExit(1) + else: + console.print("[dim]--contracts: No contracts found. Create one: evalview mcp snapshot --name [/dim]\n") + # Extract model config (can be string or dict) model_config = config.get("model", {}) if verbose and model_config: @@ -2807,7 +2861,7 @@ async def update_display(): # Parse fail_on and warn_on into sets fail_statuses = set() warn_statuses = set() - valid_statuses = {"REGRESSION", "TOOLS_CHANGED", "OUTPUT_CHANGED", "PASSED"} + valid_statuses = {"REGRESSION", "TOOLS_CHANGED", "OUTPUT_CHANGED", "PASSED", "CONTRACT_DRIFT"} for s in fail_on.upper().split(","): s = s.strip() @@ -5976,6 +6030,275 @@ def golden_show(test_name: str): console.print() +# ============================================================================ +# MCP Contract Commands +# ============================================================================ + + +@main.group() +def mcp(): + """Manage MCP contracts (detect external server interface drift). + + MCP contracts are snapshots of an external MCP server's tool definitions. + Use them with `evalview run --contracts` to detect when servers change + their interface before your tests break. + + Examples: + evalview mcp snapshot "npx:@modelcontextprotocol/server-github" --name server-github + evalview mcp check server-github + evalview mcp list + """ + pass + + +@mcp.command("snapshot") +@click.argument("endpoint") +@click.option("--name", "-n", required=True, help="Server name (used as contract identifier)") +@click.option("--notes", help="Notes about this snapshot") +@click.option("--timeout", default=30.0, type=float, help="Connection timeout in seconds") +def mcp_snapshot(endpoint: str, name: str, notes: str, timeout: float): + """Snapshot an MCP server's tool definitions as a contract. + + ENDPOINT is the MCP server endpoint (e.g., "npx:@modelcontextprotocol/server-github"). + + Examples: + evalview mcp snapshot "npx:@modelcontextprotocol/server-filesystem /tmp" --name fs-server + evalview mcp snapshot "http://localhost:8080" --name my-server --notes "v2.1 release" + """ + import asyncio + from evalview.adapters.mcp_adapter import MCPAdapter + from evalview.core.mcp_contract import ContractStore + + console.print(f"\n[cyan]━━━ MCP Contract Snapshot ━━━[/cyan]\n") + console.print(f" Server: [bold]{name}[/bold]") + console.print(f" Endpoint: {endpoint}") + console.print() + + adapter = MCPAdapter(endpoint=endpoint, timeout=timeout) + + try: + tools = asyncio.run(adapter.discover_tools()) + except Exception as e: + console.print(f"[red]Failed to connect to MCP server: {e}[/red]") + console.print("[dim]Check that the server is running and the endpoint is correct.[/dim]\n") + raise SystemExit(1) + + if not tools: + console.print("[yellow]Server returned no tools.[/yellow]\n") + raise SystemExit(1) + + store = ContractStore() + + if store.has_contract(name): + if not click.confirm( + f"Contract '{name}' already exists. Overwrite?", + default=False, + ): + console.print("[dim]Cancelled[/dim]\n") + return + + path = store.save_contract( + server_name=name, + endpoint=endpoint, + tools=tools, + notes=notes, + ) + + console.print(f"[green]Snapshot saved: {path}[/green]") + console.print(f" Tools discovered: [bold]{len(tools)}[/bold]") + for tool in tools: + desc = tool.get("description", "") + if len(desc) > 60: + desc = desc[:57] + "..." + console.print(f" [dim]- {tool['name']}[/dim] {desc}") + console.print() + console.print("[dim]Check for drift: evalview mcp check " + name + "[/dim]") + console.print("[dim]Use in CI: evalview run --contracts --fail-on CONTRACT_DRIFT[/dim]\n") + + +@mcp.command("check") +@click.argument("name") +@click.option("--endpoint", help="Override endpoint (default: use endpoint from snapshot)") +@click.option("--timeout", default=30.0, type=float, help="Connection timeout in seconds") +def mcp_check(name: str, endpoint: str, timeout: float): + """Check an MCP server for contract drift. + + NAME is the contract name (from `evalview mcp snapshot --name`). + + Examples: + evalview mcp check server-github + evalview mcp check my-server --endpoint "http://new-host:8080" + """ + import asyncio + from evalview.adapters.mcp_adapter import MCPAdapter + from evalview.core.mcp_contract import ContractStore + from evalview.core.contract_diff import diff_contract, ContractDriftStatus + + store = ContractStore() + contract = store.load_contract(name) + + if not contract: + console.print(f"\n[red]No contract found: {name}[/red]") + console.print("[dim]Create one with: evalview mcp snapshot --name " + name + "[/dim]\n") + raise SystemExit(1) + + target_endpoint = endpoint or contract.metadata.endpoint + adapter = MCPAdapter(endpoint=target_endpoint, timeout=timeout) + + console.print(f"\n[cyan]━━━ MCP Contract Check ━━━[/cyan]\n") + console.print(f" Contract: [bold]{name}[/bold]") + console.print(f" Endpoint: {target_endpoint}") + + # Show snapshot age + age = datetime.now() - contract.metadata.snapshot_at + age_days = age.days + if age_days > 30: + console.print(f" Snapshot age: [yellow]{age_days} days (consider refreshing)[/yellow]") + else: + console.print(f" Snapshot age: [dim]{age_days} day(s)[/dim]") + console.print() + + try: + current_tools = asyncio.run(adapter.discover_tools()) + except Exception as e: + console.print(f"[red]Failed to connect to MCP server: {e}[/red]") + console.print("[dim]The server may be down. Use --endpoint to try a different host.[/dim]\n") + raise SystemExit(2) + + result = diff_contract(contract, current_tools) + + if result.status == ContractDriftStatus.PASSED: + if result.informational_changes: + console.print(f"[green]PASSED[/green] - No breaking changes ({result.summary()})") + console.print() + for change in result.informational_changes: + console.print(f" [dim]+ {change.tool_name}: {change.detail}[/dim]") + else: + console.print("[green]PASSED[/green] - Interface matches snapshot exactly") + console.print() + else: + console.print(f"[red]CONTRACT_DRIFT[/red] - {result.summary()}\n") + + for change in result.breaking_changes: + if change.kind.value == "removed": + console.print(f" [red]REMOVED: {change.tool_name}[/red] - {change.detail}") + else: + console.print(f" [red]CHANGED: {change.tool_name}[/red] - {change.detail}") + + if result.informational_changes: + console.print() + for change in result.informational_changes: + console.print(f" [dim]INFO: {change.tool_name} - {change.detail}[/dim]") + + console.print() + console.print("[dim]To accept the new interface: evalview mcp snapshot " + target_endpoint + " --name " + name + "[/dim]\n") + raise SystemExit(1) + + +@mcp.command("list") +def mcp_list(): + """List all MCP contract snapshots. + + Shows all saved contracts with metadata. + """ + from evalview.core.mcp_contract import ContractStore + + store = ContractStore() + contracts = store.list_contracts() + + if not contracts: + console.print("\n[yellow]No MCP contracts found.[/yellow]") + console.print("[dim]Create one: evalview mcp snapshot --name [/dim]\n") + return + + console.print("\n[cyan]━━━ MCP Contracts ━━━[/cyan]\n") + + for c in sorted(contracts, key=lambda x: x.server_name): + age = datetime.now() - c.snapshot_at + age_str = f"{age.days}d ago" if age.days > 0 else "today" + + console.print(f" [bold]{c.server_name}[/bold]") + console.print(f" [dim]Endpoint: {c.endpoint}[/dim]") + console.print(f" [dim]Tools: {c.tool_count} | Snapshot: {age_str}[/dim]") + if c.notes: + console.print(f" [dim]Notes: {c.notes}[/dim]") + console.print() + + console.print(f"[dim]Total: {len(contracts)} contract(s)[/dim]") + console.print("[dim]Check for drift: evalview mcp check [/dim]\n") + + +@mcp.command("delete") +@click.argument("name") +@click.option("--force", "-f", is_flag=True, help="Skip confirmation") +def mcp_delete(name: str, force: bool): + """Delete an MCP contract snapshot. + + NAME is the contract name to delete. + """ + from evalview.core.mcp_contract import ContractStore + + store = ContractStore() + + if not store.has_contract(name): + console.print(f"\n[yellow]No contract found: {name}[/yellow]\n") + return + + if not force: + if not click.confirm(f"Delete contract '{name}'?", default=False): + console.print("[dim]Cancelled[/dim]") + return + + store.delete_contract(name) + console.print(f"\n[green]Deleted contract: {name}[/green]\n") + + +@mcp.command("show") +@click.argument("name") +def mcp_show(name: str): + """Show details of an MCP contract snapshot. + + NAME is the contract name. + """ + from evalview.core.mcp_contract import ContractStore + from rich.panel import Panel + + store = ContractStore() + contract = store.load_contract(name) + + if not contract: + console.print(f"\n[yellow]No contract found: {name}[/yellow]") + console.print("[dim]Create one: evalview mcp snapshot --name " + name + "[/dim]\n") + return + + meta = contract.metadata + age = datetime.now() - meta.snapshot_at + + console.print(f"\n[cyan]━━━ MCP Contract: {meta.server_name} ━━━[/cyan]\n") + console.print(f" Endpoint: {meta.endpoint}") + console.print(f" Snapshot: {meta.snapshot_at.strftime('%Y-%m-%d %H:%M')} ({age.days}d ago)") + console.print(f" Protocol: {meta.protocol_version}") + console.print(f" Schema hash: {meta.schema_hash}") + if meta.notes: + console.print(f" Notes: {meta.notes}") + console.print() + + console.print(f"[bold]Tools ({meta.tool_count}):[/bold]\n") + + for tool in contract.tools: + console.print(f" [bold]{tool.name}[/bold]") + if tool.description: + console.print(f" {tool.description}") + if tool.inputSchema.get("properties"): + props = tool.inputSchema["properties"] + required = set(tool.inputSchema.get("required", [])) + for pname, pdef in props.items(): + ptype = pdef.get("type", "any") + req_marker = " [red]*[/red]" if pname in required else "" + console.print(f" [dim]- {pname}: {ptype}{req_marker}[/dim]") + console.print() + + @main.command() @click.option( "--provider", diff --git a/evalview/core/contract_diff.py b/evalview/core/contract_diff.py new file mode 100644 index 0000000..f43dc49 --- /dev/null +++ b/evalview/core/contract_diff.py @@ -0,0 +1,246 @@ +"""Contract diff engine for detecting MCP server interface drift. + +Compares a saved contract snapshot against the current tool definitions +from an MCP server. Detects: + - REMOVED tools (breaking) + - ADDED tools (informational) + - CHANGED schemas: new required params, renamed params, type changes (breaking) + - Description-only changes (informational, ignored by default) + +This is the mirror of diff.py: diff.py compares agent behavior traces, +contract_diff.py compares server interface schemas. +""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import List, Dict, Any, Set +import logging + +from evalview.core.mcp_contract import MCPContract, ToolSchema + +logger = logging.getLogger(__name__) + + +class ContractDriftStatus(Enum): + """Result of comparing current tools against a contract snapshot. + + Two states: + - PASSED: Interface matches snapshot (no breaking changes). + - CONTRACT_DRIFT: Breaking changes detected (tools removed, schemas changed). + """ + + PASSED = "passed" + CONTRACT_DRIFT = "contract_drift" + + +class ChangeKind(Enum): + """Kind of change detected in a tool schema.""" + + REMOVED = "removed" # Tool no longer exists (breaking) + ADDED = "added" # New tool available (informational) + PARAM_ADDED_REQ = "param_added_required" # New required param (breaking) + PARAM_REMOVED = "param_removed" # Param removed (breaking) + PARAM_TYPE_CHANGED = "param_type_changed" # Param type changed (breaking) + PARAM_ADDED_OPT = "param_added_optional" # New optional param (safe) + DESCRIPTION_CHANGED = "description_changed" # Description changed (info) + + +# Changes that constitute a breaking contract drift +BREAKING_CHANGES: Set[ChangeKind] = { + ChangeKind.REMOVED, + ChangeKind.PARAM_ADDED_REQ, + ChangeKind.PARAM_REMOVED, + ChangeKind.PARAM_TYPE_CHANGED, +} + + +@dataclass +class ToolChange: + """A single change detected in a tool's schema.""" + + tool_name: str + kind: ChangeKind + detail: str + + @property + def is_breaking(self) -> bool: + return self.kind in BREAKING_CHANGES + + +@dataclass +class ContractDiff: + """Complete diff between a contract snapshot and current server tools.""" + + server_name: str + changes: List[ToolChange] = field(default_factory=list) + snapshot_tool_count: int = 0 + current_tool_count: int = 0 + + @property + def has_breaking_changes(self) -> bool: + return any(c.is_breaking for c in self.changes) + + @property + def status(self) -> ContractDriftStatus: + if self.has_breaking_changes: + return ContractDriftStatus.CONTRACT_DRIFT + return ContractDriftStatus.PASSED + + @property + def breaking_changes(self) -> List[ToolChange]: + return [c for c in self.changes if c.is_breaking] + + @property + def informational_changes(self) -> List[ToolChange]: + return [c for c in self.changes if not c.is_breaking] + + def summary(self) -> str: + if not self.changes: + return "No changes" + + breaking = len(self.breaking_changes) + info = len(self.informational_changes) + parts = [] + if breaking: + parts.append(f"{breaking} breaking change(s)") + if info: + parts.append(f"{info} informational change(s)") + return ", ".join(parts) + + +def _get_required_params(schema: Dict[str, Any]) -> Set[str]: + """Extract required parameter names from a JSON Schema.""" + return set(schema.get("required", [])) + + +def _get_all_params(schema: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: + """Extract all parameter definitions from a JSON Schema.""" + return schema.get("properties", {}) + + +def _diff_tool_schema( + tool_name: str, + snapshot: ToolSchema, + current: ToolSchema, +) -> List[ToolChange]: + """Compare two versions of the same tool's schema.""" + changes: List[ToolChange] = [] + + # Description change (informational) + if snapshot.description != current.description: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.DESCRIPTION_CHANGED, + detail=( + f"description changed from " + f"'{snapshot.description[:60]}' to '{current.description[:60]}'" + ), + )) + + snap_schema = snapshot.inputSchema + curr_schema = current.inputSchema + + snap_params = _get_all_params(snap_schema) + curr_params = _get_all_params(curr_schema) + snap_required = _get_required_params(snap_schema) + curr_required = _get_required_params(curr_schema) + + snap_names = set(snap_params.keys()) + curr_names = set(curr_params.keys()) + + # Removed parameters (breaking) + for name in snap_names - curr_names: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_REMOVED, + detail=f"parameter '{name}' removed", + )) + + # Added parameters + for name in curr_names - snap_names: + if name in curr_required: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_REQ, + detail=f"new required parameter '{name}'", + )) + else: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_OPT, + detail=f"new optional parameter '{name}'", + )) + + # Changed parameters (type changes) + for name in snap_names & curr_names: + snap_type = snap_params[name].get("type") + curr_type = curr_params[name].get("type") + if snap_type != curr_type: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_TYPE_CHANGED, + detail=f"parameter '{name}' type changed from '{snap_type}' to '{curr_type}'", + )) + + # Parameter became required (breaking - it's a new constraint) + if name not in snap_required and name in curr_required: + changes.append(ToolChange( + tool_name=tool_name, + kind=ChangeKind.PARAM_ADDED_REQ, + detail=f"parameter '{name}' became required", + )) + + return changes + + +def diff_contract( + contract: MCPContract, + current_tools: List[Dict[str, Any]], +) -> ContractDiff: + """Compare a saved contract against current tool definitions. + + Args: + contract: The saved contract snapshot. + current_tools: Current tool definitions from the MCP server. + + Returns: + ContractDiff with all detected changes. + """ + current_schemas = { + t["name"]: ToolSchema.model_validate(t) for t in current_tools + } + snapshot_schemas = {t.name: t for t in contract.tools} + + changes: List[ToolChange] = [] + + # Removed tools (in snapshot but not in current) + for name in snapshot_schemas: + if name not in current_schemas: + changes.append(ToolChange( + tool_name=name, + kind=ChangeKind.REMOVED, + detail=f"tool '{name}' no longer available", + )) + + # Added tools (in current but not in snapshot) + for name in current_schemas: + if name not in snapshot_schemas: + changes.append(ToolChange( + tool_name=name, + kind=ChangeKind.ADDED, + detail=f"new tool '{name}' available", + )) + + # Changed tools (in both) + for name in snapshot_schemas.keys() & current_schemas.keys(): + tool_changes = _diff_tool_schema( + name, snapshot_schemas[name], current_schemas[name] + ) + changes.extend(tool_changes) + + return ContractDiff( + server_name=contract.metadata.server_name, + changes=changes, + snapshot_tool_count=len(snapshot_schemas), + current_tool_count=len(current_schemas), + ) diff --git a/evalview/core/diff.py b/evalview/core/diff.py index 95f2f0a..c623bf5 100644 --- a/evalview/core/diff.py +++ b/evalview/core/diff.py @@ -25,17 +25,19 @@ class DiffStatus(Enum): A test may have additional pass/fail criteria (cost limits, latency thresholds) beyond the diff status. - Four states with clear developer-friendly terminology: + Five states with clear developer-friendly terminology: - PASSED: Output and tools match within tolerance - safe to ship - TOOLS_CHANGED: Tool sequence differs - agent behavior shifted, review before deploy - OUTPUT_CHANGED: Same tools but output differs beyond threshold - review before deploy - REGRESSION: Score dropped significantly - likely a bug, fix before deploy + - CONTRACT_DRIFT: External MCP server interface changed - fix integration before deploy """ PASSED = "passed" # Output and tools match within tolerance TOOLS_CHANGED = "tools_changed" # Tool sequence differs from golden OUTPUT_CHANGED = "output_changed" # Output differs beyond similarity threshold REGRESSION = "regression" # Score dropped >5 points from golden + CONTRACT_DRIFT = "contract_drift" # External MCP server interface changed # Alias for backwards compatibility diff --git a/evalview/core/mcp_contract.py b/evalview/core/mcp_contract.py new file mode 100644 index 0000000..56aa890 --- /dev/null +++ b/evalview/core/mcp_contract.py @@ -0,0 +1,152 @@ +"""MCP contract storage and management. + +MCP contracts are snapshots of an external MCP server's tool definitions. +When running tests with --contracts, current tool definitions are compared +against the snapshot to detect interface drift before tests execute. + +This is the mirror of golden traces: golden traces detect when YOUR agent +drifts, contracts detect when an EXTERNAL server drifts underneath you. + +Storage format: + .evalview/contracts/ + .contract.json # The tool schema snapshot +""" + +import json +import hashlib +from pathlib import Path +from datetime import datetime +from typing import Optional, List, Dict, Any +from pydantic import BaseModel, Field +import logging + +logger = logging.getLogger(__name__) + + +class ToolSchema(BaseModel): + """Schema for a single MCP tool.""" + + name: str + description: str = "" + inputSchema: Dict[str, Any] = Field(default_factory=dict) + + +class ContractMetadata(BaseModel): + """Metadata about a contract snapshot.""" + + server_name: str + endpoint: str + snapshot_at: datetime + protocol_version: str = "2024-11-05" + tool_count: int = 0 + notes: Optional[str] = None + schema_hash: str = "" # Hash of tool schemas for quick comparison + + +class MCPContract(BaseModel): + """A contract snapshot from an MCP server.""" + + metadata: ContractMetadata + tools: List[ToolSchema] = Field(default_factory=list) + + @property + def tool_names(self) -> List[str]: + return [t.name for t in self.tools] + + +class ContractStore: + """Manages MCP contract storage and retrieval.""" + + def __init__(self, base_path: Optional[Path] = None): + self.base_path = base_path or Path(".") + self.contracts_dir = self.base_path / ".evalview" / "contracts" + + def _safe_name(self, server_name: str) -> str: + return "".join(c if c.isalnum() or c in "._-" else "_" for c in server_name) + + def _get_contract_path(self, server_name: str) -> Path: + return self.contracts_dir / f"{self._safe_name(server_name)}.contract.json" + + def _hash_schemas(self, tools: List[Dict[str, Any]]) -> str: + """Hash tool schemas for quick drift comparison.""" + canonical = json.dumps(tools, sort_keys=True, default=str) + return hashlib.sha256(canonical.encode()).hexdigest()[:16] + + def save_contract( + self, + server_name: str, + endpoint: str, + tools: List[Dict[str, Any]], + notes: Optional[str] = None, + ) -> Path: + """Save a tool schema snapshot as a contract. + + Args: + server_name: Human-readable server identifier. + endpoint: The MCP server endpoint used for discovery. + tools: Raw tool definitions from tools/list response. + notes: Optional notes about this snapshot. + + Returns: + Path to saved contract file. + """ + self.contracts_dir.mkdir(parents=True, exist_ok=True) + + tool_schemas = [ToolSchema.model_validate(t) for t in tools] + + contract = MCPContract( + metadata=ContractMetadata( + server_name=server_name, + endpoint=endpoint, + snapshot_at=datetime.now(), + tool_count=len(tool_schemas), + notes=notes, + schema_hash=self._hash_schemas(tools), + ), + tools=tool_schemas, + ) + + contract_path = self._get_contract_path(server_name) + with open(contract_path, "w") as f: + f.write(contract.model_dump_json(indent=2)) + + logger.info(f"Saved contract: {contract_path}") + return contract_path + + def load_contract(self, server_name: str) -> Optional[MCPContract]: + """Load a contract by server name.""" + contract_path = self._get_contract_path(server_name) + if not contract_path.exists(): + return None + + with open(contract_path) as f: + data = json.load(f) + + return MCPContract.model_validate(data) + + def has_contract(self, server_name: str) -> bool: + return self._get_contract_path(server_name).exists() + + def list_contracts(self) -> List[ContractMetadata]: + """List all saved contracts.""" + if not self.contracts_dir.exists(): + return [] + + results = [] + for path in self.contracts_dir.glob("*.contract.json"): + try: + with open(path) as f: + data = json.load(f) + results.append(ContractMetadata.model_validate(data["metadata"])) + except Exception as e: + logger.warning(f"Failed to load contract {path}: {e}") + + return results + + def delete_contract(self, server_name: str) -> bool: + """Delete a contract.""" + contract_path = self._get_contract_path(server_name) + if contract_path.exists(): + contract_path.unlink() + return True + return False diff --git a/examples/mcp/contract-testing.yaml b/examples/mcp/contract-testing.yaml new file mode 100644 index 0000000..7caad1d --- /dev/null +++ b/examples/mcp/contract-testing.yaml @@ -0,0 +1,42 @@ +# MCP Contract Testing Example +# +# This demonstrates evaluating an agent that uses an external MCP server +# you don't control (Scenario 2). The workflow: +# +# 1. Snapshot the server: +# evalview mcp snapshot "npx:@modelcontextprotocol/server-filesystem /tmp" --name fs-server +# +# 2. Run this test with contract checking: +# evalview run examples/mcp/contract-testing.yaml --contracts --fail-on "REGRESSION,CONTRACT_DRIFT" +# +# 3. If the server's tools change, the run fails before this test even executes. +# +# This test validates that an AI agent correctly uses filesystem tools. +# The contract ensures those tools still exist with the expected interface. + +name: "Agent uses filesystem MCP tools" +description: "Test that the agent calls the right MCP tools for a file read task" + +adapter: mcp +endpoint: "npx:@modelcontextprotocol/server-filesystem /tmp" + +# Reference the contract for this server (checked by --contracts flag) +# If the contract drifted, this test won't even run — saving time and +# giving a clear error message instead of a confusing tool-not-found failure. + +input: + query: "read_file" + context: + arguments: + path: "/tmp/evalview-contract-test.txt" + +expected: + tools: + - "read_file" + output: + contains: + - "hello" + +thresholds: + min_score: 50 + max_latency: 10000 diff --git a/tests/test_mcp_contracts.py b/tests/test_mcp_contracts.py new file mode 100644 index 0000000..5f51c9d --- /dev/null +++ b/tests/test_mcp_contracts.py @@ -0,0 +1,495 @@ +"""Tests for MCP contract testing: storage, diff engine, and integration.""" + +import json +import pytest +from datetime import datetime + +from evalview.core.mcp_contract import ( + ContractStore, + MCPContract, + ContractMetadata, + ToolSchema, +) +from evalview.core.contract_diff import ( + diff_contract, + ContractDiff, + ContractDriftStatus, + ChangeKind, + ToolChange, + BREAKING_CHANGES, + _diff_tool_schema, +) + + +# ============================================================================ +# Test Data +# ============================================================================ + +SAMPLE_TOOLS = [ + { + "name": "create_issue", + "description": "Create a new issue in a GitHub repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": {"type": "string"}, + "title": {"type": "string"}, + "body": {"type": "string"}, + }, + "required": ["repo", "title"], + }, + }, + { + "name": "list_issues", + "description": "List issues in a repository", + "inputSchema": { + "type": "object", + "properties": { + "repo": {"type": "string"}, + "state": {"type": "string"}, + }, + "required": ["repo"], + }, + }, + { + "name": "read_file", + "description": "Read a file from the filesystem", + "inputSchema": { + "type": "object", + "properties": { + "path": {"type": "string"}, + }, + "required": ["path"], + }, + }, +] + + +@pytest.fixture +def contract_store(tmp_path): + """Create a ContractStore using a temp directory.""" + return ContractStore(base_path=tmp_path) + + +@pytest.fixture +def saved_contract(contract_store): + """Create and save a sample contract.""" + contract_store.save_contract( + server_name="test-server", + endpoint="npx:@test/server", + tools=SAMPLE_TOOLS, + notes="Test snapshot", + ) + return contract_store.load_contract("test-server") + + +# ============================================================================ +# ContractStore Tests +# ============================================================================ + + +class TestContractStore: + """Tests for MCP contract storage.""" + + def test_save_and_load(self, contract_store): + """Save a contract and load it back.""" + path = contract_store.save_contract( + server_name="my-server", + endpoint="http://localhost:8080", + tools=SAMPLE_TOOLS, + notes="Initial snapshot", + ) + + assert path.exists() + assert path.suffix == ".json" + + loaded = contract_store.load_contract("my-server") + assert loaded is not None + assert loaded.metadata.server_name == "my-server" + assert loaded.metadata.endpoint == "http://localhost:8080" + assert loaded.metadata.tool_count == 3 + assert loaded.metadata.notes == "Initial snapshot" + assert len(loaded.tools) == 3 + + def test_tool_names(self, saved_contract): + """Contract exposes tool names.""" + assert saved_contract.tool_names == ["create_issue", "list_issues", "read_file"] + + def test_load_nonexistent(self, contract_store): + """Loading a nonexistent contract returns None.""" + assert contract_store.load_contract("does-not-exist") is None + + def test_has_contract(self, contract_store): + """has_contract returns correct boolean.""" + assert not contract_store.has_contract("my-server") + + contract_store.save_contract( + server_name="my-server", + endpoint="http://localhost:8080", + tools=SAMPLE_TOOLS, + ) + + assert contract_store.has_contract("my-server") + + def test_list_contracts(self, contract_store): + """List all saved contracts.""" + assert contract_store.list_contracts() == [] + + contract_store.save_contract("server-a", "http://a", SAMPLE_TOOLS[:1]) + contract_store.save_contract("server-b", "http://b", SAMPLE_TOOLS[:2]) + + contracts = contract_store.list_contracts() + assert len(contracts) == 2 + names = {c.server_name for c in contracts} + assert names == {"server-a", "server-b"} + + def test_delete_contract(self, contract_store): + """Delete a contract.""" + contract_store.save_contract("my-server", "http://localhost", SAMPLE_TOOLS) + assert contract_store.has_contract("my-server") + + result = contract_store.delete_contract("my-server") + assert result is True + assert not contract_store.has_contract("my-server") + + def test_delete_nonexistent(self, contract_store): + """Deleting a nonexistent contract returns False.""" + assert contract_store.delete_contract("nope") is False + + def test_overwrite(self, contract_store): + """Overwriting a contract replaces it.""" + contract_store.save_contract("my-server", "http://old", SAMPLE_TOOLS[:1]) + contract_store.save_contract("my-server", "http://new", SAMPLE_TOOLS[:2]) + + loaded = contract_store.load_contract("my-server") + assert loaded.metadata.endpoint == "http://new" + assert loaded.metadata.tool_count == 2 + + def test_schema_hash_changes_on_different_tools(self, contract_store): + """Different tool sets produce different schema hashes.""" + contract_store.save_contract("server-a", "http://a", SAMPLE_TOOLS[:1]) + contract_store.save_contract("server-b", "http://b", SAMPLE_TOOLS[:2]) + + a = contract_store.load_contract("server-a") + b = contract_store.load_contract("server-b") + assert a.metadata.schema_hash != b.metadata.schema_hash + + def test_safe_name_sanitization(self, contract_store): + """Server names with special chars are sanitized for filesystem.""" + contract_store.save_contract( + "my server/with:special chars!", + "http://x", + SAMPLE_TOOLS[:1], + ) + assert contract_store.has_contract("my server/with:special chars!") + + def test_metadata_timestamp(self, contract_store): + """Saved contract has a valid timestamp.""" + contract_store.save_contract("ts-test", "http://x", SAMPLE_TOOLS[:1]) + loaded = contract_store.load_contract("ts-test") + assert isinstance(loaded.metadata.snapshot_at, datetime) + + +# ============================================================================ +# Contract Diff Engine Tests +# ============================================================================ + + +class TestContractDiff: + """Tests for the schema diff engine.""" + + def test_no_changes(self, saved_contract): + """Identical tools produce no changes.""" + result = diff_contract(saved_contract, SAMPLE_TOOLS) + + assert result.status == ContractDriftStatus.PASSED + assert result.changes == [] + assert not result.has_breaking_changes + assert result.summary() == "No changes" + + def test_tool_removed(self, saved_contract): + """Removing a tool is a breaking change.""" + current = [t for t in SAMPLE_TOOLS if t["name"] != "read_file"] + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert result.has_breaking_changes + + removed = [c for c in result.changes if c.kind == ChangeKind.REMOVED] + assert len(removed) == 1 + assert removed[0].tool_name == "read_file" + assert removed[0].is_breaking + + def test_tool_added(self, saved_contract): + """Adding a tool is informational (not breaking).""" + current = SAMPLE_TOOLS + [{ + "name": "delete_file", + "description": "Delete a file", + "inputSchema": {"type": "object", "properties": {}}, + }] + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED # Not breaking + assert len(result.changes) == 1 + assert result.changes[0].kind == ChangeKind.ADDED + assert result.changes[0].tool_name == "delete_file" + assert not result.changes[0].is_breaking + + def test_required_param_added(self, saved_contract): + """Adding a new required parameter is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["inputSchema"]["properties"]["owner"] = {"type": "string"} + current[0]["inputSchema"]["required"].append("owner") + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + breaking = result.breaking_changes + assert any( + c.kind == ChangeKind.PARAM_ADDED_REQ and "owner" in c.detail + for c in breaking + ) + + def test_optional_param_added(self, saved_contract): + """Adding a new optional parameter is not breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["inputSchema"]["properties"]["labels"] = {"type": "array"} + # Not added to "required" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED + info = result.informational_changes + assert any(c.kind == ChangeKind.PARAM_ADDED_OPT for c in info) + + def test_param_removed(self, saved_contract): + """Removing a parameter is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + del current[0]["inputSchema"]["properties"]["body"] + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_REMOVED and "body" in c.detail + for c in result.breaking_changes + ) + + def test_param_type_changed(self, saved_contract): + """Changing a parameter type is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[1]["inputSchema"]["properties"]["repo"]["type"] = "integer" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_TYPE_CHANGED + for c in result.breaking_changes + ) + + def test_description_changed(self, saved_contract): + """Description change is informational.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + current[0]["description"] = "Updated description for create_issue" + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.PASSED + assert any(c.kind == ChangeKind.DESCRIPTION_CHANGED for c in result.changes) + + def test_param_became_required(self, saved_contract): + """A parameter becoming required is breaking.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # "body" was optional, make it required + current[0]["inputSchema"]["required"].append("body") + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert any( + c.kind == ChangeKind.PARAM_ADDED_REQ and "body" in c.detail + and "became required" in c.detail + for c in result.breaking_changes + ) + + def test_multiple_changes(self, saved_contract): + """Multiple changes across tools are all detected.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # Remove read_file + current = [t for t in current if t["name"] != "read_file"] + # Add required param to list_issues + current[1]["inputSchema"]["properties"]["owner"] = {"type": "string"} + current[1]["inputSchema"]["required"].append("owner") + # Add new tool + current.append({ + "name": "merge_pr", + "description": "Merge a pull request", + "inputSchema": {"type": "object", "properties": {}}, + }) + + result = diff_contract(saved_contract, current) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert result.snapshot_tool_count == 3 + assert result.current_tool_count == 3 # removed 1, added 1 + + kinds = {c.kind for c in result.changes} + assert ChangeKind.REMOVED in kinds + assert ChangeKind.ADDED in kinds + assert ChangeKind.PARAM_ADDED_REQ in kinds + + def test_summary_no_changes(self, saved_contract): + """Summary for no changes.""" + result = diff_contract(saved_contract, SAMPLE_TOOLS) + assert result.summary() == "No changes" + + def test_summary_with_breaking_only(self, saved_contract): + """Summary with breaking changes only.""" + current = [t for t in SAMPLE_TOOLS if t["name"] != "read_file"] + result = diff_contract(saved_contract, current) + assert result.summary() == "1 breaking change(s)" + + def test_summary_with_mixed_changes(self, saved_contract): + """Summary with both breaking and informational changes.""" + current = json.loads(json.dumps(SAMPLE_TOOLS)) + # Remove a tool (breaking) and add one (informational) + current = [t for t in current if t["name"] != "read_file"] + current.append({ + "name": "new_tool", + "description": "A new tool", + "inputSchema": {"type": "object", "properties": {}}, + }) + result = diff_contract(saved_contract, current) + summary = result.summary() + assert "1 breaking change(s)" in summary + assert "1 informational change(s)" in summary + + def test_duplicate_tool_names_in_current(self, saved_contract): + """Duplicate tool names in current tools are deduplicated silently.""" + current = SAMPLE_TOOLS + [SAMPLE_TOOLS[0]] # Duplicate create_issue + result = diff_contract(saved_contract, current) + # Dict comprehension deduplicates - should still pass + assert result.status == ContractDriftStatus.PASSED + + def test_empty_snapshot_vs_tools(self): + """Empty snapshot vs populated tools shows all as added.""" + contract = MCPContract( + metadata=ContractMetadata( + server_name="empty", + endpoint="http://x", + snapshot_at=datetime.now(), + ), + tools=[], + ) + result = diff_contract(contract, SAMPLE_TOOLS) + + assert result.status == ContractDriftStatus.PASSED # All added = not breaking + assert all(c.kind == ChangeKind.ADDED for c in result.changes) + assert len(result.changes) == 3 + + def test_all_tools_removed(self, saved_contract): + """All tools removed is breaking.""" + result = diff_contract(saved_contract, []) + + assert result.status == ContractDriftStatus.CONTRACT_DRIFT + assert all(c.kind == ChangeKind.REMOVED for c in result.changes) + assert len(result.changes) == 3 + + +# ============================================================================ +# Breaking Changes Classification Tests +# ============================================================================ + + +class TestBreakingChanges: + """Test that the right change kinds are classified as breaking.""" + + def test_breaking_set(self): + """Verify the breaking changes set.""" + assert ChangeKind.REMOVED in BREAKING_CHANGES + assert ChangeKind.PARAM_ADDED_REQ in BREAKING_CHANGES + assert ChangeKind.PARAM_REMOVED in BREAKING_CHANGES + assert ChangeKind.PARAM_TYPE_CHANGED in BREAKING_CHANGES + + def test_non_breaking_set(self): + """Verify non-breaking changes.""" + assert ChangeKind.ADDED not in BREAKING_CHANGES + assert ChangeKind.PARAM_ADDED_OPT not in BREAKING_CHANGES + assert ChangeKind.DESCRIPTION_CHANGED not in BREAKING_CHANGES + + def test_tool_change_is_breaking(self): + """ToolChange.is_breaking reflects its kind.""" + breaking = ToolChange("t", ChangeKind.REMOVED, "gone") + assert breaking.is_breaking + + safe = ToolChange("t", ChangeKind.ADDED, "new") + assert not safe.is_breaking + + +# ============================================================================ +# DiffStatus Integration Tests +# ============================================================================ + + +class TestDiffStatusIntegration: + """Test that CONTRACT_DRIFT integrates with the existing DiffStatus.""" + + def test_contract_drift_in_diff_status(self): + """CONTRACT_DRIFT is available in the main DiffStatus enum.""" + from evalview.core.diff import DiffStatus + + assert hasattr(DiffStatus, "CONTRACT_DRIFT") + assert DiffStatus.CONTRACT_DRIFT.value == "contract_drift" + + def test_contract_drift_status_enum(self): + """ContractDriftStatus has the expected values.""" + assert ContractDriftStatus.PASSED.value == "passed" + assert ContractDriftStatus.CONTRACT_DRIFT.value == "contract_drift" + + +# ============================================================================ +# Tool Schema Diff Tests (unit level) +# ============================================================================ + + +class TestToolSchemaDiff: + """Unit tests for _diff_tool_schema.""" + + def test_identical_schemas(self): + """No changes for identical schemas.""" + schema = ToolSchema( + name="test", + description="A test tool", + inputSchema={ + "type": "object", + "properties": {"x": {"type": "string"}}, + "required": ["x"], + }, + ) + changes = _diff_tool_schema("test", schema, schema) + assert changes == [] + + def test_empty_schemas(self): + """No changes for two empty schemas.""" + schema = ToolSchema(name="test", description="", inputSchema={}) + changes = _diff_tool_schema("test", schema, schema) + assert changes == [] + + def test_schema_with_no_properties(self): + """Handle schemas without properties gracefully.""" + snap = ToolSchema( + name="test", + description="old", + inputSchema={"type": "object"}, + ) + curr = ToolSchema( + name="test", + description="new", + inputSchema={"type": "object", "properties": {"x": {"type": "string"}}}, + ) + changes = _diff_tool_schema("test", snap, curr) + # Should detect description change and new optional param + kinds = {c.kind for c in changes} + assert ChangeKind.DESCRIPTION_CHANGED in kinds + assert ChangeKind.PARAM_ADDED_OPT in kinds