diff --git a/docs/hooks.md b/docs/hooks.md
index ed9e73d..9fdd905 100644
--- a/docs/hooks.md
+++ b/docs/hooks.md
@@ -19,7 +19,7 @@ Hooks fire synchronously during the agent loop and can:
 | `BeforeLLMCall` | Before each LLM API call | `Messages`, `TaskID`, `CorrelationID` |
 | `AfterLLMCall` | After each LLM API call | `Messages`, `Response`, `TaskID`, `CorrelationID` |
 | `BeforeToolExec` | Before each tool execution | `ToolName`, `ToolInput`, `TaskID`, `CorrelationID` |
-| `AfterToolExec` | After each tool execution | `ToolName`, `ToolInput`, `ToolOutput`, `Error`, `TaskID`, `CorrelationID` |
+| `AfterToolExec` | After each tool execution | `ToolName`, `ToolInput`, `ToolOutput` (mutable), `Error`, `TaskID`, `CorrelationID` |
 | `OnError` | When an LLM call fails | `Error`, `TaskID`, `CorrelationID` |
 | `OnProgress` | During tool execution | `Phase`, `ToolName`, `StatusMessage` |
 
@@ -73,6 +73,19 @@ hooks.Register(engine.BeforeToolExec, func(ctx context.Context, hctx *engine.Hoo
 })
 ```
 
+## Output Redaction
+
+`AfterToolExec` hooks can modify `hctx.ToolOutput` to redact sensitive content before it enters the LLM context. The agent loop reads back `ToolOutput` from the `HookContext` after all hooks fire.
+
+The runner registers a guardrail hook that scans tool output for secrets and PII patterns. See [Tool Output Scanning](security/guardrails.md#tool-output-scanning) for details.
+
+```go
+hooks.Register(engine.AfterToolExec, func(ctx context.Context, hctx *engine.HookContext) error {
+    hctx.ToolOutput = strings.ReplaceAll(hctx.ToolOutput, secret, "[REDACTED]")
+    return nil
+})
+```
+
 ## Audit Logging
 
 The runner registers `AfterLLMCall` hooks that emit structured audit events for each LLM interaction. Audit fields include:
diff --git a/docs/memory.md b/docs/memory.md
index a2e8762..868ee03 100644
--- a/docs/memory.md
+++ b/docs/memory.md
@@ -17,6 +17,18 @@ memory:
 - Sessions are saved as JSON files with atomic writes (temp file + fsync + rename)
 - Automatic cleanup of sessions older than 7 days at startup
 - Session recovery on subsequent requests (disk snapshot supersedes task history)
+- **Session max age** (default 30 minutes): stale sessions are discarded on recovery to prevent poisoned error context from blocking tool retries. When an LLM accumulates repeated tool failures in a session, it may stop retrying altogether. The max age ensures these poisoned sessions expire, giving the agent a fresh start.
+
+Configure via `forge.yaml` or environment variable:
+
+```yaml
+memory:
+  session_max_age: "30m"   # default; use "1h", "15m", etc.
+```
+
+```bash
+export FORGE_SESSION_MAX_AGE=1h
+```
 
 ## Context Window Management
 
@@ -89,6 +101,7 @@ Full memory configuration in `forge.yaml`:
 memory:
   persistence: true
   sessions_dir: ".forge/sessions"
+  session_max_age: "30m"      # discard sessions idle longer than this
   char_budget: 200000
   trigger_ratio: 0.6
   long_term: false
@@ -105,6 +118,7 @@ Environment variables:
 | Variable | Description |
 |----------|-------------|
 | `FORGE_MEMORY_PERSISTENCE` | Set `false` to disable session persistence |
+| `FORGE_SESSION_MAX_AGE` | Session idle timeout, e.g. `30m`, `1h` (default: `30m`) |
 | `FORGE_MEMORY_LONG_TERM` | Set `true` to enable long-term memory |
 | `FORGE_EMBEDDING_PROVIDER` | Override embedding provider |
 
diff --git a/docs/runtime.md b/docs/runtime.md
index 58413d7..88b9990 100644
--- a/docs/runtime.md
+++ b/docs/runtime.md
@@ -148,7 +148,8 @@ forge run --host 0.0.0.0 --shutdown-timeout 30s
 | `--model` | — | Override model name |
 | `--provider` | — | Override LLM provider |
 | `--env` | `.env` | Path to env file |
-| `--enforce-guardrails` | `false` | Enforce guardrail violations as errors |
+| `--enforce-guardrails` | `true` | Enforce guardrail violations as errors |
+| `--no-guardrails` | `false` | Disable all guardrail enforcement |
 
 ### `forge serve` — Background Daemon
 
@@ -202,6 +203,8 @@ For details on session persistence, context window management, compaction, and l
 
 The engine fires hooks at key points in the loop. See [Hooks](hooks.md) for details.
 
+The runner registers four hook groups: logging, audit, progress, and guardrail hooks. The guardrail `AfterToolExec` hook scans tool output for secrets and PII, redacting or blocking before results enter the LLM context. See [Tool Output Scanning](security/guardrails.md#tool-output-scanning).
+
 ## Streaming
 
 The current implementation (v1) runs the full tool-calling loop non-streaming. `ExecuteStream` calls `Execute` internally and emits the final response as a single message on a channel. True word-by-word streaming during tool loops is planned for v2.
diff --git a/docs/security/guardrails.md b/docs/security/guardrails.md
index c2428ff..e67a960 100644
--- a/docs/security/guardrails.md
+++ b/docs/security/guardrails.md
@@ -11,6 +11,7 @@ The guardrail engine checks inbound and outbound messages against configurable p
 | `content_filter` | Inbound + Outbound | Blocks messages containing configured blocked words |
 | `no_pii` | Outbound | Detects email addresses, phone numbers, and SSNs via regex |
 | `jailbreak_protection` | Inbound | Detects common jailbreak phrases ("ignore previous instructions", etc.) |
+| `no_secrets` | Outbound | Detects API keys, tokens, and private keys (OpenAI, Anthropic, AWS, GitHub, Slack, Telegram, etc.) |
 
 ## Modes
 
@@ -37,6 +38,9 @@ Custom guardrail rules can be added to the policy scaffold:
     },
     "jailbreak_protection": {
       "mode": "warn"
+    },
+    "no_secrets": {
+      "mode": "enforce"
     }
   }
 }
@@ -45,13 +49,67 @@ Custom guardrail rules can be added to the policy scaffold:
 ## Runtime
 
 ```bash
-# Run with guardrails enforced
-forge run --enforce-guardrails
-
-# Default: warn mode (log only)
+# Default: guardrails enforced (all built-in guardrails active)
 forge run
+
+# Explicitly disable guardrail enforcement
+forge run --no-guardrails
 ```
 
+All four built-in guardrails (`content_filter`, `no_pii`, `jailbreak_protection`, `no_secrets`) are active by default, even without running `forge build`. Use `--no-guardrails` to opt out.
+
+## Tool Output Scanning
+
+The guardrail engine scans tool output via an `AfterToolExec` hook, catching secrets and PII before they enter the LLM context or outbound messages.
+
+| Guardrail | What it detects in tool output |
+|-----------|-------------------------------|
+| `no_secrets` | API keys, tokens, private keys (same patterns as outbound message scanning) |
+| `no_pii` | Email addresses, phone numbers, SSNs |
+
+**Behavior by mode:**
+
+| Mode | Behavior |
+|------|----------|
+| `enforce` | Returns a generic error (`"tool output blocked by content policy"`), blocking the result from entering the LLM context. The error message intentionally omits which guardrail matched to avoid leaking security internals to the LLM or channel. |
+| `warn` | Replaces matched patterns with `[REDACTED]`, logs a warning, and allows the redacted output through |
+
+The hook writes the redacted text back to `HookContext.ToolOutput`, which the agent loop reads after all hooks fire. This is backwards-compatible — existing hooks that don't modify `ToolOutput` leave it unchanged.
+
+## Path Containment
+
+The `cli_execute` tool confines filesystem path arguments to the agent's working directory. This prevents social-engineering attacks where an LLM is tricked into listing or reading files outside the project.
+
+### Shell Interpreter Denylist
+
+Shell interpreters (`bash`, `sh`, `zsh`, `dash`, `ksh`, `csh`, `tcsh`, `fish`) are **unconditionally blocked**, even if they appear in `allowed_binaries`. Shells defeat the no-shell `exec.Command` security model by reintroducing argument interpretation and bypassing all path validation (e.g., `bash -c "ls ~/Library/Keychains"`).
+
+### HOME Override
+
+When `workDir` is configured, `$HOME` in the subprocess environment is overridden to `workDir`. This prevents `~` expansion inside subprocesses from reaching the real home directory.
+
+### Path Argument Validation
+
+**Rules:**
+- Arguments that look like paths (`/`, `~/`, `./`, `../`) are resolved and checked
+- If a resolved path is inside `$HOME` but outside `workDir` → **blocked**
+- System paths outside `$HOME` (e.g., `/tmp`, `/etc`) → allowed
+- Non-path arguments (e.g., `get`, `pods`, `--namespace=default`) → allowed
+- Flag arguments (e.g., `--kubeconfig=~/.kube/config`) → not detected as paths, allowed
+
+Additionally, `cmd.Dir` is set to `workDir` so relative paths in subprocess execution resolve within the agent directory.
+
+**Examples:**
+
+| Command | Result |
+|---------|--------|
+| `kubectl get pods` | Allowed — no path args |
+| `bash -c "ls ~/"` | Blocked — `bash` is a denied shell interpreter |
+| `ls ~/Library/Keychains/` | Blocked — inside `$HOME`, outside workDir |
+| `cat ../../.ssh/id_rsa` | Blocked — resolves inside `$HOME`, outside workDir |
+| `jq '.' /tmp/data.json` | Allowed — system path outside `$HOME` |
+| `ls ./data/` | Allowed — within workDir |
+
 ## Audit Events
 
 Guardrail evaluations are logged as structured audit events:
diff --git a/docs/skills.md b/docs/skills.md
index 2bdfaf3..d74eaaf 100644
--- a/docs/skills.md
+++ b/docs/skills.md
@@ -168,6 +168,7 @@ forge skills list --tags kubernetes,incident-response
 | `tavily-search` | 🔍 | research | Search the web using Tavily AI search API | `tavily-search.sh` |
 | `tavily-research` | 🔬 | research | Deep multi-source research via Tavily API | `tavily-research.sh`, `tavily-research-poll.sh` |
 | `k8s-incident-triage` | ☸️ | sre | Read-only Kubernetes incident triage using kubectl | — (binary-backed) |
+| `k8s-cost-visibility` | 💰 | sre | Estimate K8s infrastructure costs (compute, storage, LoadBalancer) with cost attribution reports | `k8s-cost-visibility.sh` |
 | `k8s-pod-rightsizer` | ⚖️ | sre | Analyze workload metrics and produce CPU/memory rightsizing recommendations | — (binary-backed) |
 | `code-review` | 🔎 | developer | AI-powered code review for diffs and files | `code-review-diff.sh`, `code-review-file.sh` |
 | `code-review-standards` | 📏 | developer | Initialize and manage code review standards | — (template-based) |
@@ -265,6 +266,37 @@ This skill operates in three modes:
 
 Requires: `bash`, `kubectl`, `jq`, `curl`. Optional: `KUBECONFIG`, `K8S_API_DOMAIN`, `PROMETHEUS_URL`, `PROMETHEUS_TOKEN`, `POLICY_FILE`, `DEFAULT_NAMESPACE`.
 
+### Kubernetes Cost Visibility Skill
+
+The `k8s-cost-visibility` skill estimates Kubernetes infrastructure costs by querying cluster node, pod, PVC/PV, and LoadBalancer data via `kubectl`, applying cloud pricing models, and producing cost attribution reports:
+
+```bash
+forge skills add k8s-cost-visibility
+```
+
+This registers a single tool:
+
+| Tool | Purpose | Behavior |
+|------|---------|----------|
+| `k8s_cost_visibility` | Estimate cluster costs and produce attribution reports | Queries nodes, pods, PVCs, PVs, and services; applies pricing; returns cost breakdown |
+
+**Cost dimensions tracked:**
+
+| Dimension | Source | Default Rate |
+|-----------|--------|-------------|
+| Compute (CPU + memory) | Node instance types, pod resource requests | Auto-detected from cloud CLI or $0.031611/vCPU-hr |
+| Storage (PVC/PV) | PVC capacities, storage classes | $0.10/GiB/month |
+| LoadBalancer | Services with `type: LoadBalancer` | $18.25/month each |
+| Waste | Unbound Persistent Volumes | Flagged with estimated monthly waste |
+
+**Grouping modes:** `namespace` (includes storage + LB columns), `workload`, `node`, `label:<key>`, `annotation:<key>`.
+
+**Pricing modes:** `auto` (detect cloud CLI), `aws`, `gcp`, `azure`, `static` (built-in rates), `custom:<file.json>` (user-provided rates).
+
+**Safety:** This skill is strictly read-only. It only uses `kubectl get` commands (nodes, pods, pvc, pv, svc) — never `apply`, `delete`, `patch`, `exec`, or `scale`.
+
+Requires: `kubectl`, `jq`, `awk`, `bc`. Optional: `KUBECONFIG`, `K8S_API_DOMAIN`, `DEFAULT_NAMESPACE`, `AWS_REGION`, `AZURE_SUBSCRIPTION_ID`, `GCP_PROJECT`.
+
 ### Codegen React Skill
 
 The `codegen-react` skill scaffolds and iterates on **Vite + React** applications with Tailwind CSS:
diff --git a/docs/tools.md b/docs/tools.md
index 751f2f4..4856b87 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -59,7 +59,7 @@ Provider selection: `WEB_SEARCH_PROVIDER` env var, or auto-detect from available
 
 ## CLI Execute
 
-The `cli_execute` tool provides security-hardened command execution with 7 security layers:
+The `cli_execute` tool provides security-hardened command execution with 10 security layers:
 
 ```yaml
 tools:
@@ -73,13 +73,16 @@ tools:
 
 | # | Layer | Detail |
 |---|-------|--------|
-| 1 | **Binary allowlist** | Only pre-approved binaries can execute |
-| 2 | **Binary resolution** | Binaries are resolved to absolute paths via `exec.LookPath` at startup |
-| 3 | **Argument validation** | Rejects arguments containing `$(`, backticks, or newlines |
-| 4 | **Timeout** | Configurable per-command timeout (default: 120s) |
-| 5 | **No shell** | Uses `exec.CommandContext` directly — no shell expansion |
-| 6 | **Environment isolation** | Only `PATH`, `HOME`, `LANG`, explicit passthrough vars, proxy vars, and `OPENAI_ORG_ID` (when set) |
-| 7 | **Output limits** | Configurable max output size (default: 1MB) to prevent memory exhaustion |
+| 1 | **Shell denylist** | Shell interpreters (`bash`, `sh`, `zsh`, `dash`, `ksh`, `csh`, `tcsh`, `fish`) are unconditionally blocked — they defeat the no-shell design |
+| 2 | **Binary allowlist** | Only pre-approved binaries can execute |
+| 3 | **Binary resolution** | Binaries are resolved to absolute paths via `exec.LookPath` at startup |
+| 4 | **Argument validation** | Rejects arguments containing `$(`, backticks, or newlines |
+| 5 | **Path confinement** | Path arguments inside `$HOME` but outside `workDir` are blocked (see [Path Containment](security/guardrails.md#path-containment)) |
+| 6 | **Timeout** | Configurable per-command timeout (default: 120s) |
+| 7 | **No shell** | Uses `exec.CommandContext` directly — no shell expansion |
+| 8 | **Working directory** | `cmd.Dir` set to `workDir` so relative paths resolve within the agent directory |
+| 9 | **Environment isolation** | Only `PATH`, `HOME`, `LANG`, explicit passthrough vars, proxy vars, and `OPENAI_ORG_ID` (when set). `HOME` is overridden to `workDir` to prevent `~` expansion from reaching the real home directory |
+| 10 | **Output limits** | Configurable max output size (default: 1MB) to prevent memory exhaustion |
 
 ## File Create
 
diff --git a/forge-cli/build/policy_stage.go b/forge-cli/build/policy_stage.go
index 0e6a882..46dad3c 100644
--- a/forge-cli/build/policy_stage.go
+++ b/forge-cli/build/policy_stage.go
@@ -24,6 +24,9 @@ func (s *PolicyStage) Execute(ctx context.Context, bc *pipeline.BuildContext) er
 					Type:   "content_filter",
 					Config: map[string]any{"enabled": true},
 				},
+				{Type: "no_pii"},
+				{Type: "jailbreak_protection"},
+				{Type: "no_secrets"},
 			},
 		}
 	}
diff --git a/forge-cli/cmd/run.go b/forge-cli/cmd/run.go
index 89bac9b..cb49bc4 100644
--- a/forge-cli/cmd/run.go
+++ b/forge-cli/cmd/run.go
@@ -23,6 +23,7 @@ var (
 	runShutdownTimeout   time.Duration
 	runMockTools         bool
 	runEnforceGuardrails bool
+	runNoGuardrails      bool
 	runModel             string
 	runProvider          string
 	runEnvFile           string
@@ -42,7 +43,8 @@ func init() {
 	runCmd.Flags().StringVar(&runHost, "host", "", "bind address (e.g. 0.0.0.0 for containers)")
 	runCmd.Flags().DurationVar(&runShutdownTimeout, "shutdown-timeout", 0, "graceful shutdown timeout (e.g. 30s)")
 	runCmd.Flags().BoolVar(&runMockTools, "mock-tools", false, "use mock runtime instead of subprocess")
-	runCmd.Flags().BoolVar(&runEnforceGuardrails, "enforce-guardrails", false, "enforce guardrail violations as errors")
+	runCmd.Flags().BoolVar(&runEnforceGuardrails, "enforce-guardrails", true, "enforce guardrail violations as errors")
+	runCmd.Flags().BoolVar(&runNoGuardrails, "no-guardrails", false, "disable all guardrail enforcement")
 	runCmd.Flags().StringVar(&runModel, "model", "", "override model name (sets MODEL_NAME env var)")
 	runCmd.Flags().StringVar(&runProvider, "provider", "", "LLM provider (openai, anthropic, ollama)")
 	runCmd.Flags().StringVar(&runEnvFile, "env", ".env", "path to .env file")
@@ -59,6 +61,11 @@ func runRun(cmd *cobra.Command, args []string) error {
 
 	activeChannels := parseChannels(runWithChannels)
 
+	enforceGuardrails := runEnforceGuardrails
+	if runNoGuardrails {
+		enforceGuardrails = false
+	}
+
 	runner, err := runtime.NewRunner(runtime.RunnerConfig{
 		Config:            cfg,
 		WorkDir:           workDir,
@@ -66,7 +73,7 @@ func runRun(cmd *cobra.Command, args []string) error {
 		Host:              runHost,
 		ShutdownTimeout:   runShutdownTimeout,
 		MockTools:         runMockTools,
-		EnforceGuardrails: runEnforceGuardrails,
+		EnforceGuardrails: enforceGuardrails,
 		ModelOverride:     runModel,
 		ProviderOverride:  runProvider,
 		EnvFilePath:       resolveEnvPath(workDir, runEnvFile),
diff --git a/forge-cli/cmd/run_test.go b/forge-cli/cmd/run_test.go
index 0f6294e..1944489 100644
--- a/forge-cli/cmd/run_test.go
+++ b/forge-cli/cmd/run_test.go
@@ -19,8 +19,11 @@ func TestRunCmd_FlagDefaults(t *testing.T) {
 	if runMockTools {
 		t.Error("mock-tools should default to false")
 	}
-	if runEnforceGuardrails {
-		t.Error("enforce-guardrails should default to false")
+	if !runEnforceGuardrails {
+		t.Error("enforce-guardrails should default to true")
+	}
+	if runNoGuardrails {
+		t.Error("no-guardrails should default to false")
 	}
 	if runModel != "" {
 		t.Errorf("model should default to empty, got %q", runModel)
diff --git a/forge-cli/cmd/serve.go b/forge-cli/cmd/serve.go
index 8281ef8..17114f6 100644
--- a/forge-cli/cmd/serve.go
+++ b/forge-cli/cmd/serve.go
@@ -29,6 +29,7 @@ var (
 	serveHost              string
 	serveShutdownTimeout   time.Duration
 	serveEnforceGuardrails bool
+	serveNoGuardrails      bool
 	serveModel             string
 	serveProvider          string
 	serveEnvFile           string
@@ -87,7 +88,8 @@ func registerServeFlags(cmd *cobra.Command) {
 	cmd.Flags().IntVarP(&servePort, "port", "p", 8080, "HTTP server port")
 	cmd.Flags().StringVar(&serveHost, "host", "127.0.0.1", "bind address (use 0.0.0.0 for containers)")
 	cmd.Flags().DurationVar(&serveShutdownTimeout, "shutdown-timeout", 30*time.Second, "graceful shutdown timeout")
-	cmd.Flags().BoolVar(&serveEnforceGuardrails, "enforce-guardrails", false, "enforce guardrail violations as errors")
+	cmd.Flags().BoolVar(&serveEnforceGuardrails, "enforce-guardrails", true, "enforce guardrail violations as errors")
+	cmd.Flags().BoolVar(&serveNoGuardrails, "no-guardrails", false, "disable all guardrail enforcement")
 	cmd.Flags().StringVar(&serveModel, "model", "", "override model name (sets MODEL_NAME env var)")
 	cmd.Flags().StringVar(&serveProvider, "provider", "", "LLM provider (openai, anthropic, ollama)")
 	cmd.Flags().StringVar(&serveEnvFile, "env", ".env", "path to .env file")
@@ -166,7 +168,9 @@ func serveStartRun(cmd *cobra.Command, args []string) error {
 		"--host", serveHost,
 		"--shutdown-timeout", serveShutdownTimeout.String(),
 	}
-	if serveEnforceGuardrails {
+	if serveNoGuardrails {
+		runArgs = append(runArgs, "--no-guardrails")
+	} else if serveEnforceGuardrails {
 		runArgs = append(runArgs, "--enforce-guardrails")
 	}
 	if serveModel != "" {
diff --git a/forge-cli/runtime/guardrails_loader.go b/forge-cli/runtime/guardrails_loader.go
index caab748..cadd4cb 100644
--- a/forge-cli/runtime/guardrails_loader.go
+++ b/forge-cli/runtime/guardrails_loader.go
@@ -26,3 +26,19 @@ func LoadPolicyScaffold(workDir string) (*agentspec.PolicyScaffold, error) {
 	}
 	return &ps, nil
 }
+
+// DefaultPolicyScaffold returns a scaffold with all built-in guardrails enabled.
+// Used when no policy-scaffold.json exists (e.g. running without forge build).
+func DefaultPolicyScaffold() *agentspec.PolicyScaffold {
+	return &agentspec.PolicyScaffold{
+		Guardrails: []agentspec.Guardrail{
+			{
+				Type:   "content_filter",
+				Config: map[string]any{"enabled": true},
+			},
+			{Type: "no_pii"},
+			{Type: "jailbreak_protection"},
+			{Type: "no_secrets"},
+		},
+	}
+}
diff --git a/forge-cli/runtime/runner.go b/forge-cli/runtime/runner.go
index 71acde5..2fd1438 100644
--- a/forge-cli/runtime/runner.go
+++ b/forge-cli/runtime/runner.go
@@ -144,11 +144,14 @@ func (r *Runner) Run(ctx context.Context) error {
 		return err
 	}
 
-	// 2. Load policy scaffold
+	// 2. Load policy scaffold (fall back to built-in defaults)
 	scaffold, err := LoadPolicyScaffold(r.cfg.WorkDir)
 	if err != nil {
 		r.logger.Warn("failed to load policy scaffold", map[string]any{"error": err.Error()})
 	}
+	if scaffold == nil || len(scaffold.Guardrails) == 0 {
+		scaffold = DefaultPolicyScaffold()
+	}
 	guardrails := coreruntime.NewGuardrailEngine(scaffold, r.cfg.EnforceGuardrails, r.logger)
 
 	// 3. Build agent card
@@ -266,6 +269,7 @@ func (r *Runner) Run(ctx context.Context) error {
 				if toolRef.Name == "cli_execute" && toolRef.Config != nil {
 					hasExplicitCLI = true
 					cliCfg := clitools.ParseCLIExecuteConfig(toolRef.Config)
+					cliCfg.WorkDir = r.cfg.WorkDir
 					// Apply timeout hint from skill requirements if larger than explicit config
 					if r.derivedCLIConfig != nil && r.derivedCLIConfig.TimeoutHint > cliCfg.TimeoutSeconds {
 						cliCfg.TimeoutSeconds = r.derivedCLIConfig.TimeoutHint
@@ -290,6 +294,7 @@ func (r *Runner) Run(ctx context.Context) error {
 					AllowedBinaries: r.derivedCLIConfig.AllowedBinaries,
 					EnvPassthrough:  r.derivedCLIConfig.EnvPassthrough,
 					TimeoutSeconds:  r.derivedCLIConfig.TimeoutHint,
+					WorkDir:         r.cfg.WorkDir,
 				}
 				r.cliExecTool = clitools.NewCLIExecuteTool(cliCfg)
 				if regErr := reg.Register(r.cliExecTool); regErr != nil {
@@ -372,6 +377,7 @@ func (r *Runner) Run(ctx context.Context) error {
 					r.registerLoggingHooks(hooks)
 					r.registerAuditHooks(hooks, auditLogger)
 					r.registerProgressHooks(hooks)
+					r.registerGuardrailHooks(hooks, guardrails)
 
 					// Compute model-aware character budget.
 					charBudget := r.cfg.Config.Memory.CharBudget
@@ -426,6 +432,19 @@ func (r *Runner) Run(ctx context.Context) error {
 
 							execCfg.Store = memStore
 							execCfg.Compactor = compactor
+
+							// Session max age: stale sessions are discarded to prevent
+							// poisoned error context from blocking tool retries.
+							if v := os.Getenv("FORGE_SESSION_MAX_AGE"); v != "" {
+								if d, err := time.ParseDuration(v); err == nil {
+									execCfg.SessionMaxAge = d
+								}
+							} else if r.cfg.Config.Memory.SessionMaxAge != "" {
+								if d, err := time.ParseDuration(r.cfg.Config.Memory.SessionMaxAge); err == nil {
+									execCfg.SessionMaxAge = d
+								}
+							}
+
 							r.logger.Info("memory persistence enabled", map[string]any{
 								"sessions_dir": sessDir,
 							})
@@ -1353,6 +1372,22 @@ func (r *Runner) registerProgressHooks(hooks *coreruntime.HookRegistry) {
 	})
 }
 
+// registerGuardrailHooks registers an AfterToolExec hook that scans tool output
+// for secrets and PII, redacting or blocking based on guardrail mode.
+func (r *Runner) registerGuardrailHooks(hooks *coreruntime.HookRegistry, guardrails *coreruntime.GuardrailEngine) {
+	hooks.Register(coreruntime.AfterToolExec, func(_ context.Context, hctx *coreruntime.HookContext) error {
+		if hctx.ToolOutput == "" {
+			return nil
+		}
+		redacted, err := guardrails.CheckToolOutput(hctx.ToolOutput)
+		if err != nil {
+			return err
+		}
+		hctx.ToolOutput = redacted
+		return nil
+	})
+}
+
 // buildLLMClient creates the LLM client from the resolved model config.
 // If fallback providers are configured, wraps them in a FallbackChain.
 func (r *Runner) buildLLMClient(mc *coreruntime.ModelConfig) (llm.Client, error) {
diff --git a/forge-cli/tools/cli_execute.go b/forge-cli/tools/cli_execute.go
index d4f8084..b1ab588 100644
--- a/forge-cli/tools/cli_execute.go
+++ b/forge-cli/tools/cli_execute.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strings"
 	"time"
 
@@ -17,8 +18,9 @@ import (
 type CLIExecuteConfig struct {
 	AllowedBinaries []string
 	EnvPassthrough  []string
-	TimeoutSeconds  int // default 120
-	MaxOutputBytes  int // default 1MB
+	TimeoutSeconds  int    // default 120
+	MaxOutputBytes  int    // default 1MB
+	WorkDir         string // confine path arguments to this directory
 }
 
 // CLIExecuteTool is a Category-A builtin tool that executes only pre-approved
@@ -31,6 +33,8 @@ type CLIExecuteTool struct {
 	available   []string
 	missing     []string
 	proxyURL    string // egress proxy URL (e.g., "http://127.0.0.1:54321")
+	workDir     string // resolved absolute workDir for path confinement
+	homeDir     string // resolved $HOME for path confinement
 }
 
 // cliExecuteArgs is the JSON input schema for Execute.
@@ -58,10 +62,21 @@ func NewCLIExecuteTool(config CLIExecuteConfig) *CLIExecuteTool {
 		config.MaxOutputBytes = 1048576 // 1MB
 	}
 
+	// Resolve workDir and homeDir for path confinement.
+	workDir := config.WorkDir
+	if workDir != "" {
+		if abs, err := filepath.Abs(workDir); err == nil {
+			workDir = abs
+		}
+	}
+	homeDir := os.Getenv("HOME")
+
 	t := &CLIExecuteTool{
 		config:      config,
 		allowedSet:  make(map[string]bool, len(config.AllowedBinaries)),
 		binaryPaths: make(map[string]string, len(config.AllowedBinaries)),
+		workDir:     workDir,
+		homeDir:     homeDir,
 	}
 
 	for _, bin := range config.AllowedBinaries {
@@ -133,7 +148,13 @@ func (t *CLIExecuteTool) Execute(ctx context.Context, args json.RawMessage) (str
 		return "", fmt.Errorf("cli_execute: invalid arguments: %w", err)
 	}
 
-	// Security check 1: Binary allowlist
+	// Security check 1a: Block shell interpreters — these defeat the no-shell
+	// exec.Command design and bypass all path argument validation.
+	if deniedShells[input.Binary] {
+		return "", fmt.Errorf("cli_execute: binary %q is a shell interpreter and cannot be used", input.Binary)
+	}
+
+	// Security check 1b: Binary allowlist
 	if !t.allowedSet[input.Binary] {
 		return "", fmt.Errorf("cli_execute: binary %q is not in the allowed list", input.Binary)
 	}
@@ -151,6 +172,16 @@ func (t *CLIExecuteTool) Execute(ctx context.Context, args json.RawMessage) (str
 		}
 	}
 
+	// Security check 3b: Path confinement — block path args that escape workDir
+	// into $HOME (e.g., ~/Library/Keychains/, ../../../.ssh/id_rsa)
+	if t.workDir != "" {
+		for i, arg := range input.Args {
+			if err := t.validatePathArg(arg); err != nil {
+				return "", fmt.Errorf("cli_execute: argument %d: %w", i, err)
+			}
+		}
+	}
+
 	// Security check 4: Timeout
 	timeout := time.Duration(t.config.TimeoutSeconds) * time.Second
 	cmdCtx, cancel := context.WithTimeout(ctx, timeout)
@@ -159,6 +190,11 @@ func (t *CLIExecuteTool) Execute(ctx context.Context, args json.RawMessage) (str
 	// Security check 5: No shell — exec.CommandContext directly
 	cmd := exec.CommandContext(cmdCtx, absPath, input.Args...)
 
+	// Defense-in-depth: set working directory so relative paths resolve within workDir
+	if t.workDir != "" {
+		cmd.Dir = t.workDir
+	}
+
 	// Security check 6: Env isolation
 	cmd.Env = t.buildEnv()
 
@@ -213,11 +249,16 @@ func (t *CLIExecuteTool) Availability() (available, missing []string) {
 func (t *CLIExecuteTool) SetProxyURL(url string) { t.proxyURL = url }
 
 // buildEnv constructs an isolated environment with only PATH, HOME, LANG
-// and explicitly configured passthrough variables.
+// and explicitly configured passthrough variables. When workDir is set,
+// HOME is overridden to workDir so subprocess ~ expansion stays confined.
 func (t *CLIExecuteTool) buildEnv() []string {
+	homeVal := os.Getenv("HOME")
+	if t.workDir != "" {
+		homeVal = t.workDir
+	}
 	env := []string{
 		"PATH=" + os.Getenv("PATH"),
-		"HOME=" + os.Getenv("HOME"),
+		"HOME=" + homeVal,
 		"LANG=" + os.Getenv("LANG"),
 	}
 	for _, key := range t.config.EnvPassthrough {
@@ -236,6 +277,15 @@ func (t *CLIExecuteTool) buildEnv() []string {
 	return env
 }
 
+// deniedShells is a hardcoded set of shell interpreters that are never allowed
+// regardless of the allowlist. Shells defeat the security model by
+// reintroducing shell interpretation, bypassing path validation and the
+// no-shell exec.Command design.
+var deniedShells = map[string]bool{
+	"bash": true, "sh": true, "zsh": true, "dash": true,
+	"ksh": true, "csh": true, "tcsh": true, "fish": true,
+}
+
 // validateArg rejects arguments containing shell injection patterns.
 // Since we use exec.Command (no shell), these are defense-in-depth checks
 // against confused upstream processing.
@@ -252,6 +302,48 @@ func validateArg(arg string) error {
 	return nil
 }
 
+// validatePathArg checks whether an argument looks like a filesystem path and,
+// if so, ensures it doesn't resolve to a location inside $HOME but outside
+// workDir. System paths (outside $HOME) and non-path arguments pass through.
+func (t *CLIExecuteTool) validatePathArg(arg string) error {
+	if !looksLikePath(arg) {
+		return nil
+	}
+	resolved := resolveArgPath(arg, t.workDir, t.homeDir)
+
+	// If the resolved path is inside $HOME (or is $HOME itself) but outside workDir → blocked.
+	inHome := resolved == t.homeDir || strings.HasPrefix(resolved, t.homeDir+"/")
+	inWorkDir := resolved == t.workDir || strings.HasPrefix(resolved, t.workDir+"/")
+	if t.homeDir != "" && inHome && !inWorkDir {
+		return fmt.Errorf("path %q resolves outside the agent working directory", arg)
+	}
+	return nil
+}
+
+// looksLikePath returns true for arguments that look like filesystem paths.
+// Only bare path prefixes are matched; flag arguments (--foo=/bar) are not
+// detected so that flags like --kubeconfig=~/.kube/config pass through.
+func looksLikePath(arg string) bool {
+	return strings.HasPrefix(arg, "/") ||
+		strings.HasPrefix(arg, "~/") ||
+		strings.HasPrefix(arg, "./") ||
+		strings.HasPrefix(arg, "../") ||
+		arg == "~" || arg == "." || arg == ".."
+}
+
+// resolveArgPath expands ~ and resolves relative paths against workDir,
+// then cleans the result to eliminate .. components.
+func resolveArgPath(arg, workDir, homeDir string) string {
+	if strings.HasPrefix(arg, "~/") {
+		arg = filepath.Join(homeDir, arg[2:])
+	} else if arg == "~" {
+		arg = homeDir
+	} else if !filepath.IsAbs(arg) {
+		arg = filepath.Join(workDir, arg)
+	}
+	return filepath.Clean(arg)
+}
+
 // ParseCLIExecuteConfig extracts typed config from the map[string]any that
 // YAML produces. Handles both int and float64 for numeric fields.
 func ParseCLIExecuteConfig(raw map[string]any) CLIExecuteConfig {
diff --git a/forge-cli/tools/cli_execute_test.go b/forge-cli/tools/cli_execute_test.go
index 84282a6..5e002af 100644
--- a/forge-cli/tools/cli_execute_test.go
+++ b/forge-cli/tools/cli_execute_test.go
@@ -3,6 +3,8 @@ package tools
 import (
 	"context"
 	"encoding/json"
+	"os"
+	"path/filepath"
 	"runtime"
 	"strings"
 	"testing"
@@ -354,3 +356,176 @@ func TestCLIExecute_ParseConfig(t *testing.T) {
 		t.Errorf("MaxOutputBytes (float64) = %d, want 2097152", cfgFloat.MaxOutputBytes)
 	}
 }
+
+func TestLooksLikePath(t *testing.T) {
+	tests := []struct {
+		arg  string
+		want bool
+	}{
+		{"/etc/passwd", true},
+		{"~/Library/Keychains/", true},
+		{"./data.txt", true},
+		{"../../../.ssh/id_rsa", true},
+		{"~", true},
+		{".", true},
+		{"..", true},
+		// Non-path arguments
+		{"get", false},
+		{"pods", false},
+		{"--namespace=default", false},
+		{"--kubeconfig=~/.kube/config", false},
+		{"-o", false},
+		{"json", false},
+		{"", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.arg, func(t *testing.T) {
+			if got := looksLikePath(tt.arg); got != tt.want {
+				t.Errorf("looksLikePath(%q) = %v, want %v", tt.arg, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestValidatePathArg_BlocksHomeTraversal(t *testing.T) {
+	home := os.Getenv("HOME")
+	if home == "" {
+		t.Skip("HOME not set")
+	}
+
+	workDir := filepath.Join(home, "projects", "myagent")
+
+	tool := &CLIExecuteTool{
+		workDir: workDir,
+		homeDir: home,
+	}
+
+	tests := []struct {
+		name    string
+		arg     string
+		wantErr bool
+	}{
+		// Allowed: within workDir
+		{"workdir_file", "./data.txt", false},
+		{"workdir_subdir", "./subdir/file.yaml", false},
+		// Allowed: system paths (outside $HOME)
+		{"system_tmp", "/tmp/data.json", false},
+		{"system_etc", "/etc/hosts", false},
+		// Allowed: non-path arguments
+		{"plain_arg", "get", false},
+		{"flag_arg", "--namespace=default", false},
+		{"flag_with_path", "--kubeconfig=~/.kube/config", false},
+		// Blocked: home traversal
+		{"home_library", "~/Library/Keychains/", true},
+		{"home_downloads", "~/Downloads/", true},
+		{"home_ssh", "~/.ssh/id_rsa", true},
+		{"home_root", "~/", true},
+		// Blocked: relative escape (stays inside $HOME but outside workDir)
+		{"dotdot_escape", "../../.ssh/id_rsa", true},
+		{"dotdot_one_level", "../other_project/secret.key", true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tool.validatePathArg(tt.arg)
+			if tt.wantErr && err == nil {
+				t.Errorf("validatePathArg(%q) = nil, want error", tt.arg)
+			}
+			if !tt.wantErr && err != nil {
+				t.Errorf("validatePathArg(%q) = %v, want nil", tt.arg, err)
+			}
+			if tt.wantErr && err != nil && !strings.Contains(err.Error(), "outside the agent working directory") {
+				t.Errorf("error = %q, want it to contain 'outside the agent working directory'", err.Error())
+			}
+		})
+	}
+}
+
+func TestCLIExecute_PathTraversalBlocked(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("test uses Unix paths")
+	}
+
+	home := os.Getenv("HOME")
+	if home == "" {
+		t.Skip("HOME not set")
+	}
+
+	workDir := filepath.Join(home, "projects", "myagent")
+
+	tool := NewCLIExecuteTool(CLIExecuteConfig{
+		AllowedBinaries: []string{"ls"},
+		WorkDir:         workDir,
+	})
+
+	args, _ := json.Marshal(cliExecuteArgs{
+		Binary: "ls",
+		Args:   []string{home},
+	})
+
+	_, err := tool.Execute(context.Background(), args)
+	if err == nil {
+		t.Fatal("Execute() expected error for home directory traversal, got nil")
+	}
+	if !strings.Contains(err.Error(), "outside the agent working directory") {
+		t.Errorf("error = %q, want it to mention 'outside the agent working directory'", err.Error())
+	}
+}
+
+func TestCLIExecute_ShellInterpreterBlocked(t *testing.T) {
+	shells := []string{"bash", "sh", "zsh", "dash", "ksh", "csh", "tcsh", "fish"}
+	for _, shell := range shells {
+		t.Run(shell, func(t *testing.T) {
+			tool := NewCLIExecuteTool(CLIExecuteConfig{
+				AllowedBinaries: []string{shell},
+			})
+
+			args, _ := json.Marshal(cliExecuteArgs{
+				Binary: shell,
+				Args:   []string{"-c", "echo hello"},
+			})
+
+			_, err := tool.Execute(context.Background(), args)
+			if err == nil {
+				t.Fatalf("Execute(%s) expected error, got nil", shell)
+			}
+			if !strings.Contains(err.Error(), "shell interpreter") {
+				t.Errorf("error = %q, want it to mention 'shell interpreter'", err.Error())
+			}
+		})
+	}
+}
+
+func TestCLIExecute_HomeOverriddenToWorkDir(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("env command differs on Windows")
+	}
+
+	tmpDir := t.TempDir()
+
+	tool := NewCLIExecuteTool(CLIExecuteConfig{
+		AllowedBinaries: []string{"env"},
+		WorkDir:         tmpDir,
+	})
+
+	args, _ := json.Marshal(cliExecuteArgs{
+		Binary: "env",
+	})
+
+	result, err := tool.Execute(context.Background(), args)
+	if err != nil {
+		t.Fatalf("Execute() error = %v", err)
+	}
+
+	var res cliExecuteResult
+	if err := json.Unmarshal([]byte(result), &res); err != nil {
+		t.Fatalf("failed to unmarshal result: %v", err)
+	}
+
+	// HOME should be overridden to workDir, not the real home
+	expected := "HOME=" + tmpDir
+	if !strings.Contains(res.Stdout, expected) {
+		t.Errorf("expected %q in env output, got:\n%s", expected, res.Stdout)
+	}
+}
diff --git a/forge-core/runtime/guardrails.go b/forge-core/runtime/guardrails.go
index 0c31f33..4ac4ae5 100644
--- a/forge-core/runtime/guardrails.go
+++ b/forge-core/runtime/guardrails.go
@@ -48,9 +48,17 @@ func (g *GuardrailEngine) check(msg *a2a.Message, direction string) error {
 		case "content_filter":
 			err = g.checkContentFilter(text, gr)
 		case "no_pii":
-			err = g.checkNoPII(text)
+			if direction == "outbound" {
+				err = g.checkNoPII(text)
+			}
 		case "jailbreak_protection":
-			err = g.checkJailbreak(text)
+			if direction == "inbound" {
+				err = g.checkJailbreak(text)
+			}
+		case "no_secrets":
+			if direction == "outbound" {
+				err = g.checkNoSecrets(text)
+			}
 		default:
 			continue
 		}
@@ -135,3 +143,65 @@ func (g *GuardrailEngine) checkJailbreak(text string) error {
 	}
 	return nil
 }
+
+var secretPatterns = []*regexp.Regexp{
+	regexp.MustCompile(`sk-ant-[A-Za-z0-9\-]{20,}`),                      // Anthropic API keys
+	regexp.MustCompile(`sk-[A-Za-z0-9]{20,}`),                            // OpenAI API keys
+	regexp.MustCompile(`ghp_[A-Za-z0-9]{36}`),                            // GitHub PATs
+	regexp.MustCompile(`gho_[A-Za-z0-9]{36}`),                            // GitHub OAuth tokens
+	regexp.MustCompile(`ghs_[A-Za-z0-9]{36}`),                            // GitHub server tokens
+	regexp.MustCompile(`github_pat_[A-Za-z0-9_]{22,}`),                   // GitHub fine-grained PATs
+	regexp.MustCompile(`AKIA[0-9A-Z]{16}`),                               // AWS access key IDs
+	regexp.MustCompile(`xoxb-[0-9]{10,}-[A-Za-z0-9-]+`),                  // Slack bot tokens
+	regexp.MustCompile(`xoxp-[0-9]{10,}-[A-Za-z0-9-]+`),                  // Slack user tokens
+	regexp.MustCompile(`-----BEGIN (RSA|EC|OPENSSH|PRIVATE) .*KEY-----`), // Private keys
+	regexp.MustCompile(`[0-9]{8,10}:[A-Za-z0-9_-]{35,}`),                 // Telegram bot tokens
+}
+
+func (g *GuardrailEngine) checkNoSecrets(text string) error {
+	for _, re := range secretPatterns {
+		if re.MatchString(text) {
+			return fmt.Errorf("potential secret or credential detected in output")
+		}
+	}
+	return nil
+}
+
+// CheckToolOutput scans tool output text against configured guardrails
+// (no_secrets and no_pii). In enforce mode, returns an error on first match
+// without echoing the match. In warn mode, replaces matches with [REDACTED],
+// logs a warning, and returns the redacted text.
+func (g *GuardrailEngine) CheckToolOutput(text string) (string, error) {
+	if text == "" {
+		return text, nil
+	}
+
+	for _, gr := range g.scaffold.Guardrails {
+		var patterns []*regexp.Regexp
+		switch gr.Type {
+		case "no_secrets":
+			patterns = secretPatterns
+		case "no_pii":
+			patterns = piiPatterns
+		default:
+			continue
+		}
+
+		for _, re := range patterns {
+			if !re.MatchString(text) {
+				continue
+			}
+			if g.enforce {
+				return "", fmt.Errorf("tool output blocked by content policy")
+			}
+			// Warn mode: redact matches
+			text = re.ReplaceAllString(text, "[REDACTED]")
+			g.logger.Warn("guardrail redaction", map[string]any{
+				"guardrail": gr.Type,
+				"direction": "tool_output",
+				"detail":    fmt.Sprintf("pattern %s matched, content redacted", re.String()),
+			})
+		}
+	}
+	return text, nil
+}
diff --git a/forge-core/runtime/loop.go b/forge-core/runtime/loop.go
index 427f93b..4598bcc 100644
--- a/forge-core/runtime/loop.go
+++ b/forge-core/runtime/loop.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/initializ/forge/forge-core/a2a"
 	"github.com/initializ/forge/forge-core/llm"
@@ -28,10 +29,11 @@ type LLMExecutor struct {
 	compactor          *Compactor
 	store              *MemoryStore
 	logger             Logger
-	modelName          string // resolved model name for context budget
-	charBudget         int    // resolved character budget
-	maxToolResultChars int    // computed from char budget
-	filesDir           string // directory for file_create output
+	modelName          string        // resolved model name for context budget
+	charBudget         int           // resolved character budget
+	maxToolResultChars int           // computed from char budget
+	filesDir           string        // directory for file_create output
+	sessionMaxAge      time.Duration // max age for session recovery (0 = no limit)
 }
 
 // LLMExecutorConfig configures the LLM executor.
@@ -44,9 +46,10 @@ type LLMExecutorConfig struct {
 	Compactor     *Compactor
 	Store         *MemoryStore
 	Logger        Logger
-	ModelName     string // model name for context-aware budgeting
-	CharBudget    int    // explicit char budget override (0 = auto from model)
-	FilesDir      string // directory for file_create output (default: $TMPDIR/forge-files)
+	ModelName     string        // model name for context-aware budgeting
+	CharBudget    int           // explicit char budget override (0 = auto from model)
+	FilesDir      string        // directory for file_create output (default: $TMPDIR/forge-files)
+	SessionMaxAge time.Duration // max idle time before session recovery is skipped (0 = 30m default)
 }
 
 // NewLLMExecutor creates a new LLMExecutor with the given configuration.
@@ -83,6 +86,11 @@ func NewLLMExecutor(cfg LLMExecutorConfig) *LLMExecutor {
 		toolLimit = 400_000
 	}
 
+	sessionMaxAge := cfg.SessionMaxAge
+	if sessionMaxAge == 0 {
+		sessionMaxAge = 30 * time.Minute
+	}
+
 	return &LLMExecutor{
 		client:             cfg.Client,
 		tools:              cfg.Tools,
@@ -96,6 +104,7 @@ func NewLLMExecutor(cfg LLMExecutorConfig) *LLMExecutor {
 		charBudget:         budget,
 		maxToolResultChars: toolLimit,
 		filesDir:           cfg.FilesDir,
+		sessionMaxAge:      sessionMaxAge,
 	}
 }
 
@@ -109,6 +118,9 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess
 
 	// Try to recover session from disk. If found, the disk snapshot
 	// supersedes task.History to avoid duplicating messages.
+	// Sessions older than sessionMaxAge are discarded to prevent stale
+	// error context from poisoning the LLM (e.g., repeated tool failures
+	// causing the LLM to stop retrying tools altogether).
 	recovered := false
 	if e.store != nil {
 		saved, err := e.store.Load(task.ID)
@@ -117,12 +129,21 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess
 				"task_id": task.ID, "error": err.Error(),
 			})
 		} else if saved != nil {
-			mem.LoadFromStore(saved)
-			recovered = true
-			e.logger.Info("session recovered from disk", map[string]any{
-				"task_id":  task.ID,
-				"messages": len(saved.Messages),
-			})
+			if !saved.UpdatedAt.IsZero() && time.Since(saved.UpdatedAt) > e.sessionMaxAge {
+				e.logger.Info("discarding stale session", map[string]any{
+					"task_id":    task.ID,
+					"updated_at": saved.UpdatedAt.Format(time.RFC3339),
+					"max_age":    e.sessionMaxAge.String(),
+				})
+				_ = e.store.Delete(task.ID)
+			} else {
+				mem.LoadFromStore(saved)
+				recovered = true
+				e.logger.Info("session recovered from disk", map[string]any{
+					"task_id":  task.ID,
+					"messages": len(saved.Messages),
+				})
+			}
 		}
 	}
 
@@ -234,17 +255,19 @@ func (e *LLMExecutor) Execute(ctx context.Context, task *a2a.Task, msg *a2a.Mess
 				result = result[:e.maxToolResultChars] + "\n\n[OUTPUT TRUNCATED — original length: " + strconv.Itoa(len(result)) + " chars]"
 			}
 
-			// Fire AfterToolExec hook
-			if err := e.hooks.Fire(ctx, AfterToolExec, &HookContext{
+			// Fire AfterToolExec hook — hooks may redact ToolOutput.
+			afterHctx := &HookContext{
 				ToolName:      tc.Function.Name,
 				ToolInput:     tc.Function.Arguments,
 				ToolOutput:    result,
 				Error:         execErr,
 				TaskID:        TaskIDFromContext(ctx),
 				CorrelationID: CorrelationIDFromContext(ctx),
-			}); err != nil {
+			}
+			if err := e.hooks.Fire(ctx, AfterToolExec, afterHctx); err != nil {
 				return nil, fmt.Errorf("after tool exec hook: %w", err)
 			}
+			result = afterHctx.ToolOutput // allow hooks to redact output
 
 			// Handle file_create tool: always create a file part.
 			// For other tools with large output, detect content type.
diff --git a/forge-core/types/config.go b/forge-core/types/config.go
index a5b5695..3ff5a6e 100644
--- a/forge-core/types/config.go
+++ b/forge-core/types/config.go
@@ -43,10 +43,11 @@ type SecretsConfig struct {
 
 // MemoryConfig configures agent memory persistence and compaction.
 type MemoryConfig struct {
-	Persistence  *bool   `yaml:"persistence,omitempty"` // default: true
-	SessionsDir  string  `yaml:"sessions_dir,omitempty"`
-	TriggerRatio float64 `yaml:"trigger_ratio,omitempty"`
-	CharBudget   int     `yaml:"char_budget,omitempty"`
+	Persistence   *bool   `yaml:"persistence,omitempty"` // default: true
+	SessionsDir   string  `yaml:"sessions_dir,omitempty"`
+	SessionMaxAge string  `yaml:"session_max_age,omitempty"` // e.g. "30m", "1h" (default: 30m)
+	TriggerRatio  float64 `yaml:"trigger_ratio,omitempty"`
+	CharBudget    int     `yaml:"char_budget,omitempty"`
 
 	// Long-term memory (persistent cross-session knowledge).
 	LongTerm          *bool   `yaml:"long_term,omitempty"`            // default: false
diff --git a/forge-skills/local/embedded/k8s-cost-visibility/SKILL.md b/forge-skills/local/embedded/k8s-cost-visibility/SKILL.md
new file mode 100644
index 0000000..d7ee5c3
--- /dev/null
+++ b/forge-skills/local/embedded/k8s-cost-visibility/SKILL.md
@@ -0,0 +1,256 @@
+---
+name: k8s-cost-visibility
+icon: "\U0001F4B0"
+category: sre
+tags:
+  - kubernetes
+  - cost-optimization
+  - finops
+  - resource-management
+  - capacity-planning
+  - kubectl
+description: Estimate Kubernetes infrastructure costs by querying cluster node, pod, PVC/PV, and LoadBalancer data, applying cloud pricing models, and producing cost attribution reports with storage and LoadBalancer cost tracking, grouped by namespace, workload, node, label, or annotation.
+metadata:
+  forge:
+    requires:
+      bins:
+        - kubectl
+        - jq
+        - awk
+        - bc
+      env:
+        required: []
+        one_of: []
+        optional:
+          - KUBECONFIG
+          - K8S_API_DOMAIN
+          - DEFAULT_NAMESPACE
+          - AWS_REGION
+          - AZURE_SUBSCRIPTION_ID
+          - GCP_PROJECT
+    egress_domains:
+      - "$K8S_API_DOMAIN"
+      - api.pricing.us-east-1.amazonaws.com
+      - dc.services.visualstudio.com
+      - login.microsoftonline.com
+      - management.azure.com
+    denied_tools:
+      - http_request
+      - web_search
+    timeout_hint: 120
+    trust_hints:
+      network: true
+      filesystem: read
+      shell: true
+---
+
+# Kubernetes Cost Visibility
+
+Estimates Kubernetes infrastructure costs by querying cluster node, pod, PVC/PV, and LoadBalancer resource data via `kubectl`, applying pricing models (cloud CLI auto-detection, static pricing map, or manual override), and producing cost attribution reports including storage and LoadBalancer costs.
+
+This skill is **read-only** — it never mutates cluster state.
+
+Supports grouping costs by:
+
+- **namespace** — total cost per namespace (compute + storage + LoadBalancer)
+- **workload** — cost per deployment/statefulset/daemonset
+- **node** — cost per node with utilization
+- **label** — cost grouped by any label key (e.g., `team`, `env`)
+- **annotation** — cost grouped by any annotation key
+
+Additional cost tracking:
+
+- **storage costs** — PVC/PV storage cost attribution per namespace
+- **LoadBalancer costs** — LoadBalancer service cost tracking per namespace
+- **waste detection** — unbound Persistent Volumes flagged as waste
+
+---
+
+## Tool Usage
+
+This skill uses `cli_execute` with `kubectl` commands exclusively.
+NEVER use http_request or web_search to interact with Kubernetes.
+All cluster operations MUST go through kubectl or the cost-visibility script via cli_execute.
+
+---
+
+## Tool: k8s_cost_visibility
+
+Estimate Kubernetes infrastructure costs and produce cost attribution reports.
+
+**Input:** pricing_mode (string), group_by (string), namespace (string), label_selector (string), top (integer), output_format (string), cache_ttl (integer)
+
+**Output format:** Markdown tables for cost reports. JSON for machine-readable output.
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `pricing_mode` | string | `auto` | Pricing source: `auto` (detect cloud CLI), `aws`, `gcp`, `azure`, `static` (built-in map), or `custom:file.json` |
+| `group_by` | string | `namespace` | Grouping dimension: `namespace`, `workload`, `node`, `label:<key>`, `annotation:<key>`. Use `namespace` to see storage and LoadBalancer cost columns. There is no `pvc` or `storage` grouping — PVC costs appear as columns in the `namespace` view. |
+| `namespace` | string | _(empty)_ | Filter to a single namespace. When set, only pods, PVCs, and services in this namespace are included. Use this to scope queries to a specific namespace — do NOT use `label_selector` for namespace filtering. |
+| `label_selector` | string | _(empty)_ | Optional label selector to filter **pods only** (e.g., `app=web,env=prod`). Does NOT filter PVCs or services. Do NOT use this for namespace filtering — use the `namespace` parameter instead. |
+| `top` | integer | `0` | Limit output to top N entries by cost (0 = show all) |
+| `output_format` | string | `markdown` | Output format: `markdown` or `json` |
+| `cache_ttl` | integer | `300` | Cache TTL in seconds for node pricing data (0 = no cache) |
+
+### Pricing Modes
+
+| Mode | Source | Description |
+|------|--------|-------------|
+| `auto` | Cloud CLI detection | Tries `aws`, `gcp`, `azure` CLIs in order; falls back to `static` |
+| `aws` | AWS EC2 pricing API | Uses `aws pricing get-products` for on-demand rates |
+| `gcp` | GCP billing catalog | Uses `gcloud compute machine-types describe` |
+| `azure` | Azure retail prices | Uses `az vm list-sizes` with pricing |
+| `static` | Built-in price map | Uses embedded per-vCPU and per-GiB-memory hourly rates |
+| `custom:<file>` | User-provided JSON | Reads pricing from a local JSON file |
+
+### Custom Pricing File Format
+
+```json
+{
+  "cpu_hourly": 0.031611,
+  "memory_gib_hourly": 0.004237,
+  "storage_gib_monthly": 0.10,
+  "lb_monthly": 18.25,
+  "currency": "USD"
+}
+```
+
+---
+
+## Input Modes
+
+### 1) Human Mode (Natural Language)
+
+Examples:
+
+- `show me cluster costs` → `{"pricing_mode": "auto", "group_by": "namespace"}`
+- `cost breakdown by team label` → `{"group_by": "label:team"}`
+- `top 5 most expensive namespaces` → `{"group_by": "namespace", "top": 5}`
+- `costs for app=checkout pods` → `{"label_selector": "app=checkout", "group_by": "workload"}`
+- `node cost utilization report` → `{"group_by": "node"}`
+- `show costs using AWS pricing` → `{"pricing_mode": "aws", "group_by": "namespace"}`
+- `show storage waste` → `{"group_by": "namespace"}`
+- `how many load balancers are running` → `{"group_by": "namespace"}`
+- `show me PVC costs` → `{"group_by": "namespace"}`
+- `PVC costs in envoy-gateway-system` → `{"namespace": "envoy-gateway-system", "group_by": "namespace"}`
+- `top 5 namespaces by storage cost` → `{"group_by": "namespace", "top": 5}`
+- `costs for the monitoring namespace` → `{"namespace": "monitoring", "group_by": "namespace"}`
+
+### 2) Automation Mode (Structured JSON)
+
+```json
+{
+  "pricing_mode": "auto",
+  "group_by": "namespace",
+  "namespace": "",
+  "label_selector": "",
+  "top": 0,
+  "output_format": "markdown",
+  "cache_ttl": 300
+}
+```
+
+---
+
+## Execution Workflow
+
+### Step 0 — Preflight
+
+Verify cluster access:
+
+```bash
+kubectl cluster-info --request-timeout=5s
+```
+
+If RBAC denies access, report the error and stop.
+
+### Step 1 — Collect Node Data
+
+Fetch all node specs (CPU, memory, instance type, region, labels):
+
+```bash
+kubectl get nodes -o json
+```
+
+Extract allocatable CPU/memory and instance type labels for pricing.
+
+### Step 2 — Determine Pricing
+
+Based on `pricing_mode`:
+
+1. **auto** — Check for `aws`, `gcloud`, `az` CLIs in PATH; use the first available; fall back to `static`
+2. **Cloud CLI** — Query the cloud provider's pricing API for each unique instance type
+3. **static** — Use built-in rates ($0.031611/vCPU-hour, $0.004237/GiB-hour based on m5.xlarge on-demand)
+4. **custom** — Load rates from the specified JSON file
+
+Results are cached locally for `cache_ttl` seconds to avoid repeated API calls.
+
+### Step 3 — Collect Pod Data
+
+Fetch all running pods with resource requests:
+
+```bash
+kubectl get pods --all-namespaces -o json
+```
+
+Filter by `label_selector` if provided.
+
+### Step 3.5 — Collect Storage & LoadBalancer Data
+
+Fetch PVC, PV, and LoadBalancer service data (best-effort, non-fatal if RBAC denies access):
+
+```bash
+kubectl get pvc --all-namespaces -o json
+kubectl get pv -o json
+kubectl get svc --all-namespaces -o json
+```
+
+Extract PVC capacities and storage classes, identify unbound PVs (waste detection), and enumerate LoadBalancer services. Storage costs are computed at `$0.10/GiB/month` (default) and LoadBalancers at `$18.25/month` each.
+
+### Step 4 — Compute Cost Attribution
+
+For each pod:
+
+1. Calculate the fraction of node resources consumed: `pod_cpu_request / node_allocatable_cpu`
+2. Multiply by the node's hourly cost to get the pod's hourly cost share
+3. Extrapolate to monthly cost (730 hours)
+
+Aggregate costs by the selected `group_by` dimension.
+
+### Step 5 — Generate Report
+
+Format results as markdown tables or JSON, sorted by cost descending.
+
+---
+
+## Safety Constraints
+
+This skill MUST:
+
+- Be completely read-only — never mutate cluster state
+- Only use `kubectl get` commands (`nodes`, `pods`, `pvc`, `pv`, `svc`) — never `apply`, `delete`, `patch`, `exec`, or `scale`
+- Never modify RBAC, NetworkPolicy, or Secret resources
+- Never access pod filesystems or execute commands in containers
+- Cache pricing data locally, never write to cluster
+- Handle missing data gracefully (unknown instance types fall back to static pricing)
+- Skip nodes with no allocatable resources
+- Report errors as JSON to stderr
+
+---
+
+## Autonomous Compatibility
+
+This skill is designed to be invoked by:
+
+- Humans via natural language CLI
+- Automation pipelines via structured JSON
+- Scheduled cost reporting sweeps
+- FinOps dashboards via JSON output
+
+It must:
+
+- Be idempotent (repeated runs produce consistent results for the same cluster state)
+- Produce deterministic results (no LLM-based guessing)
+- Generate machine-parseable output for downstream processing
diff --git a/forge-skills/local/embedded/k8s-cost-visibility/scripts/k8s-cost-visibility.sh b/forge-skills/local/embedded/k8s-cost-visibility/scripts/k8s-cost-visibility.sh
new file mode 100755
index 0000000..c084cae
--- /dev/null
+++ b/forge-skills/local/embedded/k8s-cost-visibility/scripts/k8s-cost-visibility.sh
@@ -0,0 +1,1201 @@
+#!/usr/bin/env bash
+# k8s-cost-visibility.sh — Estimate Kubernetes infrastructure costs by querying
+# cluster node/pod data via kubectl, applying pricing models, and producing
+# cost attribution reports.
+#
+# Usage: ./k8s-cost-visibility.sh '{"pricing_mode":"auto","group_by":"namespace"}'
+#
+# Requires: kubectl, jq, awk, bc, bash.
+set -euo pipefail
+
+# Flag to prevent duplicate error output (error_json sets this before exit)
+__error_handled=0
+
+# Catch unexpected exits and emit a JSON error so failures are never silent
+trap '__exit_code=$?; if [ $__exit_code -ne 0 ] && [ "$__error_handled" -eq 0 ]; then __msg="{\"error\":\"script exited unexpectedly (code $__exit_code) at line ${LINENO:-unknown}\"}"; echo "$__msg" >&2; echo "$__msg"; fi' EXIT
+
+###############################################################################
+# Constants & Defaults
+###############################################################################
+
+# Default static pricing (based on AWS m5.xlarge on-demand US-East-1)
+DEFAULT_CPU_HOURLY="0.031611"
+DEFAULT_MEMORY_GIB_HOURLY="0.004237"
+DEFAULT_CURRENCY="USD"
+DEFAULT_STORAGE_GIB_MONTHLY="0.10"   # ~$0.10/GiB/month (Azure Standard SSD / AWS gp3 / GCP pd-balanced)
+DEFAULT_LB_MONTHLY="18.25"           # ~$0.025/hr ≈ $18.25/month (AWS ALB / Azure Standard LB / GCP forwarding rule)
+
+# Cache directory
+CACHE_DIR="${TMPDIR:-/tmp}/k8s-cost-cache"
+
+###############################################################################
+# Helpers
+###############################################################################
+
+error_json() {
+  local msg="$1"
+  __error_handled=1
+  echo "{\"error\":\"$msg\"}" >&2
+  echo "{\"error\":\"$msg\"}"
+  exit 1
+}
+
+json_safe() {
+  # Escape a string for safe JSON embedding
+  local s="$1"
+  echo -n "$s" | jq -Rs '.'
+}
+
+###############################################################################
+# Input Parsing & Validation
+###############################################################################
+
+INPUT="${1:-}"
+if [ -z "$INPUT" ]; then
+  error_json "usage: k8s-cost-visibility.sh {\\\"pricing_mode\\\":\\\"auto\\\",\\\"group_by\\\":\\\"namespace\\\"}"
+fi
+
+if ! echo "$INPUT" | jq empty 2>/dev/null; then
+  error_json "invalid JSON input"
+fi
+
+PRICING_MODE=$(echo "$INPUT" | jq -r '.pricing_mode // "auto"')
+GROUP_BY=$(echo "$INPUT" | jq -r '.group_by // "namespace"')
+LABEL_SELECTOR=$(echo "$INPUT" | jq -r '.label_selector // empty')
+TOP_N=$(echo "$INPUT" | jq -r '.top // 0')
+OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.output_format // "markdown"')
+CACHE_TTL=$(echo "$INPUT" | jq -r '.cache_ttl // 300')
+NAMESPACE=$(echo "$INPUT" | jq -r '.namespace // empty')
+
+# Validate pricing_mode
+# Normalize pricing_mode synonyms
+case "$PRICING_MODE" in
+  auto|default) PRICING_MODE="auto" ;;
+  aws|amazon) PRICING_MODE="aws" ;;
+  gcp|google) PRICING_MODE="gcp" ;;
+  azure|az) PRICING_MODE="azure" ;;
+  static|on_demand|on-demand|ondemand) PRICING_MODE="static" ;;
+  custom:*)
+    CUSTOM_PRICING_FILE="${PRICING_MODE#custom:}"
+    if [ ! -f "$CUSTOM_PRICING_FILE" ]; then
+      error_json "custom pricing file not found: $CUSTOM_PRICING_FILE"
+    fi
+    if ! jq empty "$CUSTOM_PRICING_FILE" 2>/dev/null; then
+      error_json "invalid JSON in custom pricing file: $CUSTOM_PRICING_FILE"
+    fi
+    PRICING_MODE="custom"
+    ;;
+  *)
+    # Unrecognized mode — fall back to auto-detection rather than failing
+    PRICING_MODE="auto"
+    ;;
+esac
+
+# Validate group_by
+case "$GROUP_BY" in
+  namespace|workload|node) ;;
+  label:*|annotation:*)
+    GROUP_KEY="${GROUP_BY#*:}"
+    if [ -z "$GROUP_KEY" ]; then
+      error_json "group_by '$GROUP_BY' requires a key (e.g., label:team)"
+    fi
+    ;;
+  *)
+    error_json "invalid group_by '$GROUP_BY': must be namespace, workload, node, label:<key>, or annotation:<key>"
+    ;;
+esac
+
+# Validate output_format
+case "$OUTPUT_FORMAT" in
+  markdown|json) ;;
+  *)
+    error_json "invalid output_format '$OUTPUT_FORMAT': must be markdown or json"
+    ;;
+esac
+
+# Validate top (must be non-negative integer)
+if ! echo "$TOP_N" | grep -qE '^[0-9]+$'; then
+  error_json "invalid top value '$TOP_N': must be a non-negative integer"
+fi
+
+# Validate cache_ttl (must be non-negative integer)
+if ! echo "$CACHE_TTL" | grep -qE '^[0-9]+$'; then
+  error_json "invalid cache_ttl value '$CACHE_TTL': must be a non-negative integer"
+fi
+
+###############################################################################
+# Preflight
+###############################################################################
+
+preflight() {
+  local kc="${KUBECONFIG:-${HOME}/.kube/config}"
+  if [ ! -f "$kc" ] && [ -z "${KUBECONFIG:-}" ]; then
+    error_json "no kubeconfig found at ${kc} — set KUBECONFIG or configure kubectl"
+  fi
+
+  local cluster_err
+  if ! cluster_err=$(kubectl cluster-info --request-timeout=10s 2>&1); then
+    error_json "cannot connect to Kubernetes cluster: $(echo "$cluster_err" | head -1 | tr '"' "'")"
+  fi
+}
+
+###############################################################################
+# Cache Functions
+###############################################################################
+
+cache_key() {
+  local key="$1"
+  echo "${CACHE_DIR}/${key}"
+}
+
+cache_get() {
+  local key="$1"
+  local file
+  file=$(cache_key "$key")
+
+  if [ "$CACHE_TTL" -eq 0 ]; then
+    return 1
+  fi
+
+  if [ ! -f "$file" ]; then
+    return 1
+  fi
+
+  # Check age — use stat with macOS/Linux fallback
+  local file_age now file_mtime
+  now=$(date +%s)
+  file_mtime=$(stat -c %Y "$file" 2>/dev/null || stat -f %m "$file" 2>/dev/null || echo "0")
+  file_age=$((now - file_mtime))
+
+  if [ "$file_age" -gt "$CACHE_TTL" ]; then
+    rm -f "$file"
+    return 1
+  fi
+
+  cat "$file"
+}
+
+cache_set() {
+  local key="$1"
+  local value="$2"
+  local file
+  file=$(cache_key "$key")
+  mkdir -p "$CACHE_DIR"
+  echo "$value" > "$file"
+}
+
+###############################################################################
+# Node Data Collection
+###############################################################################
+
+get_node_data() {
+  local node_json
+  node_json=$(kubectl get nodes -o json 2>/dev/null) || error_json "Failed to fetch nodes"
+
+  echo "$node_json" | jq '[
+    .items[] |
+    {
+      name: .metadata.name,
+      labels: (.metadata.labels // {}),
+      annotations: (.metadata.annotations // {}),
+      instance_type: (
+        (.metadata.labels // {})["node.kubernetes.io/instance-type"] //
+        (.metadata.labels // {})["beta.kubernetes.io/instance-type"] //
+        "unknown"
+      ),
+      region: (
+        (.metadata.labels // {})["topology.kubernetes.io/region"] //
+        (.metadata.labels // {})["failure-domain.beta.kubernetes.io/region"] //
+        "unknown"
+      ),
+      allocatable_cpu_milli: (
+        .status.allocatable.cpu |
+        if . == null then 0
+        else tostring |
+          if test("m$") then rtrimstr("m") | tonumber
+          else tonumber * 1000
+          end
+        end
+      ),
+      allocatable_memory_bytes: (
+        .status.allocatable.memory |
+        if . == null then 0
+        else tostring |
+          if test("Ki$") then rtrimstr("Ki") | tonumber * 1024
+          elif test("Mi$") then rtrimstr("Mi") | tonumber * 1048576
+          elif test("Gi$") then rtrimstr("Gi") | tonumber * 1073741824
+          else tonumber
+          end
+        end
+      )
+    } |
+    select(.allocatable_cpu_milli > 0)
+  ]' || error_json "failed to parse node data — cluster may have unexpected node format"
+}
+
+###############################################################################
+# Pricing Functions
+###############################################################################
+
+detect_cloud_provider() {
+  # First, detect from node labels (most reliable — matches actual cluster provider)
+  local provider_hint
+  provider_hint=$(kubectl get nodes -o json 2>/dev/null | jq -r '
+    .items[0].metadata.labels // {} |
+    if has("kubernetes.azure.com/os-sku") or has("kubernetes.azure.com/cluster") then "azure"
+    elif has("eks.amazonaws.com/nodegroup") or has("alpha.eksctl.io/cluster-name") then "aws"
+    elif has("cloud.google.com/gke-nodepool") or has("cloud.google.com/machine-family") then "gcp"
+    else "unknown"
+    end
+  ' 2>/dev/null || echo "unknown")
+
+  case "$provider_hint" in
+    aws)
+      if command -v aws &>/dev/null; then echo "aws"; else echo "static"; fi
+      ;;
+    gcp)
+      if command -v gcloud &>/dev/null; then echo "gcp"; else echo "static"; fi
+      ;;
+    azure)
+      if command -v az &>/dev/null; then echo "azure"; else echo "static"; fi
+      ;;
+    *)
+      # Fallback: check CLIs in order
+      if command -v aws &>/dev/null; then echo "aws"
+      elif command -v gcloud &>/dev/null; then echo "gcp"
+      elif command -v az &>/dev/null; then echo "azure"
+      else echo "static"
+      fi
+      ;;
+  esac
+}
+
+get_static_pricing() {
+  jq -n \
+    --arg cpu "$DEFAULT_CPU_HOURLY" \
+    --arg mem "$DEFAULT_MEMORY_GIB_HOURLY" \
+    --arg currency "$DEFAULT_CURRENCY" '{
+      cpu_hourly: ($cpu | tonumber),
+      memory_gib_hourly: ($mem | tonumber),
+      currency: $currency,
+      source: "static"
+    }'
+}
+
+get_custom_pricing() {
+  jq '. + {source: "custom"}' "$CUSTOM_PRICING_FILE"
+}
+
+get_aws_pricing() {
+  local instance_type="$1"
+  local region="${AWS_REGION:-us-east-1}"
+  local cache_result
+
+  if cache_result=$(cache_get "aws-${region}-${instance_type}" 2>/dev/null); then
+    echo "$cache_result"
+    return
+  fi
+
+  local price_json
+  if price_json=$(aws pricing get-products \
+    --service-code AmazonEC2 \
+    --region us-east-1 \
+    --filters \
+      "Type=TERM_MATCH,Field=instanceType,Value=${instance_type}" \
+      "Type=TERM_MATCH,Field=location,Value=$(aws_region_to_location "$region")" \
+      "Type=TERM_MATCH,Field=operatingSystem,Value=Linux" \
+      "Type=TERM_MATCH,Field=tenancy,Value=Shared" \
+      "Type=TERM_MATCH,Field=preInstalledSw,Value=NA" \
+      "Type=TERM_MATCH,Field=capacitystatus,Value=Used" \
+    --max-results 1 2>/dev/null); then
+
+    local hourly_price
+    hourly_price=$(echo "$price_json" | jq -r '
+      .PriceList[0] // empty |
+      fromjson |
+      .terms.OnDemand | to_entries[0].value |
+      .priceDimensions | to_entries[0].value |
+      .pricePerUnit.USD // "0"
+    ' 2>/dev/null || echo "0")
+
+    if [ "$hourly_price" != "0" ] && [ -n "$hourly_price" ]; then
+      local result
+      result=$(jq -n --arg price "$hourly_price" --arg itype "$instance_type" '{
+        instance_hourly: ($price | tonumber),
+        instance_type: $itype,
+        source: "aws"
+      }')
+      cache_set "aws-${region}-${instance_type}" "$result"
+      echo "$result"
+      return
+    fi
+  fi
+
+  # Fallback to static
+  get_static_pricing
+}
+
+aws_region_to_location() {
+  local region="$1"
+  case "$region" in
+    us-east-1) echo "US East (N. Virginia)" ;;
+    us-east-2) echo "US East (Ohio)" ;;
+    us-west-1) echo "US West (N. California)" ;;
+    us-west-2) echo "US West (Oregon)" ;;
+    eu-west-1) echo "EU (Ireland)" ;;
+    eu-west-2) echo "EU (London)" ;;
+    eu-central-1) echo "EU (Frankfurt)" ;;
+    ap-southeast-1) echo "Asia Pacific (Singapore)" ;;
+    ap-northeast-1) echo "Asia Pacific (Tokyo)" ;;
+    *) echo "US East (N. Virginia)" ;;
+  esac
+}
+
+get_gcp_pricing() {
+  local instance_type="$1"
+  local project="${GCP_PROJECT:-}"
+  local cache_result
+
+  if cache_result=$(cache_get "gcp-${instance_type}" 2>/dev/null); then
+    echo "$cache_result"
+    return
+  fi
+
+  if [ -n "$project" ]; then
+    local zone machine_info
+    zone=$(gcloud config get-value compute/zone 2>/dev/null || echo "us-central1-a")
+
+    if machine_info=$(gcloud compute machine-types describe "$instance_type" \
+      --zone="$zone" --project="$project" --format=json 2>/dev/null); then
+
+      local vcpus mem_mb
+      vcpus=$(echo "$machine_info" | jq -r '.guestCpus // 0')
+      mem_mb=$(echo "$machine_info" | jq -r '.memoryMb // 0')
+
+      if [ "$vcpus" -gt 0 ]; then
+        # Use static per-unit pricing with actual vCPU/memory counts
+        local result
+        result=$(jq -n \
+          --argjson vcpus "$vcpus" \
+          --argjson mem_mb "$mem_mb" \
+          --argjson cpu_rate "$DEFAULT_CPU_HOURLY" \
+          --argjson mem_rate "$DEFAULT_MEMORY_GIB_HOURLY" \
+          --arg itype "$instance_type" '{
+            instance_hourly: ($vcpus * $cpu_rate + ($mem_mb / 1024) * $mem_rate),
+            instance_type: $itype,
+            source: "gcp"
+          }')
+        cache_set "gcp-${instance_type}" "$result"
+        echo "$result"
+        return
+      fi
+    fi
+  fi
+
+  get_static_pricing
+}
+
+get_azure_pricing() {
+  local instance_type="$1"
+  local node_region="${2:-eastus}"
+  local cache_result
+
+  if cache_result=$(cache_get "azure-${node_region}-${instance_type}" 2>/dev/null); then
+    echo "$cache_result"
+    return
+  fi
+
+  # Auto-detect subscription if not set
+  local sub="${AZURE_SUBSCRIPTION_ID:-}"
+  if [ -z "$sub" ]; then
+    sub=$(az account show --query 'id' -o tsv 2>/dev/null || echo "")
+  fi
+
+  if [ -n "$sub" ]; then
+    local vm_info
+    local size_name="$instance_type"
+
+    if vm_info=$(az vm list-sizes --location "$node_region" --subscription "$sub" \
+      --query "[?name=='$size_name']" -o json 2>/dev/null); then
+
+      local vcpus mem_mb
+      vcpus=$(echo "$vm_info" | jq -r '.[0].numberOfCores // 0')
+      mem_mb=$(echo "$vm_info" | jq -r '.[0].memoryInMb // 0')
+
+      if [ "$vcpus" -gt 0 ] 2>/dev/null; then
+        local result
+        result=$(jq -n \
+          --argjson vcpus "$vcpus" \
+          --argjson mem_mb "$mem_mb" \
+          --argjson cpu_rate "$DEFAULT_CPU_HOURLY" \
+          --argjson mem_rate "$DEFAULT_MEMORY_GIB_HOURLY" \
+          --arg itype "$instance_type" '{
+            instance_hourly: ($vcpus * $cpu_rate + ($mem_mb / 1024) * $mem_rate),
+            instance_type: $itype,
+            source: "azure"
+          }')
+        cache_set "azure-${node_region}-${instance_type}" "$result"
+        echo "$result"
+        return
+      fi
+    fi
+  fi
+
+  get_static_pricing
+}
+
+get_node_hourly_cost() {
+  local node_json="$1"
+  local mode="$2"
+
+  local instance_type alloc_cpu_milli alloc_mem_bytes node_region
+  instance_type=$(echo "$node_json" | jq -r '.instance_type')
+  alloc_cpu_milli=$(echo "$node_json" | jq -r '.allocatable_cpu_milli')
+  alloc_mem_bytes=$(echo "$node_json" | jq -r '.allocatable_memory_bytes')
+  node_region=$(echo "$node_json" | jq -r '.region')
+
+  case "$mode" in
+    static)
+      # Cost = vCPU_count * cpu_rate + GiB_count * mem_rate
+      echo "$alloc_cpu_milli $alloc_mem_bytes" | awk \
+        -v cpu_rate="$DEFAULT_CPU_HOURLY" \
+        -v mem_rate="$DEFAULT_MEMORY_GIB_HOURLY" '{
+          vcpus = $1 / 1000
+          gib = $2 / 1073741824
+          printf "%.6f\n", vcpus * cpu_rate + gib * mem_rate
+        }'
+      ;;
+    custom)
+      local cpu_hourly mem_hourly
+      cpu_hourly=$(jq -r '.cpu_hourly // 0' "$CUSTOM_PRICING_FILE")
+      mem_hourly=$(jq -r '.memory_gib_hourly // 0' "$CUSTOM_PRICING_FILE")
+      echo "$alloc_cpu_milli $alloc_mem_bytes" | awk \
+        -v cpu_rate="$cpu_hourly" \
+        -v mem_rate="$mem_hourly" '{
+          vcpus = $1 / 1000
+          gib = $2 / 1073741824
+          printf "%.6f\n", vcpus * cpu_rate + gib * mem_rate
+        }'
+      ;;
+    aws)
+      local pricing
+      pricing=$(get_aws_pricing "$instance_type")
+      local instance_hourly
+      instance_hourly=$(echo "$pricing" | jq -r '.instance_hourly // 0')
+      if [ "$instance_hourly" != "0" ] && [ -n "$instance_hourly" ]; then
+        echo "$instance_hourly"
+      else
+        # Fallback to static
+        echo "$alloc_cpu_milli $alloc_mem_bytes" | awk \
+          -v cpu_rate="$DEFAULT_CPU_HOURLY" \
+          -v mem_rate="$DEFAULT_MEMORY_GIB_HOURLY" '{
+            vcpus = $1 / 1000
+            gib = $2 / 1073741824
+            printf "%.6f\n", vcpus * cpu_rate + gib * mem_rate
+          }'
+      fi
+      ;;
+    gcp)
+      local pricing
+      pricing=$(get_gcp_pricing "$instance_type")
+      local instance_hourly
+      instance_hourly=$(echo "$pricing" | jq -r '.instance_hourly // 0')
+      if [ "$instance_hourly" != "0" ] && [ -n "$instance_hourly" ]; then
+        echo "$instance_hourly"
+      else
+        echo "$alloc_cpu_milli $alloc_mem_bytes" | awk \
+          -v cpu_rate="$DEFAULT_CPU_HOURLY" \
+          -v mem_rate="$DEFAULT_MEMORY_GIB_HOURLY" '{
+            vcpus = $1 / 1000
+            gib = $2 / 1073741824
+            printf "%.6f\n", vcpus * cpu_rate + gib * mem_rate
+          }'
+      fi
+      ;;
+    azure)
+      local pricing
+      pricing=$(get_azure_pricing "$instance_type" "$node_region")
+      local instance_hourly
+      instance_hourly=$(echo "$pricing" | jq -r '.instance_hourly // 0')
+      if [ "$instance_hourly" != "0" ] && [ -n "$instance_hourly" ]; then
+        echo "$instance_hourly"
+      else
+        echo "$alloc_cpu_milli $alloc_mem_bytes" | awk \
+          -v cpu_rate="$DEFAULT_CPU_HOURLY" \
+          -v mem_rate="$DEFAULT_MEMORY_GIB_HOURLY" '{
+            vcpus = $1 / 1000
+            gib = $2 / 1073741824
+            printf "%.6f\n", vcpus * cpu_rate + gib * mem_rate
+          }'
+      fi
+      ;;
+  esac
+}
+
+###############################################################################
+# Pod Data Collection
+###############################################################################
+
+get_pod_data() {
+  local POD_DATA
+  local ns_flag="--all-namespaces"
+  [ -n "$NAMESPACE" ] && ns_flag="-n $NAMESPACE"
+  if [[ -n "$LABEL_SELECTOR" ]]; then
+    POD_DATA=$(kubectl get pods $ns_flag -l "$LABEL_SELECTOR" -o json 2>/dev/null) || error_json "Failed to fetch pods"
+  else
+    POD_DATA=$(kubectl get pods $ns_flag -o json 2>/dev/null) || error_json "Failed to fetch pods"
+  fi
+
+  echo "$POD_DATA" | jq '[
+    .items[] |
+    select(.status.phase == "Running") |
+    {
+      name: .metadata.name,
+      namespace: .metadata.namespace,
+      node_name: (.spec.nodeName // "unscheduled"),
+      labels: (.metadata.labels // {}),
+      annotations: (.metadata.annotations // {}),
+      owner_kind: ((.metadata.ownerReferences // [])[0].kind // "standalone"),
+      owner_name: ((.metadata.ownerReferences // [])[0].name // .metadata.name),
+      cpu_request_milli: (
+        [.spec.containers[].resources.requests.cpu // "0" |
+          tostring |
+          if test("m$") then rtrimstr("m") | tonumber
+          elif . == "0" then 0
+          else tonumber * 1000
+          end
+        ] | add // 0
+      ),
+      memory_request_bytes: (
+        [.spec.containers[].resources.requests.memory // "0" |
+          tostring |
+          if test("Ki$") then rtrimstr("Ki") | tonumber * 1024
+          elif test("Mi$") then rtrimstr("Mi") | tonumber * 1048576
+          elif test("Gi$") then rtrimstr("Gi") | tonumber * 1073741824
+          elif . == "0" then 0
+          else tonumber
+          end
+        ] | add // 0
+      )
+    }
+  ]' || error_json "failed to parse pod data — cluster may have unexpected pod format"
+}
+
+###############################################################################
+# Storage & LoadBalancer Data Collection
+###############################################################################
+
+get_pvc_data() {
+  local pvc_json
+  local ns_flag="--all-namespaces"
+  [ -n "$NAMESPACE" ] && ns_flag="-n $NAMESPACE"
+  pvc_json=$(kubectl get pvc $ns_flag -o json 2>/dev/null) || { echo "[]"; return; }
+
+  echo "$pvc_json" | jq '[
+    .items[] |
+    {
+      namespace: .metadata.namespace,
+      name: .metadata.name,
+      storage_class: (.spec.storageClassName // "default"),
+      volume_name: (.spec.volumeName // ""),
+      capacity_bytes: (
+        (.status.capacity.storage // .spec.resources.requests.storage // "0") |
+        tostring |
+        if test("Ti$") then rtrimstr("Ti") | tonumber * 1099511627776
+        elif test("Gi$") then rtrimstr("Gi") | tonumber * 1073741824
+        elif test("Mi$") then rtrimstr("Mi") | tonumber * 1048576
+        elif test("Ki$") then rtrimstr("Ki") | tonumber * 1024
+        elif . == "0" then 0
+        else tonumber
+        end
+      )
+    }
+  ]'
+}
+
+get_unbound_pvs() {
+  local pv_json
+  pv_json=$(kubectl get pv -o json 2>/dev/null) || { echo "[]"; return; }
+
+  echo "$pv_json" | jq '[
+    .items[] |
+    select(.status.phase != "Bound") |
+    {
+      name: .metadata.name,
+      storage_class: (.spec.storageClassName // "default"),
+      reclaim_policy: (.spec.persistentVolumeReclaimPolicy // "Delete"),
+      phase: .status.phase,
+      capacity_bytes: (
+        (.spec.capacity.storage // "0") |
+        tostring |
+        if test("Ti$") then rtrimstr("Ti") | tonumber * 1099511627776
+        elif test("Gi$") then rtrimstr("Gi") | tonumber * 1073741824
+        elif test("Mi$") then rtrimstr("Mi") | tonumber * 1048576
+        elif test("Ki$") then rtrimstr("Ki") | tonumber * 1024
+        elif . == "0" then 0
+        else tonumber
+        end
+      )
+    }
+  ]'
+}
+
+compute_storage_costs() {
+  local pvc_data="$1"
+  local storage_rate="$DEFAULT_STORAGE_GIB_MONTHLY"
+
+  if [ "$PRICING_MODE" = "custom" ] && [ -n "${CUSTOM_PRICING_FILE:-}" ]; then
+    local custom_rate
+    custom_rate=$(jq -r '.storage_gib_monthly // empty' "$CUSTOM_PRICING_FILE" 2>/dev/null || true)
+    if [ -n "$custom_rate" ]; then
+      storage_rate="$custom_rate"
+    fi
+  fi
+
+  echo "$pvc_data" | jq --arg rate "$storage_rate" '[
+    .[] |
+    {
+      namespace: .namespace,
+      pvc_name: .name,
+      storage_class: .storage_class,
+      capacity_gib: (.capacity_bytes / 1073741824),
+      monthly_cost: ((.capacity_bytes / 1073741824) * ($rate | tonumber))
+    }
+  ]'
+}
+
+get_lb_services() {
+  local svc_json
+  local ns_flag="--all-namespaces"
+  [ -n "$NAMESPACE" ] && ns_flag="-n $NAMESPACE"
+  svc_json=$(kubectl get svc $ns_flag -o json 2>/dev/null) || { echo "[]"; return; }
+
+  echo "$svc_json" | jq '[
+    .items[] |
+    select(.spec.type == "LoadBalancer") |
+    {
+      namespace: .metadata.namespace,
+      name: .metadata.name,
+      external_ip: (
+        (.status.loadBalancer.ingress // [])[0] |
+        if . == null then "pending"
+        elif .ip then .ip
+        elif .hostname then .hostname
+        else "pending"
+        end
+      ),
+      port_count: (.spec.ports | length),
+      created: .metadata.creationTimestamp
+    }
+  ]'
+}
+
+compute_lb_costs() {
+  local lb_data="$1"
+  local lb_rate="$DEFAULT_LB_MONTHLY"
+
+  if [ "$PRICING_MODE" = "custom" ] && [ -n "${CUSTOM_PRICING_FILE:-}" ]; then
+    local custom_rate
+    custom_rate=$(jq -r '.lb_monthly // empty' "$CUSTOM_PRICING_FILE" 2>/dev/null || true)
+    if [ -n "$custom_rate" ]; then
+      lb_rate="$custom_rate"
+    fi
+  fi
+
+  echo "$lb_data" | jq --arg rate "$lb_rate" '[
+    .[] |
+    {
+      namespace: .namespace,
+      service_name: .name,
+      external_ip: .external_ip,
+      port_count: .port_count,
+      monthly_cost: ($rate | tonumber)
+    }
+  ]'
+}
+
+###############################################################################
+# Cost Computation
+###############################################################################
+
+compute_costs() {
+  local node_data="$1"
+  local pod_data="$2"
+  local pricing_mode="$3"
+
+  # Build node cost map using jq (safe JSON construction)
+  local node_costs="{}"
+  local node_count
+  node_count=$(echo "$node_data" | jq 'length')
+
+  local i=0
+  while [ "$i" -lt "$node_count" ]; do
+    local node_info node_name hourly_cost
+    node_info=$(echo "$node_data" | jq ".[$i]")
+    node_name=$(echo "$node_info" | jq -r '.name')
+    hourly_cost=$(get_node_hourly_cost "$node_info" "$pricing_mode" 2>/dev/null || echo "")
+
+    # Guard against empty or non-numeric hourly_cost
+    if [ -z "$hourly_cost" ] || ! echo "$hourly_cost" | grep -qE '^[0-9.]+$'; then
+      hourly_cost="0"
+    fi
+
+    local alloc_cpu alloc_mem
+    alloc_cpu=$(echo "$node_info" | jq '.allocatable_cpu_milli // 0')
+    alloc_mem=$(echo "$node_info" | jq '.allocatable_memory_bytes // 0')
+
+    node_costs=$(echo "$node_costs" | jq \
+      --arg name "$node_name" \
+      --argjson cost "$hourly_cost" \
+      --argjson cpu "$alloc_cpu" \
+      --argjson mem "$alloc_mem" \
+      '. + {($name): {hourly_cost: $cost, alloc_cpu_milli: $cpu, alloc_mem_bytes: $mem}}')
+
+    i=$((i + 1))
+  done
+
+  # Compute per-pod costs
+  echo "$pod_data" | jq --argjson nodes "$node_costs" '[
+    .[] |
+    . as $pod |
+    ($nodes[$pod.node_name] // null) as $node |
+    if $node == null then
+      . + {hourly_cost: 0, monthly_cost: 0, cost_source: "unscheduled"}
+    else
+      (
+        if $node.alloc_cpu_milli > 0 then
+          ($pod.cpu_request_milli / $node.alloc_cpu_milli)
+        else 0 end
+      ) as $cpu_fraction |
+      (
+        if $node.alloc_mem_bytes > 0 then
+          ($pod.memory_request_bytes / $node.alloc_mem_bytes)
+        else 0 end
+      ) as $mem_fraction |
+      (($cpu_fraction + $mem_fraction) / 2 * $node.hourly_cost) as $hourly |
+      . + {
+        hourly_cost: $hourly,
+        monthly_cost: ($hourly * 730),
+        cost_source: "computed",
+        cpu_fraction: $cpu_fraction,
+        mem_fraction: $mem_fraction,
+        node_hourly_cost: $node.hourly_cost
+      }
+    end
+  ]'
+}
+
+###############################################################################
+# Grouping & Aggregation
+###############################################################################
+
+group_costs() {
+  local pod_costs="$1"
+  local group_by="$2"
+  local storage_costs="${3:-[]}"
+  local lb_costs="${4:-[]}"
+
+  case "$group_by" in
+    namespace)
+      echo "$pod_costs" | jq --argjson sc "$storage_costs" --argjson lc "$lb_costs" '
+        # Build storage and LB lookup maps by namespace
+        (if ($sc | length) > 0 then ($sc | group_by(.namespace) | map({key: .[0].namespace, value: {storage_gib: ([.[].capacity_gib] | add // 0), storage_monthly_cost: ([.[].monthly_cost] | add // 0)}}) | from_entries) else {} end) as $storage_map |
+        (if ($lc | length) > 0 then ($lc | group_by(.namespace) | map({key: .[0].namespace, value: {lb_count: length, lb_monthly_cost: ([.[].monthly_cost] | add // 0)}}) | from_entries) else {} end) as $lb_map |
+        # Union all namespaces from pods, storage, and LB sources
+        (([.[] | .namespace] + [$sc[] | .namespace] + [$lc[] | .namespace]) | unique) as $all_ns |
+        # Build pod data lookup map by namespace
+        (if length > 0 then (group_by(.namespace) | map({key: .[0].namespace, value: .}) | from_entries) else {} end) as $pod_map |
+        [
+          $all_ns[] | . as $ns |
+          ($pod_map[$ns] // []) as $pods |
+          {
+            group_key: $ns,
+            pod_count: ($pods | length),
+            total_cpu_milli: ([$pods[].cpu_request_milli] | add // 0),
+            total_memory_bytes: ([$pods[].memory_request_bytes] | add // 0),
+            hourly_cost: ([$pods[].hourly_cost] | add // 0),
+            monthly_cost: ([$pods[].monthly_cost] | add // 0),
+            storage_gib: (($storage_map[$ns].storage_gib) // 0),
+            storage_monthly_cost: (($storage_map[$ns].storage_monthly_cost) // 0),
+            lb_count: (($lb_map[$ns].lb_count) // 0),
+            lb_monthly_cost: (($lb_map[$ns].lb_monthly_cost) // 0)
+          }
+        ] | sort_by(-(.monthly_cost + .storage_monthly_cost + .lb_monthly_cost))'
+      ;;
+    workload)
+      echo "$pod_costs" | jq '[
+        group_by(.namespace + "/" + .owner_kind + "/" + .owner_name)[] |
+        {
+          group_key: (.[0].namespace + "/" + .[0].owner_kind + "/" + .[0].owner_name),
+          namespace: .[0].namespace,
+          kind: .[0].owner_kind,
+          workload_name: .[0].owner_name,
+          pod_count: length,
+          total_cpu_milli: ([.[].cpu_request_milli] | add // 0),
+          total_memory_bytes: ([.[].memory_request_bytes] | add // 0),
+          hourly_cost: ([.[].hourly_cost] | add // 0),
+          monthly_cost: ([.[].monthly_cost] | add // 0)
+        }
+      ] | sort_by(-.monthly_cost)'
+      ;;
+    node)
+      echo "$pod_costs" | jq '[
+        group_by(.node_name)[] |
+        {
+          group_key: .[0].node_name,
+          pod_count: length,
+          total_cpu_milli: ([.[].cpu_request_milli] | add // 0),
+          total_memory_bytes: ([.[].memory_request_bytes] | add // 0),
+          hourly_cost: ([.[].hourly_cost] | add // 0),
+          monthly_cost: ([.[].monthly_cost] | add // 0),
+          node_hourly_cost: (.[0].node_hourly_cost // 0)
+        }
+      ] | sort_by(-.monthly_cost)'
+      ;;
+    label:*)
+      local key="${group_by#label:}"
+      echo "$pod_costs" | jq --arg key "$key" '[
+        group_by(.labels[$key] // "unset")[] |
+        {
+          group_key: (.[0].labels[$key] // "unset"),
+          label_key: $key,
+          pod_count: length,
+          total_cpu_milli: ([.[].cpu_request_milli] | add // 0),
+          total_memory_bytes: ([.[].memory_request_bytes] | add // 0),
+          hourly_cost: ([.[].hourly_cost] | add // 0),
+          monthly_cost: ([.[].monthly_cost] | add // 0)
+        }
+      ] | sort_by(-.monthly_cost)'
+      ;;
+    annotation:*)
+      local key="${group_by#annotation:}"
+      echo "$pod_costs" | jq --arg key "$key" '[
+        group_by(.annotations[$key] // "unset")[] |
+        {
+          group_key: (.[0].annotations[$key] // "unset"),
+          annotation_key: $key,
+          pod_count: length,
+          total_cpu_milli: ([.[].cpu_request_milli] | add // 0),
+          total_memory_bytes: ([.[].memory_request_bytes] | add // 0),
+          hourly_cost: ([.[].hourly_cost] | add // 0),
+          monthly_cost: ([.[].monthly_cost] | add // 0)
+        }
+      ] | sort_by(-.monthly_cost)'
+      ;;
+  esac
+}
+
+apply_top_n() {
+  local data="$1"
+  local top_n="$2"
+
+  if [ "$top_n" -gt 0 ]; then
+    echo "$data" | jq --argjson n "$top_n" '.[:$n]'
+  else
+    echo "$data"
+  fi
+}
+
+###############################################################################
+# Report Generation
+###############################################################################
+
+format_cost() {
+  # Format a decimal cost value to 2 decimal places
+  local val="$1"
+  printf "%.2f" "$val"
+}
+
+format_cpu_display() {
+  local milli="$1"
+  if [ "$milli" -ge 1000 ] 2>/dev/null; then
+    echo "$(echo "$milli" | awk '{printf "%.1f", $1/1000}') vCPU"
+  else
+    echo "${milli}m"
+  fi
+}
+
+format_memory_display() {
+  local bytes="$1"
+  local gib
+  gib=$(echo "$bytes" | awk '{printf "%.1f", $1/1073741824}')
+  if echo "$gib" | awk '{exit ($1 >= 1.0) ? 0 : 1}'; then
+    echo "${gib} GiB"
+  else
+    local mib
+    mib=$(echo "$bytes" | awk '{printf "%.0f", $1/1048576}')
+    echo "${mib} MiB"
+  fi
+}
+
+generate_markdown() {
+  local grouped_data="$1"
+  local group_by="$2"
+  local pricing_source="$3"
+  local storage_costs="${4:-[]}"
+  local lb_costs="${5:-[]}"
+  local unbound_pvs="${6:-[]}"
+  local entry_count
+
+  entry_count=$(echo "$grouped_data" | jq 'length')
+  local total_compute_monthly total_storage_monthly total_lb_monthly
+  total_compute_monthly=$(echo "$grouped_data" | jq '[.[].monthly_cost] | add // 0')
+  total_storage_monthly=$(echo "$storage_costs" | jq '[.[].monthly_cost] | add // 0')
+  total_lb_monthly=$(echo "$lb_costs" | jq '[.[].monthly_cost] | add // 0')
+  local total_monthly total_hourly
+  total_monthly=$(echo "$total_compute_monthly $total_storage_monthly $total_lb_monthly" | awk '{printf "%.6f", $1 + $2 + $3}')
+  total_hourly=$(echo "$total_monthly" | awk '{printf "%.6f", $1 / 730}')
+
+  echo "# Kubernetes Cost Report"
+  echo ""
+  echo "**Grouped by:** ${group_by}"
+  echo "**Pricing source:** ${pricing_source}"
+  echo "**Currency:** ${DEFAULT_CURRENCY}"
+  if [ -n "$NAMESPACE" ]; then
+    echo "**Namespace:** ${NAMESPACE}"
+  fi
+  if [ -n "$LABEL_SELECTOR" ]; then
+    echo "**Label filter:** ${LABEL_SELECTOR}"
+  fi
+  echo "**Total hourly:** \$$(format_cost "$total_hourly")"
+  echo "**Total monthly (730h):** \$$(format_cost "$total_monthly")"
+  echo ""
+
+  case "$group_by" in
+    namespace)
+      echo "| Namespace | Pods | CPU Req | Mem Req | Compute \$/mo | Storage \$/mo | LB \$/mo | Total \$/mo | % of Total |"
+      echo "|-----------|------|---------|---------|-------------|-------------|---------|-----------|------------|"
+      echo "$grouped_data" | jq -r --argjson total "$total_monthly" '.[] |
+        (.monthly_cost + .storage_monthly_cost + .lb_monthly_cost) as $row_total |
+        "\(.group_key)\t\(.pod_count)\t\(.total_cpu_milli)\t\(.total_memory_bytes)\t\(.monthly_cost)\t\(.storage_monthly_cost)\t\(.lb_monthly_cost)\t\($row_total)\t\(if $total > 0 then ($row_total / $total * 100) else 0 end)"
+      ' | while IFS=$'\t' read -r gk pods cpu mem compute storage lb total pct; do
+        echo "| ${gk} | ${pods} | $(format_cpu_display "$cpu") | $(format_memory_display "$mem") | \$$(format_cost "$compute") | \$$(format_cost "$storage") | \$$(format_cost "$lb") | \$$(format_cost "$total") | $(printf "%.1f" "$pct")% |"
+      done
+      ;;
+    workload)
+      echo "| Workload | Namespace | Pods | CPU Requests | Memory Requests | Monthly Cost | % of Total |"
+      echo "|----------|-----------|------|-------------|-----------------|-------------|------------|"
+      echo "$grouped_data" | jq -r --argjson total "$total_monthly" '.[] |
+        "\(.kind)/\(.workload_name)\t\(.namespace)\t\(.pod_count)\t\(.total_cpu_milli)\t\(.total_memory_bytes)\t\(.monthly_cost)\t\(if $total > 0 then (.monthly_cost / $total * 100) else 0 end)"
+      ' | while IFS=$'\t' read -r wl ns pods cpu mem monthly pct; do
+        echo "| ${wl} | ${ns} | ${pods} | $(format_cpu_display "$cpu") | $(format_memory_display "$mem") | \$$(format_cost "$monthly") | $(printf "%.1f" "$pct")% |"
+      done
+      ;;
+    node)
+      echo "| Node | Pods | CPU Requests | Memory Requests | Node Cost/hr | Pod Cost/mo | Utilization |"
+      echo "|------|------|-------------|-----------------|-------------|-------------|-------------|"
+      echo "$grouped_data" | jq -r '.[] |
+        "\(.group_key)\t\(.pod_count)\t\(.total_cpu_milli)\t\(.total_memory_bytes)\t\(.node_hourly_cost)\t\(.monthly_cost)\t\(if .node_hourly_cost > 0 then (.hourly_cost / .node_hourly_cost * 100) else 0 end)"
+      ' | while IFS=$'\t' read -r node pods cpu mem node_hr monthly util; do
+        echo "| ${node} | ${pods} | $(format_cpu_display "$cpu") | $(format_memory_display "$mem") | \$$(format_cost "$node_hr") | \$$(format_cost "$monthly") | $(printf "%.1f" "$util")% |"
+      done
+      ;;
+    label:*|annotation:*)
+      local dim_label
+      dim_label=$(echo "$group_by" | cut -d: -f1)
+      local dim_key
+      dim_key=$(echo "$group_by" | cut -d: -f2)
+      echo "| ${dim_label}:${dim_key} | Pods | CPU Requests | Memory Requests | Monthly Cost | % of Total |"
+      echo "|$(printf '%0.s-' {1..20})|------|-------------|-----------------|-------------|------------|"
+      echo "$grouped_data" | jq -r --argjson total "$total_monthly" '.[] |
+        "\(.group_key)\t\(.pod_count)\t\(.total_cpu_milli)\t\(.total_memory_bytes)\t\(.monthly_cost)\t\(if $total > 0 then (.monthly_cost / $total * 100) else 0 end)"
+      ' | while IFS=$'\t' read -r gk pods cpu mem monthly pct; do
+        echo "| ${gk} | ${pods} | $(format_cpu_display "$cpu") | $(format_memory_display "$mem") | \$$(format_cost "$monthly") | $(printf "%.1f" "$pct")% |"
+      done
+      ;;
+  esac
+
+  if [ "$TOP_N" -gt 0 ] && [ "$entry_count" -eq "$TOP_N" ]; then
+    echo ""
+    echo "_Showing top ${TOP_N} entries by cost._"
+  fi
+
+  # LoadBalancer Services section
+  local lb_count
+  lb_count=$(echo "$lb_costs" | jq 'length')
+  if [ "$lb_count" -gt 0 ]; then
+    echo ""
+    echo "## LoadBalancer Services"
+    echo ""
+    echo "| Namespace | Service | External IP | Monthly Cost |"
+    echo "|-----------|---------|-------------|-------------|"
+    echo "$lb_costs" | jq -r '.[] |
+      "\(.namespace)\t\(.service_name)\t\(.external_ip)\t\(.monthly_cost)"
+    ' | while IFS=$'\t' read -r ns svc ip cost; do
+      echo "| ${ns} | ${svc} | ${ip} | \$$(format_cost "$cost") |"
+    done
+    echo ""
+    echo "**Total LoadBalancer cost:** \$$(format_cost "$total_lb_monthly")/month"
+  fi
+
+  # Unbound Persistent Volumes (Waste) section
+  local unbound_count
+  unbound_count=$(echo "$unbound_pvs" | jq 'length')
+  if [ "$unbound_count" -gt 0 ]; then
+    local total_pv_waste
+    total_pv_waste=$(echo "$unbound_pvs" | jq '[.[].monthly_waste] | add // 0')
+    echo ""
+    echo "## Unbound Persistent Volumes (Waste)"
+    echo ""
+    echo "| PV Name | Capacity | Storage Class | Reclaim Policy | Phase | Est. Monthly Waste |"
+    echo "|---------|----------|---------------|----------------|-------|--------------------|"
+    echo "$unbound_pvs" | jq -r '.[] |
+      "\(.name)\t\(.capacity_gib)\t\(.storage_class)\t\(.reclaim_policy)\t\(.phase)\t\(.monthly_waste)"
+    ' | while IFS=$'\t' read -r name cap sc rp phase waste; do
+      echo "| ${name} | $(printf "%.1f" "$cap") GiB | ${sc} | ${rp} | ${phase} | \$$(format_cost "$waste") |"
+    done
+    echo ""
+    echo "**Total estimated waste:** \$$(format_cost "$total_pv_waste")/month"
+  fi
+}
+
+generate_json_output() {
+  local grouped_data="$1"
+  local group_by="$2"
+  local pricing_source="$3"
+  local storage_costs="${4:-[]}"
+  local lb_costs="${5:-[]}"
+  local unbound_pvs="${6:-[]}"
+
+  local total_compute_monthly total_storage_monthly total_lb_monthly
+  total_compute_monthly=$(echo "$grouped_data" | jq '[.[].monthly_cost] | add // 0')
+  total_storage_monthly=$(echo "$storage_costs" | jq '[.[].monthly_cost] | add // 0')
+  total_lb_monthly=$(echo "$lb_costs" | jq '[.[].monthly_cost] | add // 0')
+  local total_monthly total_hourly
+  total_monthly=$(echo "$total_compute_monthly $total_storage_monthly $total_lb_monthly" | awk '{printf "%.6f", $1 + $2 + $3}')
+  total_hourly=$(echo "$total_monthly" | awk '{printf "%.6f", $1 / 730}')
+
+  jq -n \
+    --arg group_by "$group_by" \
+    --arg pricing_source "$pricing_source" \
+    --arg currency "$DEFAULT_CURRENCY" \
+    --arg namespace "${NAMESPACE:-}" \
+    --arg label_selector "$LABEL_SELECTOR" \
+    --argjson total_hourly "$total_hourly" \
+    --argjson total_monthly "$total_monthly" \
+    --argjson total_compute_monthly "$total_compute_monthly" \
+    --argjson total_storage_monthly "$total_storage_monthly" \
+    --argjson total_lb_monthly "$total_lb_monthly" \
+    --argjson top_n "$TOP_N" \
+    --argjson entries "$grouped_data" \
+    --argjson storage_costs "$storage_costs" \
+    --argjson lb_services "$lb_costs" \
+    --argjson unbound_pvs "$unbound_pvs" '{
+      group_by: $group_by,
+      pricing_source: $pricing_source,
+      currency: $currency,
+      namespace: (if $namespace == "" then null else $namespace end),
+      label_selector: (if $label_selector == "" then null else $label_selector end),
+      top_n: (if $top_n == 0 then null else $top_n end),
+      total_hourly_cost: $total_hourly,
+      total_monthly_cost: $total_monthly,
+      total_compute_monthly: $total_compute_monthly,
+      total_storage_monthly: $total_storage_monthly,
+      total_lb_monthly: $total_lb_monthly,
+      entries: $entries,
+      storage_costs: $storage_costs,
+      lb_services: $lb_services,
+      unbound_pvs: $unbound_pvs
+    }'
+}
+
+###############################################################################
+# Main Orchestration
+###############################################################################
+
+main() {
+  # Step 0: Preflight
+  preflight
+
+  # Step 1: Collect node data
+  local node_data
+  node_data=$(get_node_data)
+
+  local node_count
+  node_count=$(echo "$node_data" | jq 'length')
+  if [ "$node_count" -eq 0 ]; then
+    error_json "no nodes found with allocatable resources"
+  fi
+
+  # Step 2: Determine pricing mode
+  local effective_mode="$PRICING_MODE"
+  if [ "$effective_mode" = "auto" ]; then
+    effective_mode=$(detect_cloud_provider)
+  fi
+
+  local pricing_source="$effective_mode"
+
+  # Step 3: Collect pod data
+  local pod_data
+  pod_data=$(get_pod_data)
+
+  local pod_count
+  pod_count=$(echo "$pod_data" | jq 'length')
+  # Step 3.5: Collect storage and LoadBalancer data (best-effort)
+  local pvc_data="[]" unbound_pvs="[]" storage_costs="[]"
+  local lb_services="[]" lb_costs="[]"
+
+  if pvc_data=$(get_pvc_data 2>/dev/null); then
+    storage_costs=$(compute_storage_costs "$pvc_data" 2>/dev/null) || storage_costs="[]"
+  else
+    pvc_data="[]"; storage_costs="[]"
+  fi
+
+  if ! unbound_pvs=$(get_unbound_pvs 2>/dev/null); then
+    unbound_pvs="[]"
+  fi
+
+  # Annotate unbound PVs with waste cost
+  local storage_rate="$DEFAULT_STORAGE_GIB_MONTHLY"
+  if [ "$PRICING_MODE" = "custom" ] && [ -n "${CUSTOM_PRICING_FILE:-}" ]; then
+    local cr
+    cr=$(jq -r '.storage_gib_monthly // empty' "$CUSTOM_PRICING_FILE" 2>/dev/null || true)
+    [ -n "$cr" ] && storage_rate="$cr"
+  fi
+  unbound_pvs=$(echo "$unbound_pvs" | jq --arg rate "$storage_rate" '[
+    .[] | . + {
+      capacity_gib: (.capacity_bytes / 1073741824),
+      monthly_waste: ((.capacity_bytes / 1073741824) * ($rate | tonumber))
+    }
+  ]' 2>/dev/null) || unbound_pvs="[]"
+
+  if lb_services=$(get_lb_services 2>/dev/null); then
+    lb_costs=$(compute_lb_costs "$lb_services" 2>/dev/null) || lb_costs="[]"
+  else
+    lb_services="[]"; lb_costs="[]"
+  fi
+
+  # Verify we have at least some data to report
+  local storage_count lb_svc_count unbound_pv_count
+  storage_count=$(echo "$storage_costs" | jq 'length')
+  lb_svc_count=$(echo "$lb_costs" | jq 'length')
+  unbound_pv_count=$(echo "$unbound_pvs" | jq 'length')
+  if [ "$pod_count" -eq 0 ] && [ "$storage_count" -eq 0 ] && [ "$lb_svc_count" -eq 0 ] && [ "$unbound_pv_count" -eq 0 ]; then
+    error_json "no running pods, PVCs, or LoadBalancer services found"
+  fi
+
+  # Step 4: Compute costs (skip if no pods to avoid unnecessary pricing API calls)
+  local pod_costs="[]"
+  if [ "$pod_count" -gt 0 ]; then
+    pod_costs=$(compute_costs "$node_data" "$pod_data" "$effective_mode") || error_json "failed to compute pod costs"
+  fi
+
+  # Step 5: Group and aggregate
+  local grouped
+  grouped=$(group_costs "$pod_costs" "$GROUP_BY" "$storage_costs" "$lb_costs") || error_json "failed to group costs"
+
+  # Apply top N filter
+  grouped=$(apply_top_n "$grouped" "$TOP_N")
+
+  # Step 6: Generate output
+  case "$OUTPUT_FORMAT" in
+    markdown)
+      generate_markdown "$grouped" "$GROUP_BY" "$pricing_source" "$storage_costs" "$lb_costs" "$unbound_pvs"
+      ;;
+    json)
+      generate_json_output "$grouped" "$GROUP_BY" "$pricing_source" "$storage_costs" "$lb_costs" "$unbound_pvs"
+      ;;
+  esac
+}
+
+main
diff --git a/forge-skills/local/registry_embedded_test.go b/forge-skills/local/registry_embedded_test.go
index d0bda32..7247e00 100644
--- a/forge-skills/local/registry_embedded_test.go
+++ b/forge-skills/local/registry_embedded_test.go
@@ -16,12 +16,12 @@ func TestEmbeddedRegistry_DiscoverAll(t *testing.T) {
 		t.Fatalf("List error: %v", err)
 	}
 
-	if len(skills) != 11 {
+	if len(skills) != 12 {
 		names := make([]string, len(skills))
 		for i, s := range skills {
 			names[i] = s.Name
 		}
-		t.Fatalf("expected 11 skills, got %d: %v", len(skills), names)
+		t.Fatalf("expected 12 skills, got %d: %v", len(skills), names)
 	}
 
 	// Verify all expected skills are present
@@ -42,6 +42,7 @@ func TestEmbeddedRegistry_DiscoverAll(t *testing.T) {
 		"codegen-react":         {displayName: "Codegen React", hasEnv: false, hasBins: true, hasEgress: true},
 		"codegen-html":          {displayName: "Codegen Html", hasEnv: false, hasBins: true, hasEgress: true},
 		"k8s-pod-rightsizer":    {displayName: "K8s Pod Rightsizer", hasEnv: false, hasBins: true, hasEgress: false},
+		"k8s-cost-visibility":  {displayName: "K8s Cost Visibility", hasEnv: false, hasBins: true, hasEgress: true},
 	}
 
 	for _, s := range skills {