Merge branch 'staging' into feat/credential-manager

2026-01-21 14:33:28 -08:00
parent fba2751f73 ae349e133c
commit 888569416b
44 changed files with 6451 additions and 360 deletions
@@ -57,10 +57,15 @@ __pycache__/
 .eggs/
 *.egg

+# Generated runtime data
+core/data/
+
 # Misc
 *.local
 .cache/
 tmp/
 temp/

-exports/*
+exports/*
+
+core/.agent-builder-sessions/*
@@ -3,9 +3,9 @@
    "agent-builder": {
      "command": "python",
      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/Users/acho-admin/acho/local-oss/hive/core",
+      "cwd": "core",
      "env": {
-        "PYTHONPATH": "/Users/acho-admin/acho/local-oss/hive/aden-tools/src"
+        "PYTHONPATH": "../aden-tools/src"
      }
    }
  }
@@ -29,6 +29,18 @@ import argparse
 import os
 import sys

+# Suppress FastMCP banner in STDIO mode
+if "--stdio" in sys.argv:
+    # Monkey-patch rich Console to redirect to stderr
+    import rich.console
+    _original_console_init = rich.console.Console.__init__
+
+    def _patched_console_init(self, *args, **kwargs):
+        kwargs['file'] = sys.stderr  # Force all rich output to stderr
+        _original_console_init(self, *args, **kwargs)
+
+    rich.console.Console.__init__ = _patched_console_init
+
 from fastmcp import FastMCP
 from starlette.requests import Request
 from starlette.responses import PlainTextResponse
@@ -51,7 +63,9 @@ mcp = FastMCP("aden-tools")

 # Register all tools with the MCP server, passing credential manager
 tools = register_all_tools(mcp, credentials=credentials)
-print(f"[MCP] Registered {len(tools)} tools: {tools}")
+# Only print to stdout in HTTP mode (STDIO mode requires clean stdout for JSON-RPC)
+if "--stdio" not in sys.argv:
+    print(f"[MCP] Registered {len(tools)} tools: {tools}")


@mcp.custom_route("/health", methods=["GET"])
@@ -88,7 +102,7 @@ def main() -> None:
    args = parser.parse_args()

    if args.stdio:
-        print("[MCP] Starting with STDIO transport")
+        # STDIO mode: only JSON-RPC messages go to stdout
        mcp.run(transport="stdio")
    else:
        print(f"[MCP] Starting HTTP server on {args.host}:{args.port}")
@@ -9,9 +9,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def apply_diff(path: str, diff_text: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
        """
-        Apply a diff to a file within the session sandbox.
+        Purpose
+            Apply a structured diff to update a file while preserving context.

-        Use this when you need to apply structured diff patches to modify file content.
+        When to use
+            Larger but still controlled updates
+            Refactoring structured memory (tables, sections)
+            Automated compaction or cleanup passes
+
+        Rules & Constraints
+            Diff must be context-aware
+            Rejected if it touches restricted sections
+            Prefer apply_patch for small changes

        Args:
            path: The path to the file (relative to session root)
@@ -9,10 +9,21 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def apply_patch(path: str, patch_text: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
        """
-        Apply a patch to a file within the session sandbox.
+        Purpose
+            Apply a scoped, line-level modification to an existing file.

-        Use this when you need to apply patch-formatted changes to a file.
-        This is an alias for apply_diff with the same functionality.
+        When to use
+            Update curated canonical memory
+            Fix or refine existing summaries or facts
+            Remove duplication or stale information
+
+        Rules & Constraints
+            Patch must be small and targeted
+            Must preserve unrelated content
+            Only allowed on approved files and sections
+
+        Best practice
+            Always read the file first. Never patch blindly.

        Args:
            path: The path to the file (relative to session root)
@@ -10,10 +10,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def execute_command_tool(command: str, workspace_id: str, agent_id: str, session_id: str, cwd: Optional[str] = None) -> dict:
        """
-        Execute a shell command within the session sandbox.
+        Purpose
+            Execute a shell command within the session sandbox.

-        Use this when you need to run shell commands safely within the sandboxed environment.
-        Commands are executed with a 60-second timeout.
+        When to use
+            Run validators or linters
+            Generate derived artifacts (indexes, summaries)
+            Perform controlled maintenance tasks
+
+        Rules & Constraints
+            No network access unless explicitly allowed
+            No destructive commands (rm -rf, system modification)
+            Output must be treated as data, not truth

        Args:
            command: The shell command to execute
@@ -9,10 +9,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def grep_search(path: str, pattern: str, workspace_id: str, agent_id: str, session_id: str, recursive: bool = False) -> dict:
        """
-        Search for a pattern in a file or directory within the session sandbox.
+        Purpose
+            Search for a regex pattern in files within the session sandbox.

-        Use this when you need to find specific content or patterns in files using regex.
-        Set recursive=True to search through all subdirectories.
+        When to use
+            Find specific content or patterns across files
+            Locate references to variables, functions, or terms
+            Search through logs or data files for matching entries
+
+        Rules & Constraints
+            Pattern must be a valid regex expression
+            Set recursive=True to search through subdirectories
+            Binary files and permission-denied files are skipped

        Args:
            path: The path to search in (file or directory, relative to session root)
@@ -8,10 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def list_dir(path: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
        """
-        List the contents of a directory within the session sandbox.
+        Purpose
+            List the contents of a directory within the session sandbox.

-        Use this when you need to explore directory contents and see what files
-        and subdirectories exist.
+        When to use
+            Explore directory structure and contents
+            Discover available files and subdirectories
+            Verify file existence before reading or writing
+
+        Rules & Constraints
+            Path must point to an existing directory
+            Returns file names, types, and sizes
+            Does not recurse into subdirectories

        Args:
            path: The directory path (relative to session root)
@@ -8,10 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def replace_file_content(path: str, target: str, replacement: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
        """
-        Replace content in a file within the session sandbox.
+        Purpose
+            Replace all occurrences of a target string with replacement text in a file.

-        Use this when you need to perform find-and-replace operations on file content.
-        All occurrences of the target string will be replaced.
+        When to use
+            Fixing repeated errors or typos
+            Updating deprecated terms or placeholders
+            Refactoring simple patterns across a file
+
+        Rules & Constraints
+            Target must exist in file
+            Replacement must be intentional
+            No regex or complex logic - pure string replacement

        Args:
            path: The path to the file (relative to session root)
@@ -8,9 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def view_file(path: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
        """
-        Read the content of a file within the session sandbox.
+        Purpose
+            Read the content of a file within the session sandbox.

-        Use this when you need to view the contents of an existing file.
+        When to use
+            Inspect file contents before making changes
+            Retrieve stored data or configuration
+            Review logs or artifacts
+
+        Rules & Constraints
+            File must exist at the specified path
+            Returns full content with size and line count
+            Always read before patching or modifying

        Args:
            path: The path to the file (relative to session root)
@@ -8,10 +8,21 @@ def register_tools(mcp: FastMCP) -> None:
    @mcp.tool()
    def write_to_file(path: str, content: str, workspace_id: str, agent_id: str, session_id: str, append: bool = False) -> dict:
        """
-        Write content to a file within the session sandbox.
+        Purpose
+            Create a new file or append content to an existing file.

-        Use this when you need to create a new file or overwrite an existing file.
-        Set append=True to add content to the end of an existing file.
+        When to use
+            Append new events to append-only logs
+            Create new artifacts or summaries
+            Initialize new canonical memory files
+
+        Rules & Constraints
+            Must not overwrite canonical memory unless explicitly allowed
+            Should include structured data (JSON, Markdown with headers)
+            Every write must be intentional and minimal
+
+        Anti-pattern
+            Do NOT dump raw conversation transcripts without structure or reason.

        Args:
            path: The path to the file (relative to session root)
@@ -10,9 +10,11 @@ Build goal-driven agents that use LLM reasoning to accomplish tasks.
 ## Quick Start

 1. Define the goal (what success looks like)
-2. Add nodes (units of work)
-3. Connect with edges (flow between nodes)
-4. Validate and test
+2. Generate constraint tests from goal → Approve tests
+3. Add nodes (units of work) - validate against constraint tests
+4. Connect with edges (flow between nodes)
+5. Validate and test graph
+6. Handoff to testing-agent skill for final evaluation

 ## Core Concepts

@@ -117,10 +119,15 @@ For each component (goal, node, edge):

 ```
 Agent Build Progress:
+
+GOAL STAGE:
 - [ ] Define goal with success criteria → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
 - [ ] Define goal constraints → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add entry node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add each processing node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
+- [ ] Generate constraint tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓  (NEW)
+
+AGENT STAGE:
+- [ ] Add entry node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
+- [ ] Add each processing node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
 - [ ] Add pause nodes (if HITL needed) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
 - [ ] Add resume entry points (for pause nodes) → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
 - [ ] Add terminal node(s) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
@@ -129,6 +136,11 @@ Agent Build Progress:
 - [ ] Validate full graph → TEST GRAPH → SHOW RESULTS
 - [ ] Final approval → ASK APPROVAL (clickable: Approve & Export/Reject/Pause) ✓
 - [ ] Export to exports/{agent-name}/
+
+EVAL STAGE (handoff to testing-agent skill):
+- [ ] Generate success criteria tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓
+- [ ] Run all tests (constraint + success criteria)
+- [ ] Debug failures and iterate
 ```

 ### Testing During Approval
@@ -147,6 +159,31 @@ Show the human:
 - What tools are available
 - What outputs will be written

+**Validate against constraint tests** (if available):
+
+After approving constraint tests, reference them during node development:
+
+```python
+# When presenting a node for approval, show constraint alignment:
+"""
+**NODE: search_node**
+
+Test Results: [test_node output]
+
+Constraint Test Alignment:
+✓ test_constraint_api_limits_respected
+  → Node uses rate-limited tool wrapper ✓
+✓ test_constraint_content_safety_filter
+  → Output includes safety_score field ✓
+
+Validation: ✅ PASS
+"""
+```
+
+**IMPORTANT**: Constraint tests may not fully execute until the agent is complete,
+but their test definitions guide node design. Review the test code to ensure
+your nodes handle the constraint scenarios.
+
 **Before final approval**, use `test_graph` to simulate full execution:
 ```
 test_graph(
@@ -425,6 +462,7 @@ Goal(
            description="What the agent must NOT do",
            constraint_type="hard",  # hard = must not violate
            category="safety",
+            check="llm_judge",  # Optional: how to validate ("llm_judge", expression, or function)
        ),
    ],
 )
@@ -433,6 +471,98 @@ Goal(
 **Good goals**: Specific, measurable, constrained
 **Bad goals**: Vague, unmeasurable, no boundaries

+## Constraint Test Generation
+
+**CRITICAL**: After approving the goal, generate constraint tests BEFORE building nodes.
+
+Constraint tests verify that the agent will respect its defined constraints (safety, rate limits, etc.).
+These tests are **agent-agnostic** - they test boundaries, not implementation. This means they can be
+generated before any nodes exist.
+
+### Why Generate Tests Before Building?
+
+1. **Early Validation**: Catch constraint violations during node development, not after
+2. **Design Guidance**: Tests make constraints concrete and testable
+3. **Incremental Feedback**: Review constraint tests while designing each node
+
+### Generation Workflow
+
+```python
+# 1. After goal is approved, generate constraint tests
+result = generate_constraint_tests(
+    goal_id=goal_data["id"],
+    goal_json=json.dumps(goal_data)
+)
+
+# 2. Tests are returned with PENDING status
+# The MCP tool returns approval_required=True
+
+# 3. Display each test to the human for approval
+┌─────────────────────────────────────────────────────────────────┐
+│ [1/3] test_constraint_api_limits_respected                       │
+│       Constraint: api_limits                                     │
+│       Confidence: 88%                                            │
+│                                                                  │
+│       def test_constraint_api_limits_respected(agent):           │
+│           ...                                                    │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+└─────────────────────────────────────────────────────────────────┘
+
+# 4. Use AskUserQuestion with approval options
+AskUserQuestion(
+    questions=[{
+        "question": "Do you approve this constraint test?",
+        "header": "Test Approval",
+        "options": [
+            {"label": "✓ Approve (Recommended)", "description": "Test looks good"},
+            {"label": "✗ Reject", "description": "Test is invalid"},
+            {"label": "✎ Edit", "description": "Modify before accepting"},
+            {"label": "⏭ Skip", "description": "Decide later"}
+        ],
+        "multiSelect": false
+    }]
+)
+
+# 5. Call approve_tests with the decisions
+approve_tests(
+    goal_id=goal_data["id"],
+    approvals='[{"test_id": "...", "action": "approve"}, ...]'
+)
+
+# 6. Verify no pending tests before proceeding to nodes
+pending = get_pending_tests(goal_id=goal_data["id"])
+if json.loads(pending)["pending_count"] > 0:
+    # Prompt user to handle remaining tests
+    print("⚠️ Pending tests must be resolved before building nodes")
+```
+
+### Approval Rules
+
+- **All tests must be reviewed** - no auto-approval
+- **Approved/Modified tests are stored** for use during node validation
+- **Rejected tests are not stored** (with reason tracked)
+- **Skipped tests remain pending** - must be resolved before export
+
+### Using Constraint Tests During Node Building
+
+Once constraint tests are approved, reference them when designing nodes:
+
+```python
+# Before adding a node that makes API calls, review constraint tests:
+"""
+Creating node: search_node (llm_tool_use)
+Tools: youtube_search, video_details
+
+Constraint Test Review:
+✓ test_constraint_api_limits_respected - checks rate limits
+  → Ensure search_node handles rate limit responses gracefully
+
+✓ test_constraint_content_safety_filter - checks safe content
+  → Ensure output_keys include safety flags for filtering
+"""
+```
+
 ## Integrating External Tools (MCP Servers)

 Before adding nodes, you can register MCP servers to make their tools available to your agent.
@@ -772,11 +902,29 @@ analyze → needs_clarification? → YES → request-clarification (PAUSE)
 | `export_graph` | Export the completed agent |
 | `get_session_status` | View current build progress |

-### Testing Tools (for HITL approval)
-| Tool | Purpose |
-|------|---------|
-| `test_node` | Run a single node with sample inputs to show behavior |
-| `test_graph` | Simulate full graph execution to show the complete flow |
+### Testing Tools by Stage
+
+#### Goal Stage (this skill) - Generate constraint tests
+| Tool | Purpose | When to Use |
+|------|---------|-------------|
+| `generate_constraint_tests` | Generate tests from constraints | Immediately after goal approval |
+| `approve_tests` | Approve/reject/modify tests | After generation, before building nodes |
+| `get_pending_tests` | List tests awaiting approval | Before proceeding to node building |
+
+#### Agent Stage (this skill) - Build and validate nodes
+| Tool | Purpose | When to Use |
+|------|---------|-------------|
+| `test_node` | Run a single node with sample inputs | Before each node approval |
+| `test_graph` | Simulate full graph execution | Before final approval |
+
+#### Eval Stage (testing-agent skill) - Final evaluation
+| Tool | Purpose | When to Use |
+|------|---------|-------------|
+| `generate_success_tests` | Generate tests from success criteria | After agent export |
+| `run_tests` | Run all tests in parallel | After test approval |
+| `debug_test` | Debug failed tests | After test failures |
+
+See the [testing-agent skill](../testing-agent/SKILL.md) for the full Eval stage workflow.

 ## Using the Exported Agent

@@ -917,3 +1065,72 @@ result = await runner.run(context)
 ```

 For complete API details, see [reference/api.md](reference/api.md).
+
+## Handoff to Testing-Agent Skill
+
+After exporting the agent, switch to the **testing-agent** skill for final evaluation (Eval Stage).
+
+### What Transfers
+
+1. **Goal definition** (with constraints and success criteria)
+2. **Approved constraint tests** (generated in Goal Stage)
+3. **Exported agent** at `exports/{agent-name}/`
+
+### What Happens in Testing-Agent
+
+1. Generate **success criteria tests** (these need agent details, so generated after build)
+2. Run **all tests** (constraint + success criteria) in parallel
+3. Debug failures and categorize errors
+4. Iterate based on error type
+
+### Triggering the Handoff
+
+After `export_graph` completes successfully, display:
+
+```
+✅ Agent exported to exports/{agent-name}/
+
+Next Steps (Eval Stage):
+1. Switch to testing-agent skill
+2. Generate success criteria tests
+3. Run full evaluation
+4. Debug any failures
+
+Command: "Run /testing-agent for exports/{agent-name}"
+```
+
+### Error Category Routing
+
+If tests fail in the Eval stage, the error category determines where to go:
+
+| Error Category | Meaning | Action |
+|---------------|---------|--------|
+| `LOGIC_ERROR` | Goal definition is wrong | Return to Goal Stage - update goal, regenerate constraint tests |
+| `IMPLEMENTATION_ERROR` | Code bug in nodes/edges | Return to Agent Stage - fix nodes/edges, re-export |
+| `EDGE_CASE` | New scenario discovered | Stay in Eval Stage - add edge case test, continue |
+
+### Flow Diagram
+
+```
+┌──────────────────────────────────────────────────────────────┐
+│                GOAL STAGE (building-agents skill)            │
+│  1. Define success_criteria and constraints → APPROVE        │
+│  2. Generate CONSTRAINT TESTS from constraints               │
+│  3. APPROVE each constraint test                             │
+└──────────────────────────────────────────────────────────────┘
+                              ↓
+┌──────────────────────────────────────────────────────────────┐
+│               AGENT STAGE (building-agents skill)            │
+│  1. Add nodes - review constraint tests for design guidance  │
+│  2. Test each node - validate against constraint expectations│
+│  3. Connect edges → Validate graph → Export                  │
+└──────────────────────────────────────────────────────────────┘
+                              ↓
+┌──────────────────────────────────────────────────────────────┐
+│                EVAL STAGE (testing-agent skill)              │
+│  1. Generate SUCCESS_CRITERIA TESTS → APPROVE                │
+│  2. Run ALL tests (constraint + success criteria)            │
+│  3. Debug failures → Categorize errors                       │
+│  4. Route back based on error category (if needed)           │
+└──────────────────────────────────────────────────────────────┘
+```
@@ -0,0 +1,625 @@
+---
+name: testing-agent
+description: Run goal-based evaluation tests for agents. Use when you need to verify an agent meets its goals, debug failing tests, or iterate on agent improvements based on test results.
+---
+
+# Testing Agents
+
+Run goal-based evaluation tests for agents built with the building-agents skill.
+
+## Quick Start
+
+1. **Check existing state first** - See if tests already exist
+2. Generate tests from goal (only if needed)
+3. Approve tests (mandatory human approval)
+4. Run tests against agent
+5. Debug failures and iterate
+
+## Check Existing State First
+
+**CRITICAL**: Before generating any tests, ALWAYS check if tests already exist for the goal.
+
+```python
+# Check what tests exist for this goal
+result = list_tests(goal_id="youtube-research")
+
+# Returns:
+{
+    "goal_id": "youtube-research",
+    "total": 42,
+    "by_status": {
+        "pending": 10,
+        "approved": 30,
+        "modified": 2,
+        "rejected": 0
+    },
+    "by_type": {
+        "constraint": 15,
+        "success_criteria": 25,
+        "edge_case": 2
+    },
+    "tests": [...]  # List of test summaries
+}
+```
+
+### Decision Tree
+
+Based on existing state, choose the right action:
+
+```
+list_tests(goal_id) → Check existing tests
+        ↓
+┌───────┴────────────────────────────────────────┐
+│                                                │
+No tests exist                    Tests exist
+│                                      │
+↓                            ┌─────────┴─────────┐
+Generate tests               │                   │
+(constraint first,           Has pending         All approved
+then success_criteria)       tests               │
+                             │                   ↓
+                             ↓                   Run tests
+                             Approve pending     directly
+                             tests first
+```
+
+### Resuming a Testing Session
+
+When the user asks to test an agent that may have been tested before:
+
+1. **Always check first**: `list_tests(goal_id="...")`
+2. **Show the user what exists**:
+   - "Found 42 existing tests: 30 approved, 10 pending, 2 modified"
+   - "Last run: 28/30 passed (93.3%)"
+3. **Ask what they want to do**:
+
+```python
+AskUserQuestion(
+    questions=[{
+        "question": "Tests already exist for this agent. What would you like to do?",
+        "header": "Existing Tests",
+        "options": [
+            {
+                "label": "Run existing tests (Recommended)",
+                "description": "Run the 32 approved tests against the agent"
+            },
+            {
+                "label": "Approve pending tests",
+                "description": "Review and approve the 10 pending tests first"
+            },
+            {
+                "label": "Regenerate all tests",
+                "description": "Delete existing and generate fresh tests (loses approvals)"
+            },
+            {
+                "label": "Show test details",
+                "description": "List all tests with their status and last results"
+            }
+        ],
+        "multiSelect": false
+    }]
+)
+```
+
+### Why This Matters
+
+- **Saves time**: Approved tests don't need re-approval
+- **Preserves work**: User's previous approvals/modifications are kept
+- **Clear state**: User knows exactly what exists before taking action
+- **Prevents duplicates**: Won't generate tests that already exist
+
+## Core Concepts
+
+**Test Types**: Three types of tests, generated at different stages:
+- `constraint` - Generated during Goal stage (agent-agnostic boundaries)
+- `success_criteria` - Generated during Eval stage (after agent exists)
+- `edge_case` - Generated when new scenarios discovered during debugging
+
+**Approval**: All LLM-generated tests require explicit user approval before running.
+
+**Error Categories**: Failed tests are categorized to guide iteration:
+- `LOGIC_ERROR` - Goal definition is wrong → Update goal, restart full flow
+- `IMPLEMENTATION_ERROR` - Code bug → Fix agent, re-run Eval
+- `EDGE_CASE` - New scenario discovered → Add test, continue Eval
+
+**Iteration**: Each error category has a specific fix path (see Error Categorization section).
+
+## Workflow (HITL Required)
+
+**CRITICAL**: Each step requires human approval before proceeding.
+**CRITICAL**: Use structured questions (AskUserQuestion) with fallback to text mode.
+
+### Approval Strategy
+
+**Always try structured questions first**, with graceful fallback:
+
+1. **Attempt**: Call AskUserQuestion with clickable options
+2. **Catch**: If tool fails/rejected, fall back to text prompt
+3. **Parse**: Accept text input like "approve", "reject", "skip"
+
+This ensures the workflow works in all environments (VSCode extension, CLI, web).
+
+### Test Loop
+
+```
+For each test generated:
+1. DISPLAY → Show the test details to the human
+2. VALIDATE → Check test syntax and structure
+3. ASK APPROVAL → Use AskUserQuestion with clickable options
+4. Only run tests after approval
+```
+
+### Checklist (ask approval at each check)
+
+```
+Agent Testing Progress:
+- [ ] Load goal and agent → VERIFY PATHS
+- [ ] CHECK EXISTING TESTS → list_tests, show stats, ask what to do
+- [ ] If no tests OR user wants fresh: Generate tests → ASK APPROVAL
+- [ ] If pending tests exist: Approve pending tests first
+- [ ] Run all approved tests → SHOW RESULTS
+- [ ] Debug failed tests → SHOW CATEGORIZATION
+- [ ] Iterate based on category → ASK APPROVAL for changes
+```
+
+## The Three-Stage Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                           GOAL STAGE                                     │
+│  1. Define success_criteria and constraints (building-agents skill)      │
+│  2. Generate CONSTRAINT TESTS → USER APPROVAL → tests stored             │
+└─────────────────────────────────────────────────────────────────────────┘
+                                   ↓
+┌─────────────────────────────────────────────────────────────────────────┐
+│                          AGENT STAGE                                     │
+│  Build nodes + edges (building-agents skill)                             │
+│  Constraint tests can run during development for early feedback          │
+└─────────────────────────────────────────────────────────────────────────┘
+                                   ↓
+┌─────────────────────────────────────────────────────────────────────────┐
+│                           EVAL STAGE (this skill)                        │
+│  1. Generate SUCCESS_CRITERIA TESTS → USER APPROVAL → tests stored       │
+│  2. Run all tests in parallel → pass/fail summary                        │
+│  3. On failure → Debug tool with categorization                          │
+│  4. Iterate based on error category                                      │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+## Test Generation
+
+### When to Generate Each Type
+
+| Test Type | When Generated | Why |
+|-----------|----------------|-----|
+| **Constraint Tests** | During Goal stage (before agent exists) | Constraints are agent-agnostic boundaries |
+| **Success Criteria Tests** | During Eval stage (after agent exists) | May depend on agent flow/nodes |
+| **Edge Case Tests** | During debugging (when new scenario found) | Discovered through test failures |
+
+### Generating Tests
+
+```python
+import json
+
+# 1. Generate constraint tests (Goal stage)
+result = generate_constraint_tests(
+    goal_id="youtube-research",
+    goal_json=json.dumps({
+        "id": "youtube-research",
+        "name": "YouTube Research Agent",
+        "description": "Find relevant YouTube videos on a topic",
+        "success_criteria": [
+            {
+                "id": "find_videos",
+                "description": "Find 3-5 relevant videos",
+                "metric": "video_count",
+                "target": "3-5",
+                "weight": 1.0
+            }
+        ],
+        "constraints": [
+            {
+                "id": "api_limits",
+                "description": "Must respect YouTube API rate limits",
+                "constraint_type": "hard",
+                "category": "reliability",
+                "check": "llm_judge"  # Optional: how to validate
+            }
+        ]
+    })
+)
+
+# 2. Generate success criteria tests (Eval stage, after agent built)
+result = generate_success_tests(
+    goal_id="youtube-research",
+    goal_json='...',  # Same structure as above
+    node_names="search_node,filter_node,format_node",
+    tool_names="youtube_search,video_details"
+)
+```
+
+**After generation**, tests are stored as PENDING. They must be approved before running.
+
+## Approval Patterns
+
+### Interactive Approval Flow
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Generated Tests for: youtube-research (3 tests)                  │
+├─────────────────────────────────────────────────────────────────┤
+│ [1/3] test_find_videos_happy_path                               │
+│       Type: SUCCESS_CRITERIA                                     │
+│       Confidence: 92%                                            │
+│       Input: {"topic": "machine learning tutorials"}             │
+│       Expected: 3-5 videos with titles and IDs                   │
+│                                                                  │
+│       def test_find_videos_happy_path(agent):                    │
+│           result = agent.run({"topic": "machine learning"})      │
+│           assert 3 <= len(result.videos) <= 5                    │
+│           assert all(v.title for v in result.videos)             │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Approval Actions
+
+| Action | Description | Result |
+|--------|-------------|--------|
+| **approve** | Accept test as-is | Status → APPROVED, test will run |
+| **reject** | Decline with reason | Status → REJECTED, test won't run |
+| **edit** | Modify code before accepting | Status → MODIFIED, original preserved |
+| **skip** | Leave for later | Status → PENDING, decide later |
+
+### Approval Code Pattern
+
+```python
+# After generating tests, approve them
+result = approve_tests(
+    goal_id="youtube-research",
+    approvals='[
+        {"test_id": "test_001", "action": "approve"},
+        {"test_id": "test_002", "action": "modify", "modified_code": "def test_..."},
+        {"test_id": "test_003", "action": "reject", "reason": "Not a valid scenario"},
+        {"test_id": "test_004", "action": "skip"}
+    ]'
+)
+```
+
+### Structured Approval Questions
+
+```python
+# Try structured approval first
+try:
+    response = AskUserQuestion(
+        questions=[{
+            "question": "Do you approve this test?",
+            "header": "Test Approval",
+            "options": [
+                {
+                    "label": "Approve (Recommended)",
+                    "description": "Test looks good, include in test suite"
+                },
+                {
+                    "label": "Reject",
+                    "description": "Test is invalid or unnecessary"
+                },
+                {
+                    "label": "Edit",
+                    "description": "Modify the test code before accepting"
+                },
+                {
+                    "label": "Skip",
+                    "description": "Decide later, leave as pending"
+                }
+            ],
+            "multiSelect": false
+        }]
+    )
+except:
+    # Fallback to text mode
+    print("Do you approve this test? Type: approve | reject | edit | skip")
+```
+
+## Test Execution
+
+### Parallel Configuration
+
+```python
+# Tests run in parallel with these defaults
+ParallelConfig(
+    num_workers=cpu_count(),    # Use all CPU cores
+    timeout_per_test=60.0,      # 60 seconds per test
+    fail_fast=False,            # Run all tests, don't stop on first failure
+    mode="loadfile",            # Group tests by parent_criteria_id
+)
+```
+
+### Running Tests
+
+```python
+# Run all approved tests
+result = run_tests(
+    goal_id="youtube-research",
+    agent_path="exports/youtube-agent",
+    test_types='["all"]',  # or ["constraint", "success_criteria", "edge_case"]
+    parallel=4,            # Number of workers
+    fail_fast=False        # Run all tests
+)
+
+# Result structure
+{
+    "goal_id": "youtube-research",
+    "overall_passed": false,
+    "summary": {
+        "total": 15,
+        "passed": 12,
+        "failed": 3,
+        "pass_rate": "80.0%"
+    },
+    "duration_ms": 5432,
+    "results": [
+        {"test_id": "test_001", "passed": true, "duration_ms": 234},
+        {"test_id": "test_002", "passed": false, "duration_ms": 567, "error_category": "IMPLEMENTATION_ERROR"},
+        ...
+    ]
+}
+```
+
+### Execution Flow
+
+1. Load only APPROVED and MODIFIED tests (skip PENDING and REJECTED)
+2. Group tests by `parent_criteria_id` for shared fixture setup
+3. Run groups in parallel with process isolation
+4. Aggregate results with timing information
+
+## Error Categorization & Iteration
+
+### Decision Tree
+
+```
+Test Fails → Categorize Error
+                ↓
+    ┌───────────┴─────────────────┬────────────────────┐
+    │                             │                    │
+LOGIC ERROR               IMPLEMENTATION ERROR      EDGE CASE
+(criteria wrong)          (code bug)                (new scenario)
+    │                             │                    │
+    ↓                             ↓                    ↓
+Update goal               Fix nodes/edges          Generate new
+success_criteria          in Agent stage           edge case test
+    ↓                             ↓                    │
+FULL 3-STEP               Re-run Eval              Continue in
+FLOW RESTART              (skip Goal stage)        Eval stage
+```
+
+### Pattern-Based Heuristics
+
+The categorizer uses these patterns to classify errors:
+
+**LOGIC_ERROR** (goal definition is wrong):
+- "goal not achieved"
+- "constraint violated: core"
+- "fundamental assumption"
+- "success criteria mismatch"
+- "expected behavior incorrect"
+
+**IMPLEMENTATION_ERROR** (code bug in agent):
+- TypeError, AttributeError, KeyError, ValueError
+- "tool call failed"
+- "node execution error"
+- "assertion failed"
+- "null pointer", "undefined"
+
+**EDGE_CASE** (new scenario discovered):
+- "boundary condition"
+- "timeout", "rate limit"
+- "empty result", "no results"
+- "unexpected format"
+- "rare input", "unusual"
+
+### Iteration Guidance
+
+```python
+# After categorization, you get guidance
+{
+    "error_category": "IMPLEMENTATION_ERROR",
+    "iteration_guidance": {
+        "stage": "Agent",
+        "action": "Fix the code in nodes/edges",
+        "restart_required": false,
+        "description": "The goal is correct, but the implementation has a bug. Fix the agent code and re-run Eval."
+    }
+}
+```
+
+| Category | Go To Stage | Restart Required | Action |
+|----------|-------------|------------------|--------|
+| LOGIC_ERROR | Goal | Yes | Update success_criteria/constraints, rebuild agent |
+| IMPLEMENTATION_ERROR | Agent | No | Fix nodes/edges, re-run Eval only |
+| EDGE_CASE | Eval | No | Generate edge case test, continue in Eval |
+
+## Debugging Failed Tests
+
+### Debug Tool
+
+```python
+# Get detailed debug info for a failed test
+result = debug_test(
+    goal_id="youtube-research",
+    test_id="test_find_videos_no_results"
+)
+
+# Returns comprehensive debug info
+{
+    "test_id": "test_find_videos_no_results",
+    "test_name": "test_find_videos_no_results",
+    "input": {"topic": "xyzabc123nonsense"},
+    "expected": {"videos": [], "message": "No results found"},
+    "actual": {"error": "NullPointerException at node_3"},
+    "passed": false,
+    "error_message": "TypeError: 'NoneType' has no attribute 'get'",
+    "error_category": "IMPLEMENTATION_ERROR",
+    "stack_trace": "Traceback (most recent call last):\n  ...",
+    "logs": [
+        {"timestamp": "...", "node": "search_node", "level": "INFO", "msg": "..."},
+        {"timestamp": "...", "node": "filter_node", "level": "ERROR", "msg": "..."}
+    ],
+    "runtime_data": {
+        "execution_path": ["start", "search_node", "filter_node"],
+        "node_outputs": {...}
+    },
+    "suggested_fix": "Check null handling in filter_node when no results returned",
+    "iteration_guidance": {
+        "stage": "Agent",
+        "action": "Fix the code in nodes/edges",
+        "restart_required": false
+    }
+}
+```
+
+### Debug Workflow
+
+1. **Run all tests** → Get pass/fail summary
+2. **Select failed test** → Get detailed DebugInfo
+3. **Review categorization** → Understand error type
+4. **Check suggested fix** → Get actionable guidance
+5. **Follow iteration guidance** → Go to correct stage
+
+## Example: Testing YouTube Agent
+
+See [examples/testing-youtube-agent.md](examples/testing-youtube-agent.md) for a complete walkthrough.
+
+## Common Patterns
+
+### Happy Path Tests
+Test normal successful execution with valid inputs:
+```python
+def test_find_videos_happy_path(agent):
+    result = agent.run({"topic": "python tutorials"})
+    assert result.success
+    assert len(result.videos) >= 3
+    assert all(v.title for v in result.videos)
+```
+
+### Boundary Condition Tests
+Test exactly at target thresholds:
+```python
+def test_find_videos_minimum_count(agent):
+    result = agent.run({"topic": "very specific niche topic"})
+    assert len(result.videos) >= 1  # At least one result
+```
+
+### Error Handling Tests
+Test graceful handling of failures:
+```python
+def test_find_videos_invalid_input(agent):
+    result = agent.run({"topic": ""})  # Empty input
+    assert not result.success or result.message == "Invalid input"
+```
+
+### Constraint Violation Tests
+Test that constraints are respected:
+```python
+def test_api_rate_limit_respected(agent):
+    # Run multiple times quickly
+    for _ in range(5):
+        result = agent.run({"topic": "test"})
+    # Should not hit rate limit errors
+    assert "rate limit" not in str(result).lower()
+```
+
+## Anti-Patterns
+
+| Don't | Do Instead |
+|-------|------------|
+| Auto-approve tests | Always require explicit user approval |
+| Run PENDING/REJECTED tests | Only run APPROVED/MODIFIED tests |
+| Generate success tests during Goal stage | Wait until agent exists |
+| Treat all failures the same | Categorize and iterate appropriately |
+| Restart full flow for IMPLEMENTATION_ERROR | Fix agent, re-run Eval only |
+| Add test for LOGIC_ERROR | Fix the goal definition instead |
+| Ignore confidence scores | Review low-confidence categorizations manually |
+| Skip the approval step | Tests must be reviewed before running |
+
+## Tools Reference
+
+### Testing Tools
+
+| Tool | Purpose | When to Use |
+|------|---------|-------------|
+| `generate_constraint_tests` | Generate tests from goal constraints | Goal stage |
+| `generate_success_tests` | Generate tests from success criteria | Eval stage (after agent built) |
+| `approve_tests` | Approve/reject/modify generated tests | After generation |
+| `run_tests` | Execute tests in parallel | After approval |
+| `debug_test` | Analyze failed test with categorization | After test fails |
+| `list_tests` | List tests for a goal by status | Anytime |
+| `get_pending_tests` | Get tests awaiting approval | Before approval |
+
+### Building Tools (for iteration)
+
+When iteration requires modifying the agent, use these from the building-agents skill:
+
+| Tool | Purpose | When to Use |
+|------|---------|-------------|
+| `set_goal` | Update goal definition | LOGIC_ERROR iteration |
+| `add_node` | Add or modify nodes | IMPLEMENTATION_ERROR iteration |
+| `add_edge` | Add or modify edges | IMPLEMENTATION_ERROR iteration |
+| `validate_graph` | Validate changes | After any modification |
+| `export_graph` | Re-export agent | After fixes complete |
+
+## CLI Commands
+
+```bash
+# Generate tests from goal
+python -m core test-generate goal.json --type all
+
+# Interactive approval of pending tests
+python -m core test-approve <goal_id>
+
+# Run tests for an agent
+python -m core test-run <agent_path> --goal <goal_id> --parallel 4
+
+# Debug a failed test
+python -m core test-debug <goal_id> <test_id>
+
+# List tests by status
+python -m core test-list <goal_id> --status approved
+
+# Show test statistics
+python -m core test-stats <goal_id>
+```
+
+## Integration with building-agents
+
+### Handoff Points
+
+| Scenario | From | To | Action |
+|----------|------|-----|--------|
+| Agent built, ready to test | building-agents | testing-agent | Generate success tests |
+| LOGIC_ERROR found | testing-agent | building-agents | Update goal, rebuild |
+| IMPLEMENTATION_ERROR found | testing-agent | building-agents | Fix nodes/edges |
+| EDGE_CASE found | testing-agent | testing-agent | Generate edge case test |
+| All tests pass | testing-agent | Done | Agent is validated |
+
+### When to Switch Skills
+
+**Use building-agents when:**
+- Defining goals and constraints
+- Building agent nodes and edges
+- Fixing LOGIC_ERROR or IMPLEMENTATION_ERROR
+
+**Use testing-agent when:**
+- Generating tests from goals
+- Approving and running tests
+- Debugging failures
+- Categorizing errors
+
+### Shared Patterns
+
+Both skills use:
+- AskUserQuestion with structured options
+- HITL at every critical step
+- Fallback to text mode when widgets unavailable
+- Session state management for continuity
@@ -0,0 +1,348 @@
+# Example: Testing a YouTube Research Agent
+
+This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
+
+## Prerequisites
+
+- Agent built with building-agents skill at `exports/youtube-research/`
+- Goal defined with success criteria and constraints
+
+## Step 1: Load the Goal
+
+First, load the goal that was defined during the Goal stage:
+
+```json
+{
+    "id": "youtube-research",
+    "name": "YouTube Research Agent",
+    "description": "Find relevant YouTube videos on a given topic",
+    "success_criteria": [
+        {
+            "id": "find_videos",
+            "description": "Find 3-5 relevant videos",
+            "metric": "video_count",
+            "target": "3-5",
+            "weight": 1.0
+        },
+        {
+            "id": "relevance",
+            "description": "Videos must be relevant to the topic",
+            "metric": "relevance_score",
+            "target": ">0.8",
+            "weight": 0.8
+        }
+    ],
+    "constraints": [
+        {
+            "id": "api_limits",
+            "description": "Must not exceed YouTube API rate limits",
+            "constraint_type": "hard",
+            "category": "technical"
+        },
+        {
+            "id": "content_safety",
+            "description": "Must filter out inappropriate content",
+            "constraint_type": "hard",
+            "category": "safety"
+        }
+    ]
+}
+```
+
+## Step 2: Generate Constraint Tests
+
+During the Goal stage (or early Eval), generate tests for constraints:
+
+```python
+result = generate_constraint_tests(
+    goal_id="youtube-research",
+    goal_json='<goal JSON above>'
+)
+```
+
+**Generated tests (awaiting approval):**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Generated Constraint Tests (2 tests)                             │
+├─────────────────────────────────────────────────────────────────┤
+│ [1/2] test_constraint_api_limits_respected                       │
+│       Constraint: api_limits                                     │
+│       Confidence: 88%                                            │
+│                                                                  │
+│       def test_constraint_api_limits_respected(agent):           │
+│           """Verify API rate limits are not exceeded."""         │
+│           import time                                            │
+│           for i in range(10):                                    │
+│               result = agent.run({"topic": f"test_{i}"})         │
+│               time.sleep(0.1)                                    │
+│           # Should complete without rate limit errors            │
+│           assert "rate limit" not in str(result).lower()         │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+├─────────────────────────────────────────────────────────────────┤
+│ [2/2] test_constraint_content_safety_filter                      │
+│       Constraint: content_safety                                 │
+│       Confidence: 91%                                            │
+│                                                                  │
+│       def test_constraint_content_safety_filter(agent):          │
+│           """Verify inappropriate content is filtered."""        │
+│           result = agent.run({"topic": "general topic"})         │
+│           for video in result.videos:                            │
+│               assert video.safe_for_work is True                 │
+│               assert video.age_restricted is False               │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Step 3: Approve Constraint Tests
+
+Review and approve each test:
+
+```python
+result = approve_tests(
+    goal_id="youtube-research",
+    approvals='[
+        {"test_id": "test_constraint_api_001", "action": "approve"},
+        {"test_id": "test_constraint_content_001", "action": "approve"}
+    ]'
+)
+```
+
+## Step 4: Generate Success Criteria Tests
+
+After the agent is built, generate success criteria tests:
+
+```python
+result = generate_success_tests(
+    goal_id="youtube-research",
+    goal_json='<goal JSON>',
+    node_names="search_node,filter_node,rank_node,format_node",
+    tool_names="youtube_search,video_details,channel_info"
+)
+```
+
+**Generated tests (awaiting approval):**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Generated Success Criteria Tests (4 tests)                       │
+├─────────────────────────────────────────────────────────────────┤
+│ [1/4] test_find_videos_happy_path                               │
+│       Criteria: find_videos                                      │
+│       Confidence: 95%                                            │
+│                                                                  │
+│       def test_find_videos_happy_path(agent):                    │
+│           """Test finding videos for a common topic."""          │
+│           result = agent.run({"topic": "machine learning"})      │
+│           assert result.success                                  │
+│           assert 3 <= len(result.videos) <= 5                    │
+│           assert all(v.title for v in result.videos)             │
+│           assert all(v.video_id for v in result.videos)          │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+├─────────────────────────────────────────────────────────────────┤
+│ [2/4] test_find_videos_minimum_boundary                          │
+│       Criteria: find_videos                                      │
+│       Confidence: 87%                                            │
+│                                                                  │
+│       def test_find_videos_minimum_boundary(agent):              │
+│           """Test at minimum threshold (3 videos)."""            │
+│           result = agent.run({"topic": "niche topic xyz"})       │
+│           assert len(result.videos) >= 3                         │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+├─────────────────────────────────────────────────────────────────┤
+│ [3/4] test_relevance_score_threshold                             │
+│       Criteria: relevance                                        │
+│       Confidence: 92%                                            │
+│                                                                  │
+│       def test_relevance_score_threshold(agent):                 │
+│           """Test relevance scoring meets threshold."""          │
+│           result = agent.run({"topic": "python programming"})    │
+│           for video in result.videos:                            │
+│               assert video.relevance_score > 0.8                 │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+├─────────────────────────────────────────────────────────────────┤
+│ [4/4] test_find_videos_no_results_graceful                       │
+│       Criteria: find_videos                                      │
+│       Confidence: 84%                                            │
+│                                                                  │
+│       def test_find_videos_no_results_graceful(agent):           │
+│           """Test graceful handling of no results."""            │
+│           result = agent.run({"topic": "xyznonexistent123"})     │
+│           # Should not crash, return empty or message            │
+│           assert result.videos == [] or result.message           │
+│                                                                  │
+│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Step 5: Approve Success Criteria Tests
+
+```python
+result = approve_tests(
+    goal_id="youtube-research",
+    approvals='[
+        {"test_id": "test_success_001", "action": "approve"},
+        {"test_id": "test_success_002", "action": "approve"},
+        {"test_id": "test_success_003", "action": "approve"},
+        {"test_id": "test_success_004", "action": "approve"}
+    ]'
+)
+```
+
+## Step 6: Run All Tests
+
+Execute all approved tests:
+
+```python
+result = run_tests(
+    goal_id="youtube-research",
+    agent_path="exports/youtube-research",
+    test_types='["all"]',
+    parallel=4
+)
+```
+
+**Results:**
+
+```json
+{
+    "goal_id": "youtube-research",
+    "overall_passed": false,
+    "summary": {
+        "total": 6,
+        "passed": 5,
+        "failed": 1,
+        "pass_rate": "83.3%"
+    },
+    "duration_ms": 4521,
+    "results": [
+        {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
+        {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
+        {"test_id": "test_success_001", "passed": true, "duration_ms": 789},
+        {"test_id": "test_success_002", "passed": true, "duration_ms": 654},
+        {"test_id": "test_success_003", "passed": true, "duration_ms": 543},
+        {"test_id": "test_success_004", "passed": false, "duration_ms": 845,
+         "error_category": "IMPLEMENTATION_ERROR",
+         "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
+    ]
+}
+```
+
+## Step 7: Debug the Failed Test
+
+```python
+result = debug_test(
+    goal_id="youtube-research",
+    test_id="test_success_004"
+)
+```
+
+**Debug Output:**
+
+```json
+{
+    "test_id": "test_success_004",
+    "test_name": "test_find_videos_no_results_graceful",
+    "input": {"topic": "xyznonexistent123"},
+    "expected": "Empty list or message",
+    "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
+    "passed": false,
+    "error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
+    "error_category": "IMPLEMENTATION_ERROR",
+    "stack_trace": "Traceback (most recent call last):\n  File \"filter_node.py\", line 42\n    for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
+    "logs": [
+        {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
+        {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
+        {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
+    ],
+    "runtime_data": {
+        "execution_path": ["start", "search_node", "filter_node"],
+        "node_outputs": {
+            "search_node": null
+        }
+    },
+    "suggested_fix": "Add null check in filter_node before accessing .videos attribute",
+    "iteration_guidance": {
+        "stage": "Agent",
+        "action": "Fix the code in nodes/edges",
+        "restart_required": false,
+        "description": "The goal is correct, but filter_node doesn't handle null results from search_node."
+    }
+}
+```
+
+## Step 8: Iterate Based on Category
+
+Since this is an **IMPLEMENTATION_ERROR**, we:
+
+1. **Don't restart** the Goal → Agent → Eval flow
+2. **Fix the agent** using building-agents skill:
+   - Modify `filter_node` to handle null results
+3. **Re-run Eval** (tests only)
+
+### Fix in building-agents:
+
+```python
+# Update the filter_node to handle null
+add_node(
+    node_id="filter_node",
+    name="Filter Node",
+    description="Filter and rank videos",
+    node_type="function",
+    input_keys=["search_results"],
+    output_keys=["filtered_videos"],
+    system_prompt="""
+    Filter videos by relevance.
+    IMPORTANT: Handle case where search_results is None or empty.
+    Return empty list if no results.
+    """
+)
+```
+
+### Re-export and re-test:
+
+```python
+# Re-export the fixed agent
+export_graph(path="exports/youtube-research")
+
+# Re-run tests
+result = run_tests(
+    goal_id="youtube-research",
+    agent_path="exports/youtube-research",
+    test_types='["all"]'
+)
+```
+
+**Updated Results:**
+
+```json
+{
+    "goal_id": "youtube-research",
+    "overall_passed": true,
+    "summary": {
+        "total": 6,
+        "passed": 6,
+        "failed": 0,
+        "pass_rate": "100.0%"
+    }
+}
+```
+
+## Summary
+
+1. **Generated** constraint tests during Goal stage
+2. **Generated** success criteria tests during Eval stage
+3. **Approved** all tests with user review
+4. **Ran** tests in parallel
+5. **Debugged** the one failure
+6. **Categorized** as IMPLEMENTATION_ERROR
+7. **Fixed** the agent (not the goal)
+8. **Re-ran** Eval only (didn't restart full flow)
+9. **Passed** all tests
+
+The agent is now validated and ready for production use.
@@ -64,7 +64,7 @@ To use the agent builder with Claude Desktop or other MCP clients, add this to y
    "agent-builder": {
      "command": "python",
      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/hive/core"
+      "cwd": "/path/to/goal-agent"
    }
  }
 }
@@ -75,144 +75,48 @@ The MCP server provides tools for:
 - Defining goals with success criteria
 - Adding nodes (llm_generate, llm_tool_use, router, function)
 - Connecting nodes with edges
- **Registering MCP servers as tool sources** ✨
- **Discovering tools from MCP servers** ✨
 - Validating and exporting agent graphs
 - Testing nodes and full agent graphs

-When you register an MCP server during agent building, the tools from that server become available to your agent, and an `mcp_servers.json` configuration file is automatically created on export.
-
-See [MCP_SERVER_GUIDE.md](MCP_SERVER_GUIDE.md) for agent builder instructions and [MCP_BUILDER_TOOLS_GUIDE.md](MCP_BUILDER_TOOLS_GUIDE.md) for MCP integration tools.
-
-## MCP Tool Integration
-
-The framework also supports **connecting to MCP servers as tool providers**, allowing your agents to use tools from external MCP servers (like aden-tools). This enables you to extend your agents with powerful external capabilities.
-
-### Quick Example
-
-```python
-from framework.runner.runner import AgentRunner
-
-# Load an agent
-runner = AgentRunner.load("exports/task-planner")
-
-# Register an MCP server with tools
-runner.register_mcp_server(
-    name="aden-tools",
-    transport="stdio",
-    command="python",
-    args=["mcp_server.py", "--stdio"],
-    cwd="../aden-tools"
-)
-
-# Tools from the MCP server are now available to your agent
-result = await runner.run({"query": "Search for AI news"})
-```
-
-### Auto-loading MCP Servers
-
-Create `mcp_servers.json` in your agent folder:
-
-```json
-{
-  "servers": [
-    {
-      "name": "aden-tools",
-      "transport": "stdio",
-      "command": "python",
-      "args": ["mcp_server.py", "--stdio"],
-      "cwd": "../aden-tools"
-    }
-  ]
-}
-```
-
-MCP servers will be automatically loaded when you load the agent.
-
-### Available Tools from aden-tools
-
-When you register the aden-tools MCP server, these tools become available:
- `web_search` - Search the web using Brave Search API
- `web_scrape` - Extract content from web pages
- `file_read` - Read file contents
- `file_write` - Write content to files
- `pdf_read` - Extract text from PDF files
-
-See [MCP_INTEGRATION_GUIDE.md](MCP_INTEGRATION_GUIDE.md) for detailed instructions on MCP tool integration.
-
 ## Quick Start

-### Running Agents
+### Calculator Agent

-The framework comes with pre-built example agents in the `exports/` directory:
+Run an LLM-powered calculator:

 ```bash
-# List available agents
-python -m framework list exports/
+# Single calculation
+python -m framework calculate "2 + 3 * 4"

-# Show agent information
-python -m framework info exports/task-planner
+# Interactive mode
+python -m framework interactive

-# Run an agent
-python -m framework run exports/task-planner --input '{"objective": "Build a web scraper"}'
-
-# Interactive shell mode (with human-in-the-loop approval)
-python -m framework shell exports/task-planner
+# Analyze runs with Builder
+python -m framework analyze calculator
 ```

-### Available Commands
-
- `run` - Execute an exported agent with given input
- `info` - Display agent details (goal, nodes, edges, success criteria)
- `validate` - Check that an agent is valid and runnable
- `list` - List all exported agents in a directory
- `dispatch` - Route requests to multiple agents using the orchestrator
- `shell` - Start an interactive session with an agent
-
-### Building Agents Programmatically
-
-You can build agents using the MCP server (recommended) or programmatically:
+### Using the Runtime

 ```python
 from framework import Runtime

-# Initialize runtime with storage path
-runtime = Runtime("./storage")
+runtime = Runtime("/path/to/storage")

-# Start a run for a goal
-run_id = runtime.start_run(
-    goal_id="data-processor",
-    goal_description="Process data with quality checks",
-    input_data={"dataset": "customers.csv"}
-)
-
-# Set the current node context
-runtime.set_node("processor-node")
+# Start a run
+run_id = runtime.start_run("my_goal", "Description of what we're doing")

 # Record a decision
 decision_id = runtime.decide(
    intent="Choose how to process the data",
    options=[
-        {
-            "id": "fast",
-            "description": "Quick processing",
-            "action_type": "tool_call",
-            "pros": ["Fast"],
-            "cons": ["Less accurate"]
-        },
-        {
-            "id": "thorough",
-            "description": "Detailed processing",
-            "action_type": "tool_call",
-            "pros": ["Accurate"],
-            "cons": ["Slower"]
-        },
+        {"id": "fast", "description": "Quick processing", "pros": ["Fast"], "cons": ["Less accurate"]},
+        {"id": "thorough", "description": "Detailed processing", "pros": ["Accurate"], "cons": ["Slower"]},
    ],
    chosen="thorough",
    reasoning="Accuracy is more important for this task"
 )

-# Record the outcome of the decision
+# Record the outcome
 runtime.record_outcome(
    decision_id=decision_id,
    success=True,
@@ -221,13 +125,32 @@ runtime.record_outcome(
 )

 # End the run
-runtime.end_run(
-    success=True,
-    narrative="Successfully processed all data",
-    output_data={"total_processed": 100}
-)
+runtime.end_run(success=True, narrative="Successfully processed all data")
 ```

+### Testing Agents
+
+The framework includes a goal-based testing framework for validating agent behavior.
+
+```bash
+# Generate tests from a goal definition
+python -m framework test-generate goal.json
+
+# Interactively approve generated tests
+python -m framework test-approve <goal_id>
+
+# Run tests against an agent
+python -m framework test-run <agent_path> --parallel 4
+
+# Debug failed tests
+python -m framework test-debug <goal_id> <test_id>
+
+# List tests by status
+python -m framework test-list <goal_id>
+```
+
+For detailed testing workflows, see the [testing-agent skill](.claude/skills/testing-agent/SKILL.md).
+
 ### Analyzing Agent Behavior with Builder

 The BuilderQuery interface allows you to analyze agent runs and identify improvements:
@@ -235,119 +158,50 @@ The BuilderQuery interface allows you to analyze agent runs and identify improve
 ```python
 from framework import BuilderQuery

-# Initialize Builder query interface
-query = BuilderQuery("./storage")
+query = BuilderQuery("/path/to/storage")

-# Find patterns across runs for a goal
-patterns = query.find_patterns("data-processor")
-if patterns:
-    print(f"Success rate: {patterns.success_rate:.1%}")
-    print(f"Runs analyzed: {patterns.run_count}")
+# Find patterns across runs
+patterns = query.find_patterns("my_goal")
+print(f"Success rate: {patterns.success_rate:.1%}")

-    # Show problematic nodes
-    for node_id, failure_rate in patterns.problematic_nodes:
-        print(f"Node '{node_id}' has {failure_rate:.1%} failure rate")
+# Analyze a failure
+analysis = query.analyze_failure("run_123")
+print(f"Root cause: {analysis.root_cause}")
+print(f"Suggestions: {analysis.suggestions}")

-# Analyze a specific failure
-analysis = query.analyze_failure("run_20260119_143022_abc123")
-if analysis:
-    print(f"Failure point: {analysis.failure_point}")
-    print(f"Root cause: {analysis.root_cause}")
-    print(f"\nSuggestions:")
-    for suggestion in analysis.suggestions:
-        print(f"  - {suggestion}")
-
-# Get improvement recommendations for a goal
-suggestions = query.suggest_improvements("data-processor")
+# Get improvement recommendations
+suggestions = query.suggest_improvements("my_goal")
 for s in suggestions:
    print(f"[{s['priority']}] {s['recommendation']}")
-    print(f"  Reason: {s['reason']}")
-
-# Get performance metrics for a specific node
-perf = query.get_node_performance("processor-node")
-print(f"Node: {perf['node_id']}")
-print(f"Success rate: {perf['success_rate']:.1%}")
-print(f"Avg latency: {perf['avg_latency_ms']:.0f}ms")
 ```

 ## Architecture

-The framework consists of several layers:
-
 ```
 ┌─────────────────┐
-│  Human Engineer │  ← Supervision, approval via HITL
+│  Human Engineer │  ← Supervision, approval
 └────────┬────────┘
         │
 ┌────────▼────────┐
-│   Builder LLM   │  ← Analyzes runs, suggests improvements (via MCP)
+│   Builder LLM   │  ← Analyzes runs, suggests improvements
 │  (BuilderQuery) │
 └────────┬────────┘
         │
 ┌────────▼────────┐
-│   Agent Graph   │  ← Node-based execution flow
-│   (AgentRunner) │     (llm_generate, llm_tool_use, router, function)
-└────────┬────────┘
-         │
-┌────────▼────────┐
-│    Runtime      │  ← Records decisions, outcomes, problems
-│   (Decision DB) │
+│   Agent LLM     │  ← Executes tasks, records decisions
+│    (Runtime)    │
 └─────────────────┘
 ```

 ## Key Concepts

-### Graph-Based Agents
-
-Agents are defined as directed graphs with:
- **Nodes**: Execution steps (llm_generate, llm_tool_use, router, function)
- **Edges**: Control flow between nodes, including conditional routing
- **Goal**: What the agent is designed to accomplish with success criteria
- **Constraints**: Hard and soft limits on agent behavior
-
-### Decision Recording
-
 - **Decision**: The atomic unit of agent behavior. Captures intent, options, choice, and reasoning.
- **Outcome**: Result of executing a decision (success/failure, latency, tokens, state changes)
- **Run**: A complete execution trace with all decisions and outcomes
- **Problem**: Issues reported during execution with severity and suggested fixes
-
-### Analysis & Improvement
-
- **Runtime**: Interface agents use to record their behavior during execution
- **BuilderQuery**: Interface for analyzing agent runs and identifying patterns
- **PatternAnalysis**: Cross-run analysis showing success rates, common failures, problematic nodes
- **FailureAnalysis**: Deep dive into why a specific run failed with suggestions
-
-### Human-in-the-Loop (HITL)
-
- **Approval Callbacks**: Nodes can require human approval before execution
- **Interactive Shell**: Chat-like interface for running agents with approval prompts
- **Session State**: Agents can pause and resume based on user input
-
-### Multi-Agent Orchestration
-
- **AgentOrchestrator**: Dispatch requests to multiple agents
- **Agent Discovery**: Automatically discover and register agents from a directory
- **Dispatch Strategy**: Route requests to the most appropriate agent(s)
-
-## Example Agents
-
-The `exports/` directory contains example agents you can run or use as templates:
-
- **task-planner**: Breaks down complex objectives into actionable tasks with dependencies
- **research-summary-agent**: Conducts research and generates summaries
- **outbound-sales-agent**: Handles outbound sales workflows
- **youtube-comments-research**: Analyzes YouTube comments for insights
-
-Each agent includes:
- `agent.json`: Graph definition with nodes, edges, goal, and constraints
- `README.md`: Agent documentation
- `tools.py` (optional): Custom tool implementations
+- **Run**: A complete execution with all decisions and outcomes.
+- **Runtime**: Interface agents use to record their behavior.
+- **BuilderQuery**: Interface Builder uses to analyze agent behavior.

 ## Requirements

 - Python 3.11+
 - pydantic >= 2.0
 - anthropic >= 0.40.0 (for LLM-powered agents)
- mcp, fastmcp (optional, for MCP server)
@@ -10,6 +10,16 @@ choice the agent makes is captured with:
 - Whether that was good or bad (evaluated post-hoc)

 This gives the Builder LLM the information it needs to improve agent behavior.
+
+## Testing Framework
+
+The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
+- Generate tests from Goal success_criteria and constraints
+- Mandatory user approval before tests are stored
+- Parallel test execution with error categorization
+- Debug tools with fix suggestions
+
+See `framework.testing` for details.
 """

 from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
@@ -19,6 +29,21 @@ from framework.builder.query import BuilderQuery
 from framework.llm import LLMProvider, AnthropicProvider
 from framework.runner import AgentRunner, AgentOrchestrator

+# Testing framework
+from framework.testing import (
+    Test,
+    TestResult,
+    TestSuiteResult,
+    TestStorage,
+    ApprovalStatus,
+    ErrorCategory,
+    ConstraintTestGenerator,
+    SuccessCriteriaTestGenerator,
+    ParallelTestRunner,
+    ParallelConfig,
+    DebugTool,
+)
+
 __all__ = [
    # Schemas
    "Decision",
@@ -38,4 +63,16 @@ __all__ = [
    # Runner
    "AgentRunner",
    "AgentOrchestrator",
+    # Testing
+    "Test",
+    "TestResult",
+    "TestSuiteResult",
+    "TestStorage",
+    "ApprovalStatus",
+    "ErrorCategory",
+    "ConstraintTestGenerator",
+    "SuccessCriteriaTestGenerator",
+    "ParallelTestRunner",
+    "ParallelConfig",
+    "DebugTool",
 ]
@@ -8,6 +8,14 @@ Usage:
    python -m core list exports/
    python -m core dispatch exports/ --input '{"key": "value"}'
    python -m core shell exports/my-agent
+
+Testing commands:
+    python -m core test-generate goal.json
+    python -m core test-approve <goal_id>
+    python -m core test-run <agent_path> --goal <goal_id>
+    python -m core test-debug <goal_id> <test_id>
+    python -m core test-list <goal_id>
+    python -m core test-stats <goal_id>
 """

 import argparse
@@ -20,7 +28,7 @@ def main():
    )
    parser.add_argument(
        "--model",
-        default="claude-sonnet-4-20250514",
+        default="claude-haiku-4-5-20251001",
        help="Anthropic model to use",
    )

@@ -30,6 +38,10 @@ def main():
    from framework.runner.cli import register_commands
    register_commands(subparsers)

+    # Register testing commands (test-generate, test-approve, test-run, test-debug, etc.)
+    from framework.testing.cli import register_testing_commands
+    register_testing_commands(subparsers)
+
    args = parser.parse_args()

    if hasattr(args, "func"):
@@ -340,7 +340,7 @@ class GraphSpec(BaseModel):
    )

    # Default LLM settings
-    default_model: str = "claude-sonnet-4-20250514"
+    default_model: str = "claude-haiku-4-5-20251001"
    max_tokens: int = 1024

    # Execution limits
@@ -165,12 +165,7 @@ class GraphExecutor:

                path.append(current_node_id)

-                # Check if terminal
-                if current_node_id in graph.terminal_nodes:
-                    self.logger.info(f"✓ Reached terminal node: {node_spec.name}")
-                    break
-
-                # Check if pause (HITL)
+                # Check if pause (HITL) before execution
                if current_node_id in graph.pause_nodes:
                    self.logger.info(f"⏸ Paused at HITL node: {node_spec.name}")
                    # Execute this node, then pause
@@ -279,6 +274,11 @@ class GraphExecutor:
                        session_state=session_state_out,
                    )

+                # Check if this is a terminal node - if so, we're done
+                if node_spec.id in graph.terminal_nodes:
+                    self.logger.info(f"✓ Reached terminal node: {node_spec.name}")
+                    break
+
                # Determine next node
                if result.next_node:
                    # Router explicitly set next node
@@ -76,6 +76,7 @@ class Constraint(BaseModel):
        description="Category: 'time', 'cost', 'safety', 'scope', 'quality'"
    )
    check: str = Field(
+        default="",
        description="How to check: expression, function name, or 'llm_judge'"
    )

@@ -431,22 +431,13 @@ class LLMNode(NodeProtocol):
            # Write to output keys
            output = self._parse_output(response.content, ctx.node_spec)

-            # For llm_generate nodes, try to parse JSON and extract fields
-            if ctx.node_spec.node_type == "llm_generate" and len(ctx.node_spec.output_keys) > 1:
+            # For llm_generate and llm_tool_use nodes, try to parse JSON and extract fields
+            if ctx.node_spec.node_type in ("llm_generate", "llm_tool_use") and len(ctx.node_spec.output_keys) > 1:
                try:
-                    # Try to parse as JSON
                    import json
-                    import re

-                    # Remove markdown code blocks if present
-                    content = response.content.strip()
-                    if content.startswith("```"):
-                        # Extract JSON from code block
-                        match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', content, re.DOTALL)
-                        if match:
-                            content = match.group(1).strip()
-
-                    parsed = json.loads(content)
+                    # Try direct JSON parse first
+                    parsed = self._extract_json_with_haiku(response.content, ctx.node_spec.output_keys)

                    # If parsed successfully, write each field to its corresponding output key
                    if isinstance(parsed, dict):
@@ -454,8 +445,12 @@ class LLMNode(NodeProtocol):
                            if key in parsed:
                                ctx.memory.write(key, parsed[key])
                                output[key] = parsed[key]
+                            elif key in ctx.input_data:
+                                # Key not in parsed JSON but exists in input - pass through input value
+                                ctx.memory.write(key, ctx.input_data[key])
+                                output[key] = ctx.input_data[key]
                            else:
-                                # Key not in parsed JSON, write the whole response
+                                # Key not in parsed JSON or input, write the whole response
                                ctx.memory.write(key, response.content)
                                output[key] = response.content
                    else:
@@ -465,8 +460,8 @@ class LLMNode(NodeProtocol):
                            output[key] = response.content

                except (json.JSONDecodeError, Exception) as e:
-                    # JSON parsing failed, fall back to writing entire response
-                    logger.warning(f"      ⚠ Failed to parse JSON output, using raw response: {e}")
+                    # JSON extraction failed completely
+                    logger.warning(f"      ⚠ Failed to extract JSON output: {e}")
                    for key in ctx.node_spec.output_keys:
                        ctx.memory.write(key, response.content)
                        output[key] = response.content
@@ -503,6 +498,80 @@ class LLMNode(NodeProtocol):
        # Default output
        return {"result": content}

+    def _extract_json_with_haiku(self, raw_response: str, output_keys: list[str]) -> dict[str, Any]:
+        """Use Haiku to extract clean JSON from potentially verbose LLM response."""
+        import json
+        import re
+
+        # Try direct JSON parse first (fast path)
+        try:
+            content = raw_response.strip()
+            # Remove markdown code blocks if present
+            if content.startswith("```"):
+                match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', content, re.DOTALL)
+                if match:
+                    content = match.group(1).strip()
+
+            parsed = json.loads(content)
+            if isinstance(parsed, dict):
+                return parsed
+        except json.JSONDecodeError:
+            pass
+
+        # JSON parse failed - use Haiku to extract clean JSON
+        import os
+        api_key = os.environ.get("ANTHROPIC_API_KEY")
+        if not api_key:
+            # No API key, try one more simple extraction
+            try:
+                # Find first { and last }
+                start = raw_response.find('{')
+                end = raw_response.rfind('}')
+                if start != -1 and end != -1:
+                    json_str = raw_response[start:end+1]
+                    return json.loads(json_str)
+            except:
+                pass
+            raise ValueError("Cannot parse JSON and no API key for Haiku cleanup")
+
+        # Use Haiku to clean the response
+        from framework.llm.anthropic import AnthropicProvider
+        haiku = AnthropicProvider(model="claude-3-5-haiku-20241022")
+
+        prompt = f"""Extract the JSON object from this LLM response. Extract ONLY the values that the LLM actually generated.
+
+Expected output keys: {output_keys}
+
+LLM Response:
+{raw_response}
+
+IMPORTANT:
+- Only extract keys that the LLM explicitly output in its response
+- Do NOT include keys that were just mentioned or passed through from input
+- If the LLM output multiple pieces of text/JSON, extract the LAST JSON object only
+- Output ONLY valid JSON with no extra text, no markdown, no explanations"""
+
+        try:
+            result = haiku.complete(
+                messages=[{"role": "user", "content": prompt}],
+                system="You extract clean JSON from messy responses. Output only valid JSON, nothing else.",
+            )
+
+            cleaned = result.content.strip()
+            # Remove markdown if Haiku added it
+            if cleaned.startswith("```"):
+                match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, re.DOTALL)
+                if match:
+                    cleaned = match.group(1).strip()
+
+            parsed = json.loads(cleaned)
+            logger.info(f"      ✓ Haiku cleaned JSON output")
+            return parsed
+
+        except Exception as e:
+            logger.warning(f"      ⚠ Haiku JSON extraction failed: {e}")
+            raise
+
    def _build_messages(self, ctx: NodeContext) -> list[dict]:
        """Build the message list for the LLM."""
        # Use Haiku to intelligently format inputs from memory
@@ -18,14 +18,14 @@ class AnthropicProvider(LLMProvider):
    def __init__(
        self,
        api_key: str | None = None,
-        model: str = "claude-sonnet-4-20250514",
+        model: str = "claude-haiku-4-5-20251001",
    ):
        """
        Initialize the Anthropic provider.

        Args:
            api_key: Anthropic API key. If not provided, uses ANTHROPIC_API_KEY env var.
-            model: Model to use (default: claude-sonnet-4-20250514)
+            model: Model to use (default: claude-haiku-4-5-20251001)
        """
        self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
        if not self.api_key:
@@ -9,6 +9,7 @@ Usage:

 import json
 from datetime import datetime
+from pathlib import Path
 from typing import Annotated

 from mcp.server import FastMCP
@@ -16,32 +17,163 @@ from mcp.server import FastMCP
 from framework.graph import Goal, SuccessCriterion, Constraint, NodeSpec, EdgeSpec, EdgeCondition
 from framework.graph.edge import GraphSpec

+# Testing framework imports
+from framework.testing.test_case import Test, ApprovalStatus, TestType
+from framework.testing.test_storage import TestStorage
+from framework.testing.constraint_gen import ConstraintTestGenerator
+from framework.testing.success_gen import SuccessCriteriaTestGenerator
+from framework.testing.approval_types import ApprovalRequest, ApprovalAction
+from framework.testing.debug_tool import DebugTool
+from framework.testing.parallel import AgentFactory
+

 # Initialize MCP server
 mcp = FastMCP("agent-builder")


+# Session persistence directory
+SESSIONS_DIR = Path(".agent-builder-sessions")
+ACTIVE_SESSION_FILE = SESSIONS_DIR / ".active"
+
+
 # Session storage
 class BuildSession:
-    """In-memory build session."""
+    """Build session with persistence support."""

-    def __init__(self, name: str):
-        self.id = f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    def __init__(self, name: str, session_id: str | None = None):
+        self.id = session_id or f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.name = name
        self.goal: Goal | None = None
        self.nodes: list[NodeSpec] = []
        self.edges: list[EdgeSpec] = []
        self.mcp_servers: list[dict] = []  # MCP server configurations
+        self.created_at = datetime.now().isoformat()
+        self.last_modified = datetime.now().isoformat()
+
+    def to_dict(self) -> dict:
+        """Serialize session to dictionary."""
+        return {
+            "session_id": self.id,
+            "name": self.name,
+            "goal": self.goal.model_dump() if self.goal else None,
+            "nodes": [n.model_dump() for n in self.nodes],
+            "edges": [e.model_dump() for e in self.edges],
+            "mcp_servers": self.mcp_servers,
+            "created_at": self.created_at,
+            "last_modified": self.last_modified,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "BuildSession":
+        """Deserialize session from dictionary."""
+        session = cls(name=data["name"], session_id=data["session_id"])
+        session.created_at = data.get("created_at", session.created_at)
+        session.last_modified = data.get("last_modified", session.last_modified)
+
+        # Restore goal
+        if data.get("goal"):
+            goal_data = data["goal"]
+            session.goal = Goal(
+                id=goal_data["id"],
+                name=goal_data["name"],
+                description=goal_data["description"],
+                success_criteria=[
+                    SuccessCriterion(**sc) for sc in goal_data.get("success_criteria", [])
+                ],
+                constraints=[
+                    Constraint(**c) for c in goal_data.get("constraints", [])
+                ],
+            )
+
+        # Restore nodes
+        session.nodes = [NodeSpec(**n) for n in data.get("nodes", [])]
+
+        # Restore edges
+        edges_data = data.get("edges", [])
+        for e in edges_data:
+            # Convert condition string back to enum
+            condition_str = e.get("condition")
+            if isinstance(condition_str, str):
+                condition_map = {
+                    "always": EdgeCondition.ALWAYS,
+                    "on_success": EdgeCondition.ON_SUCCESS,
+                    "on_failure": EdgeCondition.ON_FAILURE,
+                    "conditional": EdgeCondition.CONDITIONAL,
+                }
+                e["condition"] = condition_map.get(condition_str, EdgeCondition.ON_SUCCESS)
+            session.edges.append(EdgeSpec(**e))
+
+        # Restore MCP servers
+        session.mcp_servers = data.get("mcp_servers", [])
+
+        return session


 # Global session
 _session: BuildSession | None = None


+def _ensure_sessions_dir():
+    """Ensure sessions directory exists."""
+    SESSIONS_DIR.mkdir(exist_ok=True)
+
+
+def _save_session(session: BuildSession):
+    """Save session to disk."""
+    _ensure_sessions_dir()
+
+    # Update last modified
+    session.last_modified = datetime.now().isoformat()
+
+    # Save session file
+    session_file = SESSIONS_DIR / f"{session.id}.json"
+    with open(session_file, "w") as f:
+        json.dump(session.to_dict(), f, indent=2, default=str)
+
+    # Update active session pointer
+    with open(ACTIVE_SESSION_FILE, "w") as f:
+        f.write(session.id)
+
+
+def _load_session(session_id: str) -> BuildSession:
+    """Load session from disk."""
+    session_file = SESSIONS_DIR / f"{session_id}.json"
+    if not session_file.exists():
+        raise ValueError(f"Session '{session_id}' not found")
+
+    with open(session_file, "r") as f:
+        data = json.load(f)
+
+    return BuildSession.from_dict(data)
+
+
+def _load_active_session() -> BuildSession | None:
+    """Load the active session if one exists."""
+    if not ACTIVE_SESSION_FILE.exists():
+        return None
+
+    try:
+        with open(ACTIVE_SESSION_FILE, "r") as f:
+            session_id = f.read().strip()
+
+        if session_id:
+            return _load_session(session_id)
+    except Exception:
+        pass
+
+    return None
+
+
 def get_session() -> BuildSession:
    global _session
+
+    # Try to load active session if no session in memory
+    if _session is None:
+        _session = _load_active_session()
+
    if _session is None:
        raise ValueError("No active session. Call create_session first.")
+
    return _session


@@ -54,13 +186,122 @@ def create_session(name: Annotated[str, "Name for the agent being built"]) -> st
    """Create a new agent building session. Call this first before building an agent."""
    global _session
    _session = BuildSession(name)
+    _save_session(_session)  # Auto-save
    return json.dumps({
        "session_id": _session.id,
        "name": name,
        "status": "created",
+        "persisted": True,
    })


+@mcp.tool()
+def list_sessions() -> str:
+    """List all saved agent building sessions."""
+    _ensure_sessions_dir()
+
+    sessions = []
+    if SESSIONS_DIR.exists():
+        for session_file in SESSIONS_DIR.glob("*.json"):
+            try:
+                with open(session_file, "r") as f:
+                    data = json.load(f)
+                    sessions.append({
+                        "session_id": data["session_id"],
+                        "name": data["name"],
+                        "created_at": data.get("created_at"),
+                        "last_modified": data.get("last_modified"),
+                        "node_count": len(data.get("nodes", [])),
+                        "edge_count": len(data.get("edges", [])),
+                        "has_goal": data.get("goal") is not None,
+                    })
+            except Exception:
+                pass  # Skip corrupted files
+
+    # Check which session is currently active
+    active_id = None
+    if ACTIVE_SESSION_FILE.exists():
+        try:
+            with open(ACTIVE_SESSION_FILE, "r") as f:
+                active_id = f.read().strip()
+        except Exception:
+            pass
+
+    return json.dumps({
+        "sessions": sorted(sessions, key=lambda s: s["last_modified"], reverse=True),
+        "total": len(sessions),
+        "active_session_id": active_id,
+    }, indent=2)
+
+
+@mcp.tool()
+def load_session_by_id(session_id: Annotated[str, "ID of the session to load"]) -> str:
+    """Load a previously saved agent building session by its ID."""
+    global _session
+
+    try:
+        _session = _load_session(session_id)
+
+        # Update active session pointer
+        with open(ACTIVE_SESSION_FILE, "w") as f:
+            f.write(session_id)
+
+        return json.dumps({
+            "success": True,
+            "session_id": _session.id,
+            "name": _session.name,
+            "node_count": len(_session.nodes),
+            "edge_count": len(_session.edges),
+            "has_goal": _session.goal is not None,
+            "created_at": _session.created_at,
+            "last_modified": _session.last_modified,
+            "message": f"Session '{_session.name}' loaded successfully"
+        })
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": str(e)
+        })
+
+
+@mcp.tool()
+def delete_session(session_id: Annotated[str, "ID of the session to delete"]) -> str:
+    """Delete a saved agent building session."""
+    global _session
+
+    session_file = SESSIONS_DIR / f"{session_id}.json"
+    if not session_file.exists():
+        return json.dumps({
+            "success": False,
+            "error": f"Session '{session_id}' not found"
+        })
+
+    try:
+        # Remove session file
+        session_file.unlink()
+
+        # Clear active session if it was the deleted one
+        if _session and _session.id == session_id:
+            _session = None
+
+        if ACTIVE_SESSION_FILE.exists():
+            with open(ACTIVE_SESSION_FILE, "r") as f:
+                active_id = f.read().strip()
+                if active_id == session_id:
+                    ACTIVE_SESSION_FILE.unlink()
+
+        return json.dumps({
+            "success": True,
+            "deleted_session_id": session_id,
+            "message": f"Session '{session_id}' deleted successfully"
+        })
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": str(e)
+        })
+
+
@mcp.tool()
 def set_goal(
    goal_id: Annotated[str, "Unique identifier for the goal"],
@@ -122,6 +363,8 @@ def set_goal(
    if not constraint_list:
        warnings.append("Consider adding constraints")

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": len(errors) == 0,
        "errors": errors,
@@ -259,6 +502,8 @@ def add_node(
    if node_type in ("llm_generate", "llm_tool_use") and not system_prompt:
        warnings.append(f"LLM node '{node_id}' should have a system_prompt")

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": len(errors) == 0,
        "errors": errors,
@@ -335,6 +580,8 @@ def add_edge(
    if edge_condition == EdgeCondition.CONDITIONAL and not condition_expr:
        errors.append(f"Conditional edge '{edge_id}' needs condition_expr")

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": len(errors) == 0,
        "errors": errors,
@@ -425,6 +672,8 @@ def update_node(
    if node.node_type in ("llm_generate", "llm_tool_use") and not node.system_prompt:
        warnings.append(f"LLM node '{node_id}' should have a system_prompt")

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": len(errors) == 0,
        "errors": errors,
@@ -482,6 +731,8 @@ def delete_node(
        if not (e.source == node_id or e.target == node_id)
    ]

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": True,
        "deleted_node": removed_node.model_dump(),
@@ -512,6 +763,8 @@ def delete_edge(
    # Remove the edge
    removed_edge = session.edges.pop(edge_idx)

+    _save_session(session)  # Auto-save
+
    return json.dumps({
        "valid": True,
        "deleted_edge": removed_edge.model_dump(),
@@ -944,6 +1197,46 @@ def export_graph() -> str:
    entry_node = validation["entry_node"]
    terminal_nodes = validation["terminal_nodes"]

+    # Extract pause/resume configuration from validation
+    pause_nodes = validation.get("pause_nodes", [])
+    resume_entry_points = validation.get("resume_entry_points", [])
+
+    # Build entry_points dict for pause/resume architecture
+    entry_points = {}
+    if entry_node:
+        entry_points["start"] = entry_node
+
+    # Add resume entry points with {pause_node}_resume naming convention
+    if pause_nodes and resume_entry_points:
+        # Strategy 1: Try to match by checking which resume node uses the pause node's outputs
+        pause_to_resume = {}
+        for pause_node_id in pause_nodes:
+            pause_node = next((n for n in session.nodes if n.id == pause_node_id), None)
+            if not pause_node:
+                continue
+
+            # Find resume nodes that read the outputs of this pause node
+            for resume_node_id in resume_entry_points:
+                resume_node = next((n for n in session.nodes if n.id == resume_node_id), None)
+                if not resume_node:
+                    continue
+
+                # Check if resume node reads pause node's outputs
+                shared_keys = set(pause_node.output_keys) & set(resume_node.input_keys)
+                if shared_keys:
+                    pause_to_resume[pause_node_id] = resume_node_id
+                    break
+
+        # Strategy 2: Fallback - pair sequentially if no match found
+        unmatched_pause = [p for p in pause_nodes if p not in pause_to_resume]
+        unmatched_resume = [r for r in resume_entry_points if r not in pause_to_resume.values()]
+        for pause_id, resume_id in zip(unmatched_pause, unmatched_resume):
+            pause_to_resume[pause_id] = resume_id
+
+        # Build entry_points dict
+        for pause_id, resume_id in pause_to_resume.items():
+            entry_points[f"{pause_id}_resume"] = resume_id
+
    # Build edges list
    edges_list = [
        {
@@ -988,6 +1281,8 @@ def export_graph() -> str:
        "goal_id": session.goal.id,
        "version": "1.0.0",
        "entry_node": entry_node,
+        "entry_points": entry_points,
+        "pause_nodes": pause_nodes,
        "terminal_nodes": terminal_nodes,
        "nodes": [node.model_dump() for node in session.nodes],
        "edges": edges_list,
@@ -1222,6 +1517,7 @@ def add_mcp_server(

            # Add to session
            session.mcp_servers.append(server_config)
+            _save_session(session)  # Auto-save

            return json.dumps({
                "success": True,
@@ -1341,6 +1637,7 @@ def remove_mcp_server(
    for i, server in enumerate(session.mcp_servers):
        if server["name"] == name:
            session.mcp_servers.pop(i)
+            _save_session(session)  # Auto-save
            return json.dumps({
                "success": True,
                "removed": name,
@@ -1964,6 +2261,387 @@ def simulate_plan_execution(
    }, indent=2)


+# =============================================================================
+# TESTING TOOLS (Goal-Based Evaluation)
+# =============================================================================
+
+# Session storage for pending tests (not yet persisted)
+_pending_tests: dict[str, list[Test]] = {}
+
+# Default storage path for tests
+DEFAULT_TEST_STORAGE_PATH = Path("data/tests")
+
+
+@mcp.tool()
+def generate_constraint_tests(
+    goal_id: Annotated[str, "ID of the goal to generate tests for"],
+    goal_json: Annotated[str, """JSON string of the Goal object. Constraint fields:
+- id: string (required)
+- description: string (required)
+- constraint_type: "hard" or "soft" (required)
+- category: string (optional, default: "general")
+- check: string (optional, how to validate: "llm_judge", expression, or function name)"""],
+) -> str:
+    """
+    Generate constraint tests for a goal.
+
+    Returns proposals for user approval. Tests are NOT persisted until approved.
+    """
+    try:
+        goal = Goal.model_validate_json(goal_json)
+    except Exception as e:
+        return json.dumps({"error": f"Invalid goal JSON: {e}"})
+
+    # Get LLM provider
+    try:
+        from framework.llm import AnthropicProvider
+        llm = AnthropicProvider()
+    except Exception as e:
+        return json.dumps({"error": f"Failed to initialize LLM: {e}"})
+
+    # Generate tests
+    generator = ConstraintTestGenerator(llm)
+    tests = generator.generate(goal)
+
+    # Store as pending (not persisted yet)
+    _pending_tests[goal_id] = tests
+
+    return json.dumps({
+        "goal_id": goal_id,
+        "generated_count": len(tests),
+        "tests": [
+            {
+                "id": t.id,
+                "test_name": t.test_name,
+                "parent_criteria_id": t.parent_criteria_id,
+                "description": t.description,
+                "confidence": t.llm_confidence,
+                "test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
+            }
+            for t in tests
+        ],
+        "next_step": "Call approve_tests to approve, modify, or reject each test",
+    })
+
+
+@mcp.tool()
+def generate_success_tests(
+    goal_id: Annotated[str, "ID of the goal to generate tests for"],
+    goal_json: Annotated[str, "JSON string of the Goal object"],
+    node_names: Annotated[str, "Comma-separated list of agent node names"] = "",
+    tool_names: Annotated[str, "Comma-separated list of available tool names"] = "",
+) -> str:
+    """
+    Generate success criteria tests for a goal.
+
+    Should be called during Eval stage after agent exists.
+    Returns proposals for user approval.
+    """
+    try:
+        goal = Goal.model_validate_json(goal_json)
+    except Exception as e:
+        return json.dumps({"error": f"Invalid goal JSON: {e}"})
+
+    # Get LLM provider
+    try:
+        from framework.llm import AnthropicProvider
+        llm = AnthropicProvider()
+    except Exception as e:
+        return json.dumps({"error": f"Failed to initialize LLM: {e}"})
+
+    # Parse node/tool names
+    nodes = [n.strip() for n in node_names.split(",") if n.strip()]
+    tools = [t.strip() for t in tool_names.split(",") if t.strip()]
+
+    # Generate tests
+    generator = SuccessCriteriaTestGenerator(llm)
+    tests = generator.generate(goal, node_names=nodes, tool_names=tools)
+
+    # Add to pending (may have constraint tests already)
+    if goal_id in _pending_tests:
+        _pending_tests[goal_id].extend(tests)
+    else:
+        _pending_tests[goal_id] = tests
+
+    return json.dumps({
+        "goal_id": goal_id,
+        "generated_count": len(tests),
+        "tests": [
+            {
+                "id": t.id,
+                "test_name": t.test_name,
+                "parent_criteria_id": t.parent_criteria_id,
+                "description": t.description,
+                "confidence": t.llm_confidence,
+                "test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
+            }
+            for t in tests
+        ],
+        "next_step": "Call approve_tests to approve, modify, or reject each test",
+    })
+
+
+@mcp.tool()
+def approve_tests(
+    goal_id: Annotated[str, "ID of the goal"],
+    approvals: Annotated[str, "JSON array of approval decisions"],
+) -> str:
+    """
+    Approve, reject, or modify generated tests.
+
+    Approvals format:
+    [
+        {"test_id": "...", "action": "approve"},
+        {"test_id": "...", "action": "modify", "modified_code": "..."},
+        {"test_id": "...", "action": "reject", "reason": "..."},
+        {"test_id": "...", "action": "skip"}
+    ]
+
+    Actions: approve, modify (requires modified_code), reject (requires reason), skip
+    """
+    if goal_id not in _pending_tests:
+        return json.dumps({"error": f"No pending tests for goal {goal_id}"})
+
+    try:
+        approvals_list = json.loads(approvals)
+    except json.JSONDecodeError as e:
+        return json.dumps({"error": f"Invalid approvals JSON: {e}"})
+
+    # Create storage
+    storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
+
+    # Build approval requests
+    requests = []
+    for a in approvals_list:
+        try:
+            action = ApprovalAction(a.get("action", "skip"))
+            requests.append(ApprovalRequest(
+                test_id=a["test_id"],
+                action=action,
+                modified_code=a.get("modified_code"),
+                reason=a.get("reason"),
+                approved_by="mcp_user",
+            ))
+        except (KeyError, ValueError) as e:
+            return json.dumps({"error": f"Invalid approval entry: {e}"})
+
+    # Find and save approved tests
+    pending = {t.id: t for t in _pending_tests[goal_id]}
+
+    results = []
+    for req in requests:
+        test = pending.get(req.test_id)
+        if not test:
+            results.append({"test_id": req.test_id, "error": "Not found in pending"})
+            continue
+
+        if req.action == ApprovalAction.APPROVE:
+            test.approve(req.approved_by)
+            storage.save_test(test)
+            results.append({"test_id": req.test_id, "status": "approved"})
+
+        elif req.action == ApprovalAction.MODIFY:
+            if req.modified_code:
+                test.modify(req.modified_code, req.approved_by)
+                storage.save_test(test)
+                results.append({"test_id": req.test_id, "status": "modified"})
+            else:
+                results.append({"test_id": req.test_id, "error": "modified_code required"})
+
+        elif req.action == ApprovalAction.REJECT:
+            test.reject(req.reason or "No reason provided")
+            storage.save_test(test)
+            results.append({"test_id": req.test_id, "status": "rejected"})
+
+        elif req.action == ApprovalAction.SKIP:
+            results.append({"test_id": req.test_id, "status": "skipped"})
+
+    # Clear pending for processed tests
+    processed_ids = {r["test_id"] for r in results if "error" not in r}
+    _pending_tests[goal_id] = [t for t in _pending_tests[goal_id] if t.id not in processed_ids]
+
+    # Clean up if empty
+    if not _pending_tests[goal_id]:
+        del _pending_tests[goal_id]
+
+    return json.dumps({"goal_id": goal_id, "results": results})
+
+
+@mcp.tool()
+def run_tests(
+    goal_id: Annotated[str, "ID of the goal to test"],
+    agent_path: Annotated[str, "Path to the agent export folder"],
+    test_types: Annotated[str, 'JSON array of test types: ["constraint", "outcome", "edge_case", "all"]'] = '["all"]',
+    parallel: Annotated[int, "Number of parallel workers (0 for sequential)"] = 0,
+    fail_fast: Annotated[bool, "Stop on first failure"] = False,
+) -> str:
+    """
+    Run evaluation tests for a goal.
+
+    Returns pass/fail summary with detailed results for each test.
+    """
+    from framework.testing.parallel import ParallelTestRunner, ParallelConfig
+
+    # Parse test types
+    try:
+        types_list = json.loads(test_types)
+    except json.JSONDecodeError:
+        types_list = ["all"]
+
+    # Load storage
+    storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
+
+    # Get approved tests
+    tests = storage.get_approved_tests(goal_id)
+
+    # Filter by type if not "all"
+    if "all" not in types_list:
+        type_map = {
+            "constraint": TestType.CONSTRAINT,
+            "outcome": TestType.SUCCESS_CRITERIA,
+            "edge_case": TestType.EDGE_CASE,
+        }
+        filter_types = {type_map.get(t) for t in types_list if t in type_map}
+        tests = [t for t in tests if t.test_type in filter_types]
+
+    if not tests:
+        return json.dumps({
+            "goal_id": goal_id,
+            "error": "No approved tests found",
+            "hint": "Generate and approve tests first using generate_constraint_tests and approve_tests",
+        })
+
+    # Configure runner
+    config = ParallelConfig(
+        num_workers=parallel if parallel > 0 else 1,
+        fail_fast=fail_fast,
+    )
+
+    # Run tests - use AgentFactory for picklable parallel execution
+    runner = ParallelTestRunner(config, storage)
+    result = runner.run_all(
+        goal_id=goal_id,
+        agent_factory=AgentFactory(agent_path),
+        tests=tests,
+    )
+
+    return json.dumps({
+        "goal_id": goal_id,
+        "overall_passed": result.all_passed,
+        "summary": {
+            "total": result.total,
+            "passed": result.passed,
+            "failed": result.failed,
+            "pass_rate": f"{result.pass_rate:.1%}",
+        },
+        "duration_ms": result.duration_ms,
+        "results": [r.summary_dict() for r in result.results],
+    })
+
+
+@mcp.tool()
+def debug_test(
+    goal_id: Annotated[str, "ID of the goal"],
+    test_id: Annotated[str, "ID of the failed test"],
+    run_id: Annotated[str, "Optional Runtime run ID for detailed logs"] = "",
+) -> str:
+    """
+    Get detailed debug info for a failed test.
+
+    Includes error categorization, logs, and fix suggestions.
+    """
+    storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
+
+    # Optionally load runtime storage
+    runtime_storage = None
+    try:
+        from framework.storage.backend import FileStorage
+        runtime_storage = FileStorage(f"data/runtime/{goal_id}")
+    except Exception:
+        pass
+
+    debug_tool = DebugTool(storage, runtime_storage)
+    info = debug_tool.analyze(goal_id, test_id, run_id or None)
+
+    return json.dumps(info.to_dict(), indent=2, default=str)
+
+
+@mcp.tool()
+def list_tests(
+    goal_id: Annotated[str, "ID of the goal"],
+    status: Annotated[str, "Filter by approval status: pending, approved, modified, rejected, all"] = "all",
+) -> str:
+    """
+    List tests for a goal.
+
+    Returns test metadata without full code (use debug_test for details).
+    """
+    storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
+    tests = storage.get_tests_by_goal(goal_id)
+
+    # Filter by status
+    if status != "all":
+        try:
+            filter_status = ApprovalStatus(status)
+            tests = [t for t in tests if t.approval_status == filter_status]
+        except ValueError:
+            pass
+
+    return json.dumps({
+        "goal_id": goal_id,
+        "total": len(tests),
+        "tests": [
+            {
+                "id": t.id,
+                "test_name": t.test_name,
+                "test_type": t.test_type.value,
+                "parent_criteria_id": t.parent_criteria_id,
+                "approval_status": t.approval_status.value,
+                "last_result": t.last_result,
+                "confidence": t.llm_confidence,
+            }
+            for t in tests
+        ],
+    })
+
+
+@mcp.tool()
+def get_pending_tests(
+    goal_id: Annotated[str, "ID of the goal"],
+) -> str:
+    """
+    Get pending tests awaiting approval.
+
+    Returns tests that have been generated but not yet approved.
+    """
+    if goal_id not in _pending_tests:
+        return json.dumps({
+            "goal_id": goal_id,
+            "pending_count": 0,
+            "tests": [],
+        })
+
+    tests = _pending_tests[goal_id]
+    return json.dumps({
+        "goal_id": goal_id,
+        "pending_count": len(tests),
+        "tests": [
+            {
+                "id": t.id,
+                "test_name": t.test_name,
+                "test_type": t.test_type.value,
+                "parent_criteria_id": t.parent_criteria_id,
+                "description": t.description,
+                "confidence": t.llm_confidence,
+                "test_code": t.test_code,
+                "input": t.input,
+                "expected_output": t.expected_output,
+            }
+            for t in tests
+        ],
+    })
+
+
 # =============================================================================
 # PLAN LOADING AND EXECUTION
 # =============================================================================
@@ -6,8 +6,6 @@ import json
 import sys
 from pathlib import Path

-from framework.graph import ExecutionStatus
-

 def register_commands(subparsers: argparse._SubParsersAction) -> None:
    """Register runner commands with the main CLI."""
@@ -48,6 +46,11 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
        action="store_true",
        help="Only output the final result JSON",
    )
+    run_parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Show detailed execution logs (steps, LLM calls, etc.)",
+    )
    run_parser.set_defaults(func=cmd_run)

    # info command
@@ -166,8 +169,17 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:

 def cmd_run(args: argparse.Namespace) -> int:
    """Run an exported agent."""
+    import logging
    from framework.runner import AgentRunner

+    # Set logging level (quiet by default for cleaner output)
+    if args.quiet:
+        logging.basicConfig(level=logging.ERROR, format='%(message)s')
+    elif getattr(args, 'verbose', False):
+        logging.basicConfig(level=logging.INFO, format='%(message)s')
+    else:
+        logging.basicConfig(level=logging.WARNING, format='%(message)s')
+
    # Load input context
    context = {}
    if args.input:
@@ -189,12 +201,18 @@ def cmd_run(args: argparse.Namespace) -> int:
        runner = AgentRunner.load(
            args.agent_path,
            mock_mode=args.mock,
-            model=getattr(args, "model", "claude-sonnet-4-20250514"),
+            model=getattr(args, "model", "claude-haiku-4-5-20251001"),
        )
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

+    # Auto-inject user_id if the agent expects it but it's not provided
+    entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else []
+    if "user_id" in entry_input_keys and context.get("user_id") is None:
+        import os
+        context["user_id"] = os.environ.get("USER", "default_user")
+
    if not args.quiet:
        info = runner.info()
        print(f"Agent: {info.name}")
@@ -212,12 +230,14 @@ def cmd_run(args: argparse.Namespace) -> int:

    # Format output
    output = {
-        "status": result.status.value if hasattr(result.status, "value") else str(result.status),
-        "completed_steps": result.completed_steps,
-        "results": result.results,
+        "success": result.success,
+        "steps_executed": result.steps_executed,
+        "output": result.output,
    }
-    if result.feedback:
-        output["feedback"] = result.feedback
+    if result.error:
+        output["error"] = result.error
+    if result.paused_at:
+        output["paused_at"] = result.paused_at

    # Output results
    if args.output:
@@ -231,27 +251,51 @@ def cmd_run(args: argparse.Namespace) -> int:
        else:
            print()
            print("=" * 60)
-            status_str = result.status.value if hasattr(result.status, "value") else str(result.status)
+            status_str = "SUCCESS" if result.success else "FAILED"
            print(f"Status: {status_str}")
-            print(f"Completed steps: {len(result.completed_steps)}")
+            print(f"Steps executed: {result.steps_executed}")
+            print(f"Path: {' → '.join(result.path)}")
            print("=" * 60)

-            if result.status == ExecutionStatus.COMPLETED:
+            if result.success:
                print("\n--- Results ---")
-                for key, value in result.results.items():
-                    if isinstance(value, (dict, list)):
-                        print(f"\n{key}:")
-                        value_str = json.dumps(value, indent=2, default=str)
-                        if len(value_str) > 500:
-                            value_str = value_str[:500] + "..."
-                        print(value_str)
-                    else:
-                        print(f"{key}: {str(value)[:200]}")
-            elif result.feedback:
-                print(f"\nFeedback: {result.feedback}")
+                # Show only meaningful output keys (skip internal/intermediate values)
+                meaningful_keys = ["final_response", "response", "result", "answer", "output"]
+
+                # Try to find the most relevant output
+                shown = False
+                for key in meaningful_keys:
+                    if key in result.output:
+                        value = result.output[key]
+                        if isinstance(value, str) and len(value) > 10:
+                            print(value)
+                            shown = True
+                            break
+                        elif isinstance(value, (dict, list)):
+                            print(json.dumps(value, indent=2, default=str))
+                            shown = True
+                            break
+
+                # If no meaningful key found, show all non-internal keys
+                if not shown:
+                    for key, value in result.output.items():
+                        if not key.startswith("_") and key not in ["user_id", "request", "memory_loaded", "user_profile", "recent_context"]:
+                            if isinstance(value, (dict, list)):
+                                print(f"\n{key}:")
+                                value_str = json.dumps(value, indent=2, default=str)
+                                if len(value_str) > 300:
+                                    value_str = value_str[:300] + "..."
+                                print(value_str)
+                            else:
+                                val_str = str(value)
+                                if len(val_str) > 200:
+                                    val_str = val_str[:200] + "..."
+                                print(f"{key}: {val_str}")
+            elif result.error:
+                print(f"\nError: {result.error}")

    runner.cleanup()
-    return 0 if result.status == ExecutionStatus.COMPLETED else 1
+    return 0 if result.success else 1


 def cmd_info(args: argparse.Namespace) -> int:
@@ -760,6 +804,11 @@ def cmd_shell(args: argparse.Namespace) -> int:
            # STARTING FRESH: Merge new input with accumulated session memory
            run_context = {**session_memory, **context}

+            # Auto-inject user_id if missing (for personal assistant agents)
+            if "user_id" in entry_input_keys and run_context.get("user_id") is None:
+                import os
+                run_context["user_id"] = os.environ.get("USER", "default_user")
+
            # Add conversation history to context if agent expects it
            if conversation_history:
                run_context["_conversation_history"] = conversation_history.copy()
@@ -778,16 +827,25 @@ def cmd_shell(args: argparse.Namespace) -> int:
        print(f"Steps executed: {result.steps_executed}")
        print(f"Path: {' → '.join(result.path)}")

+        # Show clean output - prioritize meaningful keys
        if result.output:
-            print("\nOutput:")
-            for key, value in result.output.items():
-                if isinstance(value, (dict, list)):
-                    value_str = json.dumps(value, indent=2, default=str)
-                    if len(value_str) > 300:
-                        value_str = value_str[:300] + "..."
-                    print(f"  {key}: {value_str}")
-                else:
-                    print(f"  {key}: {str(value)[:200]}")
+            meaningful_keys = ["final_response", "response", "result", "answer", "output"]
+            shown = False
+
+            for key in meaningful_keys:
+                if key in result.output:
+                    value = result.output[key]
+                    if isinstance(value, str) and len(value) > 10:
+                        print(f"\n{value}\n")
+                        shown = True
+                        break
+
+            if not shown:
+                print("\nOutput:")
+                for key, value in result.output.items():
+                    if not key.startswith("_"):
+                        val_str = str(value)[:200]
+                        print(f"  {key}: {val_str}")

        if result.error:
            print(f"\nError: {result.error}")
@@ -65,10 +65,15 @@ class MCPClient:
        self._session = None
        self._read_stream = None
        self._write_stream = None
+        self._stdio_context = None  # Context manager for stdio_client
        self._http_client: httpx.Client | None = None
        self._tools: dict[str, MCPTool] = {}
        self._connected = False

+        # Background event loop for persistent STDIO connection
+        self._loop = None
+        self._loop_thread = None
+
    def _run_async(self, coro):
        """
        Run an async coroutine, handling both sync and async contexts.
@@ -79,6 +84,13 @@ class MCPClient:
        Returns:
            Result of the coroutine
        """
+        # If we have a persistent loop (for STDIO), use it
+        if self._loop is not None:
+            import concurrent.futures
+            future = asyncio.run_coroutine_threadsafe(coro, self._loop)
+            return future.result()
+
+        # Otherwise, use the standard approach
        try:
            # Try to get the current event loop
            asyncio.get_running_loop()
@@ -129,12 +141,12 @@ class MCPClient:
        self._connected = True

    def _connect_stdio(self) -> None:
-        """Connect to MCP server via STDIO transport using MCP SDK."""
+        """Connect to MCP server via STDIO transport using MCP SDK with persistent connection."""
        if not self.config.command:
            raise ValueError("command is required for STDIO transport")

        try:
-            # Import MCP SDK
+            import threading
            from mcp import StdioServerParameters

            # Create server parameters
@@ -145,10 +157,62 @@ class MCPClient:
                cwd=self.config.cwd,
            )

-            # Store for later use in async context
+            # Store for later use
            self._server_params = server_params

-            logger.info(f"Connected to MCP server '{self.config.name}' via STDIO")
+            # Start background event loop for persistent connection
+            loop_started = threading.Event()
+            connection_ready = threading.Event()
+            connection_error = []
+
+            def run_event_loop():
+                """Run event loop in background thread."""
+                self._loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(self._loop)
+                loop_started.set()
+
+                # Initialize persistent connection
+                async def init_connection():
+                    try:
+                        from mcp import ClientSession
+                        from mcp.client.stdio import stdio_client
+
+                        # Create persistent stdio client context
+                        self._stdio_context = stdio_client(server_params)
+                        self._read_stream, self._write_stream = await self._stdio_context.__aenter__()
+
+                        # Create persistent session
+                        self._session = ClientSession(self._read_stream, self._write_stream)
+                        await self._session.__aenter__()
+
+                        # Initialize session
+                        await self._session.initialize()
+
+                        connection_ready.set()
+                    except Exception as e:
+                        connection_error.append(e)
+                        connection_ready.set()
+
+                # Schedule connection initialization
+                self._loop.create_task(init_connection())
+
+                # Run loop forever
+                self._loop.run_forever()
+
+            self._loop_thread = threading.Thread(target=run_event_loop, daemon=True)
+            self._loop_thread.start()
+
+            # Wait for loop to start
+            loop_started.wait(timeout=5)
+            if not loop_started.is_set():
+                raise RuntimeError("Event loop failed to start")
+
+            # Wait for connection to be ready
+            connection_ready.wait(timeout=10)
+            if connection_error:
+                raise connection_error[0]
+
+            logger.info(f"Connected to MCP server '{self.config.name}' via STDIO (persistent)")
        except Exception as e:
            raise RuntimeError(f"Failed to connect to MCP server: {e}")

@@ -196,28 +260,23 @@ class MCPClient:
            raise

    async def _list_tools_stdio_async(self) -> list[dict]:
-        """List tools via STDIO protocol using MCP SDK."""
-        from mcp import ClientSession
-        from mcp.client.stdio import stdio_client
+        """List tools via STDIO protocol using persistent session."""
+        if not self._session:
+            raise RuntimeError("STDIO session not initialized")

-        async with stdio_client(self._server_params) as (read, write):
-            async with ClientSession(read, write) as session:
-                # Initialize the session
-                await session.initialize()
+        # List tools using persistent session
+        response = await self._session.list_tools()

-                # List tools
-                response = await session.list_tools()
+        # Convert tools to dict format
+        tools_list = []
+        for tool in response.tools:
+            tools_list.append({
+                "name": tool.name,
+                "description": tool.description,
+                "inputSchema": tool.inputSchema,
+            })

-                # Convert tools to dict format
-                tools_list = []
-                for tool in response.tools:
-                    tools_list.append({
-                        "name": tool.name,
-                        "description": tool.description,
-                        "inputSchema": tool.inputSchema,
-                    })
-
-                return tools_list
+        return tools_list

    def _list_tools_http(self) -> list[dict]:
        """List tools via HTTP protocol."""
@@ -280,31 +339,26 @@ class MCPClient:
            return self._call_tool_http(tool_name, arguments)

    async def _call_tool_stdio_async(self, tool_name: str, arguments: dict[str, Any]) -> Any:
-        """Call tool via STDIO protocol using MCP SDK."""
-        from mcp import ClientSession
-        from mcp.client.stdio import stdio_client
+        """Call tool via STDIO protocol using persistent session."""
+        if not self._session:
+            raise RuntimeError("STDIO session not initialized")

-        async with stdio_client(self._server_params) as (read, write):
-            async with ClientSession(read, write) as session:
-                # Initialize the session
-                await session.initialize()
+        # Call tool using persistent session
+        result = await self._session.call_tool(tool_name, arguments=arguments)

-                # Call tool
-                result = await session.call_tool(tool_name, arguments=arguments)
+        # Extract content
+        if result.content:
+            # MCP returns content as a list of content items
+            if len(result.content) > 0:
+                content_item = result.content[0]
+                # Check if it's a text content item
+                if hasattr(content_item, 'text'):
+                    return content_item.text
+                elif hasattr(content_item, 'data'):
+                    return content_item.data
+            return result.content

-                # Extract content
-                if result.content:
-                    # MCP returns content as a list of content items
-                    if len(result.content) > 0:
-                        content_item = result.content[0]
-                        # Check if it's a text content item
-                        if hasattr(content_item, 'text'):
-                            return content_item.text
-                        elif hasattr(content_item, 'data'):
-                            return content_item.data
-                    return result.content
-
-                return None
+        return None

    def _call_tool_http(self, tool_name: str, arguments: dict[str, Any]) -> Any:
        """Call tool via HTTP protocol."""
@@ -336,6 +390,25 @@ class MCPClient:

    def disconnect(self) -> None:
        """Disconnect from the MCP server."""
+        # Clean up persistent STDIO connection
+        if self._loop is not None:
+            # Stop event loop - this will cause context managers to clean up naturally
+            if self._loop and self._loop.is_running():
+                self._loop.call_soon_threadsafe(self._loop.stop)
+
+            # Wait for thread to finish
+            if self._loop_thread and self._loop_thread.is_alive():
+                self._loop_thread.join(timeout=2)
+
+            # Clear references
+            self._session = None
+            self._stdio_context = None
+            self._read_stream = None
+            self._write_stream = None
+            self._loop = None
+            self._loop_thread = None
+
+        # Clean up HTTP client
        if self._http_client:
            self._http_client.close()
            self._http_client = None
@@ -57,7 +57,7 @@ class AgentOrchestrator:
    def __init__(
        self,
        llm: LLMProvider | None = None,
-        model: str = "claude-sonnet-4-20250514",
+        model: str = "claude-haiku-4-5-20251001",
    ):
        """
        Initialize the orchestrator.
@@ -172,7 +172,7 @@ class AgentRunner:
        goal: Goal,
        mock_mode: bool = False,
        storage_path: Path | None = None,
-        model: str = "claude-sonnet-4-20250514",
+        model: str = "claude-haiku-4-5-20251001",
    ):
        """
        Initialize the runner (use AgentRunner.load() instead).
@@ -196,8 +196,12 @@ class AgentRunner:
            self._storage_path = storage_path
            self._temp_dir = None
        else:
-            self._temp_dir = tempfile.TemporaryDirectory()
-            self._storage_path = Path(self._temp_dir.name) / "runtime"
+            # Use persistent storage in ~/.hive by default
+            home = Path.home()
+            default_storage = home / ".hive" / "storage" / agent_path.name
+            default_storage.mkdir(parents=True, exist_ok=True)
+            self._storage_path = default_storage
+            self._temp_dir = None

        # Initialize components
        self._tool_registry = ToolRegistry()
@@ -222,7 +226,7 @@ class AgentRunner:
        agent_path: str | Path,
        mock_mode: bool = False,
        storage_path: Path | None = None,
-        model: str = "claude-sonnet-4-20250514",
+        model: str = "claude-haiku-4-5-20251001",
    ) -> "AgentRunner":
        """
        Load an agent from an export folder.
@@ -367,6 +371,18 @@ class AgentRunner:
        # Create runtime
        self._runtime = Runtime(storage_path=self._storage_path)

+        # Set up session context for tools (workspace_id, agent_id, session_id)
+        workspace_id = "default"  # Could be derived from storage path
+        agent_id = self.graph.id or "unknown"
+        # Use "current" as a stable session_id for persistent memory
+        session_id = "current"
+
+        self._tool_registry.set_session_context(
+            workspace_id=workspace_id,
+            agent_id=agent_id,
+            session_id=session_id,
+        )
+
        # Create LLM provider (if not mock mode and API key available)
        if not self.mock_mode and os.environ.get("ANTHROPIC_API_KEY"):
            from framework.llm.anthropic import AnthropicProvider
@@ -35,6 +35,7 @@ class ToolRegistry:
    def __init__(self):
        self._tools: dict[str, RegisteredTool] = {}
        self._mcp_clients: list[Any] = []  # List of MCPClient instances
+        self._session_context: dict[str, Any] = {}  # Auto-injected context for tools

    def register(
        self,
@@ -227,6 +228,15 @@ class ToolRegistry:
        """Check if a tool is registered."""
        return name in self._tools

+    def set_session_context(self, **context) -> None:
+        """
+        Set session context to auto-inject into tool calls.
+
+        Args:
+            **context: Key-value pairs to inject (e.g., workspace_id, agent_id, session_id)
+        """
+        self._session_context.update(context)
+
    def register_mcp_server(
        self,
        server_config: dict[str, Any],
@@ -279,10 +289,12 @@ class ToolRegistry:
                tool = self._convert_mcp_tool_to_framework_tool(mcp_tool)

                # Create executor that calls the MCP server
-                def make_mcp_executor(client_ref: MCPClient, tool_name: str):
+                def make_mcp_executor(client_ref: MCPClient, tool_name: str, registry_ref):
                    def executor(inputs: dict) -> Any:
                        try:
-                            result = client_ref.call_tool(tool_name, inputs)
+                            # Inject session context for tools that need it
+                            merged_inputs = {**registry_ref._session_context, **inputs}
+                            result = client_ref.call_tool(tool_name, merged_inputs)
                            # MCP tools return content array, extract the result
                            if isinstance(result, list) and len(result) > 0:
                                if isinstance(result[0], dict) and "text" in result[0]:
@@ -298,7 +310,7 @@ class ToolRegistry:
                self.register(
                    mcp_tool.name,
                    tool,
-                    make_mcp_executor(client, mcp_tool.name),
+                    make_mcp_executor(client, mcp_tool.name, self),
                )
                count += 1

@@ -9,12 +9,15 @@ handles all the structured logging.
 from datetime import datetime
 from typing import Any
 from pathlib import Path
+import logging
 import uuid

 from framework.schemas.decision import Decision, Option, Outcome, DecisionType
 from framework.schemas.run import Run, RunStatus
 from framework.storage.backend import FileStorage

+logger = logging.getLogger(__name__)
+

 class Runtime:
    """
@@ -100,7 +103,10 @@ class Runtime:
            output_data: Final output of the run
        """
        if self._current_run is None:
-            raise RuntimeError("No run in progress")
+            # Gracefully handle case where run was already ended or never started
+            # This can happen during exception handling cascades
+            logger.warning("end_run called but no run in progress (already ended or never started)")
+            return

        status = RunStatus.COMPLETED if success else RunStatus.FAILED
        self._current_run.output_data = output_data or {}
@@ -158,10 +164,12 @@ class Runtime:
            context: Additional context available when deciding

        Returns:
-            The decision ID (use this to record outcome later)
+            The decision ID (use this to record outcome later), or empty string if no run in progress
        """
        if self._current_run is None:
-            raise RuntimeError("No run in progress. Call start_run() first.")
+            # Gracefully handle case where run ended during exception handling
+            logger.warning(f"decide called but no run in progress: {intent}")
+            return ""

        # Build Option objects
        option_objects = []
@@ -220,7 +228,10 @@ class Runtime:
            latency_ms: Time taken in milliseconds
        """
        if self._current_run is None:
-            raise RuntimeError("No run in progress")
+            # Gracefully handle case where run ended during exception handling
+            # This can happen in cascading error scenarios
+            logger.warning(f"record_outcome called but no run in progress (decision_id={decision_id})")
+            return

        outcome = Outcome(
            success=success,
@@ -258,10 +269,13 @@ class Runtime:
            suggested_fix: What might fix it (if known)

        Returns:
-            The problem ID
+            The problem ID, or empty string if no run in progress
        """
        if self._current_run is None:
-            raise RuntimeError("No run in progress")
+            # Gracefully handle case where run ended during exception handling
+            # Log the problem since we can't store it, then return empty ID
+            logger.warning(f"report_problem called but no run in progress: [{severity}] {description}")
+            return ""

        return self._current_run.add_problem(
            severity=severity,
@@ -0,0 +1,144 @@
+"""
+Goal-Based Testing Framework
+
+A three-stage framework (Goal → Agent → Eval) where tests are LLM-generated
+from success_criteria and constraints, with mandatory user approval.
+
+## Core Flow
+
+1. **Goal Stage**: Define success_criteria and constraints, generate constraint tests
+2. **Agent Stage**: Build nodes + edges, run constraint tests during development
+3. **Eval Stage**: Generate success_criteria tests, run all tests, debug failures
+
+## Key Components
+
+- **Schemas**: Test, TestResult, TestSuiteResult, ApprovalStatus, ErrorCategory
+- **Storage**: TestStorage for persisting tests and results
+- **Generation**: LLM-based test generation from Goal criteria
+- **Approval**: Mandatory user approval workflow (CLI and programmatic)
+- **Runner**: Parallel test execution with pytest-xdist inspired design
+- **Debug**: Error categorization and fix suggestions
+
+## MCP Tools
+
+Testing tools are integrated into the main agent_builder_server.py (not a separate server).
+This ensures the building_agent skill has access to all testing functionality:
+- generate_constraint_tests, generate_success_tests
+- approve_tests, run_tests, debug_test
+- list_tests, get_pending_tests
+
+## Usage
+
+```python
+from framework.testing import (
+    Test, TestResult, TestStorage,
+    ConstraintTestGenerator, SuccessCriteriaTestGenerator,
+    ParallelTestRunner, DebugTool,
+)
+
+# Generate tests
+generator = ConstraintTestGenerator(llm)
+tests = generator.generate(goal)
+
+# Approve tests (required)
+for test in tests:
+    test.approve("user")
+    storage.save_test(test)
+
+# Run tests
+runner = ParallelTestRunner()
+result = runner.run_all(goal_id, agent_factory, tests)
+
+# Debug failures
+debug = DebugTool(storage)
+info = debug.analyze(goal_id, test_id)
+```
+
+## CLI Commands
+
+```bash
+python -m framework test-generate goal.json
+python -m framework test-approve <goal_id>
+python -m framework test-run <agent_path> --goal <goal_id>
+python -m framework test-debug <goal_id> <test_id>
+```
+"""
+
+# Schemas
+from framework.testing.test_case import (
+    ApprovalStatus,
+    TestType,
+    Test,
+)
+from framework.testing.test_result import (
+    ErrorCategory,
+    TestResult,
+    TestSuiteResult,
+)
+
+# Storage
+from framework.testing.test_storage import TestStorage
+
+# Generation
+from framework.testing.constraint_gen import ConstraintTestGenerator
+from framework.testing.success_gen import SuccessCriteriaTestGenerator
+from framework.testing.prompts import (
+    CONSTRAINT_TEST_PROMPT,
+    SUCCESS_CRITERIA_TEST_PROMPT,
+)
+
+# Approval
+from framework.testing.approval_types import (
+    ApprovalAction,
+    ApprovalRequest,
+    ApprovalResult,
+    BatchApprovalRequest,
+    BatchApprovalResult,
+)
+from framework.testing.approval_cli import interactive_approval, batch_approval
+
+# Runner
+from framework.testing.executor import TestExecutor
+from framework.testing.parallel import ParallelTestRunner, ParallelConfig
+from framework.testing.categorizer import ErrorCategorizer
+
+# Debug
+from framework.testing.debug_tool import DebugTool, DebugInfo
+
+# CLI
+from framework.testing.cli import register_testing_commands
+
+__all__ = [
+    # Schemas
+    "ApprovalStatus",
+    "TestType",
+    "Test",
+    "ErrorCategory",
+    "TestResult",
+    "TestSuiteResult",
+    # Storage
+    "TestStorage",
+    # Generation
+    "ConstraintTestGenerator",
+    "SuccessCriteriaTestGenerator",
+    "CONSTRAINT_TEST_PROMPT",
+    "SUCCESS_CRITERIA_TEST_PROMPT",
+    # Approval
+    "ApprovalAction",
+    "ApprovalRequest",
+    "ApprovalResult",
+    "BatchApprovalRequest",
+    "BatchApprovalResult",
+    "interactive_approval",
+    "batch_approval",
+    # Runner
+    "TestExecutor",
+    "ParallelTestRunner",
+    "ParallelConfig",
+    "ErrorCategorizer",
+    # Debug
+    "DebugTool",
+    "DebugInfo",
+    # CLI
+    "register_testing_commands",
+]
@@ -0,0 +1,295 @@
+"""
+Interactive CLI for reviewing and approving generated tests.
+
+LLM-generated tests are NEVER created without user approval.
+This CLI provides the interactive approval workflow.
+"""
+
+import json
+import tempfile
+import subprocess
+import os
+from typing import Callable
+
+from framework.testing.test_case import Test, ApprovalStatus
+from framework.testing.test_storage import TestStorage
+from framework.testing.approval_types import (
+    ApprovalAction,
+    ApprovalRequest,
+    ApprovalResult,
+    BatchApprovalResult,
+)
+
+
+def interactive_approval(
+    tests: list[Test],
+    storage: TestStorage,
+    on_progress: Callable[[int, int], None] | None = None,
+) -> list[ApprovalResult]:
+    """
+    Interactive CLI flow for reviewing generated tests.
+
+    Displays each test and allows user to:
+    - [a]pprove: Accept as-is
+    - [r]eject: Decline with reason
+    - [e]dit: Modify before accepting
+    - [s]kip: Leave pending (decide later)
+
+    Args:
+        tests: List of pending tests to review
+        storage: TestStorage for saving decisions
+        on_progress: Optional callback(current, total) for progress tracking
+
+    Returns:
+        List of ApprovalResult for each processed test
+    """
+    results = []
+    total = len(tests)
+
+    for i, test in enumerate(tests, 1):
+        if on_progress:
+            on_progress(i, total)
+
+        # Display test
+        _display_test(test, i, total)
+
+        # Get user action
+        action = _get_user_action()
+
+        # Process action
+        result = _process_action(test, action, storage)
+        results.append(result)
+
+        print()  # Blank line between tests
+
+    return results
+
+
+def batch_approval(
+    goal_id: str,
+    requests: list[ApprovalRequest],
+    storage: TestStorage,
+) -> BatchApprovalResult:
+    """
+    Process multiple approval requests at once.
+
+    Used by MCP interface for programmatic approval.
+
+    Args:
+        goal_id: Goal ID for the tests
+        requests: List of approval requests
+        storage: TestStorage for saving decisions
+
+    Returns:
+        BatchApprovalResult with counts and individual results
+    """
+    results = []
+    counts = {
+        "approved": 0,
+        "modified": 0,
+        "rejected": 0,
+        "skipped": 0,
+        "errors": 0,
+    }
+
+    for req in requests:
+        # Validate request
+        valid, error = req.validate_action()
+        if not valid:
+            results.append(ApprovalResult.error_result(
+                req.test_id, req.action, error or "Invalid request"
+            ))
+            counts["errors"] += 1
+            continue
+
+        # Load test
+        test = storage.load_test(goal_id, req.test_id)
+        if not test:
+            results.append(ApprovalResult.error_result(
+                req.test_id, req.action, f"Test {req.test_id} not found"
+            ))
+            counts["errors"] += 1
+            continue
+
+        # Apply action
+        try:
+            if req.action == ApprovalAction.APPROVE:
+                test.approve(req.approved_by)
+                counts["approved"] += 1
+            elif req.action == ApprovalAction.MODIFY:
+                test.modify(req.modified_code or test.test_code, req.approved_by)
+                counts["modified"] += 1
+            elif req.action == ApprovalAction.REJECT:
+                test.reject(req.reason or "No reason provided")
+                counts["rejected"] += 1
+            elif req.action == ApprovalAction.SKIP:
+                counts["skipped"] += 1
+
+            # Save if not skipped
+            if req.action != ApprovalAction.SKIP:
+                storage.update_test(test)
+
+            results.append(ApprovalResult.success_result(
+                req.test_id, req.action, f"Test {req.action.value}d successfully"
+            ))
+
+        except Exception as e:
+            results.append(ApprovalResult.error_result(
+                req.test_id, req.action, str(e)
+            ))
+            counts["errors"] += 1
+
+    return BatchApprovalResult(
+        goal_id=goal_id,
+        total=len(requests),
+        approved=counts["approved"],
+        modified=counts["modified"],
+        rejected=counts["rejected"],
+        skipped=counts["skipped"],
+        errors=counts["errors"],
+        results=results,
+    )
+
+
+def _display_test(test: Test, index: int, total: int) -> None:
+    """Display a test for review."""
+    separator = "=" * 60
+
+    print(f"\n{separator}")
+    print(f"[{index}/{total}] {test.test_name}")
+    print(f"Type: {test.test_type.value}")
+    print(f"Criteria: {test.parent_criteria_id}")
+    print(f"Confidence: {test.llm_confidence * 100:.0f}%")
+    print(separator)
+
+    print(f"\nDescription: {test.description}")
+
+    if test.input:
+        print(f"\nInput:")
+        print(json.dumps(test.input, indent=2))
+
+    if test.expected_output:
+        print(f"\nExpected Output:")
+        print(json.dumps(test.expected_output, indent=2))
+
+    print(f"\nTest Code:")
+    print("-" * 40)
+    print(test.test_code)
+    print("-" * 40)
+
+    print("\n[a]pprove  [r]eject  [e]dit  [s]kip")
+
+
+def _get_user_action() -> ApprovalAction:
+    """Get user's choice for action."""
+    while True:
+        choice = input("Your choice: ").strip().lower()
+
+        if choice == "a":
+            return ApprovalAction.APPROVE
+        elif choice == "r":
+            return ApprovalAction.REJECT
+        elif choice == "e":
+            return ApprovalAction.MODIFY
+        elif choice == "s":
+            return ApprovalAction.SKIP
+        else:
+            print("Invalid choice. Please enter a, r, e, or s.")
+
+
+def _process_action(
+    test: Test,
+    action: ApprovalAction,
+    storage: TestStorage,
+) -> ApprovalResult:
+    """Process user's action on a test."""
+    try:
+        if action == ApprovalAction.APPROVE:
+            test.approve()
+            storage.update_test(test)
+            print("✓ Approved")
+            return ApprovalResult.success_result(test.id, action, "Approved")
+
+        elif action == ApprovalAction.REJECT:
+            reason = input("Rejection reason: ").strip()
+            if not reason:
+                reason = "No reason provided"
+            test.reject(reason)
+            storage.update_test(test)
+            print(f"✗ Rejected: {reason}")
+            return ApprovalResult.success_result(test.id, action, f"Rejected: {reason}")
+
+        elif action == ApprovalAction.MODIFY:
+            edited_code = _edit_test_code(test.test_code)
+            if edited_code != test.test_code:
+                test.modify(edited_code)
+                storage.update_test(test)
+                print("✓ Modified and approved")
+                return ApprovalResult.success_result(test.id, action, "Modified and approved")
+            else:
+                # No changes made, treat as approve
+                test.approve()
+                storage.update_test(test)
+                print("✓ Approved (no modifications)")
+                return ApprovalResult.success_result(test.id, ApprovalAction.APPROVE, "No modifications made")
+
+        elif action == ApprovalAction.SKIP:
+            print("⏭ Skipped (remains pending)")
+            return ApprovalResult.success_result(test.id, action, "Skipped")
+
+        else:
+            return ApprovalResult.error_result(test.id, action, f"Unknown action: {action}")
+
+    except Exception as e:
+        return ApprovalResult.error_result(test.id, action, str(e))
+
+
+def _edit_test_code(code: str) -> str:
+    """
+    Open test code in user's editor for modification.
+
+    Uses $EDITOR environment variable, falls back to vim/nano.
+    """
+    editor = os.environ.get("EDITOR", "vim")
+
+    # Try to find an available editor
+    if not _command_exists(editor):
+        for fallback in ["nano", "vi", "notepad"]:
+            if _command_exists(fallback):
+                editor = fallback
+                break
+
+    # Create temp file with code
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        suffix=".py",
+        delete=False
+    ) as f:
+        f.write(code)
+        temp_path = f.name
+
+    try:
+        # Open editor
+        subprocess.run([editor, temp_path], check=True)
+
+        # Read edited code
+        with open(temp_path) as f:
+            return f.read()
+    except subprocess.CalledProcessError:
+        print("Editor failed, keeping original code")
+        return code
+    except FileNotFoundError:
+        print(f"Editor '{editor}' not found, keeping original code")
+        return code
+    finally:
+        # Clean up temp file
+        try:
+            os.unlink(temp_path)
+        except OSError:
+            pass
+
+
+def _command_exists(cmd: str) -> bool:
+    """Check if a command exists in PATH."""
+    from shutil import which
+    return which(cmd) is not None
@@ -0,0 +1,130 @@
+"""
+Types for the approval workflow.
+
+These types are used for both interactive CLI approval and
+programmatic/MCP-based approval.
+"""
+
+from enum import Enum
+from datetime import datetime
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class ApprovalAction(str, Enum):
+    """Actions a user can take on a generated test."""
+    APPROVE = "approve"   # Accept as-is
+    MODIFY = "modify"     # Accept with modifications
+    REJECT = "reject"     # Decline
+    SKIP = "skip"         # Leave pending (decide later)
+
+
+class ApprovalRequest(BaseModel):
+    """
+    Request to approve/modify/reject a generated test.
+
+    Used by both CLI and MCP interfaces.
+    """
+    test_id: str
+    action: ApprovalAction
+    modified_code: str | None = Field(
+        default=None,
+        description="New code if action is MODIFY"
+    )
+    reason: str | None = Field(
+        default=None,
+        description="Rejection reason if action is REJECT"
+    )
+    approved_by: str = "user"
+
+    def validate_action(self) -> tuple[bool, str | None]:
+        """
+        Validate that the request has required fields for its action.
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        if self.action == ApprovalAction.MODIFY and not self.modified_code:
+            return False, "modified_code is required for MODIFY action"
+        if self.action == ApprovalAction.REJECT and not self.reason:
+            return False, "reason is required for REJECT action"
+        return True, None
+
+
+class ApprovalResult(BaseModel):
+    """
+    Result of processing an approval request.
+    """
+    test_id: str
+    action: ApprovalAction
+    success: bool
+    message: str | None = None
+    error: str | None = None
+    timestamp: datetime = Field(default_factory=datetime.now)
+
+    @classmethod
+    def success_result(
+        cls, test_id: str, action: ApprovalAction, message: str | None = None
+    ) -> "ApprovalResult":
+        """Create a successful result."""
+        return cls(
+            test_id=test_id,
+            action=action,
+            success=True,
+            message=message,
+        )
+
+    @classmethod
+    def error_result(
+        cls, test_id: str, action: ApprovalAction, error: str
+    ) -> "ApprovalResult":
+        """Create an error result."""
+        return cls(
+            test_id=test_id,
+            action=action,
+            success=False,
+            error=error,
+        )
+
+
+class BatchApprovalRequest(BaseModel):
+    """
+    Request to approve multiple tests at once.
+
+    Useful for MCP interface where user reviews all tests and submits decisions.
+    """
+    goal_id: str
+    approvals: list[ApprovalRequest]
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "goal_id": self.goal_id,
+            "approvals": [a.model_dump() for a in self.approvals],
+        }
+
+
+class BatchApprovalResult(BaseModel):
+    """
+    Result of processing a batch approval request.
+    """
+    goal_id: str
+    total: int
+    approved: int
+    modified: int
+    rejected: int
+    skipped: int
+    errors: int
+    results: list[ApprovalResult]
+
+    def summary(self) -> str:
+        """Return a summary string."""
+        return (
+            f"Processed {self.total} tests: "
+            f"{self.approved} approved, "
+            f"{self.modified} modified, "
+            f"{self.rejected} rejected, "
+            f"{self.skipped} skipped, "
+            f"{self.errors} errors"
+        )
@@ -0,0 +1,260 @@
+"""
+Error categorization for test failures.
+
+Categorizes errors to guide iteration strategy:
+- LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints
+- IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
+- EDGE_CASE: New scenario discovered → add new test only
+"""
+
+import re
+from typing import Any
+
+from framework.testing.test_result import ErrorCategory, TestResult
+
+
+class ErrorCategorizer:
+    """
+    Categorize test failures for guiding iteration.
+
+    Uses pattern matching heuristics to classify errors.
+    Each category has different implications for how to fix.
+    """
+
+    # Patterns indicating goal/criteria definition is wrong
+    LOGIC_ERROR_PATTERNS = [
+        r"goal not achieved",
+        r"constraint violated:?\s*core",
+        r"fundamental assumption",
+        r"success criteria mismatch",
+        r"criteria not met",
+        r"expected behavior incorrect",
+        r"specification error",
+        r"requirement mismatch",
+    ]
+
+    # Patterns indicating code/implementation bug
+    IMPLEMENTATION_ERROR_PATTERNS = [
+        r"TypeError",
+        r"AttributeError",
+        r"KeyError",
+        r"IndexError",
+        r"ValueError",
+        r"NameError",
+        r"ImportError",
+        r"ModuleNotFoundError",
+        r"RuntimeError",
+        r"NullPointerException",
+        r"NoneType.*has no attribute",
+        r"tool call failed",
+        r"node execution error",
+        r"agent execution failed",
+        r"assertion.*failed",
+        r"AssertionError",
+        r"expected.*but got",
+        r"unexpected.*type",
+        r"missing required",
+        r"invalid.*argument",
+    ]
+
+    # Patterns indicating edge case / new scenario
+    EDGE_CASE_PATTERNS = [
+        r"boundary condition",
+        r"timeout",
+        r"connection.*timeout",
+        r"request.*timeout",
+        r"unexpected format",
+        r"unexpected response",
+        r"rare input",
+        r"empty.*result",
+        r"null.*value",
+        r"empty.*response",
+        r"no.*results",
+        r"rate.*limit",
+        r"quota.*exceeded",
+        r"retry.*exhausted",
+        r"unicode.*error",
+        r"encoding.*error",
+        r"special.*character",
+    ]
+
+    def __init__(self):
+        """Initialize categorizer with compiled patterns."""
+        self._logic_patterns = [
+            re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS
+        ]
+        self._impl_patterns = [
+            re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS
+        ]
+        self._edge_patterns = [
+            re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS
+        ]
+
+    def categorize(self, result: TestResult) -> ErrorCategory | None:
+        """
+        Categorize a test failure.
+
+        Args:
+            result: TestResult to categorize
+
+        Returns:
+            ErrorCategory if test failed, None if passed
+        """
+        if result.passed:
+            return None
+
+        # Combine error sources for analysis
+        error_text = self._get_error_text(result)
+
+        # Check patterns in priority order
+        # Logic errors take precedence (wrong goal definition)
+        for pattern in self._logic_patterns:
+            if pattern.search(error_text):
+                return ErrorCategory.LOGIC_ERROR
+
+        # Then implementation errors (code bugs)
+        for pattern in self._impl_patterns:
+            if pattern.search(error_text):
+                return ErrorCategory.IMPLEMENTATION_ERROR
+
+        # Then edge cases (new scenarios)
+        for pattern in self._edge_patterns:
+            if pattern.search(error_text):
+                return ErrorCategory.EDGE_CASE
+
+        # Default to implementation error (most common)
+        return ErrorCategory.IMPLEMENTATION_ERROR
+
+    def categorize_with_confidence(
+        self, result: TestResult
+    ) -> tuple[ErrorCategory | None, float]:
+        """
+        Categorize with a confidence score.
+
+        Args:
+            result: TestResult to categorize
+
+        Returns:
+            Tuple of (category, confidence 0-1)
+        """
+        if result.passed:
+            return None, 1.0
+
+        error_text = self._get_error_text(result)
+
+        # Count pattern matches for each category
+        logic_matches = sum(
+            1 for p in self._logic_patterns if p.search(error_text)
+        )
+        impl_matches = sum(
+            1 for p in self._impl_patterns if p.search(error_text)
+        )
+        edge_matches = sum(
+            1 for p in self._edge_patterns if p.search(error_text)
+        )
+
+        total_matches = logic_matches + impl_matches + edge_matches
+
+        if total_matches == 0:
+            # No pattern matches, default to implementation with low confidence
+            return ErrorCategory.IMPLEMENTATION_ERROR, 0.3
+
+        # Calculate confidence based on match dominance
+        if logic_matches >= impl_matches and logic_matches >= edge_matches:
+            confidence = logic_matches / total_matches if total_matches > 0 else 0.5
+            return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4)
+
+        if impl_matches >= logic_matches and impl_matches >= edge_matches:
+            confidence = impl_matches / total_matches if total_matches > 0 else 0.5
+            return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4)
+
+        confidence = edge_matches / total_matches if total_matches > 0 else 0.5
+        return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4)
+
+    def _get_error_text(self, result: TestResult) -> str:
+        """Extract all error text from a result for analysis."""
+        parts = []
+
+        if result.error_message:
+            parts.append(result.error_message)
+
+        if result.stack_trace:
+            parts.append(result.stack_trace)
+
+        # Include log messages
+        for log in result.runtime_logs:
+            if log.get("level") in ("ERROR", "CRITICAL", "WARNING"):
+                parts.append(str(log.get("msg", "")))
+
+        return " ".join(parts)
+
+    def get_fix_suggestion(self, category: ErrorCategory) -> str:
+        """
+        Get a fix suggestion based on error category.
+
+        Args:
+            category: ErrorCategory from categorization
+
+        Returns:
+            Human-readable fix suggestion
+        """
+        suggestions = {
+            ErrorCategory.LOGIC_ERROR: (
+                "Review and update success_criteria or constraints in the Goal definition. "
+                "The goal specification may not accurately describe the desired behavior."
+            ),
+            ErrorCategory.IMPLEMENTATION_ERROR: (
+                "Fix the code in agent nodes/edges. "
+                "There's a bug in the implementation that needs to be corrected."
+            ),
+            ErrorCategory.EDGE_CASE: (
+                "Add a new test for this edge case scenario. "
+                "This is a valid scenario that wasn't covered by existing tests."
+            ),
+        }
+        return suggestions.get(category, "Review the test and agent implementation.")
+
+    def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]:
+        """
+        Get detailed iteration guidance based on error category.
+
+        Returns a dict with:
+        - stage: Which stage to return to (Goal, Agent, Eval)
+        - action: What action to take
+        - restart_required: Whether full 3-step flow restart is needed
+        """
+        guidance = {
+            ErrorCategory.LOGIC_ERROR: {
+                "stage": "Goal",
+                "action": "Update success_criteria or constraints",
+                "restart_required": True,
+                "description": (
+                    "The goal definition is incorrect. Update the success criteria "
+                    "or constraints, then restart the full Goal → Agent → Eval flow."
+                ),
+            },
+            ErrorCategory.IMPLEMENTATION_ERROR: {
+                "stage": "Agent",
+                "action": "Fix nodes/edges implementation",
+                "restart_required": False,
+                "description": (
+                    "There's a code bug. Fix the agent implementation, "
+                    "then re-run Eval (skip Goal stage)."
+                ),
+            },
+            ErrorCategory.EDGE_CASE: {
+                "stage": "Eval",
+                "action": "Add new test only",
+                "restart_required": False,
+                "description": (
+                    "This is a new scenario. Add a test for it and continue "
+                    "in the Eval stage."
+                ),
+            },
+        }
+        return guidance.get(category, {
+            "stage": "Unknown",
+            "action": "Review manually",
+            "restart_required": False,
+            "description": "Unable to determine category. Manual review required.",
+        })
@@ -0,0 +1,413 @@
+"""
+CLI commands for goal-based testing.
+
+Provides commands:
+- test-generate: Generate tests from a goal
+- test-approve: Review and approve pending tests
+- test-run: Run tests for an agent
+- test-debug: Debug a failed test
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from framework.graph.goal import Goal
+from framework.testing.test_case import TestType
+from framework.testing.test_storage import TestStorage
+from framework.testing.constraint_gen import ConstraintTestGenerator
+from framework.testing.success_gen import SuccessCriteriaTestGenerator
+from framework.testing.approval_cli import interactive_approval
+from framework.testing.parallel import ParallelTestRunner, ParallelConfig, AgentFactory
+from framework.testing.debug_tool import DebugTool
+
+
+DEFAULT_STORAGE_PATH = Path("data/tests")
+
+
+def register_testing_commands(subparsers: argparse._SubParsersAction) -> None:
+    """Register testing CLI commands."""
+
+    # test-generate
+    gen_parser = subparsers.add_parser(
+        "test-generate",
+        help="Generate tests from goal criteria",
+    )
+    gen_parser.add_argument(
+        "goal_file",
+        help="Path to goal JSON file",
+    )
+    gen_parser.add_argument(
+        "--type",
+        choices=["constraint", "success", "all"],
+        default="all",
+        help="Type of tests to generate",
+    )
+    gen_parser.add_argument(
+        "--auto-approve",
+        action="store_true",
+        help="Skip interactive approval (use with caution)",
+    )
+    gen_parser.add_argument(
+        "--output",
+        "-o",
+        help="Output directory for tests (default: data/tests/<goal_id>)",
+    )
+    gen_parser.set_defaults(func=cmd_test_generate)
+
+    # test-approve
+    approve_parser = subparsers.add_parser(
+        "test-approve",
+        help="Review and approve pending tests",
+    )
+    approve_parser.add_argument(
+        "goal_id",
+        help="Goal ID to review tests for",
+    )
+    approve_parser.add_argument(
+        "--storage",
+        help="Storage directory (default: data/tests/<goal_id>)",
+    )
+    approve_parser.set_defaults(func=cmd_test_approve)
+
+    # test-run
+    run_parser = subparsers.add_parser(
+        "test-run",
+        help="Run tests for an agent",
+    )
+    run_parser.add_argument(
+        "agent_path",
+        help="Path to agent export folder",
+    )
+    run_parser.add_argument(
+        "--goal",
+        "-g",
+        required=True,
+        help="Goal ID to run tests for",
+    )
+    run_parser.add_argument(
+        "--parallel",
+        "-p",
+        type=int,
+        default=0,
+        help="Number of parallel workers (0 for sequential)",
+    )
+    run_parser.add_argument(
+        "--fail-fast",
+        action="store_true",
+        help="Stop on first failure",
+    )
+    run_parser.add_argument(
+        "--type",
+        choices=["constraint", "success", "edge_case", "all"],
+        default="all",
+        help="Type of tests to run",
+    )
+    run_parser.set_defaults(func=cmd_test_run)
+
+    # test-debug
+    debug_parser = subparsers.add_parser(
+        "test-debug",
+        help="Debug a failed test",
+    )
+    debug_parser.add_argument(
+        "goal_id",
+        help="Goal ID",
+    )
+    debug_parser.add_argument(
+        "test_id",
+        help="Test ID to debug",
+    )
+    debug_parser.add_argument(
+        "--run-id",
+        help="Runtime run ID for detailed logs",
+    )
+    debug_parser.set_defaults(func=cmd_test_debug)
+
+    # test-list
+    list_parser = subparsers.add_parser(
+        "test-list",
+        help="List tests for a goal",
+    )
+    list_parser.add_argument(
+        "goal_id",
+        help="Goal ID",
+    )
+    list_parser.add_argument(
+        "--status",
+        choices=["pending", "approved", "modified", "rejected", "all"],
+        default="all",
+        help="Filter by approval status",
+    )
+    list_parser.set_defaults(func=cmd_test_list)
+
+    # test-stats
+    stats_parser = subparsers.add_parser(
+        "test-stats",
+        help="Show test statistics for a goal",
+    )
+    stats_parser.add_argument(
+        "goal_id",
+        help="Goal ID",
+    )
+    stats_parser.set_defaults(func=cmd_test_stats)
+
+
+def cmd_test_generate(args: argparse.Namespace) -> int:
+    """Generate tests from a goal file."""
+    # Load goal
+    goal_path = Path(args.goal_file)
+    if not goal_path.exists():
+        print(f"Error: Goal file not found: {goal_path}")
+        return 1
+
+    with open(goal_path) as f:
+        goal = Goal.model_validate_json(f.read())
+
+    print(f"Loaded goal: {goal.name} ({goal.id})")
+
+    # Determine output directory
+    output_dir = Path(args.output) if args.output else DEFAULT_STORAGE_PATH / goal.id
+    storage = TestStorage(output_dir)
+
+    # Get LLM provider
+    try:
+        from framework.llm import AnthropicProvider
+        llm = AnthropicProvider()
+    except Exception as e:
+        print(f"Error: Failed to initialize LLM provider: {e}")
+        return 1
+
+    all_tests = []
+
+    # Generate constraint tests
+    if args.type in ("constraint", "all"):
+        print(f"\nGenerating constraint tests for {len(goal.constraints)} constraints...")
+        generator = ConstraintTestGenerator(llm)
+        constraint_tests = generator.generate(goal)
+        all_tests.extend(constraint_tests)
+        print(f"Generated {len(constraint_tests)} constraint tests")
+
+    # Generate success criteria tests
+    if args.type in ("success", "all"):
+        print(f"\nGenerating success criteria tests for {len(goal.success_criteria)} criteria...")
+        generator = SuccessCriteriaTestGenerator(llm)
+        success_tests = generator.generate(goal)
+        all_tests.extend(success_tests)
+        print(f"Generated {len(success_tests)} success criteria tests")
+
+    if not all_tests:
+        print("\nNo tests generated.")
+        return 0
+
+    print(f"\nTotal tests generated: {len(all_tests)}")
+
+    # Approval
+    if args.auto_approve:
+        print("\nAuto-approving all tests...")
+        for test in all_tests:
+            test.approve("cli-auto")
+            storage.save_test(test)
+        print(f"Saved {len(all_tests)} tests to {output_dir}")
+    else:
+        print("\nStarting interactive approval...")
+        # Save pending tests first
+        for test in all_tests:
+            storage.save_test(test)
+
+        results = interactive_approval(all_tests, storage)
+        approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
+        print(f"\nApproved: {approved}/{len(all_tests)} tests")
+
+    return 0
+
+
+def cmd_test_approve(args: argparse.Namespace) -> int:
+    """Review and approve pending tests."""
+    storage_path = Path(args.storage) if args.storage else DEFAULT_STORAGE_PATH / args.goal_id
+    storage = TestStorage(storage_path)
+
+    pending = storage.get_pending_tests(args.goal_id)
+
+    if not pending:
+        print(f"No pending tests for goal {args.goal_id}")
+        return 0
+
+    print(f"Found {len(pending)} pending tests\n")
+
+    results = interactive_approval(pending, storage)
+    approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
+    print(f"\nApproved: {approved}/{len(pending)} tests")
+
+    return 0
+
+
+def cmd_test_run(args: argparse.Namespace) -> int:
+    """Run tests for an agent."""
+    storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal)
+
+    # Get approved tests
+    tests = storage.get_approved_tests(args.goal)
+
+    # Filter by type
+    if args.type != "all":
+        type_map = {
+            "constraint": TestType.CONSTRAINT,
+            "success": TestType.SUCCESS_CRITERIA,
+            "edge_case": TestType.EDGE_CASE,
+        }
+        filter_type = type_map.get(args.type)
+        if filter_type:
+            tests = [t for t in tests if t.test_type == filter_type]
+
+    if not tests:
+        print(f"No approved tests found for goal {args.goal}")
+        return 1
+
+    print(f"Running {len(tests)} tests...\n")
+
+    # Configure runner
+    config = ParallelConfig(
+        num_workers=args.parallel if args.parallel > 0 else 1,
+        fail_fast=args.fail_fast,
+    )
+
+    # Run with progress - use AgentFactory for picklable parallel execution
+    runner = ParallelTestRunner(config, storage)
+
+    def on_result(result):
+        status = "✓" if result.passed else "✗"
+        print(f"  {status} {result.test_id} ({result.duration_ms}ms)")
+
+    result = runner.run_all(
+        goal_id=args.goal,
+        agent_factory=AgentFactory(args.agent_path),
+        tests=tests,
+        on_result=on_result,
+    )
+
+    # Print summary
+    print(f"\n{'=' * 40}")
+    print(f"Results: {result.passed}/{result.total} passed ({result.pass_rate:.1%})")
+    print(f"Duration: {result.duration_ms}ms")
+
+    if not result.all_passed:
+        print(f"\nFailed tests:")
+        for r in result.get_failed_results():
+            print(f"  - {r.test_id}: {r.error_message}")
+            if r.error_category:
+                print(f"    Category: {r.error_category.value}")
+
+    return 0 if result.all_passed else 1
+
+
+def cmd_test_debug(args: argparse.Namespace) -> int:
+    """Debug a failed test."""
+    storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
+
+    # Try to load runtime storage
+    runtime_storage = None
+    try:
+        from framework.storage.backend import FileStorage
+        runtime_storage = FileStorage(f"data/runtime/{args.goal_id}")
+    except Exception:
+        pass
+
+    debug_tool = DebugTool(storage, runtime_storage)
+    info = debug_tool.analyze(args.goal_id, args.test_id, args.run_id)
+
+    # Print debug info
+    print(f"Debug Info for: {info.test_name}")
+    print("=" * 50)
+
+    print(f"\nTest ID: {info.test_id}")
+    print(f"Passed: {info.passed}")
+
+    if info.error_category:
+        print(f"\nError Category: {info.error_category}")
+        print(f"Suggested Fix: {info.suggested_fix}")
+
+    if info.error_message:
+        print(f"\nError Message:\n{info.error_message}")
+
+    if info.stack_trace:
+        print(f"\nStack Trace:\n{info.stack_trace}")
+
+    if info.iteration_guidance:
+        print(f"\nIteration Guidance:")
+        print(f"  Stage: {info.iteration_guidance.get('stage')}")
+        print(f"  Action: {info.iteration_guidance.get('action')}")
+        print(f"  Restart Required: {info.iteration_guidance.get('restart_required')}")
+
+    print(f"\nInput:\n{json.dumps(info.input, indent=2)}")
+    print(f"\nExpected:\n{json.dumps(info.expected, indent=2)}")
+    print(f"\nActual:\n{json.dumps(info.actual, indent=2, default=str)}")
+
+    return 0
+
+
+def cmd_test_list(args: argparse.Namespace) -> int:
+    """List tests for a goal."""
+    storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
+    tests = storage.get_tests_by_goal(args.goal_id)
+
+    # Filter by status
+    if args.status != "all":
+        from framework.testing.test_case import ApprovalStatus
+        try:
+            filter_status = ApprovalStatus(args.status)
+            tests = [t for t in tests if t.approval_status == filter_status]
+        except ValueError:
+            pass
+
+    if not tests:
+        print(f"No tests found for goal {args.goal_id}")
+        return 0
+
+    print(f"Tests for goal {args.goal_id}:\n")
+    for t in tests:
+        status_icon = {
+            "pending": "⏳",
+            "approved": "✓",
+            "modified": "✓*",
+            "rejected": "✗",
+        }.get(t.approval_status.value, "?")
+
+        result_icon = ""
+        if t.last_result:
+            result_icon = " [PASS]" if t.last_result == "passed" else " [FAIL]"
+
+        print(f"  {status_icon} {t.test_name} ({t.test_type.value}){result_icon}")
+        print(f"      ID: {t.id}")
+        print(f"      Criteria: {t.parent_criteria_id}")
+        if t.llm_confidence:
+            print(f"      Confidence: {t.llm_confidence:.0%}")
+        print()
+
+    return 0
+
+
+def cmd_test_stats(args: argparse.Namespace) -> int:
+    """Show test statistics."""
+    storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
+    stats = storage.get_stats()
+
+    print(f"Statistics for goal {args.goal_id}:\n")
+    print(f"  Total tests: {stats['total_tests']}")
+    print(f"\n  By approval status:")
+    for status, count in stats["by_approval"].items():
+        print(f"    {status}: {count}")
+
+    # Get pass/fail stats
+    tests = storage.get_approved_tests(args.goal_id)
+    passed = sum(1 for t in tests if t.last_result == "passed")
+    failed = sum(1 for t in tests if t.last_result == "failed")
+    not_run = sum(1 for t in tests if t.last_result is None)
+
+    print(f"\n  Execution results:")
+    print(f"    Passed: {passed}")
+    print(f"    Failed: {failed}")
+    print(f"    Not run: {not_run}")
+
+    return 0
@@ -0,0 +1,201 @@
+"""
+Constraint test generator.
+
+Generates tests for Goal constraints using LLM.
+Tests are returned with PENDING approval status.
+"""
+
+import uuid
+from typing import TYPE_CHECKING
+
+from framework.graph.goal import Goal, Constraint
+from framework.testing.test_case import Test, TestType, ApprovalStatus
+from framework.testing.prompts import CONSTRAINT_TEST_PROMPT
+from framework.llm.provider import Tool, ToolUse, ToolResult
+
+if TYPE_CHECKING:
+    from framework.llm.provider import LLMProvider
+
+
+# Tool for collecting generated tests - Claude handles JSON escaping automatically
+SUBMIT_TEST_TOOL = Tool(
+    name="submit_test",
+    description="Submit a generated constraint test. Call once per test.",
+    parameters={
+        "properties": {
+            "constraint_id": {
+                "type": "string",
+                "description": "ID of the constraint being tested",
+            },
+            "test_name": {
+                "type": "string",
+                "description": "pytest function name, e.g., test_constraint_api_limits_respected",
+            },
+            "test_code": {
+                "type": "string",
+                "description": "Complete Python test function code",
+            },
+            "description": {
+                "type": "string",
+                "description": "What the test validates",
+            },
+            "input": {
+                "type": "object",
+                "description": "Test input data",
+            },
+            "expected_output": {
+                "type": "object",
+                "description": "Expected output",
+            },
+            "confidence": {
+                "type": "number",
+                "description": "Confidence score 0-1",
+            },
+        },
+        "required": ["constraint_id", "test_name", "test_code", "description", "confidence"],
+    },
+)
+
+
+class ConstraintTestGenerator:
+    """
+    Generate constraint tests from Goal constraints.
+
+    Generated tests require user approval before being added to the test suite.
+    """
+
+    def __init__(self, llm: "LLMProvider"):
+        """
+        Initialize generator with LLM provider.
+
+        Args:
+            llm: LLM provider for test generation (e.g., AnthropicProvider)
+        """
+        self.llm = llm
+
+    def generate(self, goal: Goal) -> list[Test]:
+        """
+        Generate tests for all constraints in a goal.
+
+        Args:
+            goal: Goal with constraints to test
+
+        Returns:
+            List of Test objects with approval_status=PENDING.
+            These MUST be approved before being added to the test suite.
+        """
+        if not goal.constraints:
+            return []
+
+        # Format prompt
+        prompt = CONSTRAINT_TEST_PROMPT.format(
+            goal_name=goal.name,
+            goal_description=goal.description,
+            constraints_formatted=self._format_constraints(goal.constraints),
+        )
+
+        # Collect tests via tool calls - Claude handles JSON escaping automatically
+        collected_tests: list[dict] = []
+
+        def tool_executor(tool_use: ToolUse) -> ToolResult:
+            if tool_use.name == "submit_test":
+                collected_tests.append(tool_use.input)
+                return ToolResult(
+                    tool_use_id=tool_use.id, content="Test recorded successfully"
+                )
+            return ToolResult(
+                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
+            )
+
+        self.llm.complete_with_tools(
+            messages=[{"role": "user", "content": prompt}],
+            system="You are a test generation expert. For each constraint, call the submit_test tool with the test details.",
+            tools=[SUBMIT_TEST_TOOL],
+            tool_executor=tool_executor,
+            max_iterations=20,
+        )
+
+        return self._create_tests_from_collected(collected_tests, goal.id)
+
+    def generate_for_constraint(
+        self, goal: Goal, constraint: Constraint
+    ) -> list[Test]:
+        """
+        Generate tests for a single constraint.
+
+        Args:
+            goal: Goal containing the constraint
+            constraint: Specific constraint to test
+
+        Returns:
+            List of Test objects for the constraint
+        """
+        # Format prompt with just this constraint
+        prompt = CONSTRAINT_TEST_PROMPT.format(
+            goal_name=goal.name,
+            goal_description=goal.description,
+            constraints_formatted=self._format_constraint(constraint),
+        )
+
+        # Collect tests via tool calls
+        collected_tests: list[dict] = []
+
+        def tool_executor(tool_use: ToolUse) -> ToolResult:
+            if tool_use.name == "submit_test":
+                collected_tests.append(tool_use.input)
+                return ToolResult(
+                    tool_use_id=tool_use.id, content="Test recorded successfully"
+                )
+            return ToolResult(
+                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
+            )
+
+        self.llm.complete_with_tools(
+            messages=[{"role": "user", "content": prompt}],
+            system="You are a test generation expert. Call the submit_test tool with the test details.",
+            tools=[SUBMIT_TEST_TOOL],
+            tool_executor=tool_executor,
+            max_iterations=10,
+        )
+
+        return self._create_tests_from_collected(collected_tests, goal.id)
+
+    def _format_constraints(self, constraints: list[Constraint]) -> str:
+        """Format constraints for prompt."""
+        lines = []
+        for c in constraints:
+            lines.append(self._format_constraint(c))
+            lines.append("")
+        return "\n".join(lines)
+
+    def _format_constraint(self, constraint: Constraint) -> str:
+        """Format a single constraint for prompt."""
+        severity = "HARD" if constraint.constraint_type == "hard" else "SOFT"
+        return f"""### Constraint: {constraint.id}
+- Type: {severity} ({constraint.constraint_type})
+- Category: {constraint.category}
+- Description: {constraint.description}
+- Check: {constraint.check}"""
+
+    def _create_tests_from_collected(
+        self, collected: list[dict], goal_id: str
+    ) -> list[Test]:
+        """Create Test objects from tool call data."""
+        tests = []
+        for td in collected:
+            test = Test(
+                id=f"test_{uuid.uuid4().hex[:8]}",
+                goal_id=goal_id,
+                parent_criteria_id=td.get("constraint_id", "unknown"),
+                test_type=TestType.CONSTRAINT,
+                test_name=td.get("test_name", "unnamed_test"),
+                test_code=td.get("test_code", ""),
+                description=td.get("description", ""),
+                input=td.get("input", {}),
+                expected_output=td.get("expected_output", {}),
+                generated_by="llm",
+                llm_confidence=float(td.get("confidence", 0.5)),
+                approval_status=ApprovalStatus.PENDING,
+            )
+            tests.append(test)
+        return tests
@@ -0,0 +1,286 @@
+"""
+Debug tool for analyzing failed tests.
+
+Provides detailed information for debugging:
+- Test input and expected output
+- Actual output and error details
+- Error categorization
+- Runtime logs and execution path
+- Fix suggestions
+"""
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from framework.testing.test_case import Test
+from framework.testing.test_result import TestResult, ErrorCategory
+from framework.testing.test_storage import TestStorage
+from framework.testing.categorizer import ErrorCategorizer
+
+
+class DebugInfo(BaseModel):
+    """
+    Comprehensive debug information for a failed test.
+    """
+    test_id: str
+    test_name: str
+
+    # Test definition
+    input: dict[str, Any] = Field(default_factory=dict)
+    expected: dict[str, Any] = Field(default_factory=dict)
+
+    # Actual result
+    actual: Any = None
+    passed: bool = False
+
+    # Error details
+    error_message: str | None = None
+    error_category: str | None = None
+    stack_trace: str | None = None
+
+    # Runtime data
+    logs: list[dict[str, Any]] = Field(default_factory=list)
+    runtime_data: dict[str, Any] = Field(default_factory=dict)
+
+    # Fix guidance
+    suggested_fix: str | None = None
+    iteration_guidance: dict[str, Any] = Field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dict for JSON serialization."""
+        return self.model_dump()
+
+
+class DebugTool:
+    """
+    Debug tool for analyzing failed tests.
+
+    Integrates with:
+    - TestStorage for test and result data
+    - Runtime storage (optional) for decision logs
+    - ErrorCategorizer for classification
+    """
+
+    def __init__(
+        self,
+        test_storage: TestStorage,
+        runtime_storage: Any | None = None,
+    ):
+        """
+        Initialize debug tool.
+
+        Args:
+            test_storage: Storage for test and result data
+            runtime_storage: Optional FileStorage for Runtime data
+        """
+        self.test_storage = test_storage
+        self.runtime_storage = runtime_storage
+        self.categorizer = ErrorCategorizer()
+
+    def analyze(
+        self,
+        goal_id: str,
+        test_id: str,
+        run_id: str | None = None,
+    ) -> DebugInfo:
+        """
+        Get detailed debug info for a failed test.
+
+        Args:
+            goal_id: Goal ID containing the test
+            test_id: ID of the test to analyze
+            run_id: Optional Runtime run ID for detailed logs
+
+        Returns:
+            DebugInfo with comprehensive debug data
+        """
+        # Load test
+        test = self.test_storage.load_test(goal_id, test_id)
+        if not test:
+            return DebugInfo(
+                test_id=test_id,
+                test_name="unknown",
+                error_message=f"Test {test_id} not found in goal {goal_id}",
+            )
+
+        # Load latest result
+        result = self.test_storage.get_latest_result(test_id)
+
+        # Build debug info
+        debug_info = DebugInfo(
+            test_id=test_id,
+            test_name=test.test_name,
+            input=test.input,
+            expected=test.expected_output,
+        )
+
+        if result:
+            debug_info.actual = result.actual_output
+            debug_info.passed = result.passed
+            debug_info.error_message = result.error_message
+            debug_info.stack_trace = result.stack_trace
+            debug_info.logs = result.runtime_logs
+
+            # Set category
+            if result.error_category:
+                debug_info.error_category = result.error_category.value
+            elif not result.passed:
+                # Categorize if not already done
+                category = self.categorizer.categorize(result)
+                if category:
+                    debug_info.error_category = category.value
+
+        # Get runtime data if available
+        if run_id and self.runtime_storage:
+            debug_info.runtime_data = self._get_runtime_data(run_id)
+
+        # Generate fix suggestions
+        if debug_info.error_category:
+            category = ErrorCategory(debug_info.error_category)
+            debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
+            debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
+
+        return debug_info
+
+    def analyze_result(
+        self,
+        test: Test,
+        result: TestResult,
+        run_id: str | None = None,
+    ) -> DebugInfo:
+        """
+        Analyze a test result directly (without loading from storage).
+
+        Args:
+            test: The Test that was run
+            result: The TestResult to analyze
+            run_id: Optional Runtime run ID
+
+        Returns:
+            DebugInfo with debug data
+        """
+        debug_info = DebugInfo(
+            test_id=test.id,
+            test_name=test.test_name,
+            input=test.input,
+            expected=test.expected_output,
+            actual=result.actual_output,
+            passed=result.passed,
+            error_message=result.error_message,
+            stack_trace=result.stack_trace,
+            logs=result.runtime_logs,
+        )
+
+        # Categorize
+        if result.error_category:
+            debug_info.error_category = result.error_category.value
+        elif not result.passed:
+            category = self.categorizer.categorize(result)
+            if category:
+                debug_info.error_category = category.value
+
+        # Runtime data
+        if run_id and self.runtime_storage:
+            debug_info.runtime_data = self._get_runtime_data(run_id)
+
+        # Fix suggestions
+        if debug_info.error_category:
+            category = ErrorCategory(debug_info.error_category)
+            debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
+            debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
+
+        return debug_info
+
+    def get_failure_summary(
+        self,
+        goal_id: str,
+    ) -> dict[str, Any]:
+        """
+        Get summary of all failures for a goal.
+
+        Returns:
+            Dict with failure counts by category and test IDs
+        """
+        tests = self.test_storage.get_tests_by_goal(goal_id)
+
+        failures_by_category: dict[str, list[str]] = {
+            "logic_error": [],
+            "implementation_error": [],
+            "edge_case": [],
+            "uncategorized": [],
+        }
+
+        for test in tests:
+            if test.last_result == "failed":
+                result = self.test_storage.get_latest_result(test.id)
+                if result and result.error_category:
+                    failures_by_category[result.error_category.value].append(test.id)
+                else:
+                    failures_by_category["uncategorized"].append(test.id)
+
+        return {
+            "goal_id": goal_id,
+            "total_failures": sum(len(ids) for ids in failures_by_category.values()),
+            "by_category": failures_by_category,
+            "iteration_suggestions": self._get_iteration_suggestions(failures_by_category),
+        }
+
+    def _get_runtime_data(self, run_id: str) -> dict[str, Any]:
+        """Extract runtime data from Runtime storage."""
+        if not self.runtime_storage:
+            return {}
+
+        try:
+            run = self.runtime_storage.load_run(run_id)
+            if not run:
+                return {"error": f"Run {run_id} not found"}
+
+            return {
+                "execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [],
+                "decisions": [
+                    d.model_dump() if hasattr(d, "model_dump") else str(d)
+                    for d in getattr(run, "decisions", [])
+                ],
+                "problems": [
+                    p.model_dump() if hasattr(p, "model_dump") else str(p)
+                    for p in getattr(run, "problems", [])
+                ],
+                "status": run.status.value if hasattr(run, "status") else "unknown",
+            }
+        except Exception as e:
+            return {"error": f"Failed to load runtime data: {e}"}
+
+    def _get_iteration_suggestions(
+        self,
+        failures_by_category: dict[str, list[str]],
+    ) -> list[str]:
+        """Generate iteration suggestions based on failure categories."""
+        suggestions = []
+
+        if failures_by_category["logic_error"]:
+            suggestions.append(
+                f"Found {len(failures_by_category['logic_error'])} logic errors. "
+                "Review and update Goal success_criteria/constraints, then restart "
+                "the full Goal → Agent → Eval flow."
+            )
+
+        if failures_by_category["implementation_error"]:
+            suggestions.append(
+                f"Found {len(failures_by_category['implementation_error'])} implementation errors. "
+                "Fix agent node/edge code and re-run Eval."
+            )
+
+        if failures_by_category["edge_case"]:
+            suggestions.append(
+                f"Found {len(failures_by_category['edge_case'])} edge cases. "
+                "These are new scenarios - add tests for them."
+            )
+
+        if failures_by_category["uncategorized"]:
+            suggestions.append(
+                f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. "
+                "Manual review required."
+            )
+
+        return suggestions
@@ -0,0 +1,407 @@
+"""
+Single test executor.
+
+Executes a single test against an agent and returns a TestResult.
+"""
+
+import asyncio
+import inspect
+import os
+import time
+import traceback
+from typing import Any, Protocol, runtime_checkable
+
+from framework.testing.test_case import Test
+from framework.testing.test_result import TestResult, ErrorCategory
+from framework.testing.categorizer import ErrorCategorizer
+
+
+class LLMJudge:
+    """
+    LLM-based judge for semantic evaluation of test results.
+
+    Used by tests that need to evaluate semantic properties like
+    "no hallucination" or "preserves meaning" that can't be checked
+    with simple assertions.
+    """
+
+    def __init__(self):
+        """Initialize the LLM judge."""
+        self._client = None
+
+    def _get_client(self):
+        """Lazy-load the Anthropic client."""
+        if self._client is None:
+            try:
+                import anthropic
+                self._client = anthropic.Anthropic()
+            except ImportError:
+                raise RuntimeError("anthropic package required for LLM judge")
+        return self._client
+
+    def evaluate(
+        self,
+        constraint: str,
+        source_document: str,
+        summary: str,
+        criteria: str,
+    ) -> dict[str, Any]:
+        """
+        Evaluate whether a summary meets a constraint.
+
+        Args:
+            constraint: The constraint being tested (e.g., "no-hallucination")
+            source_document: The original document
+            summary: The generated summary to evaluate
+            criteria: Human-readable criteria for evaluation
+
+        Returns:
+            Dict with 'passes' (bool) and 'explanation' (str)
+        """
+        client = self._get_client()
+
+        prompt = f"""You are evaluating whether a summary meets a specific constraint.
+
+CONSTRAINT: {constraint}
+CRITERIA: {criteria}
+
+SOURCE DOCUMENT:
+{source_document}
+
+SUMMARY TO EVALUATE:
+{summary}
+
+Evaluate whether the summary meets the constraint. Be strict but fair.
+
+Respond with JSON in this exact format:
+{{"passes": true/false, "explanation": "brief explanation of your judgment"}}
+
+Only output the JSON, nothing else."""
+
+        try:
+            response = client.messages.create(
+                model="claude-haiku-4-5-20251001",
+                max_tokens=500,
+                messages=[{"role": "user", "content": prompt}]
+            )
+
+            # Parse the response
+            import json
+            text = response.content[0].text.strip()
+            # Handle potential markdown code blocks
+            if text.startswith("```"):
+                text = text.split("```")[1]
+                if text.startswith("json"):
+                    text = text[4:]
+                text = text.strip()
+
+            result = json.loads(text)
+            return {
+                "passes": bool(result.get("passes", False)),
+                "explanation": result.get("explanation", "No explanation provided")
+            }
+        except Exception as e:
+            # On error, fail the test with explanation
+            return {
+                "passes": False,
+                "explanation": f"LLM judge error: {e}"
+            }
+
+
+@runtime_checkable
+class AgentProtocol(Protocol):
+    """Protocol for agent that can be tested."""
+
+    def run(self, input: dict[str, Any]) -> Any:
+        """Run the agent with input and return result."""
+        ...
+
+
+class SyncAgentWrapper:
+    """
+    Wrapper that makes async agent.run() callable synchronously.
+
+    This allows tests to call agent.run() without async/await syntax,
+    which simplifies test code generation and execution.
+    """
+
+    def __init__(self, agent: Any):
+        self._agent = agent
+        self._loop: asyncio.AbstractEventLoop | None = None
+
+    def run(self, input_data: dict[str, Any]) -> Any:
+        """
+        Run agent synchronously by wrapping async call.
+
+        Args:
+            input_data: Input data for the agent
+
+        Returns:
+            Output dict from the agent's ExecutionResult
+        """
+        coro = self._agent.run(input_data)
+
+        # Check if we're already in an async context
+        try:
+            loop = asyncio.get_running_loop()
+            # We're in an async context, can't use run_until_complete
+            # This shouldn't happen in normal test execution
+            raise RuntimeError("Cannot run sync wrapper from async context")
+        except RuntimeError:
+            # No running loop, create one or reuse
+            pass
+
+        # Get or create event loop
+        try:
+            if self._loop is None or self._loop.is_closed():
+                self._loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(self._loop)
+            return self._loop.run_until_complete(coro).output
+        finally:
+            # Don't close the loop here - we may need it for subsequent calls
+            pass
+
+    def __getattr__(self, name: str) -> Any:
+        """Forward other attribute access to wrapped agent."""
+        return getattr(self._agent, name)
+
+
+class TestExecutor:
+    """
+    Execute a single test against an agent.
+
+    Handles:
+    - Test code compilation and execution
+    - Timing measurement
+    - Error capture and categorization
+    - Result creation
+    """
+
+    def __init__(
+        self,
+        categorizer: ErrorCategorizer | None = None,
+        timeout: float = 60.0,
+    ):
+        """
+        Initialize executor.
+
+        Args:
+            categorizer: ErrorCategorizer for classifying failures
+            timeout: Maximum test execution time in seconds
+        """
+        self.categorizer = categorizer or ErrorCategorizer()
+        self.timeout = timeout
+
+    def execute(
+        self,
+        test: Test,
+        agent: AgentProtocol,
+        capture_logs: bool = True,
+    ) -> TestResult:
+        """
+        Execute a test against an agent.
+
+        Args:
+            test: Test to execute
+            agent: Agent instance to test
+            capture_logs: Whether to capture runtime logs
+
+        Returns:
+            TestResult with execution details
+        """
+        start_time = time.perf_counter()
+
+        try:
+            # Build test environment
+            test_globals = self._build_test_globals(agent, test)
+
+            # Compile test code
+            try:
+                compiled = compile(test.test_code, f"<test:{test.test_name}>", "exec")
+            except SyntaxError as e:
+                return self._create_error_result(
+                    test=test,
+                    start_time=start_time,
+                    error_message=f"Test code syntax error: {e}",
+                    stack_trace=traceback.format_exc(),
+                )
+
+            # Execute test
+            try:
+                exec(compiled, test_globals)
+
+                # Look for test function and call it
+                test_func = test_globals.get(test.test_name)
+                if test_func is None:
+                    # Try to find any function starting with test_
+                    for name, obj in test_globals.items():
+                        if name.startswith("test_") and callable(obj):
+                            test_func = obj
+                            break
+
+                if test_func is None:
+                    return self._create_error_result(
+                        test=test,
+                        start_time=start_time,
+                        error_message=f"Test function '{test.test_name}' not found in test code",
+                    )
+
+                # Call the test function with appropriate arguments
+                # Inspect the function signature to determine what to pass
+                sig = inspect.signature(test_func)
+                params = list(sig.parameters.keys())
+
+                # Build arguments based on what the function expects
+                call_args = []
+                for param in params:
+                    if param == "agent":
+                        call_args.append(test_globals["agent"])
+                    elif param == "llm_judge":
+                        call_args.append(test_globals["llm_judge"])
+                    elif param in test_globals:
+                        call_args.append(test_globals[param])
+                    else:
+                        # Unknown parameter - this will likely cause an error
+                        # but we let it happen naturally
+                        break
+
+                test_func(*call_args)
+
+                # Test passed
+                duration_ms = int((time.perf_counter() - start_time) * 1000)
+                return TestResult(
+                    test_id=test.id,
+                    passed=True,
+                    duration_ms=duration_ms,
+                    expected_output=test.expected_output,
+                    actual_output={"status": "passed"},
+                )
+
+            except AssertionError as e:
+                return self._create_failure_result(
+                    test=test,
+                    start_time=start_time,
+                    error_message=str(e) or "Assertion failed",
+                    stack_trace=traceback.format_exc(),
+                )
+
+            except Exception as e:
+                return self._create_failure_result(
+                    test=test,
+                    start_time=start_time,
+                    error_message=f"{type(e).__name__}: {e}",
+                    stack_trace=traceback.format_exc(),
+                )
+
+        except Exception as e:
+            return self._create_error_result(
+                test=test,
+                start_time=start_time,
+                error_message=f"Test execution error: {e}",
+                stack_trace=traceback.format_exc(),
+            )
+
+    def _build_test_globals(
+        self,
+        agent: AgentProtocol,
+        test: Test,
+    ) -> dict[str, Any]:
+        """Build the globals dict for test execution."""
+        # Wrap async agents in a sync wrapper so test code can call agent.run()
+        # without async/await syntax
+        wrapped_agent = self._wrap_agent_if_async(agent)
+
+        return {
+            "__builtins__": __builtins__,
+            "agent": wrapped_agent,
+            "llm_judge": LLMJudge(),  # For semantic evaluation tests
+            "test_input": test.input,
+            "expected_output": test.expected_output,
+            # Common test utilities
+            "assert": assert_,  # Built-in
+            "isinstance": isinstance,
+            "len": len,
+            "str": str,
+            "int": int,
+            "float": float,
+            "list": list,
+            "dict": dict,
+            "set": set,
+            "tuple": tuple,
+            "any": any,
+            "all": all,
+            "print": print,  # For debugging
+        }
+
+    def _wrap_agent_if_async(self, agent: AgentProtocol) -> Any:
+        """
+        Wrap agent if its run() method is async.
+
+        Args:
+            agent: Agent to potentially wrap
+
+        Returns:
+            SyncAgentWrapper if agent.run() is async, otherwise the original agent
+        """
+        run_method = getattr(agent, "run", None)
+        if run_method is None:
+            return agent
+
+        # Check if run() is a coroutine function
+        if inspect.iscoroutinefunction(run_method):
+            return SyncAgentWrapper(agent)
+
+        return agent
+
+    def _create_failure_result(
+        self,
+        test: Test,
+        start_time: float,
+        error_message: str,
+        stack_trace: str | None = None,
+    ) -> TestResult:
+        """Create a result for a test that failed assertions."""
+        duration_ms = int((time.perf_counter() - start_time) * 1000)
+
+        result = TestResult(
+            test_id=test.id,
+            passed=False,
+            duration_ms=duration_ms,
+            expected_output=test.expected_output,
+            error_message=error_message,
+            stack_trace=stack_trace,
+        )
+
+        # Categorize the error
+        result.error_category = self.categorizer.categorize(result)
+
+        return result
+
+    def _create_error_result(
+        self,
+        test: Test,
+        start_time: float,
+        error_message: str,
+        stack_trace: str | None = None,
+    ) -> TestResult:
+        """Create a result for a test that couldn't run."""
+        duration_ms = int((time.perf_counter() - start_time) * 1000)
+
+        result = TestResult(
+            test_id=test.id,
+            passed=False,
+            duration_ms=duration_ms,
+            error_message=error_message,
+            stack_trace=stack_trace,
+        )
+
+        # Implementation error for test setup failures
+        result.error_category = ErrorCategory.IMPLEMENTATION_ERROR
+
+        return result
+
+
+def assert_(condition: bool, message: str = "") -> None:
+    """Assert helper with message."""
+    if not condition:
+        raise AssertionError(message)
@@ -0,0 +1,344 @@
+"""
+Parallel test runner inspired by pytest-xdist.
+
+Features:
+- Per-test parallelism: Each test runs independently with load balancing
+- Worker initialization: Agent created once per worker thread (not per test)
+- Thread-based parallelism: Uses ThreadPoolExecutor for I/O-bound LLM calls
+- Fail-fast option: Stop on first failure
+"""
+
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from multiprocessing import cpu_count
+from typing import Any, Callable, Protocol, runtime_checkable
+
+from framework.testing.test_case import Test
+from framework.testing.test_result import TestResult, TestSuiteResult
+from framework.testing.test_storage import TestStorage
+from framework.testing.executor import TestExecutor, AgentProtocol
+from framework.testing.categorizer import ErrorCategorizer
+
+
+# Thread-local storage for worker agents
+# Each worker thread gets its own agent instance to avoid race conditions
+_thread_local = threading.local()
+
+
+def _init_worker(agent_factory: Any) -> None:
+    """
+    Initialize worker thread with its own agent instance.
+
+    Called once per worker thread when the ThreadPoolExecutor starts.
+    The agent is stored in thread-local storage and reused for all tests
+    executed by this worker.
+    """
+    if hasattr(agent_factory, "create"):
+        _thread_local.agent = agent_factory.create()
+    else:
+        _thread_local.agent = agent_factory()
+
+
+def _run_single_test(test: Test, timeout: float) -> TestResult:
+    """
+    Run a single test using the worker's pre-initialized agent.
+
+    Args:
+        test: Test to execute
+        timeout: Timeout per test in seconds
+
+    Returns:
+        TestResult with execution details
+    """
+    executor = TestExecutor(
+        categorizer=ErrorCategorizer(),
+        timeout=timeout,
+    )
+    return executor.execute(test, _thread_local.agent)
+
+
+@dataclass
+class ParallelConfig:
+    """Configuration for parallel test execution."""
+
+    num_workers: int = field(default_factory=cpu_count)
+    timeout_per_test: float = 60.0  # seconds
+    fail_fast: bool = False
+    mock_external_apis: bool = True
+
+
+@runtime_checkable
+class AgentFactoryProtocol(Protocol):
+    """Protocol for creating agent instances."""
+
+    def create(self) -> AgentProtocol:
+        """Create a new agent instance."""
+        ...
+
+
+class AgentFactory:
+    """Picklable factory that creates AgentRunner instances from a path.
+
+    This class is used instead of a lambda for parallel test execution,
+    since lambdas capturing local variables cannot be pickled by ProcessPoolExecutor.
+    """
+
+    def __init__(self, agent_path: str):
+        self.agent_path = agent_path
+
+    def create(self):
+        from framework.runner import AgentRunner
+        return AgentRunner.load(self.agent_path)
+
+
+class ParallelTestRunner:
+    """
+    Parallel test execution using ThreadPoolExecutor.
+
+    Key features:
+    - Per-test distribution: Tests distributed individually for load balancing
+    - Worker initialization: Each worker thread creates one agent at startup
+    - Thread-based parallelism: Uses threads (not processes) for I/O-bound LLM calls
+    - Thread-local storage: Each worker has isolated agent state via threading.local()
+    """
+
+    def __init__(
+        self,
+        config: ParallelConfig | None = None,
+        storage: TestStorage | None = None,
+    ):
+        """
+        Initialize parallel runner.
+
+        Args:
+            config: Parallel execution configuration
+            storage: TestStorage for saving results
+        """
+        self.config = config or ParallelConfig()
+        self.storage = storage
+        self.categorizer = ErrorCategorizer()
+
+    def run_all(
+        self,
+        goal_id: str,
+        agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
+        tests: list[Test] | None = None,
+        on_result: Callable[[TestResult], None] | None = None,
+    ) -> TestSuiteResult:
+        """
+        Run all approved tests for a goal.
+
+        Args:
+            goal_id: Goal ID to run tests for
+            agent_factory: Factory for creating agent instances
+            tests: Optional list of tests (loads from storage if not provided)
+            on_result: Optional callback for each test result
+
+        Returns:
+            TestSuiteResult with summary and individual results
+        """
+        # Load tests if not provided
+        if tests is None:
+            if self.storage is None:
+                raise ValueError("Either tests or storage must be provided")
+            tests = self.storage.get_approved_tests(goal_id)
+
+        if not tests:
+            return TestSuiteResult(
+                goal_id=goal_id,
+                total=0,
+                passed=0,
+                failed=0,
+            )
+
+        # Execute tests
+        results: list[TestResult] = []
+
+        if self.config.num_workers <= 1:
+            # Sequential execution - create single agent and run all tests
+            results = self._run_sequential(tests, agent_factory, on_result)
+        else:
+            # Parallel execution with per-test distribution
+            results = self._run_parallel(tests, agent_factory, on_result)
+
+        # Save results if storage available
+        if self.storage:
+            # Create test_id -> test mapping for lookup
+            test_map = {t.id: t for t in tests}
+
+            for result in results:
+                # Update the Test object with execution result
+                if result.test_id in test_map:
+                    test = test_map[result.test_id]
+                    test.record_result(result.passed)
+                    self.storage.update_test(test)
+
+                # Save the TestResult
+                self.storage.save_result(result.test_id, result)
+
+        # Create suite result
+        return self._create_suite_result(goal_id, results)
+
+    def run_tests(
+        self,
+        tests: list[Test],
+        agent: AgentProtocol,
+        on_result: Callable[[TestResult], None] | None = None,
+    ) -> list[TestResult]:
+        """
+        Run a list of tests against an agent instance.
+
+        Args:
+            tests: Tests to run
+            agent: Agent instance to test
+            on_result: Optional callback for each result
+
+        Returns:
+            List of TestResult
+        """
+        executor = TestExecutor(
+            categorizer=self.categorizer,
+            timeout=self.config.timeout_per_test,
+        )
+
+        results = []
+        for test in tests:
+            result = executor.execute(test, agent)
+            results.append(result)
+
+            if on_result:
+                on_result(result)
+
+            # Fail-fast check
+            if self.config.fail_fast and not result.passed:
+                break
+
+        return results
+
+    def _run_sequential(
+        self,
+        tests: list[Test],
+        agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
+        on_result: Callable[[TestResult], None] | None = None,
+    ) -> list[TestResult]:
+        """Run tests sequentially with a single agent instance."""
+        results = []
+        executor = TestExecutor(
+            categorizer=self.categorizer,
+            timeout=self.config.timeout_per_test,
+        )
+
+        # Create single agent for all tests
+        if isinstance(agent_factory, AgentFactoryProtocol):
+            agent = agent_factory.create()
+        else:
+            agent = agent_factory()
+
+        # Run all tests
+        for test in tests:
+            result = executor.execute(test, agent)
+            results.append(result)
+
+            if on_result:
+                on_result(result)
+
+            # Fail-fast
+            if self.config.fail_fast and not result.passed:
+                return results
+
+        return results
+
+    def _run_parallel(
+        self,
+        tests: list[Test],
+        agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
+        on_result: Callable[[TestResult], None] | None = None,
+    ) -> list[TestResult]:
+        """
+        Run tests in parallel using ThreadPoolExecutor with worker initialization.
+
+        Each worker thread creates ONE agent instance at startup and reuses it
+        for all tests assigned to that worker. Tests are distributed individually
+        for true load-balanced parallelism.
+
+        Uses threads instead of processes because LLM API calls are I/O-bound,
+        and threads have lower overhead (no pickling, shared memory).
+        """
+        results = []
+        failed = False
+
+        with ThreadPoolExecutor(
+            max_workers=self.config.num_workers,
+            initializer=_init_worker,
+            initargs=(agent_factory,),
+        ) as executor:
+            # Submit each test individually for true parallelism
+            futures = {
+                executor.submit(_run_single_test, test, self.config.timeout_per_test): test
+                for test in tests
+            }
+
+            for future in as_completed(futures):
+                test = futures[future]
+                try:
+                    result = future.result(timeout=self.config.timeout_per_test + 30)
+                    results.append(result)
+
+                    if on_result:
+                        on_result(result)
+
+                    if not result.passed:
+                        failed = True
+
+                except TimeoutError:
+                    result = TestResult(
+                        test_id=test.id,
+                        passed=False,
+                        duration_ms=int(self.config.timeout_per_test * 1000),
+                        error_message="Test timed out",
+                    )
+                    results.append(result)
+                    if on_result:
+                        on_result(result)
+                    failed = True
+
+                except Exception as e:
+                    result = TestResult(
+                        test_id=test.id,
+                        passed=False,
+                        duration_ms=0,
+                        error_message=f"Execution error: {e}",
+                    )
+                    results.append(result)
+                    if on_result:
+                        on_result(result)
+                    failed = True
+
+                # Fail-fast
+                if self.config.fail_fast and failed:
+                    executor.shutdown(wait=False, cancel_futures=True)
+                    break
+
+        return results
+
+    def _create_suite_result(
+        self,
+        goal_id: str,
+        results: list[TestResult],
+    ) -> TestSuiteResult:
+        """Create TestSuiteResult from individual results."""
+        passed = sum(1 for r in results if r.passed)
+        failed = len(results) - passed
+        total_duration = sum(r.duration_ms for r in results)
+
+        return TestSuiteResult(
+            goal_id=goal_id,
+            total=len(results),
+            passed=passed,
+            failed=failed,
+            results=results,
+            duration_ms=total_duration,
+        )
+
+
@@ -0,0 +1,112 @@
+"""
+LLM prompt templates for test generation.
+
+These prompts instruct the LLM to generate pytest-compatible tests
+from Goal success_criteria and constraints using tool calling.
+"""
+
+CONSTRAINT_TEST_PROMPT = """You are generating test cases for an AI agent's constraints.
+
+## Goal
+Name: {goal_name}
+Description: {goal_description}
+
+## Constraints to Test
+{constraints_formatted}
+
+## Instructions
+For each constraint, generate pytest-compatible tests that verify the constraint is satisfied.
+
+For EACH test, call the `submit_test` tool with:
+- constraint_id: The ID of the constraint being tested
+- test_name: A descriptive pytest function name (test_constraint_<constraint_id>_<scenario>)
+- test_code: Complete Python test function code
+- description: What the test validates
+- input: Test input data as an object
+- expected_output: Expected output as an object
+- confidence: 0-1 score based on how testable/well-defined the constraint is
+
+Consider for each constraint:
+- Happy path: Normal execution that should satisfy the constraint
+- Boundary conditions: Inputs at the edge of constraint boundaries
+- Violation scenarios: Inputs that should trigger constraint violation
+
+The test code should:
+- Be valid Python using pytest conventions
+- Use `agent.run(input)` to execute the agent
+- Include descriptive assertion messages
+- Handle potential exceptions appropriately
+
+Generate tests now by calling submit_test for each test."""
+
+SUCCESS_CRITERIA_TEST_PROMPT = """You are generating success criteria tests for an AI agent.
+
+## Goal
+Name: {goal_name}
+Description: {goal_description}
+
+## Success Criteria
+{success_criteria_formatted}
+
+## Agent Flow (for context)
+Nodes: {node_names}
+Tools: {tool_names}
+
+## Instructions
+For each success criterion, generate tests that verify the agent achieves its goals.
+
+For EACH test, call the `submit_test` tool with:
+- criteria_id: The ID of the success criterion being tested
+- test_name: A descriptive pytest function name (test_<criteria_id>_<scenario>)
+- test_code: Complete Python test function code
+- description: What the test validates
+- input: Test input data as an object
+- expected_output: Expected output as an object
+- confidence: 0-1 score based on how measurable/specific the criterion is
+
+Consider for each criterion:
+- Happy path: Normal successful execution
+- Boundary conditions: Exactly at target thresholds (if applicable)
+- Graceful handling: Near-misses and edge cases
+
+The test code should:
+- Be valid Python using pytest conventions
+- Use `agent.run(input)` to execute the agent
+- Validate the metric defined in the success criterion
+- Include descriptive assertion messages
+
+Generate tests now by calling submit_test for each test."""
+
+EDGE_CASE_TEST_PROMPT = """You are generating edge case tests for an AI agent.
+
+## Goal
+Name: {goal_name}
+Description: {goal_description}
+
+## Existing Tests
+{existing_tests_summary}
+
+## Recent Failures (if any)
+{failures_summary}
+
+## Instructions
+Generate additional edge case tests that cover scenarios not addressed by existing tests.
+
+Focus on:
+1. Unusual input formats or values
+2. Empty or null inputs
+3. Extremely large or small values
+4. Unicode and special characters
+5. Concurrent or timing-related scenarios
+6. Network/API failure simulations (if applicable)
+
+For EACH test, call the `submit_test` tool with:
+- criteria_id: An identifier for the edge case category being tested
+- test_name: A descriptive pytest function name (test_edge_case_<scenario>)
+- test_code: Complete Python test function code
+- description: What the test validates
+- input: Test input data as an object
+- expected_output: Expected output as an object
+- confidence: 0-1 score
+
+Generate edge case tests now by calling submit_test for each test."""
@@ -0,0 +1,219 @@
+"""
+Success criteria test generator.
+
+Generates tests for Goal success_criteria using LLM.
+Tests are returned with PENDING approval status.
+"""
+
+import uuid
+from typing import TYPE_CHECKING
+
+from framework.graph.goal import Goal, SuccessCriterion
+from framework.testing.test_case import Test, TestType, ApprovalStatus
+from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT
+from framework.llm.provider import Tool, ToolUse, ToolResult
+
+if TYPE_CHECKING:
+    from framework.llm.provider import LLMProvider
+
+
+# Tool for collecting generated tests - Claude handles JSON escaping automatically
+SUBMIT_TEST_TOOL = Tool(
+    name="submit_test",
+    description="Submit a generated success criteria test. Call once per test.",
+    parameters={
+        "properties": {
+            "criteria_id": {
+                "type": "string",
+                "description": "ID of the success criterion being tested",
+            },
+            "test_name": {
+                "type": "string",
+                "description": "pytest function name, e.g., test_find_videos_happy_path",
+            },
+            "test_code": {
+                "type": "string",
+                "description": "Complete Python test function code",
+            },
+            "description": {
+                "type": "string",
+                "description": "What the test validates",
+            },
+            "input": {
+                "type": "object",
+                "description": "Test input data",
+            },
+            "expected_output": {
+                "type": "object",
+                "description": "Expected output",
+            },
+            "confidence": {
+                "type": "number",
+                "description": "Confidence score 0-1",
+            },
+        },
+        "required": ["criteria_id", "test_name", "test_code", "description", "confidence"],
+    },
+)
+
+
+class SuccessCriteriaTestGenerator:
+    """
+    Generate success criteria tests from Goal success_criteria.
+
+    Generated tests require user approval before being added to the test suite.
+    Unlike constraint tests, success criteria tests are generated during the
+    Eval stage (after the agent exists) and may reference agent nodes/tools.
+    """
+
+    def __init__(self, llm: "LLMProvider"):
+        """
+        Initialize generator with LLM provider.
+
+        Args:
+            llm: LLM provider for test generation (e.g., AnthropicProvider)
+        """
+        self.llm = llm
+
+    def generate(
+        self,
+        goal: Goal,
+        node_names: list[str] | None = None,
+        tool_names: list[str] | None = None,
+    ) -> list[Test]:
+        """
+        Generate tests for all success criteria in a goal.
+
+        Args:
+            goal: Goal with success_criteria to test
+            node_names: Names of agent nodes (for context)
+            tool_names: Names of tools available to agent (for context)
+
+        Returns:
+            List of Test objects with approval_status=PENDING.
+            These MUST be approved before being added to the test suite.
+        """
+        if not goal.success_criteria:
+            return []
+
+        # Format prompt
+        prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
+            goal_name=goal.name,
+            goal_description=goal.description,
+            success_criteria_formatted=self._format_criteria(goal.success_criteria),
+            node_names=", ".join(node_names or ["(not specified)"]),
+            tool_names=", ".join(tool_names or ["(not specified)"]),
+        )
+
+        # Collect tests via tool calls - Claude handles JSON escaping automatically
+        collected_tests: list[dict] = []
+
+        def tool_executor(tool_use: ToolUse) -> ToolResult:
+            if tool_use.name == "submit_test":
+                collected_tests.append(tool_use.input)
+                return ToolResult(
+                    tool_use_id=tool_use.id, content="Test recorded successfully"
+                )
+            return ToolResult(
+                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
+            )
+
+        self.llm.complete_with_tools(
+            messages=[{"role": "user", "content": prompt}],
+            system="You are a test generation expert. For each success criterion, call the submit_test tool with the test details.",
+            tools=[SUBMIT_TEST_TOOL],
+            tool_executor=tool_executor,
+            max_iterations=20,
+        )
+
+        return self._create_tests_from_collected(collected_tests, goal.id)
+
+    def generate_for_criterion(
+        self,
+        goal: Goal,
+        criterion: SuccessCriterion,
+        node_names: list[str] | None = None,
+        tool_names: list[str] | None = None,
+    ) -> list[Test]:
+        """
+        Generate tests for a single success criterion.
+
+        Args:
+            goal: Goal containing the criterion
+            criterion: Specific criterion to test
+            node_names: Names of agent nodes
+            tool_names: Names of tools available
+
+        Returns:
+            List of Test objects for the criterion
+        """
+        prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
+            goal_name=goal.name,
+            goal_description=goal.description,
+            success_criteria_formatted=self._format_criterion(criterion),
+            node_names=", ".join(node_names or ["(not specified)"]),
+            tool_names=", ".join(tool_names or ["(not specified)"]),
+        )
+
+        # Collect tests via tool calls
+        collected_tests: list[dict] = []
+
+        def tool_executor(tool_use: ToolUse) -> ToolResult:
+            if tool_use.name == "submit_test":
+                collected_tests.append(tool_use.input)
+                return ToolResult(
+                    tool_use_id=tool_use.id, content="Test recorded successfully"
+                )
+            return ToolResult(
+                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
+            )
+
+        self.llm.complete_with_tools(
+            messages=[{"role": "user", "content": prompt}],
+            system="You are a test generation expert. Call the submit_test tool with the test details.",
+            tools=[SUBMIT_TEST_TOOL],
+            tool_executor=tool_executor,
+            max_iterations=10,
+        )
+
+        return self._create_tests_from_collected(collected_tests, goal.id)
+
+    def _format_criteria(self, criteria: list[SuccessCriterion]) -> str:
+        """Format success criteria for prompt."""
+        lines = []
+        for c in criteria:
+            lines.append(self._format_criterion(c))
+            lines.append("")
+        return "\n".join(lines)
+
+    def _format_criterion(self, criterion: SuccessCriterion) -> str:
+        """Format a single criterion for prompt."""
+        return f"""### Success Criterion: {criterion.id}
+- Description: {criterion.description}
+- Metric: {criterion.metric}
+- Target: {criterion.target}
+- Weight: {criterion.weight}
+- Currently met: {criterion.met}"""
+
+    def _create_tests_from_collected(
+        self, collected: list[dict], goal_id: str
+    ) -> list[Test]:
+        """Create Test objects from tool call data."""
+        tests = []
+        for td in collected:
+            test = Test(
+                id=f"test_{uuid.uuid4().hex[:8]}",
+                goal_id=goal_id,
+                parent_criteria_id=td.get("criteria_id", "unknown"),
+                test_type=TestType.SUCCESS_CRITERIA,
+                test_name=td.get("test_name", "unnamed_test"),
+                test_code=td.get("test_code", ""),
+                description=td.get("description", ""),
+                input=td.get("input", {}),
+                expected_output=td.get("expected_output", {}),
+                generated_by="llm",
+                llm_confidence=float(td.get("confidence", 0.5)),
+                approval_status=ApprovalStatus.PENDING,
+            )
+            tests.append(test)
+        return tests
@@ -0,0 +1,150 @@
+"""
+Test case schema with approval tracking.
+
+Tests are generated by LLM from Goal success_criteria and constraints,
+but require mandatory user approval before being stored.
+"""
+
+from datetime import datetime
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class ApprovalStatus(str, Enum):
+    """Status of user approval for a generated test."""
+    PENDING = "pending"      # Awaiting user review
+    APPROVED = "approved"    # User accepted as-is
+    MODIFIED = "modified"    # User edited before accepting
+    REJECTED = "rejected"    # User declined (with reason)
+
+
+class TestType(str, Enum):
+    """Type of test based on what it validates."""
+    CONSTRAINT = "constraint"           # Validates constraint boundaries
+    SUCCESS_CRITERIA = "outcome"        # Validates success criteria achievement
+    EDGE_CASE = "edge_case"            # Validates edge case handling
+
+
+class Test(BaseModel):
+    """
+    A test case generated from Goal success_criteria or constraints.
+
+    Tests are either:
+    - Generated by LLM during Goal stage (constraints) or Eval stage (success criteria)
+    - Created manually by human engineers
+
+    All tests require approval before being added to the test suite.
+    """
+    id: str
+    goal_id: str
+    parent_criteria_id: str = Field(
+        description="Links to success_criteria.id or constraint.id"
+    )
+    test_type: TestType
+
+    # Test definition
+    test_name: str = Field(
+        description="Descriptive function name, e.g., test_constraint_api_limits_respected"
+    )
+    test_code: str = Field(
+        description="Python test function code (pytest compatible)"
+    )
+    description: str = Field(
+        description="Human-readable description of what the test validates"
+    )
+    input: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Test input data"
+    )
+    expected_output: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Expected output or assertions"
+    )
+
+    # LLM generation metadata
+    generated_by: str = Field(
+        default="llm",
+        description="Who created the test: 'llm' or 'human'"
+    )
+    llm_confidence: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="LLM's confidence in the test quality (0-1)"
+    )
+
+    # Approval tracking (CRITICAL - tests are never used without approval)
+    approval_status: ApprovalStatus = ApprovalStatus.PENDING
+    approved_by: str | None = None
+    approved_at: datetime | None = None
+    rejection_reason: str | None = Field(
+        default=None,
+        description="Reason for rejection if status is REJECTED"
+    )
+    original_code: str | None = Field(
+        default=None,
+        description="Original LLM-generated code if user modified it"
+    )
+
+    # Execution tracking
+    last_run: datetime | None = None
+    last_result: str | None = Field(
+        default=None,
+        description="Result of last run: 'passed', 'failed', 'error'"
+    )
+    run_count: int = 0
+    pass_count: int = 0
+    fail_count: int = 0
+
+    # Timestamps
+    created_at: datetime = Field(default_factory=datetime.now)
+    updated_at: datetime = Field(default_factory=datetime.now)
+
+    model_config = {"extra": "allow"}
+
+    def approve(self, approved_by: str = "user") -> None:
+        """Mark test as approved."""
+        self.approval_status = ApprovalStatus.APPROVED
+        self.approved_by = approved_by
+        self.approved_at = datetime.now()
+        self.updated_at = datetime.now()
+
+    def modify(self, new_code: str, approved_by: str = "user") -> None:
+        """Approve test with modifications."""
+        self.original_code = self.test_code
+        self.test_code = new_code
+        self.approval_status = ApprovalStatus.MODIFIED
+        self.approved_by = approved_by
+        self.approved_at = datetime.now()
+        self.updated_at = datetime.now()
+
+    def reject(self, reason: str) -> None:
+        """Reject the test with a reason."""
+        self.approval_status = ApprovalStatus.REJECTED
+        self.rejection_reason = reason
+        self.updated_at = datetime.now()
+
+    def record_result(self, passed: bool) -> None:
+        """Record a test run result."""
+        self.last_run = datetime.now()
+        self.last_result = "passed" if passed else "failed"
+        self.run_count += 1
+        if passed:
+            self.pass_count += 1
+        else:
+            self.fail_count += 1
+        self.updated_at = datetime.now()
+
+    @property
+    def is_approved(self) -> bool:
+        """Check if test has been approved (approved or modified)."""
+        return self.approval_status in (ApprovalStatus.APPROVED, ApprovalStatus.MODIFIED)
+
+    @property
+    def pass_rate(self) -> float | None:
+        """Calculate pass rate if test has been run."""
+        if self.run_count == 0:
+            return None
+        return self.pass_count / self.run_count
@@ -0,0 +1,153 @@
+"""
+Test result schemas for tracking test execution outcomes.
+
+Results include detailed error information for debugging and
+categorization for guiding iteration strategy.
+"""
+
+from datetime import datetime
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class ErrorCategory(str, Enum):
+    """
+    Category of test failure for guiding iteration.
+
+    Each category has different implications for how to fix:
+    - LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints
+    - IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
+    - EDGE_CASE: New scenario discovered → add new test only
+    """
+    LOGIC_ERROR = "logic_error"
+    IMPLEMENTATION_ERROR = "implementation_error"
+    EDGE_CASE = "edge_case"
+
+
+class TestResult(BaseModel):
+    """
+    Result of a single test execution.
+
+    Captures:
+    - Pass/fail status with timing
+    - Actual vs expected output
+    - Error details for debugging
+    - Runtime logs and execution path
+    """
+    test_id: str
+    passed: bool
+    duration_ms: int = Field(
+        ge=0,
+        description="Test execution time in milliseconds"
+    )
+
+    # Output comparison
+    actual_output: Any = None
+    expected_output: Any = None
+
+    # Error details (populated on failure)
+    error_message: str | None = None
+    error_category: ErrorCategory | None = None
+    stack_trace: str | None = None
+
+    # Runtime data for debugging
+    runtime_logs: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Log entries from test execution"
+    )
+    node_outputs: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Output from each node executed during test"
+    )
+    execution_path: list[str] = Field(
+        default_factory=list,
+        description="Sequence of nodes executed"
+    )
+
+    # Associated run ID (links to Runtime data)
+    run_id: str | None = Field(
+        default=None,
+        description="Runtime run ID for detailed analysis"
+    )
+
+    timestamp: datetime = Field(default_factory=datetime.now)
+
+    model_config = {"extra": "allow"}
+
+    def summary_dict(self) -> dict[str, Any]:
+        """Return a summary dict for quick overview."""
+        return {
+            "test_id": self.test_id,
+            "passed": self.passed,
+            "duration_ms": self.duration_ms,
+            "error_category": self.error_category.value if self.error_category else None,
+            "error_message": self.error_message[:100] if self.error_message else None,
+        }
+
+
+class TestSuiteResult(BaseModel):
+    """
+    Aggregate result from running a test suite.
+
+    Provides summary statistics and individual results.
+    """
+    goal_id: str
+    total: int
+    passed: int
+    failed: int
+    errors: int = 0  # Tests that couldn't run (e.g., exceptions in setup)
+    skipped: int = 0
+
+    results: list[TestResult] = Field(default_factory=list)
+
+    duration_ms: int = Field(
+        default=0,
+        description="Total execution time in milliseconds"
+    )
+
+    timestamp: datetime = Field(default_factory=datetime.now)
+
+    model_config = {"extra": "allow"}
+
+    @property
+    def all_passed(self) -> bool:
+        """Check if all tests passed."""
+        return self.failed == 0 and self.errors == 0
+
+    @property
+    def pass_rate(self) -> float:
+        """Calculate pass rate."""
+        if self.total == 0:
+            return 0.0
+        return self.passed / self.total
+
+    def summary_dict(self) -> dict[str, Any]:
+        """Return summary for reporting."""
+        return {
+            "goal_id": self.goal_id,
+            "overall_passed": self.all_passed,
+            "summary": {
+                "total": self.total,
+                "passed": self.passed,
+                "failed": self.failed,
+                "errors": self.errors,
+                "skipped": self.skipped,
+            },
+            "pass_rate": f"{self.pass_rate:.1%}",
+            "duration_ms": self.duration_ms,
+        }
+
+    def get_failed_results(self) -> list[TestResult]:
+        """Get all failed test results for debugging."""
+        return [r for r in self.results if not r.passed]
+
+    def get_results_by_category(
+        self, category: ErrorCategory
+    ) -> list[TestResult]:
+        """Get failed results by error category."""
+        return [
+            r for r in self.results
+            if not r.passed and r.error_category == category
+        ]
@@ -0,0 +1,260 @@
+"""
+File-based storage backend for test data.
+
+Follows the same pattern as framework/storage/backend.py (FileStorage),
+storing tests as JSON files with indexes for efficient querying.
+"""
+
+import json
+from pathlib import Path
+from datetime import datetime
+
+from framework.testing.test_case import Test, ApprovalStatus, TestType
+from framework.testing.test_result import TestResult
+
+
+class TestStorage:
+    """
+    File-based storage for tests and results.
+
+    Directory structure:
+    {base_path}/
+      tests/
+        {goal_id}/
+          {test_id}.json           # Full test data
+      indexes/
+        by_goal/{goal_id}.json     # List of test IDs for this goal
+        by_approval/{status}.json  # Tests by approval status
+        by_type/{test_type}.json   # Tests by type
+        by_criteria/{criteria_id}.json  # Tests by parent criteria
+      results/
+        {test_id}/
+          {timestamp}.json         # Test run results
+          latest.json              # Most recent result
+      suites/
+        {goal_id}_suite.json       # Test suite metadata
+    """
+
+    def __init__(self, base_path: str | Path):
+        self.base_path = Path(base_path)
+        self._ensure_dirs()
+
+    def _ensure_dirs(self) -> None:
+        """Create directory structure if it doesn't exist."""
+        dirs = [
+            self.base_path / "tests",
+            self.base_path / "indexes" / "by_goal",
+            self.base_path / "indexes" / "by_approval",
+            self.base_path / "indexes" / "by_type",
+            self.base_path / "indexes" / "by_criteria",
+            self.base_path / "results",
+            self.base_path / "suites",
+        ]
+        for d in dirs:
+            d.mkdir(parents=True, exist_ok=True)
+
+    # === TEST OPERATIONS ===
+
+    def save_test(self, test: Test) -> None:
+        """Save a test to storage."""
+        # Ensure goal directory exists
+        goal_dir = self.base_path / "tests" / test.goal_id
+        goal_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save full test
+        test_path = goal_dir / f"{test.id}.json"
+        with open(test_path, "w") as f:
+            f.write(test.model_dump_json(indent=2))
+
+        # Update indexes
+        self._add_to_index("by_goal", test.goal_id, test.id)
+        self._add_to_index("by_approval", test.approval_status.value, test.id)
+        self._add_to_index("by_type", test.test_type.value, test.id)
+        self._add_to_index("by_criteria", test.parent_criteria_id, test.id)
+
+    def load_test(self, goal_id: str, test_id: str) -> Test | None:
+        """Load a test from storage."""
+        test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
+        if not test_path.exists():
+            return None
+        with open(test_path) as f:
+            return Test.model_validate_json(f.read())
+
+    def delete_test(self, goal_id: str, test_id: str) -> bool:
+        """Delete a test from storage."""
+        test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
+
+        if not test_path.exists():
+            return False
+
+        # Load test to get index keys
+        test = self.load_test(goal_id, test_id)
+        if test:
+            self._remove_from_index("by_goal", test.goal_id, test_id)
+            self._remove_from_index("by_approval", test.approval_status.value, test_id)
+            self._remove_from_index("by_type", test.test_type.value, test_id)
+            self._remove_from_index("by_criteria", test.parent_criteria_id, test_id)
+
+        test_path.unlink()
+
+        # Also delete results
+        results_dir = self.base_path / "results" / test_id
+        if results_dir.exists():
+            for f in results_dir.iterdir():
+                f.unlink()
+            results_dir.rmdir()
+
+        return True
+
+    def update_test(self, test: Test) -> None:
+        """
+        Update an existing test.
+
+        Handles index updates if approval_status changed.
+        """
+        # Load old test to check for index changes
+        old_test = self.load_test(test.goal_id, test.id)
+        if old_test and old_test.approval_status != test.approval_status:
+            self._remove_from_index("by_approval", old_test.approval_status.value, test.id)
+            self._add_to_index("by_approval", test.approval_status.value, test.id)
+
+        # Update timestamp
+        test.updated_at = datetime.now()
+
+        # Save
+        self.save_test(test)
+
+    # === QUERY OPERATIONS ===
+
+    def get_tests_by_goal(self, goal_id: str) -> list[Test]:
+        """Get all tests for a goal."""
+        test_ids = self._get_index("by_goal", goal_id)
+        tests = []
+        for test_id in test_ids:
+            test = self.load_test(goal_id, test_id)
+            if test:
+                tests.append(test)
+        return tests
+
+    def get_tests_by_approval_status(self, status: ApprovalStatus) -> list[str]:
+        """Get test IDs by approval status."""
+        return self._get_index("by_approval", status.value)
+
+    def get_tests_by_type(self, test_type: TestType) -> list[str]:
+        """Get test IDs by test type."""
+        return self._get_index("by_type", test_type.value)
+
+    def get_tests_by_criteria(self, criteria_id: str) -> list[str]:
+        """Get test IDs for a specific criteria."""
+        return self._get_index("by_criteria", criteria_id)
+
+    def get_pending_tests(self, goal_id: str) -> list[Test]:
+        """Get all pending tests for a goal."""
+        tests = self.get_tests_by_goal(goal_id)
+        return [t for t in tests if t.approval_status == ApprovalStatus.PENDING]
+
+    def get_approved_tests(self, goal_id: str) -> list[Test]:
+        """Get all approved tests for a goal (approved or modified)."""
+        tests = self.get_tests_by_goal(goal_id)
+        return [t for t in tests if t.is_approved]
+
+    def list_all_goals(self) -> list[str]:
+        """List all goal IDs that have tests."""
+        goals_dir = self.base_path / "indexes" / "by_goal"
+        return [f.stem for f in goals_dir.glob("*.json")]
+
+    # === RESULT OPERATIONS ===
+
+    def save_result(self, test_id: str, result: TestResult) -> None:
+        """Save a test result."""
+        results_dir = self.base_path / "results" / test_id
+        results_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save with timestamp
+        timestamp = result.timestamp.strftime("%Y%m%d_%H%M%S")
+        result_path = results_dir / f"{timestamp}.json"
+        with open(result_path, "w") as f:
+            f.write(result.model_dump_json(indent=2))
+
+        # Update latest
+        latest_path = results_dir / "latest.json"
+        with open(latest_path, "w") as f:
+            f.write(result.model_dump_json(indent=2))
+
+    def get_latest_result(self, test_id: str) -> TestResult | None:
+        """Get the most recent result for a test."""
+        latest_path = self.base_path / "results" / test_id / "latest.json"
+        if not latest_path.exists():
+            return None
+        with open(latest_path) as f:
+            return TestResult.model_validate_json(f.read())
+
+    def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]:
+        """Get result history for a test, most recent first."""
+        results_dir = self.base_path / "results" / test_id
+        if not results_dir.exists():
+            return []
+
+        # Get all result files except latest.json
+        result_files = sorted(
+            [f for f in results_dir.glob("*.json") if f.name != "latest.json"],
+            reverse=True
+        )[:limit]
+
+        results = []
+        for f in result_files:
+            with open(f) as file:
+                results.append(TestResult.model_validate_json(file.read()))
+
+        return results
+
+    # === INDEX OPERATIONS ===
+
+    def _get_index(self, index_type: str, key: str) -> list[str]:
+        """Get values from an index."""
+        index_path = self.base_path / "indexes" / index_type / f"{key}.json"
+        if not index_path.exists():
+            return []
+        with open(index_path) as f:
+            return json.load(f)
+
+    def _add_to_index(self, index_type: str, key: str, value: str) -> None:
+        """Add a value to an index."""
+        index_path = self.base_path / "indexes" / index_type / f"{key}.json"
+        values = self._get_index(index_type, key)
+        if value not in values:
+            values.append(value)
+            with open(index_path, "w") as f:
+                json.dump(values, f)
+
+    def _remove_from_index(self, index_type: str, key: str, value: str) -> None:
+        """Remove a value from an index."""
+        index_path = self.base_path / "indexes" / index_type / f"{key}.json"
+        values = self._get_index(index_type, key)
+        if value in values:
+            values.remove(value)
+            with open(index_path, "w") as f:
+                json.dump(values, f)
+
+    # === UTILITY ===
+
+    def get_stats(self) -> dict:
+        """Get storage statistics."""
+        goals = self.list_all_goals()
+        total_tests = sum(len(self._get_index("by_goal", g)) for g in goals)
+        pending = len(self._get_index("by_approval", "pending"))
+        approved = len(self._get_index("by_approval", "approved"))
+        modified = len(self._get_index("by_approval", "modified"))
+        rejected = len(self._get_index("by_approval", "rejected"))
+
+        return {
+            "total_goals": len(goals),
+            "total_tests": total_tests,
+            "by_approval": {
+                "pending": pending,
+                "approved": approved,
+                "modified": modified,
+                "rejected": rejected,
+            },
+            "storage_path": str(self.base_path),
+        }
@@ -0,0 +1,612 @@
+"""
+Unit tests for the goal-based testing framework.
+
+Tests cover:
+- Schema validation
+- Storage CRUD operations
+- Error categorization heuristics
+- Parallel runner grouping logic
+"""
+
+import pytest
+import tempfile
+from pathlib import Path
+from datetime import datetime
+
+from framework.testing.test_case import (
+    Test,
+    TestType,
+    ApprovalStatus,
+)
+from framework.testing.test_result import (
+    TestResult,
+    TestSuiteResult,
+    ErrorCategory,
+)
+from framework.testing.test_storage import TestStorage
+from framework.testing.categorizer import ErrorCategorizer
+from framework.testing.parallel import ParallelTestRunner, ParallelConfig
+from framework.testing.debug_tool import DebugTool
+
+
+# ============================================================================
+# Test Schema Tests
+# ============================================================================
+
+class TestTestCaseSchema:
+    """Tests for Test schema."""
+
+    def test_create_test(self):
+        """Test creating a basic test."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_api_limits",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_constraint_api_limits",
+            test_code="def test_constraint_api_limits(agent): pass",
+            description="Tests API rate limits",
+            input={"topic": "test"},
+            expected_output={"count": 5},
+        )
+
+        assert test.id == "test_001"
+        assert test.goal_id == "goal_001"
+        assert test.test_type == TestType.CONSTRAINT
+        assert test.approval_status == ApprovalStatus.PENDING
+        assert not test.is_approved
+
+    def test_approve_test(self):
+        """Test approving a test."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="pass",
+            description="test",
+        )
+
+        test.approve("test_user")
+
+        assert test.approval_status == ApprovalStatus.APPROVED
+        assert test.approved_by == "test_user"
+        assert test.approved_at is not None
+        assert test.is_approved
+
+    def test_modify_test(self):
+        """Test modifying a test before approval."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="original code",
+            description="test",
+        )
+
+        test.modify("modified code", "test_user")
+
+        assert test.approval_status == ApprovalStatus.MODIFIED
+        assert test.original_code == "original code"
+        assert test.test_code == "modified code"
+        assert test.is_approved
+
+    def test_reject_test(self):
+        """Test rejecting a test."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="pass",
+            description="test",
+        )
+
+        test.reject("Not a valid test case")
+
+        assert test.approval_status == ApprovalStatus.REJECTED
+        assert test.rejection_reason == "Not a valid test case"
+        assert not test.is_approved
+
+    def test_record_result(self):
+        """Test recording test results."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="pass",
+            description="test",
+        )
+
+        test.record_result(passed=True)
+        assert test.last_result == "passed"
+        assert test.run_count == 1
+        assert test.pass_count == 1
+        assert test.pass_rate == 1.0
+
+        test.record_result(passed=False)
+        assert test.last_result == "failed"
+        assert test.run_count == 2
+        assert test.pass_count == 1
+        assert test.fail_count == 1
+        assert test.pass_rate == 0.5
+
+
+class TestTestResultSchema:
+    """Tests for TestResult schema."""
+
+    def test_create_passed_result(self):
+        """Test creating a passed result."""
+        result = TestResult(
+            test_id="test_001",
+            passed=True,
+            duration_ms=100,
+            actual_output={"status": "ok"},
+            expected_output={"status": "ok"},
+        )
+
+        assert result.passed
+        assert result.duration_ms == 100
+        assert result.error_category is None
+
+    def test_create_failed_result(self):
+        """Test creating a failed result."""
+        result = TestResult(
+            test_id="test_001",
+            passed=False,
+            duration_ms=50,
+            error_message="Assertion failed",
+            error_category=ErrorCategory.IMPLEMENTATION_ERROR,
+            stack_trace="Traceback...",
+        )
+
+        assert not result.passed
+        assert result.error_category == ErrorCategory.IMPLEMENTATION_ERROR
+
+    def test_summary_dict(self):
+        """Test summary dict generation."""
+        result = TestResult(
+            test_id="test_001",
+            passed=False,
+            duration_ms=50,
+            error_message="Very long error " * 20,
+            error_category=ErrorCategory.LOGIC_ERROR,
+        )
+
+        summary = result.summary_dict()
+        assert summary["test_id"] == "test_001"
+        assert summary["passed"] is False
+        assert summary["error_category"] == "logic_error"
+        assert len(summary["error_message"]) == 100  # Truncated
+
+
+class TestTestSuiteResult:
+    """Tests for TestSuiteResult schema."""
+
+    def test_suite_result_properties(self):
+        """Test suite result calculation properties."""
+        results = [
+            TestResult(test_id="t1", passed=True, duration_ms=100),
+            TestResult(test_id="t2", passed=True, duration_ms=50),
+            TestResult(test_id="t3", passed=False, duration_ms=75,
+                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
+        ]
+
+        suite = TestSuiteResult(
+            goal_id="goal_001",
+            total=3,
+            passed=2,
+            failed=1,
+            results=results,
+            duration_ms=225,
+        )
+
+        assert not suite.all_passed
+        assert suite.pass_rate == pytest.approx(2/3)
+        assert len(suite.get_failed_results()) == 1
+
+    def test_get_results_by_category(self):
+        """Test filtering results by error category."""
+        results = [
+            TestResult(test_id="t1", passed=False, duration_ms=100,
+                       error_category=ErrorCategory.LOGIC_ERROR),
+            TestResult(test_id="t2", passed=False, duration_ms=50,
+                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
+            TestResult(test_id="t3", passed=False, duration_ms=75,
+                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
+        ]
+
+        suite = TestSuiteResult(
+            goal_id="goal_001",
+            total=3,
+            passed=0,
+            failed=3,
+            results=results,
+        )
+
+        impl_errors = suite.get_results_by_category(ErrorCategory.IMPLEMENTATION_ERROR)
+        assert len(impl_errors) == 2
+
+
+# ============================================================================
+# Storage Tests
+# ============================================================================
+
+class TestTestStorage:
+    """Tests for TestStorage."""
+
+    @pytest.fixture
+    def storage(self, tmp_path):
+        """Create a temporary storage instance."""
+        return TestStorage(tmp_path)
+
+    def test_save_and_load_test(self, storage):
+        """Test saving and loading a test."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="def test_something(agent): pass",
+            description="A test",
+        )
+
+        storage.save_test(test)
+
+        loaded = storage.load_test("goal_001", "test_001")
+        assert loaded is not None
+        assert loaded.id == "test_001"
+        assert loaded.test_name == "test_something"
+
+    def test_delete_test(self, storage):
+        """Test deleting a test."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="constraint_001",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="pass",
+            description="test",
+        )
+
+        storage.save_test(test)
+        assert storage.load_test("goal_001", "test_001") is not None
+
+        storage.delete_test("goal_001", "test_001")
+        assert storage.load_test("goal_001", "test_001") is None
+
+    def test_get_tests_by_goal(self, storage):
+        """Test querying tests by goal."""
+        for i in range(3):
+            test = Test(
+                id=f"test_{i}",
+                goal_id="goal_001",
+                parent_criteria_id=f"constraint_{i}",
+                test_type=TestType.CONSTRAINT,
+                test_name=f"test_{i}",
+                test_code="pass",
+                description="test",
+            )
+            storage.save_test(test)
+
+        tests = storage.get_tests_by_goal("goal_001")
+        assert len(tests) == 3
+
+    def test_get_approved_tests(self, storage):
+        """Test querying approved tests."""
+        # Create tests with different approval statuses
+        test1 = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="c1",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_1",
+            test_code="pass",
+            description="test",
+        )
+        test1.approve()
+        storage.save_test(test1)
+
+        test2 = Test(
+            id="test_002",
+            goal_id="goal_001",
+            parent_criteria_id="c2",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_2",
+            test_code="pass",
+            description="test",
+        )
+        # Leave pending
+        storage.save_test(test2)
+
+        test3 = Test(
+            id="test_003",
+            goal_id="goal_001",
+            parent_criteria_id="c3",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_3",
+            test_code="pass",
+            description="test",
+        )
+        test3.modify("modified", "user")
+        storage.save_test(test3)
+
+        approved = storage.get_approved_tests("goal_001")
+        assert len(approved) == 2  # approved and modified
+
+    def test_save_and_load_result(self, storage):
+        """Test saving and loading test results."""
+        result = TestResult(
+            test_id="test_001",
+            passed=True,
+            duration_ms=100,
+        )
+
+        storage.save_result("test_001", result)
+
+        loaded = storage.get_latest_result("test_001")
+        assert loaded is not None
+        assert loaded.passed is True
+        assert loaded.duration_ms == 100
+
+    def test_result_history(self, storage):
+        """Test getting result history."""
+        # Save multiple results
+        for i in range(5):
+            result = TestResult(
+                test_id="test_001",
+                passed=(i % 2 == 0),
+                duration_ms=100 + i,
+            )
+            storage.save_result("test_001", result)
+
+        history = storage.get_result_history("test_001", limit=3)
+        assert len(history) <= 3
+
+    def test_get_stats(self, storage):
+        """Test getting storage statistics."""
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="c1",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_1",
+            test_code="pass",
+            description="test",
+        )
+        test.approve()
+        storage.save_test(test)
+
+        stats = storage.get_stats()
+        assert stats["total_tests"] == 1
+        assert stats["by_approval"]["approved"] == 1
+
+
+# ============================================================================
+# Error Categorizer Tests
+# ============================================================================
+
+class TestErrorCategorizer:
+    """Tests for ErrorCategorizer."""
+
+    @pytest.fixture
+    def categorizer(self):
+        return ErrorCategorizer()
+
+    def test_categorize_passed(self, categorizer):
+        """Test that passed results return None."""
+        result = TestResult(test_id="t1", passed=True, duration_ms=100)
+        assert categorizer.categorize(result) is None
+
+    def test_categorize_logic_error(self, categorizer):
+        """Test categorization of logic errors."""
+        result = TestResult(
+            test_id="t1",
+            passed=False,
+            duration_ms=100,
+            error_message="goal not achieved: expected success criteria was not met",
+        )
+        assert categorizer.categorize(result) == ErrorCategory.LOGIC_ERROR
+
+    def test_categorize_implementation_error(self, categorizer):
+        """Test categorization of implementation errors."""
+        result = TestResult(
+            test_id="t1",
+            passed=False,
+            duration_ms=100,
+            error_message="TypeError: 'NoneType' object has no attribute 'get'",
+        )
+        assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
+
+    def test_categorize_edge_case(self, categorizer):
+        """Test categorization of edge cases."""
+        result = TestResult(
+            test_id="t1",
+            passed=False,
+            duration_ms=100,
+            error_message="timeout: request took longer than expected",
+        )
+        assert categorizer.categorize(result) == ErrorCategory.EDGE_CASE
+
+    def test_categorize_from_stack_trace(self, categorizer):
+        """Test categorization from stack trace."""
+        result = TestResult(
+            test_id="t1",
+            passed=False,
+            duration_ms=100,
+            error_message="Error occurred",
+            stack_trace="KeyError: 'missing_key'\n  at line 42",
+        )
+        assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
+
+    def test_get_fix_suggestion(self, categorizer):
+        """Test fix suggestions for each category."""
+        assert "Goal" in categorizer.get_fix_suggestion(ErrorCategory.LOGIC_ERROR)
+        assert "code" in categorizer.get_fix_suggestion(ErrorCategory.IMPLEMENTATION_ERROR).lower()
+        assert "test" in categorizer.get_fix_suggestion(ErrorCategory.EDGE_CASE).lower()
+
+    def test_get_iteration_guidance(self, categorizer):
+        """Test iteration guidance."""
+        guidance = categorizer.get_iteration_guidance(ErrorCategory.LOGIC_ERROR)
+        assert guidance["stage"] == "Goal"
+        assert guidance["restart_required"] is True
+
+        guidance = categorizer.get_iteration_guidance(ErrorCategory.IMPLEMENTATION_ERROR)
+        assert guidance["stage"] == "Agent"
+        assert guidance["restart_required"] is False
+
+
+# ============================================================================
+# Parallel Runner Tests
+# ============================================================================
+
+class TestParallelRunner:
+    """Tests for ParallelTestRunner."""
+
+    @pytest.fixture
+    def runner(self, tmp_path):
+        """Create a test runner with temporary storage."""
+        storage = TestStorage(tmp_path)
+        config = ParallelConfig(num_workers=1)  # Sequential for testing
+        return ParallelTestRunner(config, storage)
+
+    def test_create_suite_result(self, runner):
+        """Test creating suite result from individual results."""
+        results = [
+            TestResult(test_id="t1", passed=True, duration_ms=100),
+            TestResult(test_id="t2", passed=False, duration_ms=50),
+        ]
+
+        suite = runner._create_suite_result("goal_001", results)
+
+        assert suite.goal_id == "goal_001"
+        assert suite.total == 2
+        assert suite.passed == 1
+        assert suite.failed == 1
+        assert suite.duration_ms == 150
+
+
+# ============================================================================
+# Debug Tool Tests
+# ============================================================================
+
+class TestDebugTool:
+    """Tests for DebugTool."""
+
+    @pytest.fixture
+    def debug_tool(self, tmp_path):
+        """Create a debug tool with temporary storage."""
+        storage = TestStorage(tmp_path)
+        return DebugTool(storage)
+
+    def test_analyze_missing_test(self, debug_tool):
+        """Test analyzing a non-existent test."""
+        info = debug_tool.analyze("goal_001", "nonexistent")
+
+        assert info.test_id == "nonexistent"
+        assert "not found" in info.error_message.lower()
+
+    def test_analyze_with_result(self, debug_tool, tmp_path):
+        """Test analyzing a test with result."""
+        storage = TestStorage(tmp_path)
+
+        # Create and save test
+        test = Test(
+            id="test_001",
+            goal_id="goal_001",
+            parent_criteria_id="c1",
+            test_type=TestType.CONSTRAINT,
+            test_name="test_something",
+            test_code="pass",
+            description="A test",
+            input={"key": "value"},
+            expected_output={"result": "expected"},
+        )
+        storage.save_test(test)
+
+        # Create and save result
+        result = TestResult(
+            test_id="test_001",
+            passed=False,
+            duration_ms=100,
+            error_message="TypeError: something went wrong",
+            error_category=ErrorCategory.IMPLEMENTATION_ERROR,
+        )
+        storage.save_result("test_001", result)
+
+        # Create new debug tool with same storage
+        debug_tool = DebugTool(storage)
+
+        info = debug_tool.analyze("goal_001", "test_001")
+
+        assert info.test_id == "test_001"
+        assert info.test_name == "test_something"
+        assert not info.passed
+        assert info.error_category == "implementation_error"
+        assert info.suggested_fix is not None
+
+
+# ============================================================================
+# Integration Tests
+# ============================================================================
+
+class TestIntegration:
+    """Integration tests for the testing framework."""
+
+    def test_full_workflow(self, tmp_path):
+        """Test a simplified full workflow."""
+        storage = TestStorage(tmp_path)
+
+        # 1. Create tests (simulating generation)
+        tests = []
+        for i in range(3):
+            test = Test(
+                id=f"test_{i}",
+                goal_id="goal_001",
+                parent_criteria_id="constraint_001",
+                test_type=TestType.CONSTRAINT,
+                test_name=f"test_constraint_{i}",
+                test_code=f"def test_constraint_{i}(agent): assert True",
+                description=f"Test {i}",
+            )
+            tests.append(test)
+
+        # 2. Approve tests
+        for test in tests:
+            test.approve("user")
+            storage.save_test(test)
+
+        # 3. Verify storage
+        approved = storage.get_approved_tests("goal_001")
+        assert len(approved) == 3
+
+        # 4. Simulate running tests
+        config = ParallelConfig(num_workers=1)
+        runner = ParallelTestRunner(config, storage)
+
+        class MockAgent:
+            def run(self, input):
+                return {"success": True}
+
+        results = runner.run_tests(approved, MockAgent())
+        assert len(results) == 3
+
+        # 5. Save results
+        for result in results:
+            storage.save_result(result.test_id, result)
+
+        # 6. Check stats
+        stats = storage.get_stats()
+        assert stats["total_tests"] == 3
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])