From e2945b6c99646e42548b56661411e209e9d740b5 Mon Sep 17 00:00:00 2001 From: bryan Date: Tue, 20 Jan 2026 16:28:21 -0800 Subject: [PATCH] initial test phase --- .gitignore | 3 + core/.claude/skills/building-agents/SKILL.md | 237 ++++++- core/.claude/skills/testing-agent/SKILL.md | 625 ++++++++++++++++++ .../examples/testing-youtube-agent.md | 348 ++++++++++ core/README.md | 23 + core/framework/__init__.py | 37 ++ core/framework/cli.py | 14 +- core/framework/graph/edge.py | 2 +- core/framework/graph/goal.py | 1 + core/framework/llm/anthropic.py | 4 +- core/framework/mcp/agent_builder_server.py | 391 +++++++++++ core/framework/runner/cli.py | 2 +- core/framework/runner/orchestrator.py | 2 +- core/framework/runner/runner.py | 4 +- core/framework/runtime/core.py | 26 +- core/framework/testing/__init__.py | 144 ++++ core/framework/testing/approval_cli.py | 295 +++++++++ core/framework/testing/approval_types.py | 130 ++++ core/framework/testing/categorizer.py | 260 ++++++++ core/framework/testing/cli.py | 413 ++++++++++++ core/framework/testing/constraint_gen.py | 201 ++++++ core/framework/testing/debug_tool.py | 286 ++++++++ core/framework/testing/executor.py | 407 ++++++++++++ core/framework/testing/parallel.py | 344 ++++++++++ core/framework/testing/prompts.py | 112 ++++ core/framework/testing/success_gen.py | 219 ++++++ core/framework/testing/test_case.py | 150 +++++ core/framework/testing/test_result.py | 153 +++++ core/framework/testing/test_storage.py | 260 ++++++++ core/tests/test_testing_framework.py | 612 +++++++++++++++++ 30 files changed, 5681 insertions(+), 24 deletions(-) create mode 100644 core/.claude/skills/testing-agent/SKILL.md create mode 100644 core/.claude/skills/testing-agent/examples/testing-youtube-agent.md create mode 100644 core/framework/testing/__init__.py create mode 100644 core/framework/testing/approval_cli.py create mode 100644 core/framework/testing/approval_types.py create mode 100644 core/framework/testing/categorizer.py create mode 100644 core/framework/testing/cli.py create mode 100644 core/framework/testing/constraint_gen.py create mode 100644 core/framework/testing/debug_tool.py create mode 100644 core/framework/testing/executor.py create mode 100644 core/framework/testing/parallel.py create mode 100644 core/framework/testing/prompts.py create mode 100644 core/framework/testing/success_gen.py create mode 100644 core/framework/testing/test_case.py create mode 100644 core/framework/testing/test_result.py create mode 100644 core/framework/testing/test_storage.py create mode 100644 core/tests/test_testing_framework.py diff --git a/.gitignore b/.gitignore index f980803a..8e377d5a 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,9 @@ __pycache__/ .eggs/ *.egg +# Generated runtime data +core/data/ + # Misc *.local .cache/ diff --git a/core/.claude/skills/building-agents/SKILL.md b/core/.claude/skills/building-agents/SKILL.md index e23776b7..c26263f0 100644 --- a/core/.claude/skills/building-agents/SKILL.md +++ b/core/.claude/skills/building-agents/SKILL.md @@ -10,9 +10,11 @@ Build goal-driven agents that use LLM reasoning to accomplish tasks. ## Quick Start 1. Define the goal (what success looks like) -2. Add nodes (units of work) -3. Connect with edges (flow between nodes) -4. Validate and test +2. Generate constraint tests from goal → Approve tests +3. Add nodes (units of work) - validate against constraint tests +4. Connect with edges (flow between nodes) +5. Validate and test graph +6. Handoff to testing-agent skill for final evaluation ## Core Concepts @@ -117,10 +119,15 @@ For each component (goal, node, edge): ``` Agent Build Progress: + +GOAL STAGE: - [ ] Define goal with success criteria → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ - [ ] Define goal constraints → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ -- [ ] Add entry node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ -- [ ] Add each processing node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ +- [ ] Generate constraint tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓ (NEW) + +AGENT STAGE: +- [ ] Add entry node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ +- [ ] Add each processing node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ - [ ] Add pause nodes (if HITL needed) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ - [ ] Add resume entry points (for pause nodes) → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ - [ ] Add terminal node(s) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓ @@ -129,6 +136,11 @@ Agent Build Progress: - [ ] Validate full graph → TEST GRAPH → SHOW RESULTS - [ ] Final approval → ASK APPROVAL (clickable: Approve & Export/Reject/Pause) ✓ - [ ] Export to exports/{agent-name}/ + +EVAL STAGE (handoff to testing-agent skill): +- [ ] Generate success criteria tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓ +- [ ] Run all tests (constraint + success criteria) +- [ ] Debug failures and iterate ``` ### Testing During Approval @@ -147,6 +159,31 @@ Show the human: - What tools are available - What outputs will be written +**Validate against constraint tests** (if available): + +After approving constraint tests, reference them during node development: + +```python +# When presenting a node for approval, show constraint alignment: +""" +**NODE: search_node** + +Test Results: [test_node output] + +Constraint Test Alignment: +✓ test_constraint_api_limits_respected + → Node uses rate-limited tool wrapper ✓ +✓ test_constraint_content_safety_filter + → Output includes safety_score field ✓ + +Validation: ✅ PASS +""" +``` + +**IMPORTANT**: Constraint tests may not fully execute until the agent is complete, +but their test definitions guide node design. Review the test code to ensure +your nodes handle the constraint scenarios. + **Before final approval**, use `test_graph` to simulate full execution: ``` test_graph( @@ -425,6 +462,7 @@ Goal( description="What the agent must NOT do", constraint_type="hard", # hard = must not violate category="safety", + check="llm_judge", # Optional: how to validate ("llm_judge", expression, or function) ), ], ) @@ -433,6 +471,98 @@ Goal( **Good goals**: Specific, measurable, constrained **Bad goals**: Vague, unmeasurable, no boundaries +## Constraint Test Generation + +**CRITICAL**: After approving the goal, generate constraint tests BEFORE building nodes. + +Constraint tests verify that the agent will respect its defined constraints (safety, rate limits, etc.). +These tests are **agent-agnostic** - they test boundaries, not implementation. This means they can be +generated before any nodes exist. + +### Why Generate Tests Before Building? + +1. **Early Validation**: Catch constraint violations during node development, not after +2. **Design Guidance**: Tests make constraints concrete and testable +3. **Incremental Feedback**: Review constraint tests while designing each node + +### Generation Workflow + +```python +# 1. After goal is approved, generate constraint tests +result = generate_constraint_tests( + goal_id=goal_data["id"], + goal_json=json.dumps(goal_data) +) + +# 2. Tests are returned with PENDING status +# The MCP tool returns approval_required=True + +# 3. Display each test to the human for approval +┌─────────────────────────────────────────────────────────────────┐ +│ [1/3] test_constraint_api_limits_respected │ +│ Constraint: api_limits │ +│ Confidence: 88% │ +│ │ +│ def test_constraint_api_limits_respected(agent): │ +│ ... │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +└─────────────────────────────────────────────────────────────────┘ + +# 4. Use AskUserQuestion with approval options +AskUserQuestion( + questions=[{ + "question": "Do you approve this constraint test?", + "header": "Test Approval", + "options": [ + {"label": "✓ Approve (Recommended)", "description": "Test looks good"}, + {"label": "✗ Reject", "description": "Test is invalid"}, + {"label": "✎ Edit", "description": "Modify before accepting"}, + {"label": "⏭ Skip", "description": "Decide later"} + ], + "multiSelect": false + }] +) + +# 5. Call approve_tests with the decisions +approve_tests( + goal_id=goal_data["id"], + approvals='[{"test_id": "...", "action": "approve"}, ...]' +) + +# 6. Verify no pending tests before proceeding to nodes +pending = get_pending_tests(goal_id=goal_data["id"]) +if json.loads(pending)["pending_count"] > 0: + # Prompt user to handle remaining tests + print("⚠️ Pending tests must be resolved before building nodes") +``` + +### Approval Rules + +- **All tests must be reviewed** - no auto-approval +- **Approved/Modified tests are stored** for use during node validation +- **Rejected tests are not stored** (with reason tracked) +- **Skipped tests remain pending** - must be resolved before export + +### Using Constraint Tests During Node Building + +Once constraint tests are approved, reference them when designing nodes: + +```python +# Before adding a node that makes API calls, review constraint tests: +""" +Creating node: search_node (llm_tool_use) +Tools: youtube_search, video_details + +Constraint Test Review: +✓ test_constraint_api_limits_respected - checks rate limits + → Ensure search_node handles rate limit responses gracefully + +✓ test_constraint_content_safety_filter - checks safe content + → Ensure output_keys include safety flags for filtering +""" +``` + ## Adding Nodes Each node does one thing: @@ -617,11 +747,29 @@ analyze → needs_clarification? → YES → request-clarification (PAUSE) | `export_graph` | Export the completed agent | | `get_session_status` | View current build progress | -### Testing Tools (for HITL approval) -| Tool | Purpose | -|------|---------| -| `test_node` | Run a single node with sample inputs to show behavior | -| `test_graph` | Simulate full graph execution to show the complete flow | +### Testing Tools by Stage + +#### Goal Stage (this skill) - Generate constraint tests +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `generate_constraint_tests` | Generate tests from constraints | Immediately after goal approval | +| `approve_tests` | Approve/reject/modify tests | After generation, before building nodes | +| `get_pending_tests` | List tests awaiting approval | Before proceeding to node building | + +#### Agent Stage (this skill) - Build and validate nodes +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `test_node` | Run a single node with sample inputs | Before each node approval | +| `test_graph` | Simulate full graph execution | Before final approval | + +#### Eval Stage (testing-agent skill) - Final evaluation +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `generate_success_tests` | Generate tests from success criteria | After agent export | +| `run_tests` | Run all tests in parallel | After test approval | +| `debug_test` | Debug failed tests | After test failures | + +See the [testing-agent skill](../testing-agent/SKILL.md) for the full Eval stage workflow. ## Using the Exported Agent @@ -762,3 +910,72 @@ result = await runner.run(context) ``` For complete API details, see [reference/api.md](reference/api.md). + +## Handoff to Testing-Agent Skill + +After exporting the agent, switch to the **testing-agent** skill for final evaluation (Eval Stage). + +### What Transfers + +1. **Goal definition** (with constraints and success criteria) +2. **Approved constraint tests** (generated in Goal Stage) +3. **Exported agent** at `exports/{agent-name}/` + +### What Happens in Testing-Agent + +1. Generate **success criteria tests** (these need agent details, so generated after build) +2. Run **all tests** (constraint + success criteria) in parallel +3. Debug failures and categorize errors +4. Iterate based on error type + +### Triggering the Handoff + +After `export_graph` completes successfully, display: + +``` +✅ Agent exported to exports/{agent-name}/ + +Next Steps (Eval Stage): +1. Switch to testing-agent skill +2. Generate success criteria tests +3. Run full evaluation +4. Debug any failures + +Command: "Run /testing-agent for exports/{agent-name}" +``` + +### Error Category Routing + +If tests fail in the Eval stage, the error category determines where to go: + +| Error Category | Meaning | Action | +|---------------|---------|--------| +| `LOGIC_ERROR` | Goal definition is wrong | Return to Goal Stage - update goal, regenerate constraint tests | +| `IMPLEMENTATION_ERROR` | Code bug in nodes/edges | Return to Agent Stage - fix nodes/edges, re-export | +| `EDGE_CASE` | New scenario discovered | Stay in Eval Stage - add edge case test, continue | + +### Flow Diagram + +``` +┌──────────────────────────────────────────────────────────────┐ +│ GOAL STAGE (building-agents skill) │ +│ 1. Define success_criteria and constraints → APPROVE │ +│ 2. Generate CONSTRAINT TESTS from constraints │ +│ 3. APPROVE each constraint test │ +└──────────────────────────────────────────────────────────────┘ + ↓ +┌──────────────────────────────────────────────────────────────┐ +│ AGENT STAGE (building-agents skill) │ +│ 1. Add nodes - review constraint tests for design guidance │ +│ 2. Test each node - validate against constraint expectations│ +│ 3. Connect edges → Validate graph → Export │ +└──────────────────────────────────────────────────────────────┘ + ↓ +┌──────────────────────────────────────────────────────────────┐ +│ EVAL STAGE (testing-agent skill) │ +│ 1. Generate SUCCESS_CRITERIA TESTS → APPROVE │ +│ 2. Run ALL tests (constraint + success criteria) │ +│ 3. Debug failures → Categorize errors │ +│ 4. Route back based on error category (if needed) │ +└──────────────────────────────────────────────────────────────┘ +``` diff --git a/core/.claude/skills/testing-agent/SKILL.md b/core/.claude/skills/testing-agent/SKILL.md new file mode 100644 index 00000000..2ed22f3f --- /dev/null +++ b/core/.claude/skills/testing-agent/SKILL.md @@ -0,0 +1,625 @@ +--- +name: testing-agent +description: Run goal-based evaluation tests for agents. Use when you need to verify an agent meets its goals, debug failing tests, or iterate on agent improvements based on test results. +--- + +# Testing Agents + +Run goal-based evaluation tests for agents built with the building-agents skill. + +## Quick Start + +1. **Check existing state first** - See if tests already exist +2. Generate tests from goal (only if needed) +3. Approve tests (mandatory human approval) +4. Run tests against agent +5. Debug failures and iterate + +## Check Existing State First + +**CRITICAL**: Before generating any tests, ALWAYS check if tests already exist for the goal. + +```python +# Check what tests exist for this goal +result = list_tests(goal_id="youtube-research") + +# Returns: +{ + "goal_id": "youtube-research", + "total": 42, + "by_status": { + "pending": 10, + "approved": 30, + "modified": 2, + "rejected": 0 + }, + "by_type": { + "constraint": 15, + "success_criteria": 25, + "edge_case": 2 + }, + "tests": [...] # List of test summaries +} +``` + +### Decision Tree + +Based on existing state, choose the right action: + +``` +list_tests(goal_id) → Check existing tests + ↓ +┌───────┴────────────────────────────────────────┐ +│ │ +No tests exist Tests exist +│ │ +↓ ┌─────────┴─────────┐ +Generate tests │ │ +(constraint first, Has pending All approved +then success_criteria) tests │ + │ ↓ + ↓ Run tests + Approve pending directly + tests first +``` + +### Resuming a Testing Session + +When the user asks to test an agent that may have been tested before: + +1. **Always check first**: `list_tests(goal_id="...")` +2. **Show the user what exists**: + - "Found 42 existing tests: 30 approved, 10 pending, 2 modified" + - "Last run: 28/30 passed (93.3%)" +3. **Ask what they want to do**: + +```python +AskUserQuestion( + questions=[{ + "question": "Tests already exist for this agent. What would you like to do?", + "header": "Existing Tests", + "options": [ + { + "label": "Run existing tests (Recommended)", + "description": "Run the 32 approved tests against the agent" + }, + { + "label": "Approve pending tests", + "description": "Review and approve the 10 pending tests first" + }, + { + "label": "Regenerate all tests", + "description": "Delete existing and generate fresh tests (loses approvals)" + }, + { + "label": "Show test details", + "description": "List all tests with their status and last results" + } + ], + "multiSelect": false + }] +) +``` + +### Why This Matters + +- **Saves time**: Approved tests don't need re-approval +- **Preserves work**: User's previous approvals/modifications are kept +- **Clear state**: User knows exactly what exists before taking action +- **Prevents duplicates**: Won't generate tests that already exist + +## Core Concepts + +**Test Types**: Three types of tests, generated at different stages: +- `constraint` - Generated during Goal stage (agent-agnostic boundaries) +- `success_criteria` - Generated during Eval stage (after agent exists) +- `edge_case` - Generated when new scenarios discovered during debugging + +**Approval**: All LLM-generated tests require explicit user approval before running. + +**Error Categories**: Failed tests are categorized to guide iteration: +- `LOGIC_ERROR` - Goal definition is wrong → Update goal, restart full flow +- `IMPLEMENTATION_ERROR` - Code bug → Fix agent, re-run Eval +- `EDGE_CASE` - New scenario discovered → Add test, continue Eval + +**Iteration**: Each error category has a specific fix path (see Error Categorization section). + +## Workflow (HITL Required) + +**CRITICAL**: Each step requires human approval before proceeding. +**CRITICAL**: Use structured questions (AskUserQuestion) with fallback to text mode. + +### Approval Strategy + +**Always try structured questions first**, with graceful fallback: + +1. **Attempt**: Call AskUserQuestion with clickable options +2. **Catch**: If tool fails/rejected, fall back to text prompt +3. **Parse**: Accept text input like "approve", "reject", "skip" + +This ensures the workflow works in all environments (VSCode extension, CLI, web). + +### Test Loop + +``` +For each test generated: +1. DISPLAY → Show the test details to the human +2. VALIDATE → Check test syntax and structure +3. ASK APPROVAL → Use AskUserQuestion with clickable options +4. Only run tests after approval +``` + +### Checklist (ask approval at each check) + +``` +Agent Testing Progress: +- [ ] Load goal and agent → VERIFY PATHS +- [ ] CHECK EXISTING TESTS → list_tests, show stats, ask what to do +- [ ] If no tests OR user wants fresh: Generate tests → ASK APPROVAL +- [ ] If pending tests exist: Approve pending tests first +- [ ] Run all approved tests → SHOW RESULTS +- [ ] Debug failed tests → SHOW CATEGORIZATION +- [ ] Iterate based on category → ASK APPROVAL for changes +``` + +## The Three-Stage Flow + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ GOAL STAGE │ +│ 1. Define success_criteria and constraints (building-agents skill) │ +│ 2. Generate CONSTRAINT TESTS → USER APPROVAL → tests stored │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────────────┐ +│ AGENT STAGE │ +│ Build nodes + edges (building-agents skill) │ +│ Constraint tests can run during development for early feedback │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────────────┐ +│ EVAL STAGE (this skill) │ +│ 1. Generate SUCCESS_CRITERIA TESTS → USER APPROVAL → tests stored │ +│ 2. Run all tests in parallel → pass/fail summary │ +│ 3. On failure → Debug tool with categorization │ +│ 4. Iterate based on error category │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +## Test Generation + +### When to Generate Each Type + +| Test Type | When Generated | Why | +|-----------|----------------|-----| +| **Constraint Tests** | During Goal stage (before agent exists) | Constraints are agent-agnostic boundaries | +| **Success Criteria Tests** | During Eval stage (after agent exists) | May depend on agent flow/nodes | +| **Edge Case Tests** | During debugging (when new scenario found) | Discovered through test failures | + +### Generating Tests + +```python +import json + +# 1. Generate constraint tests (Goal stage) +result = generate_constraint_tests( + goal_id="youtube-research", + goal_json=json.dumps({ + "id": "youtube-research", + "name": "YouTube Research Agent", + "description": "Find relevant YouTube videos on a topic", + "success_criteria": [ + { + "id": "find_videos", + "description": "Find 3-5 relevant videos", + "metric": "video_count", + "target": "3-5", + "weight": 1.0 + } + ], + "constraints": [ + { + "id": "api_limits", + "description": "Must respect YouTube API rate limits", + "constraint_type": "hard", + "category": "reliability", + "check": "llm_judge" # Optional: how to validate + } + ] + }) +) + +# 2. Generate success criteria tests (Eval stage, after agent built) +result = generate_success_tests( + goal_id="youtube-research", + goal_json='...', # Same structure as above + node_names="search_node,filter_node,format_node", + tool_names="youtube_search,video_details" +) +``` + +**After generation**, tests are stored as PENDING. They must be approved before running. + +## Approval Patterns + +### Interactive Approval Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Generated Tests for: youtube-research (3 tests) │ +├─────────────────────────────────────────────────────────────────┤ +│ [1/3] test_find_videos_happy_path │ +│ Type: SUCCESS_CRITERIA │ +│ Confidence: 92% │ +│ Input: {"topic": "machine learning tutorials"} │ +│ Expected: 3-5 videos with titles and IDs │ +│ │ +│ def test_find_videos_happy_path(agent): │ +│ result = agent.run({"topic": "machine learning"}) │ +│ assert 3 <= len(result.videos) <= 5 │ +│ assert all(v.title for v in result.videos) │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Approval Actions + +| Action | Description | Result | +|--------|-------------|--------| +| **approve** | Accept test as-is | Status → APPROVED, test will run | +| **reject** | Decline with reason | Status → REJECTED, test won't run | +| **edit** | Modify code before accepting | Status → MODIFIED, original preserved | +| **skip** | Leave for later | Status → PENDING, decide later | + +### Approval Code Pattern + +```python +# After generating tests, approve them +result = approve_tests( + goal_id="youtube-research", + approvals='[ + {"test_id": "test_001", "action": "approve"}, + {"test_id": "test_002", "action": "modify", "modified_code": "def test_..."}, + {"test_id": "test_003", "action": "reject", "reason": "Not a valid scenario"}, + {"test_id": "test_004", "action": "skip"} + ]' +) +``` + +### Structured Approval Questions + +```python +# Try structured approval first +try: + response = AskUserQuestion( + questions=[{ + "question": "Do you approve this test?", + "header": "Test Approval", + "options": [ + { + "label": "Approve (Recommended)", + "description": "Test looks good, include in test suite" + }, + { + "label": "Reject", + "description": "Test is invalid or unnecessary" + }, + { + "label": "Edit", + "description": "Modify the test code before accepting" + }, + { + "label": "Skip", + "description": "Decide later, leave as pending" + } + ], + "multiSelect": false + }] + ) +except: + # Fallback to text mode + print("Do you approve this test? Type: approve | reject | edit | skip") +``` + +## Test Execution + +### Parallel Configuration + +```python +# Tests run in parallel with these defaults +ParallelConfig( + num_workers=cpu_count(), # Use all CPU cores + timeout_per_test=60.0, # 60 seconds per test + fail_fast=False, # Run all tests, don't stop on first failure + mode="loadfile", # Group tests by parent_criteria_id +) +``` + +### Running Tests + +```python +# Run all approved tests +result = run_tests( + goal_id="youtube-research", + agent_path="exports/youtube-agent", + test_types='["all"]', # or ["constraint", "success_criteria", "edge_case"] + parallel=4, # Number of workers + fail_fast=False # Run all tests +) + +# Result structure +{ + "goal_id": "youtube-research", + "overall_passed": false, + "summary": { + "total": 15, + "passed": 12, + "failed": 3, + "pass_rate": "80.0%" + }, + "duration_ms": 5432, + "results": [ + {"test_id": "test_001", "passed": true, "duration_ms": 234}, + {"test_id": "test_002", "passed": false, "duration_ms": 567, "error_category": "IMPLEMENTATION_ERROR"}, + ... + ] +} +``` + +### Execution Flow + +1. Load only APPROVED and MODIFIED tests (skip PENDING and REJECTED) +2. Group tests by `parent_criteria_id` for shared fixture setup +3. Run groups in parallel with process isolation +4. Aggregate results with timing information + +## Error Categorization & Iteration + +### Decision Tree + +``` +Test Fails → Categorize Error + ↓ + ┌───────────┴─────────────────┬────────────────────┐ + │ │ │ +LOGIC ERROR IMPLEMENTATION ERROR EDGE CASE +(criteria wrong) (code bug) (new scenario) + │ │ │ + ↓ ↓ ↓ +Update goal Fix nodes/edges Generate new +success_criteria in Agent stage edge case test + ↓ ↓ │ +FULL 3-STEP Re-run Eval Continue in +FLOW RESTART (skip Goal stage) Eval stage +``` + +### Pattern-Based Heuristics + +The categorizer uses these patterns to classify errors: + +**LOGIC_ERROR** (goal definition is wrong): +- "goal not achieved" +- "constraint violated: core" +- "fundamental assumption" +- "success criteria mismatch" +- "expected behavior incorrect" + +**IMPLEMENTATION_ERROR** (code bug in agent): +- TypeError, AttributeError, KeyError, ValueError +- "tool call failed" +- "node execution error" +- "assertion failed" +- "null pointer", "undefined" + +**EDGE_CASE** (new scenario discovered): +- "boundary condition" +- "timeout", "rate limit" +- "empty result", "no results" +- "unexpected format" +- "rare input", "unusual" + +### Iteration Guidance + +```python +# After categorization, you get guidance +{ + "error_category": "IMPLEMENTATION_ERROR", + "iteration_guidance": { + "stage": "Agent", + "action": "Fix the code in nodes/edges", + "restart_required": false, + "description": "The goal is correct, but the implementation has a bug. Fix the agent code and re-run Eval." + } +} +``` + +| Category | Go To Stage | Restart Required | Action | +|----------|-------------|------------------|--------| +| LOGIC_ERROR | Goal | Yes | Update success_criteria/constraints, rebuild agent | +| IMPLEMENTATION_ERROR | Agent | No | Fix nodes/edges, re-run Eval only | +| EDGE_CASE | Eval | No | Generate edge case test, continue in Eval | + +## Debugging Failed Tests + +### Debug Tool + +```python +# Get detailed debug info for a failed test +result = debug_test( + goal_id="youtube-research", + test_id="test_find_videos_no_results" +) + +# Returns comprehensive debug info +{ + "test_id": "test_find_videos_no_results", + "test_name": "test_find_videos_no_results", + "input": {"topic": "xyzabc123nonsense"}, + "expected": {"videos": [], "message": "No results found"}, + "actual": {"error": "NullPointerException at node_3"}, + "passed": false, + "error_message": "TypeError: 'NoneType' has no attribute 'get'", + "error_category": "IMPLEMENTATION_ERROR", + "stack_trace": "Traceback (most recent call last):\n ...", + "logs": [ + {"timestamp": "...", "node": "search_node", "level": "INFO", "msg": "..."}, + {"timestamp": "...", "node": "filter_node", "level": "ERROR", "msg": "..."} + ], + "runtime_data": { + "execution_path": ["start", "search_node", "filter_node"], + "node_outputs": {...} + }, + "suggested_fix": "Check null handling in filter_node when no results returned", + "iteration_guidance": { + "stage": "Agent", + "action": "Fix the code in nodes/edges", + "restart_required": false + } +} +``` + +### Debug Workflow + +1. **Run all tests** → Get pass/fail summary +2. **Select failed test** → Get detailed DebugInfo +3. **Review categorization** → Understand error type +4. **Check suggested fix** → Get actionable guidance +5. **Follow iteration guidance** → Go to correct stage + +## Example: Testing YouTube Agent + +See [examples/testing-youtube-agent.md](examples/testing-youtube-agent.md) for a complete walkthrough. + +## Common Patterns + +### Happy Path Tests +Test normal successful execution with valid inputs: +```python +def test_find_videos_happy_path(agent): + result = agent.run({"topic": "python tutorials"}) + assert result.success + assert len(result.videos) >= 3 + assert all(v.title for v in result.videos) +``` + +### Boundary Condition Tests +Test exactly at target thresholds: +```python +def test_find_videos_minimum_count(agent): + result = agent.run({"topic": "very specific niche topic"}) + assert len(result.videos) >= 1 # At least one result +``` + +### Error Handling Tests +Test graceful handling of failures: +```python +def test_find_videos_invalid_input(agent): + result = agent.run({"topic": ""}) # Empty input + assert not result.success or result.message == "Invalid input" +``` + +### Constraint Violation Tests +Test that constraints are respected: +```python +def test_api_rate_limit_respected(agent): + # Run multiple times quickly + for _ in range(5): + result = agent.run({"topic": "test"}) + # Should not hit rate limit errors + assert "rate limit" not in str(result).lower() +``` + +## Anti-Patterns + +| Don't | Do Instead | +|-------|------------| +| Auto-approve tests | Always require explicit user approval | +| Run PENDING/REJECTED tests | Only run APPROVED/MODIFIED tests | +| Generate success tests during Goal stage | Wait until agent exists | +| Treat all failures the same | Categorize and iterate appropriately | +| Restart full flow for IMPLEMENTATION_ERROR | Fix agent, re-run Eval only | +| Add test for LOGIC_ERROR | Fix the goal definition instead | +| Ignore confidence scores | Review low-confidence categorizations manually | +| Skip the approval step | Tests must be reviewed before running | + +## Tools Reference + +### Testing Tools + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `generate_constraint_tests` | Generate tests from goal constraints | Goal stage | +| `generate_success_tests` | Generate tests from success criteria | Eval stage (after agent built) | +| `approve_tests` | Approve/reject/modify generated tests | After generation | +| `run_tests` | Execute tests in parallel | After approval | +| `debug_test` | Analyze failed test with categorization | After test fails | +| `list_tests` | List tests for a goal by status | Anytime | +| `get_pending_tests` | Get tests awaiting approval | Before approval | + +### Building Tools (for iteration) + +When iteration requires modifying the agent, use these from the building-agents skill: + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `set_goal` | Update goal definition | LOGIC_ERROR iteration | +| `add_node` | Add or modify nodes | IMPLEMENTATION_ERROR iteration | +| `add_edge` | Add or modify edges | IMPLEMENTATION_ERROR iteration | +| `validate_graph` | Validate changes | After any modification | +| `export_graph` | Re-export agent | After fixes complete | + +## CLI Commands + +```bash +# Generate tests from goal +python -m core test-generate goal.json --type all + +# Interactive approval of pending tests +python -m core test-approve + +# Run tests for an agent +python -m core test-run --goal --parallel 4 + +# Debug a failed test +python -m core test-debug + +# List tests by status +python -m core test-list --status approved + +# Show test statistics +python -m core test-stats +``` + +## Integration with building-agents + +### Handoff Points + +| Scenario | From | To | Action | +|----------|------|-----|--------| +| Agent built, ready to test | building-agents | testing-agent | Generate success tests | +| LOGIC_ERROR found | testing-agent | building-agents | Update goal, rebuild | +| IMPLEMENTATION_ERROR found | testing-agent | building-agents | Fix nodes/edges | +| EDGE_CASE found | testing-agent | testing-agent | Generate edge case test | +| All tests pass | testing-agent | Done | Agent is validated | + +### When to Switch Skills + +**Use building-agents when:** +- Defining goals and constraints +- Building agent nodes and edges +- Fixing LOGIC_ERROR or IMPLEMENTATION_ERROR + +**Use testing-agent when:** +- Generating tests from goals +- Approving and running tests +- Debugging failures +- Categorizing errors + +### Shared Patterns + +Both skills use: +- AskUserQuestion with structured options +- HITL at every critical step +- Fallback to text mode when widgets unavailable +- Session state management for continuity diff --git a/core/.claude/skills/testing-agent/examples/testing-youtube-agent.md b/core/.claude/skills/testing-agent/examples/testing-youtube-agent.md new file mode 100644 index 00000000..42fd6b91 --- /dev/null +++ b/core/.claude/skills/testing-agent/examples/testing-youtube-agent.md @@ -0,0 +1,348 @@ +# Example: Testing a YouTube Research Agent + +This example walks through testing a YouTube research agent that finds relevant videos based on a topic. + +## Prerequisites + +- Agent built with building-agents skill at `exports/youtube-research/` +- Goal defined with success criteria and constraints + +## Step 1: Load the Goal + +First, load the goal that was defined during the Goal stage: + +```json +{ + "id": "youtube-research", + "name": "YouTube Research Agent", + "description": "Find relevant YouTube videos on a given topic", + "success_criteria": [ + { + "id": "find_videos", + "description": "Find 3-5 relevant videos", + "metric": "video_count", + "target": "3-5", + "weight": 1.0 + }, + { + "id": "relevance", + "description": "Videos must be relevant to the topic", + "metric": "relevance_score", + "target": ">0.8", + "weight": 0.8 + } + ], + "constraints": [ + { + "id": "api_limits", + "description": "Must not exceed YouTube API rate limits", + "constraint_type": "hard", + "category": "technical" + }, + { + "id": "content_safety", + "description": "Must filter out inappropriate content", + "constraint_type": "hard", + "category": "safety" + } + ] +} +``` + +## Step 2: Generate Constraint Tests + +During the Goal stage (or early Eval), generate tests for constraints: + +```python +result = generate_constraint_tests( + goal_id="youtube-research", + goal_json='' +) +``` + +**Generated tests (awaiting approval):** + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Generated Constraint Tests (2 tests) │ +├─────────────────────────────────────────────────────────────────┤ +│ [1/2] test_constraint_api_limits_respected │ +│ Constraint: api_limits │ +│ Confidence: 88% │ +│ │ +│ def test_constraint_api_limits_respected(agent): │ +│ """Verify API rate limits are not exceeded.""" │ +│ import time │ +│ for i in range(10): │ +│ result = agent.run({"topic": f"test_{i}"}) │ +│ time.sleep(0.1) │ +│ # Should complete without rate limit errors │ +│ assert "rate limit" not in str(result).lower() │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +├─────────────────────────────────────────────────────────────────┤ +│ [2/2] test_constraint_content_safety_filter │ +│ Constraint: content_safety │ +│ Confidence: 91% │ +│ │ +│ def test_constraint_content_safety_filter(agent): │ +│ """Verify inappropriate content is filtered.""" │ +│ result = agent.run({"topic": "general topic"}) │ +│ for video in result.videos: │ +│ assert video.safe_for_work is True │ +│ assert video.age_restricted is False │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Step 3: Approve Constraint Tests + +Review and approve each test: + +```python +result = approve_tests( + goal_id="youtube-research", + approvals='[ + {"test_id": "test_constraint_api_001", "action": "approve"}, + {"test_id": "test_constraint_content_001", "action": "approve"} + ]' +) +``` + +## Step 4: Generate Success Criteria Tests + +After the agent is built, generate success criteria tests: + +```python +result = generate_success_tests( + goal_id="youtube-research", + goal_json='', + node_names="search_node,filter_node,rank_node,format_node", + tool_names="youtube_search,video_details,channel_info" +) +``` + +**Generated tests (awaiting approval):** + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Generated Success Criteria Tests (4 tests) │ +├─────────────────────────────────────────────────────────────────┤ +│ [1/4] test_find_videos_happy_path │ +│ Criteria: find_videos │ +│ Confidence: 95% │ +│ │ +│ def test_find_videos_happy_path(agent): │ +│ """Test finding videos for a common topic.""" │ +│ result = agent.run({"topic": "machine learning"}) │ +│ assert result.success │ +│ assert 3 <= len(result.videos) <= 5 │ +│ assert all(v.title for v in result.videos) │ +│ assert all(v.video_id for v in result.videos) │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +├─────────────────────────────────────────────────────────────────┤ +│ [2/4] test_find_videos_minimum_boundary │ +│ Criteria: find_videos │ +│ Confidence: 87% │ +│ │ +│ def test_find_videos_minimum_boundary(agent): │ +│ """Test at minimum threshold (3 videos).""" │ +│ result = agent.run({"topic": "niche topic xyz"}) │ +│ assert len(result.videos) >= 3 │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +├─────────────────────────────────────────────────────────────────┤ +│ [3/4] test_relevance_score_threshold │ +│ Criteria: relevance │ +│ Confidence: 92% │ +│ │ +│ def test_relevance_score_threshold(agent): │ +│ """Test relevance scoring meets threshold.""" │ +│ result = agent.run({"topic": "python programming"}) │ +│ for video in result.videos: │ +│ assert video.relevance_score > 0.8 │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +├─────────────────────────────────────────────────────────────────┤ +│ [4/4] test_find_videos_no_results_graceful │ +│ Criteria: find_videos │ +│ Confidence: 84% │ +│ │ +│ def test_find_videos_no_results_graceful(agent): │ +│ """Test graceful handling of no results.""" │ +│ result = agent.run({"topic": "xyznonexistent123"}) │ +│ # Should not crash, return empty or message │ +│ assert result.videos == [] or result.message │ +│ │ +│ [a]pprove [r]eject [e]dit [s]kip │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Step 5: Approve Success Criteria Tests + +```python +result = approve_tests( + goal_id="youtube-research", + approvals='[ + {"test_id": "test_success_001", "action": "approve"}, + {"test_id": "test_success_002", "action": "approve"}, + {"test_id": "test_success_003", "action": "approve"}, + {"test_id": "test_success_004", "action": "approve"} + ]' +) +``` + +## Step 6: Run All Tests + +Execute all approved tests: + +```python +result = run_tests( + goal_id="youtube-research", + agent_path="exports/youtube-research", + test_types='["all"]', + parallel=4 +) +``` + +**Results:** + +```json +{ + "goal_id": "youtube-research", + "overall_passed": false, + "summary": { + "total": 6, + "passed": 5, + "failed": 1, + "pass_rate": "83.3%" + }, + "duration_ms": 4521, + "results": [ + {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234}, + {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456}, + {"test_id": "test_success_001", "passed": true, "duration_ms": 789}, + {"test_id": "test_success_002", "passed": true, "duration_ms": 654}, + {"test_id": "test_success_003", "passed": true, "duration_ms": 543}, + {"test_id": "test_success_004", "passed": false, "duration_ms": 845, + "error_category": "IMPLEMENTATION_ERROR", + "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"} + ] +} +``` + +## Step 7: Debug the Failed Test + +```python +result = debug_test( + goal_id="youtube-research", + test_id="test_success_004" +) +``` + +**Debug Output:** + +```json +{ + "test_id": "test_success_004", + "test_name": "test_find_videos_no_results_graceful", + "input": {"topic": "xyznonexistent123"}, + "expected": "Empty list or message", + "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"}, + "passed": false, + "error_message": "TypeError: 'NoneType' object has no attribute 'videos'", + "error_category": "IMPLEMENTATION_ERROR", + "stack_trace": "Traceback (most recent call last):\n File \"filter_node.py\", line 42\n for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'", + "logs": [ + {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"}, + {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"}, + {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"} + ], + "runtime_data": { + "execution_path": ["start", "search_node", "filter_node"], + "node_outputs": { + "search_node": null + } + }, + "suggested_fix": "Add null check in filter_node before accessing .videos attribute", + "iteration_guidance": { + "stage": "Agent", + "action": "Fix the code in nodes/edges", + "restart_required": false, + "description": "The goal is correct, but filter_node doesn't handle null results from search_node." + } +} +``` + +## Step 8: Iterate Based on Category + +Since this is an **IMPLEMENTATION_ERROR**, we: + +1. **Don't restart** the Goal → Agent → Eval flow +2. **Fix the agent** using building-agents skill: + - Modify `filter_node` to handle null results +3. **Re-run Eval** (tests only) + +### Fix in building-agents: + +```python +# Update the filter_node to handle null +add_node( + node_id="filter_node", + name="Filter Node", + description="Filter and rank videos", + node_type="function", + input_keys=["search_results"], + output_keys=["filtered_videos"], + system_prompt=""" + Filter videos by relevance. + IMPORTANT: Handle case where search_results is None or empty. + Return empty list if no results. + """ +) +``` + +### Re-export and re-test: + +```python +# Re-export the fixed agent +export_graph(path="exports/youtube-research") + +# Re-run tests +result = run_tests( + goal_id="youtube-research", + agent_path="exports/youtube-research", + test_types='["all"]' +) +``` + +**Updated Results:** + +```json +{ + "goal_id": "youtube-research", + "overall_passed": true, + "summary": { + "total": 6, + "passed": 6, + "failed": 0, + "pass_rate": "100.0%" + } +} +``` + +## Summary + +1. **Generated** constraint tests during Goal stage +2. **Generated** success criteria tests during Eval stage +3. **Approved** all tests with user review +4. **Ran** tests in parallel +5. **Debugged** the one failure +6. **Categorized** as IMPLEMENTATION_ERROR +7. **Fixed** the agent (not the goal) +8. **Re-ran** Eval only (didn't restart full flow) +9. **Passed** all tests + +The agent is now validated and ready for production use. diff --git a/core/README.md b/core/README.md index c4b6e986..3a91c8da 100644 --- a/core/README.md +++ b/core/README.md @@ -128,6 +128,29 @@ runtime.record_outcome( runtime.end_run(success=True, narrative="Successfully processed all data") ``` +### Testing Agents + +The framework includes a goal-based testing framework for validating agent behavior. + +```bash +# Generate tests from a goal definition +python -m framework test-generate goal.json + +# Interactively approve generated tests +python -m framework test-approve + +# Run tests against an agent +python -m framework test-run --parallel 4 + +# Debug failed tests +python -m framework test-debug + +# List tests by status +python -m framework test-list +``` + +For detailed testing workflows, see the [testing-agent skill](.claude/skills/testing-agent/SKILL.md). + ### Analyzing with Builder ```python diff --git a/core/framework/__init__.py b/core/framework/__init__.py index 85a425b7..1091f55e 100644 --- a/core/framework/__init__.py +++ b/core/framework/__init__.py @@ -10,6 +10,16 @@ choice the agent makes is captured with: - Whether that was good or bad (evaluated post-hoc) This gives the Builder LLM the information it needs to improve agent behavior. + +## Testing Framework + +The framework includes a Goal-Based Testing system (Goal → Agent → Eval): +- Generate tests from Goal success_criteria and constraints +- Mandatory user approval before tests are stored +- Parallel test execution with error categorization +- Debug tools with fix suggestions + +See `framework.testing` for details. """ from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation @@ -19,6 +29,21 @@ from framework.builder.query import BuilderQuery from framework.llm import LLMProvider, AnthropicProvider from framework.runner import AgentRunner, AgentOrchestrator +# Testing framework +from framework.testing import ( + Test, + TestResult, + TestSuiteResult, + TestStorage, + ApprovalStatus, + ErrorCategory, + ConstraintTestGenerator, + SuccessCriteriaTestGenerator, + ParallelTestRunner, + ParallelConfig, + DebugTool, +) + __all__ = [ # Schemas "Decision", @@ -38,4 +63,16 @@ __all__ = [ # Runner "AgentRunner", "AgentOrchestrator", + # Testing + "Test", + "TestResult", + "TestSuiteResult", + "TestStorage", + "ApprovalStatus", + "ErrorCategory", + "ConstraintTestGenerator", + "SuccessCriteriaTestGenerator", + "ParallelTestRunner", + "ParallelConfig", + "DebugTool", ] diff --git a/core/framework/cli.py b/core/framework/cli.py index 951a5649..834a8a68 100644 --- a/core/framework/cli.py +++ b/core/framework/cli.py @@ -8,6 +8,14 @@ Usage: python -m core list exports/ python -m core dispatch exports/ --input '{"key": "value"}' python -m core shell exports/my-agent + +Testing commands: + python -m core test-generate goal.json + python -m core test-approve + python -m core test-run --goal + python -m core test-debug + python -m core test-list + python -m core test-stats """ import argparse @@ -20,7 +28,7 @@ def main(): ) parser.add_argument( "--model", - default="claude-sonnet-4-20250514", + default="claude-haiku-4-5-20251001", help="Anthropic model to use", ) @@ -30,6 +38,10 @@ def main(): from framework.runner.cli import register_commands register_commands(subparsers) + # Register testing commands (test-generate, test-approve, test-run, test-debug, etc.) + from framework.testing.cli import register_testing_commands + register_testing_commands(subparsers) + args = parser.parse_args() if hasattr(args, "func"): diff --git a/core/framework/graph/edge.py b/core/framework/graph/edge.py index b05cb887..bded676b 100644 --- a/core/framework/graph/edge.py +++ b/core/framework/graph/edge.py @@ -340,7 +340,7 @@ class GraphSpec(BaseModel): ) # Default LLM settings - default_model: str = "claude-sonnet-4-20250514" + default_model: str = "claude-haiku-4-5-20251001" max_tokens: int = 1024 # Execution limits diff --git a/core/framework/graph/goal.py b/core/framework/graph/goal.py index b781946b..bddf7ff7 100644 --- a/core/framework/graph/goal.py +++ b/core/framework/graph/goal.py @@ -76,6 +76,7 @@ class Constraint(BaseModel): description="Category: 'time', 'cost', 'safety', 'scope', 'quality'" ) check: str = Field( + default="", description="How to check: expression, function name, or 'llm_judge'" ) diff --git a/core/framework/llm/anthropic.py b/core/framework/llm/anthropic.py index aa86d5d5..8a69ebc4 100644 --- a/core/framework/llm/anthropic.py +++ b/core/framework/llm/anthropic.py @@ -18,14 +18,14 @@ class AnthropicProvider(LLMProvider): def __init__( self, api_key: str | None = None, - model: str = "claude-sonnet-4-20250514", + model: str = "claude-haiku-4-5-20251001", ): """ Initialize the Anthropic provider. Args: api_key: Anthropic API key. If not provided, uses ANTHROPIC_API_KEY env var. - model: Model to use (default: claude-sonnet-4-20250514) + model: Model to use (default: claude-haiku-4-5-20251001) """ self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") if not self.api_key: diff --git a/core/framework/mcp/agent_builder_server.py b/core/framework/mcp/agent_builder_server.py index b31ab1b7..41dd7957 100644 --- a/core/framework/mcp/agent_builder_server.py +++ b/core/framework/mcp/agent_builder_server.py @@ -9,6 +9,7 @@ Usage: import json from datetime import datetime +from pathlib import Path from typing import Annotated from mcp.server import FastMCP @@ -16,6 +17,15 @@ from mcp.server import FastMCP from framework.graph import Goal, SuccessCriterion, Constraint, NodeSpec, EdgeSpec, EdgeCondition from framework.graph.edge import GraphSpec +# Testing framework imports +from framework.testing.test_case import Test, ApprovalStatus, TestType +from framework.testing.test_storage import TestStorage +from framework.testing.constraint_gen import ConstraintTestGenerator +from framework.testing.success_gen import SuccessCriteriaTestGenerator +from framework.testing.approval_types import ApprovalRequest, ApprovalAction +from framework.testing.debug_tool import DebugTool +from framework.testing.parallel import AgentFactory + # Initialize MCP server mcp = FastMCP("agent-builder") @@ -1408,6 +1418,387 @@ def simulate_plan_execution( }, indent=2) +# ============================================================================= +# TESTING TOOLS (Goal-Based Evaluation) +# ============================================================================= + +# Session storage for pending tests (not yet persisted) +_pending_tests: dict[str, list[Test]] = {} + +# Default storage path for tests +DEFAULT_TEST_STORAGE_PATH = Path("data/tests") + + +@mcp.tool() +def generate_constraint_tests( + goal_id: Annotated[str, "ID of the goal to generate tests for"], + goal_json: Annotated[str, """JSON string of the Goal object. Constraint fields: +- id: string (required) +- description: string (required) +- constraint_type: "hard" or "soft" (required) +- category: string (optional, default: "general") +- check: string (optional, how to validate: "llm_judge", expression, or function name)"""], +) -> str: + """ + Generate constraint tests for a goal. + + Returns proposals for user approval. Tests are NOT persisted until approved. + """ + try: + goal = Goal.model_validate_json(goal_json) + except Exception as e: + return json.dumps({"error": f"Invalid goal JSON: {e}"}) + + # Get LLM provider + try: + from framework.llm import AnthropicProvider + llm = AnthropicProvider() + except Exception as e: + return json.dumps({"error": f"Failed to initialize LLM: {e}"}) + + # Generate tests + generator = ConstraintTestGenerator(llm) + tests = generator.generate(goal) + + # Store as pending (not persisted yet) + _pending_tests[goal_id] = tests + + return json.dumps({ + "goal_id": goal_id, + "generated_count": len(tests), + "tests": [ + { + "id": t.id, + "test_name": t.test_name, + "parent_criteria_id": t.parent_criteria_id, + "description": t.description, + "confidence": t.llm_confidence, + "test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code, + } + for t in tests + ], + "next_step": "Call approve_tests to approve, modify, or reject each test", + }) + + +@mcp.tool() +def generate_success_tests( + goal_id: Annotated[str, "ID of the goal to generate tests for"], + goal_json: Annotated[str, "JSON string of the Goal object"], + node_names: Annotated[str, "Comma-separated list of agent node names"] = "", + tool_names: Annotated[str, "Comma-separated list of available tool names"] = "", +) -> str: + """ + Generate success criteria tests for a goal. + + Should be called during Eval stage after agent exists. + Returns proposals for user approval. + """ + try: + goal = Goal.model_validate_json(goal_json) + except Exception as e: + return json.dumps({"error": f"Invalid goal JSON: {e}"}) + + # Get LLM provider + try: + from framework.llm import AnthropicProvider + llm = AnthropicProvider() + except Exception as e: + return json.dumps({"error": f"Failed to initialize LLM: {e}"}) + + # Parse node/tool names + nodes = [n.strip() for n in node_names.split(",") if n.strip()] + tools = [t.strip() for t in tool_names.split(",") if t.strip()] + + # Generate tests + generator = SuccessCriteriaTestGenerator(llm) + tests = generator.generate(goal, node_names=nodes, tool_names=tools) + + # Add to pending (may have constraint tests already) + if goal_id in _pending_tests: + _pending_tests[goal_id].extend(tests) + else: + _pending_tests[goal_id] = tests + + return json.dumps({ + "goal_id": goal_id, + "generated_count": len(tests), + "tests": [ + { + "id": t.id, + "test_name": t.test_name, + "parent_criteria_id": t.parent_criteria_id, + "description": t.description, + "confidence": t.llm_confidence, + "test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code, + } + for t in tests + ], + "next_step": "Call approve_tests to approve, modify, or reject each test", + }) + + +@mcp.tool() +def approve_tests( + goal_id: Annotated[str, "ID of the goal"], + approvals: Annotated[str, "JSON array of approval decisions"], +) -> str: + """ + Approve, reject, or modify generated tests. + + Approvals format: + [ + {"test_id": "...", "action": "approve"}, + {"test_id": "...", "action": "modify", "modified_code": "..."}, + {"test_id": "...", "action": "reject", "reason": "..."}, + {"test_id": "...", "action": "skip"} + ] + + Actions: approve, modify (requires modified_code), reject (requires reason), skip + """ + if goal_id not in _pending_tests: + return json.dumps({"error": f"No pending tests for goal {goal_id}"}) + + try: + approvals_list = json.loads(approvals) + except json.JSONDecodeError as e: + return json.dumps({"error": f"Invalid approvals JSON: {e}"}) + + # Create storage + storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id) + + # Build approval requests + requests = [] + for a in approvals_list: + try: + action = ApprovalAction(a.get("action", "skip")) + requests.append(ApprovalRequest( + test_id=a["test_id"], + action=action, + modified_code=a.get("modified_code"), + reason=a.get("reason"), + approved_by="mcp_user", + )) + except (KeyError, ValueError) as e: + return json.dumps({"error": f"Invalid approval entry: {e}"}) + + # Find and save approved tests + pending = {t.id: t for t in _pending_tests[goal_id]} + + results = [] + for req in requests: + test = pending.get(req.test_id) + if not test: + results.append({"test_id": req.test_id, "error": "Not found in pending"}) + continue + + if req.action == ApprovalAction.APPROVE: + test.approve(req.approved_by) + storage.save_test(test) + results.append({"test_id": req.test_id, "status": "approved"}) + + elif req.action == ApprovalAction.MODIFY: + if req.modified_code: + test.modify(req.modified_code, req.approved_by) + storage.save_test(test) + results.append({"test_id": req.test_id, "status": "modified"}) + else: + results.append({"test_id": req.test_id, "error": "modified_code required"}) + + elif req.action == ApprovalAction.REJECT: + test.reject(req.reason or "No reason provided") + storage.save_test(test) + results.append({"test_id": req.test_id, "status": "rejected"}) + + elif req.action == ApprovalAction.SKIP: + results.append({"test_id": req.test_id, "status": "skipped"}) + + # Clear pending for processed tests + processed_ids = {r["test_id"] for r in results if "error" not in r} + _pending_tests[goal_id] = [t for t in _pending_tests[goal_id] if t.id not in processed_ids] + + # Clean up if empty + if not _pending_tests[goal_id]: + del _pending_tests[goal_id] + + return json.dumps({"goal_id": goal_id, "results": results}) + + +@mcp.tool() +def run_tests( + goal_id: Annotated[str, "ID of the goal to test"], + agent_path: Annotated[str, "Path to the agent export folder"], + test_types: Annotated[str, 'JSON array of test types: ["constraint", "outcome", "edge_case", "all"]'] = '["all"]', + parallel: Annotated[int, "Number of parallel workers (0 for sequential)"] = 0, + fail_fast: Annotated[bool, "Stop on first failure"] = False, +) -> str: + """ + Run evaluation tests for a goal. + + Returns pass/fail summary with detailed results for each test. + """ + from framework.testing.parallel import ParallelTestRunner, ParallelConfig + + # Parse test types + try: + types_list = json.loads(test_types) + except json.JSONDecodeError: + types_list = ["all"] + + # Load storage + storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id) + + # Get approved tests + tests = storage.get_approved_tests(goal_id) + + # Filter by type if not "all" + if "all" not in types_list: + type_map = { + "constraint": TestType.CONSTRAINT, + "outcome": TestType.SUCCESS_CRITERIA, + "edge_case": TestType.EDGE_CASE, + } + filter_types = {type_map.get(t) for t in types_list if t in type_map} + tests = [t for t in tests if t.test_type in filter_types] + + if not tests: + return json.dumps({ + "goal_id": goal_id, + "error": "No approved tests found", + "hint": "Generate and approve tests first using generate_constraint_tests and approve_tests", + }) + + # Configure runner + config = ParallelConfig( + num_workers=parallel if parallel > 0 else 1, + fail_fast=fail_fast, + ) + + # Run tests - use AgentFactory for picklable parallel execution + runner = ParallelTestRunner(config, storage) + result = runner.run_all( + goal_id=goal_id, + agent_factory=AgentFactory(agent_path), + tests=tests, + ) + + return json.dumps({ + "goal_id": goal_id, + "overall_passed": result.all_passed, + "summary": { + "total": result.total, + "passed": result.passed, + "failed": result.failed, + "pass_rate": f"{result.pass_rate:.1%}", + }, + "duration_ms": result.duration_ms, + "results": [r.summary_dict() for r in result.results], + }) + + +@mcp.tool() +def debug_test( + goal_id: Annotated[str, "ID of the goal"], + test_id: Annotated[str, "ID of the failed test"], + run_id: Annotated[str, "Optional Runtime run ID for detailed logs"] = "", +) -> str: + """ + Get detailed debug info for a failed test. + + Includes error categorization, logs, and fix suggestions. + """ + storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id) + + # Optionally load runtime storage + runtime_storage = None + try: + from framework.storage.backend import FileStorage + runtime_storage = FileStorage(f"data/runtime/{goal_id}") + except Exception: + pass + + debug_tool = DebugTool(storage, runtime_storage) + info = debug_tool.analyze(goal_id, test_id, run_id or None) + + return json.dumps(info.to_dict(), indent=2, default=str) + + +@mcp.tool() +def list_tests( + goal_id: Annotated[str, "ID of the goal"], + status: Annotated[str, "Filter by approval status: pending, approved, modified, rejected, all"] = "all", +) -> str: + """ + List tests for a goal. + + Returns test metadata without full code (use debug_test for details). + """ + storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id) + tests = storage.get_tests_by_goal(goal_id) + + # Filter by status + if status != "all": + try: + filter_status = ApprovalStatus(status) + tests = [t for t in tests if t.approval_status == filter_status] + except ValueError: + pass + + return json.dumps({ + "goal_id": goal_id, + "total": len(tests), + "tests": [ + { + "id": t.id, + "test_name": t.test_name, + "test_type": t.test_type.value, + "parent_criteria_id": t.parent_criteria_id, + "approval_status": t.approval_status.value, + "last_result": t.last_result, + "confidence": t.llm_confidence, + } + for t in tests + ], + }) + + +@mcp.tool() +def get_pending_tests( + goal_id: Annotated[str, "ID of the goal"], +) -> str: + """ + Get pending tests awaiting approval. + + Returns tests that have been generated but not yet approved. + """ + if goal_id not in _pending_tests: + return json.dumps({ + "goal_id": goal_id, + "pending_count": 0, + "tests": [], + }) + + tests = _pending_tests[goal_id] + return json.dumps({ + "goal_id": goal_id, + "pending_count": len(tests), + "tests": [ + { + "id": t.id, + "test_name": t.test_name, + "test_type": t.test_type.value, + "parent_criteria_id": t.parent_criteria_id, + "description": t.description, + "confidence": t.llm_confidence, + "test_code": t.test_code, + "input": t.input, + "expected_output": t.expected_output, + } + for t in tests + ], + }) + + # ============================================================================= # PLAN LOADING AND EXECUTION # ============================================================================= diff --git a/core/framework/runner/cli.py b/core/framework/runner/cli.py index e11a7a07..4ae6ed5f 100644 --- a/core/framework/runner/cli.py +++ b/core/framework/runner/cli.py @@ -189,7 +189,7 @@ def cmd_run(args: argparse.Namespace) -> int: runner = AgentRunner.load( args.agent_path, mock_mode=args.mock, - model=getattr(args, "model", "claude-sonnet-4-20250514"), + model=getattr(args, "model", "claude-haiku-4-5-20251001"), ) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) diff --git a/core/framework/runner/orchestrator.py b/core/framework/runner/orchestrator.py index 2f322aae..4b2c0142 100644 --- a/core/framework/runner/orchestrator.py +++ b/core/framework/runner/orchestrator.py @@ -57,7 +57,7 @@ class AgentOrchestrator: def __init__( self, llm: LLMProvider | None = None, - model: str = "claude-sonnet-4-20250514", + model: str = "claude-haiku-4-5-20251001", ): """ Initialize the orchestrator. diff --git a/core/framework/runner/runner.py b/core/framework/runner/runner.py index 60228ab4..87f1ea9a 100644 --- a/core/framework/runner/runner.py +++ b/core/framework/runner/runner.py @@ -171,7 +171,7 @@ class AgentRunner: goal: Goal, mock_mode: bool = False, storage_path: Path | None = None, - model: str = "claude-sonnet-4-20250514", + model: str = "claude-haiku-4-5-20251001", ): """ Initialize the runner (use AgentRunner.load() instead). @@ -216,7 +216,7 @@ class AgentRunner: agent_path: str | Path, mock_mode: bool = False, storage_path: Path | None = None, - model: str = "claude-sonnet-4-20250514", + model: str = "claude-haiku-4-5-20251001", ) -> "AgentRunner": """ Load an agent from an export folder. diff --git a/core/framework/runtime/core.py b/core/framework/runtime/core.py index 5ed3d287..70acdde1 100644 --- a/core/framework/runtime/core.py +++ b/core/framework/runtime/core.py @@ -9,12 +9,15 @@ handles all the structured logging. from datetime import datetime from typing import Any from pathlib import Path +import logging import uuid from framework.schemas.decision import Decision, Option, Outcome, DecisionType from framework.schemas.run import Run, RunStatus from framework.storage.backend import FileStorage +logger = logging.getLogger(__name__) + class Runtime: """ @@ -100,7 +103,10 @@ class Runtime: output_data: Final output of the run """ if self._current_run is None: - raise RuntimeError("No run in progress") + # Gracefully handle case where run was already ended or never started + # This can happen during exception handling cascades + logger.warning("end_run called but no run in progress (already ended or never started)") + return status = RunStatus.COMPLETED if success else RunStatus.FAILED self._current_run.output_data = output_data or {} @@ -158,10 +164,12 @@ class Runtime: context: Additional context available when deciding Returns: - The decision ID (use this to record outcome later) + The decision ID (use this to record outcome later), or empty string if no run in progress """ if self._current_run is None: - raise RuntimeError("No run in progress. Call start_run() first.") + # Gracefully handle case where run ended during exception handling + logger.warning(f"decide called but no run in progress: {intent}") + return "" # Build Option objects option_objects = [] @@ -220,7 +228,10 @@ class Runtime: latency_ms: Time taken in milliseconds """ if self._current_run is None: - raise RuntimeError("No run in progress") + # Gracefully handle case where run ended during exception handling + # This can happen in cascading error scenarios + logger.warning(f"record_outcome called but no run in progress (decision_id={decision_id})") + return outcome = Outcome( success=success, @@ -258,10 +269,13 @@ class Runtime: suggested_fix: What might fix it (if known) Returns: - The problem ID + The problem ID, or empty string if no run in progress """ if self._current_run is None: - raise RuntimeError("No run in progress") + # Gracefully handle case where run ended during exception handling + # Log the problem since we can't store it, then return empty ID + logger.warning(f"report_problem called but no run in progress: [{severity}] {description}") + return "" return self._current_run.add_problem( severity=severity, diff --git a/core/framework/testing/__init__.py b/core/framework/testing/__init__.py new file mode 100644 index 00000000..c7ec606a --- /dev/null +++ b/core/framework/testing/__init__.py @@ -0,0 +1,144 @@ +""" +Goal-Based Testing Framework + +A three-stage framework (Goal → Agent → Eval) where tests are LLM-generated +from success_criteria and constraints, with mandatory user approval. + +## Core Flow + +1. **Goal Stage**: Define success_criteria and constraints, generate constraint tests +2. **Agent Stage**: Build nodes + edges, run constraint tests during development +3. **Eval Stage**: Generate success_criteria tests, run all tests, debug failures + +## Key Components + +- **Schemas**: Test, TestResult, TestSuiteResult, ApprovalStatus, ErrorCategory +- **Storage**: TestStorage for persisting tests and results +- **Generation**: LLM-based test generation from Goal criteria +- **Approval**: Mandatory user approval workflow (CLI and programmatic) +- **Runner**: Parallel test execution with pytest-xdist inspired design +- **Debug**: Error categorization and fix suggestions + +## MCP Tools + +Testing tools are integrated into the main agent_builder_server.py (not a separate server). +This ensures the building_agent skill has access to all testing functionality: +- generate_constraint_tests, generate_success_tests +- approve_tests, run_tests, debug_test +- list_tests, get_pending_tests + +## Usage + +```python +from framework.testing import ( + Test, TestResult, TestStorage, + ConstraintTestGenerator, SuccessCriteriaTestGenerator, + ParallelTestRunner, DebugTool, +) + +# Generate tests +generator = ConstraintTestGenerator(llm) +tests = generator.generate(goal) + +# Approve tests (required) +for test in tests: + test.approve("user") + storage.save_test(test) + +# Run tests +runner = ParallelTestRunner() +result = runner.run_all(goal_id, agent_factory, tests) + +# Debug failures +debug = DebugTool(storage) +info = debug.analyze(goal_id, test_id) +``` + +## CLI Commands + +```bash +python -m framework test-generate goal.json +python -m framework test-approve +python -m framework test-run --goal +python -m framework test-debug +``` +""" + +# Schemas +from framework.testing.test_case import ( + ApprovalStatus, + TestType, + Test, +) +from framework.testing.test_result import ( + ErrorCategory, + TestResult, + TestSuiteResult, +) + +# Storage +from framework.testing.test_storage import TestStorage + +# Generation +from framework.testing.constraint_gen import ConstraintTestGenerator +from framework.testing.success_gen import SuccessCriteriaTestGenerator +from framework.testing.prompts import ( + CONSTRAINT_TEST_PROMPT, + SUCCESS_CRITERIA_TEST_PROMPT, +) + +# Approval +from framework.testing.approval_types import ( + ApprovalAction, + ApprovalRequest, + ApprovalResult, + BatchApprovalRequest, + BatchApprovalResult, +) +from framework.testing.approval_cli import interactive_approval, batch_approval + +# Runner +from framework.testing.executor import TestExecutor +from framework.testing.parallel import ParallelTestRunner, ParallelConfig +from framework.testing.categorizer import ErrorCategorizer + +# Debug +from framework.testing.debug_tool import DebugTool, DebugInfo + +# CLI +from framework.testing.cli import register_testing_commands + +__all__ = [ + # Schemas + "ApprovalStatus", + "TestType", + "Test", + "ErrorCategory", + "TestResult", + "TestSuiteResult", + # Storage + "TestStorage", + # Generation + "ConstraintTestGenerator", + "SuccessCriteriaTestGenerator", + "CONSTRAINT_TEST_PROMPT", + "SUCCESS_CRITERIA_TEST_PROMPT", + # Approval + "ApprovalAction", + "ApprovalRequest", + "ApprovalResult", + "BatchApprovalRequest", + "BatchApprovalResult", + "interactive_approval", + "batch_approval", + # Runner + "TestExecutor", + "ParallelTestRunner", + "ParallelConfig", + "ErrorCategorizer", + # Debug + "DebugTool", + "DebugInfo", + # CLI + "register_testing_commands", +] diff --git a/core/framework/testing/approval_cli.py b/core/framework/testing/approval_cli.py new file mode 100644 index 00000000..515222a0 --- /dev/null +++ b/core/framework/testing/approval_cli.py @@ -0,0 +1,295 @@ +""" +Interactive CLI for reviewing and approving generated tests. + +LLM-generated tests are NEVER created without user approval. +This CLI provides the interactive approval workflow. +""" + +import json +import tempfile +import subprocess +import os +from typing import Callable + +from framework.testing.test_case import Test, ApprovalStatus +from framework.testing.test_storage import TestStorage +from framework.testing.approval_types import ( + ApprovalAction, + ApprovalRequest, + ApprovalResult, + BatchApprovalResult, +) + + +def interactive_approval( + tests: list[Test], + storage: TestStorage, + on_progress: Callable[[int, int], None] | None = None, +) -> list[ApprovalResult]: + """ + Interactive CLI flow for reviewing generated tests. + + Displays each test and allows user to: + - [a]pprove: Accept as-is + - [r]eject: Decline with reason + - [e]dit: Modify before accepting + - [s]kip: Leave pending (decide later) + + Args: + tests: List of pending tests to review + storage: TestStorage for saving decisions + on_progress: Optional callback(current, total) for progress tracking + + Returns: + List of ApprovalResult for each processed test + """ + results = [] + total = len(tests) + + for i, test in enumerate(tests, 1): + if on_progress: + on_progress(i, total) + + # Display test + _display_test(test, i, total) + + # Get user action + action = _get_user_action() + + # Process action + result = _process_action(test, action, storage) + results.append(result) + + print() # Blank line between tests + + return results + + +def batch_approval( + goal_id: str, + requests: list[ApprovalRequest], + storage: TestStorage, +) -> BatchApprovalResult: + """ + Process multiple approval requests at once. + + Used by MCP interface for programmatic approval. + + Args: + goal_id: Goal ID for the tests + requests: List of approval requests + storage: TestStorage for saving decisions + + Returns: + BatchApprovalResult with counts and individual results + """ + results = [] + counts = { + "approved": 0, + "modified": 0, + "rejected": 0, + "skipped": 0, + "errors": 0, + } + + for req in requests: + # Validate request + valid, error = req.validate_action() + if not valid: + results.append(ApprovalResult.error_result( + req.test_id, req.action, error or "Invalid request" + )) + counts["errors"] += 1 + continue + + # Load test + test = storage.load_test(goal_id, req.test_id) + if not test: + results.append(ApprovalResult.error_result( + req.test_id, req.action, f"Test {req.test_id} not found" + )) + counts["errors"] += 1 + continue + + # Apply action + try: + if req.action == ApprovalAction.APPROVE: + test.approve(req.approved_by) + counts["approved"] += 1 + elif req.action == ApprovalAction.MODIFY: + test.modify(req.modified_code or test.test_code, req.approved_by) + counts["modified"] += 1 + elif req.action == ApprovalAction.REJECT: + test.reject(req.reason or "No reason provided") + counts["rejected"] += 1 + elif req.action == ApprovalAction.SKIP: + counts["skipped"] += 1 + + # Save if not skipped + if req.action != ApprovalAction.SKIP: + storage.update_test(test) + + results.append(ApprovalResult.success_result( + req.test_id, req.action, f"Test {req.action.value}d successfully" + )) + + except Exception as e: + results.append(ApprovalResult.error_result( + req.test_id, req.action, str(e) + )) + counts["errors"] += 1 + + return BatchApprovalResult( + goal_id=goal_id, + total=len(requests), + approved=counts["approved"], + modified=counts["modified"], + rejected=counts["rejected"], + skipped=counts["skipped"], + errors=counts["errors"], + results=results, + ) + + +def _display_test(test: Test, index: int, total: int) -> None: + """Display a test for review.""" + separator = "=" * 60 + + print(f"\n{separator}") + print(f"[{index}/{total}] {test.test_name}") + print(f"Type: {test.test_type.value}") + print(f"Criteria: {test.parent_criteria_id}") + print(f"Confidence: {test.llm_confidence * 100:.0f}%") + print(separator) + + print(f"\nDescription: {test.description}") + + if test.input: + print(f"\nInput:") + print(json.dumps(test.input, indent=2)) + + if test.expected_output: + print(f"\nExpected Output:") + print(json.dumps(test.expected_output, indent=2)) + + print(f"\nTest Code:") + print("-" * 40) + print(test.test_code) + print("-" * 40) + + print("\n[a]pprove [r]eject [e]dit [s]kip") + + +def _get_user_action() -> ApprovalAction: + """Get user's choice for action.""" + while True: + choice = input("Your choice: ").strip().lower() + + if choice == "a": + return ApprovalAction.APPROVE + elif choice == "r": + return ApprovalAction.REJECT + elif choice == "e": + return ApprovalAction.MODIFY + elif choice == "s": + return ApprovalAction.SKIP + else: + print("Invalid choice. Please enter a, r, e, or s.") + + +def _process_action( + test: Test, + action: ApprovalAction, + storage: TestStorage, +) -> ApprovalResult: + """Process user's action on a test.""" + try: + if action == ApprovalAction.APPROVE: + test.approve() + storage.update_test(test) + print("✓ Approved") + return ApprovalResult.success_result(test.id, action, "Approved") + + elif action == ApprovalAction.REJECT: + reason = input("Rejection reason: ").strip() + if not reason: + reason = "No reason provided" + test.reject(reason) + storage.update_test(test) + print(f"✗ Rejected: {reason}") + return ApprovalResult.success_result(test.id, action, f"Rejected: {reason}") + + elif action == ApprovalAction.MODIFY: + edited_code = _edit_test_code(test.test_code) + if edited_code != test.test_code: + test.modify(edited_code) + storage.update_test(test) + print("✓ Modified and approved") + return ApprovalResult.success_result(test.id, action, "Modified and approved") + else: + # No changes made, treat as approve + test.approve() + storage.update_test(test) + print("✓ Approved (no modifications)") + return ApprovalResult.success_result(test.id, ApprovalAction.APPROVE, "No modifications made") + + elif action == ApprovalAction.SKIP: + print("⏭ Skipped (remains pending)") + return ApprovalResult.success_result(test.id, action, "Skipped") + + else: + return ApprovalResult.error_result(test.id, action, f"Unknown action: {action}") + + except Exception as e: + return ApprovalResult.error_result(test.id, action, str(e)) + + +def _edit_test_code(code: str) -> str: + """ + Open test code in user's editor for modification. + + Uses $EDITOR environment variable, falls back to vim/nano. + """ + editor = os.environ.get("EDITOR", "vim") + + # Try to find an available editor + if not _command_exists(editor): + for fallback in ["nano", "vi", "notepad"]: + if _command_exists(fallback): + editor = fallback + break + + # Create temp file with code + with tempfile.NamedTemporaryFile( + mode="w", + suffix=".py", + delete=False + ) as f: + f.write(code) + temp_path = f.name + + try: + # Open editor + subprocess.run([editor, temp_path], check=True) + + # Read edited code + with open(temp_path) as f: + return f.read() + except subprocess.CalledProcessError: + print("Editor failed, keeping original code") + return code + except FileNotFoundError: + print(f"Editor '{editor}' not found, keeping original code") + return code + finally: + # Clean up temp file + try: + os.unlink(temp_path) + except OSError: + pass + + +def _command_exists(cmd: str) -> bool: + """Check if a command exists in PATH.""" + from shutil import which + return which(cmd) is not None diff --git a/core/framework/testing/approval_types.py b/core/framework/testing/approval_types.py new file mode 100644 index 00000000..f1f2ea54 --- /dev/null +++ b/core/framework/testing/approval_types.py @@ -0,0 +1,130 @@ +""" +Types for the approval workflow. + +These types are used for both interactive CLI approval and +programmatic/MCP-based approval. +""" + +from enum import Enum +from datetime import datetime +from typing import Any + +from pydantic import BaseModel, Field + + +class ApprovalAction(str, Enum): + """Actions a user can take on a generated test.""" + APPROVE = "approve" # Accept as-is + MODIFY = "modify" # Accept with modifications + REJECT = "reject" # Decline + SKIP = "skip" # Leave pending (decide later) + + +class ApprovalRequest(BaseModel): + """ + Request to approve/modify/reject a generated test. + + Used by both CLI and MCP interfaces. + """ + test_id: str + action: ApprovalAction + modified_code: str | None = Field( + default=None, + description="New code if action is MODIFY" + ) + reason: str | None = Field( + default=None, + description="Rejection reason if action is REJECT" + ) + approved_by: str = "user" + + def validate_action(self) -> tuple[bool, str | None]: + """ + Validate that the request has required fields for its action. + + Returns: + Tuple of (is_valid, error_message) + """ + if self.action == ApprovalAction.MODIFY and not self.modified_code: + return False, "modified_code is required for MODIFY action" + if self.action == ApprovalAction.REJECT and not self.reason: + return False, "reason is required for REJECT action" + return True, None + + +class ApprovalResult(BaseModel): + """ + Result of processing an approval request. + """ + test_id: str + action: ApprovalAction + success: bool + message: str | None = None + error: str | None = None + timestamp: datetime = Field(default_factory=datetime.now) + + @classmethod + def success_result( + cls, test_id: str, action: ApprovalAction, message: str | None = None + ) -> "ApprovalResult": + """Create a successful result.""" + return cls( + test_id=test_id, + action=action, + success=True, + message=message, + ) + + @classmethod + def error_result( + cls, test_id: str, action: ApprovalAction, error: str + ) -> "ApprovalResult": + """Create an error result.""" + return cls( + test_id=test_id, + action=action, + success=False, + error=error, + ) + + +class BatchApprovalRequest(BaseModel): + """ + Request to approve multiple tests at once. + + Useful for MCP interface where user reviews all tests and submits decisions. + """ + goal_id: str + approvals: list[ApprovalRequest] + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "goal_id": self.goal_id, + "approvals": [a.model_dump() for a in self.approvals], + } + + +class BatchApprovalResult(BaseModel): + """ + Result of processing a batch approval request. + """ + goal_id: str + total: int + approved: int + modified: int + rejected: int + skipped: int + errors: int + results: list[ApprovalResult] + + def summary(self) -> str: + """Return a summary string.""" + return ( + f"Processed {self.total} tests: " + f"{self.approved} approved, " + f"{self.modified} modified, " + f"{self.rejected} rejected, " + f"{self.skipped} skipped, " + f"{self.errors} errors" + ) diff --git a/core/framework/testing/categorizer.py b/core/framework/testing/categorizer.py new file mode 100644 index 00000000..eb3fbf23 --- /dev/null +++ b/core/framework/testing/categorizer.py @@ -0,0 +1,260 @@ +""" +Error categorization for test failures. + +Categorizes errors to guide iteration strategy: +- LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints +- IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage +- EDGE_CASE: New scenario discovered → add new test only +""" + +import re +from typing import Any + +from framework.testing.test_result import ErrorCategory, TestResult + + +class ErrorCategorizer: + """ + Categorize test failures for guiding iteration. + + Uses pattern matching heuristics to classify errors. + Each category has different implications for how to fix. + """ + + # Patterns indicating goal/criteria definition is wrong + LOGIC_ERROR_PATTERNS = [ + r"goal not achieved", + r"constraint violated:?\s*core", + r"fundamental assumption", + r"success criteria mismatch", + r"criteria not met", + r"expected behavior incorrect", + r"specification error", + r"requirement mismatch", + ] + + # Patterns indicating code/implementation bug + IMPLEMENTATION_ERROR_PATTERNS = [ + r"TypeError", + r"AttributeError", + r"KeyError", + r"IndexError", + r"ValueError", + r"NameError", + r"ImportError", + r"ModuleNotFoundError", + r"RuntimeError", + r"NullPointerException", + r"NoneType.*has no attribute", + r"tool call failed", + r"node execution error", + r"agent execution failed", + r"assertion.*failed", + r"AssertionError", + r"expected.*but got", + r"unexpected.*type", + r"missing required", + r"invalid.*argument", + ] + + # Patterns indicating edge case / new scenario + EDGE_CASE_PATTERNS = [ + r"boundary condition", + r"timeout", + r"connection.*timeout", + r"request.*timeout", + r"unexpected format", + r"unexpected response", + r"rare input", + r"empty.*result", + r"null.*value", + r"empty.*response", + r"no.*results", + r"rate.*limit", + r"quota.*exceeded", + r"retry.*exhausted", + r"unicode.*error", + r"encoding.*error", + r"special.*character", + ] + + def __init__(self): + """Initialize categorizer with compiled patterns.""" + self._logic_patterns = [ + re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS + ] + self._impl_patterns = [ + re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS + ] + self._edge_patterns = [ + re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS + ] + + def categorize(self, result: TestResult) -> ErrorCategory | None: + """ + Categorize a test failure. + + Args: + result: TestResult to categorize + + Returns: + ErrorCategory if test failed, None if passed + """ + if result.passed: + return None + + # Combine error sources for analysis + error_text = self._get_error_text(result) + + # Check patterns in priority order + # Logic errors take precedence (wrong goal definition) + for pattern in self._logic_patterns: + if pattern.search(error_text): + return ErrorCategory.LOGIC_ERROR + + # Then implementation errors (code bugs) + for pattern in self._impl_patterns: + if pattern.search(error_text): + return ErrorCategory.IMPLEMENTATION_ERROR + + # Then edge cases (new scenarios) + for pattern in self._edge_patterns: + if pattern.search(error_text): + return ErrorCategory.EDGE_CASE + + # Default to implementation error (most common) + return ErrorCategory.IMPLEMENTATION_ERROR + + def categorize_with_confidence( + self, result: TestResult + ) -> tuple[ErrorCategory | None, float]: + """ + Categorize with a confidence score. + + Args: + result: TestResult to categorize + + Returns: + Tuple of (category, confidence 0-1) + """ + if result.passed: + return None, 1.0 + + error_text = self._get_error_text(result) + + # Count pattern matches for each category + logic_matches = sum( + 1 for p in self._logic_patterns if p.search(error_text) + ) + impl_matches = sum( + 1 for p in self._impl_patterns if p.search(error_text) + ) + edge_matches = sum( + 1 for p in self._edge_patterns if p.search(error_text) + ) + + total_matches = logic_matches + impl_matches + edge_matches + + if total_matches == 0: + # No pattern matches, default to implementation with low confidence + return ErrorCategory.IMPLEMENTATION_ERROR, 0.3 + + # Calculate confidence based on match dominance + if logic_matches >= impl_matches and logic_matches >= edge_matches: + confidence = logic_matches / total_matches if total_matches > 0 else 0.5 + return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4) + + if impl_matches >= logic_matches and impl_matches >= edge_matches: + confidence = impl_matches / total_matches if total_matches > 0 else 0.5 + return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4) + + confidence = edge_matches / total_matches if total_matches > 0 else 0.5 + return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4) + + def _get_error_text(self, result: TestResult) -> str: + """Extract all error text from a result for analysis.""" + parts = [] + + if result.error_message: + parts.append(result.error_message) + + if result.stack_trace: + parts.append(result.stack_trace) + + # Include log messages + for log in result.runtime_logs: + if log.get("level") in ("ERROR", "CRITICAL", "WARNING"): + parts.append(str(log.get("msg", ""))) + + return " ".join(parts) + + def get_fix_suggestion(self, category: ErrorCategory) -> str: + """ + Get a fix suggestion based on error category. + + Args: + category: ErrorCategory from categorization + + Returns: + Human-readable fix suggestion + """ + suggestions = { + ErrorCategory.LOGIC_ERROR: ( + "Review and update success_criteria or constraints in the Goal definition. " + "The goal specification may not accurately describe the desired behavior." + ), + ErrorCategory.IMPLEMENTATION_ERROR: ( + "Fix the code in agent nodes/edges. " + "There's a bug in the implementation that needs to be corrected." + ), + ErrorCategory.EDGE_CASE: ( + "Add a new test for this edge case scenario. " + "This is a valid scenario that wasn't covered by existing tests." + ), + } + return suggestions.get(category, "Review the test and agent implementation.") + + def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]: + """ + Get detailed iteration guidance based on error category. + + Returns a dict with: + - stage: Which stage to return to (Goal, Agent, Eval) + - action: What action to take + - restart_required: Whether full 3-step flow restart is needed + """ + guidance = { + ErrorCategory.LOGIC_ERROR: { + "stage": "Goal", + "action": "Update success_criteria or constraints", + "restart_required": True, + "description": ( + "The goal definition is incorrect. Update the success criteria " + "or constraints, then restart the full Goal → Agent → Eval flow." + ), + }, + ErrorCategory.IMPLEMENTATION_ERROR: { + "stage": "Agent", + "action": "Fix nodes/edges implementation", + "restart_required": False, + "description": ( + "There's a code bug. Fix the agent implementation, " + "then re-run Eval (skip Goal stage)." + ), + }, + ErrorCategory.EDGE_CASE: { + "stage": "Eval", + "action": "Add new test only", + "restart_required": False, + "description": ( + "This is a new scenario. Add a test for it and continue " + "in the Eval stage." + ), + }, + } + return guidance.get(category, { + "stage": "Unknown", + "action": "Review manually", + "restart_required": False, + "description": "Unable to determine category. Manual review required.", + }) diff --git a/core/framework/testing/cli.py b/core/framework/testing/cli.py new file mode 100644 index 00000000..671c4b79 --- /dev/null +++ b/core/framework/testing/cli.py @@ -0,0 +1,413 @@ +""" +CLI commands for goal-based testing. + +Provides commands: +- test-generate: Generate tests from a goal +- test-approve: Review and approve pending tests +- test-run: Run tests for an agent +- test-debug: Debug a failed test +""" + +import argparse +import json +import sys +from pathlib import Path + +from framework.graph.goal import Goal +from framework.testing.test_case import TestType +from framework.testing.test_storage import TestStorage +from framework.testing.constraint_gen import ConstraintTestGenerator +from framework.testing.success_gen import SuccessCriteriaTestGenerator +from framework.testing.approval_cli import interactive_approval +from framework.testing.parallel import ParallelTestRunner, ParallelConfig, AgentFactory +from framework.testing.debug_tool import DebugTool + + +DEFAULT_STORAGE_PATH = Path("data/tests") + + +def register_testing_commands(subparsers: argparse._SubParsersAction) -> None: + """Register testing CLI commands.""" + + # test-generate + gen_parser = subparsers.add_parser( + "test-generate", + help="Generate tests from goal criteria", + ) + gen_parser.add_argument( + "goal_file", + help="Path to goal JSON file", + ) + gen_parser.add_argument( + "--type", + choices=["constraint", "success", "all"], + default="all", + help="Type of tests to generate", + ) + gen_parser.add_argument( + "--auto-approve", + action="store_true", + help="Skip interactive approval (use with caution)", + ) + gen_parser.add_argument( + "--output", + "-o", + help="Output directory for tests (default: data/tests/)", + ) + gen_parser.set_defaults(func=cmd_test_generate) + + # test-approve + approve_parser = subparsers.add_parser( + "test-approve", + help="Review and approve pending tests", + ) + approve_parser.add_argument( + "goal_id", + help="Goal ID to review tests for", + ) + approve_parser.add_argument( + "--storage", + help="Storage directory (default: data/tests/)", + ) + approve_parser.set_defaults(func=cmd_test_approve) + + # test-run + run_parser = subparsers.add_parser( + "test-run", + help="Run tests for an agent", + ) + run_parser.add_argument( + "agent_path", + help="Path to agent export folder", + ) + run_parser.add_argument( + "--goal", + "-g", + required=True, + help="Goal ID to run tests for", + ) + run_parser.add_argument( + "--parallel", + "-p", + type=int, + default=0, + help="Number of parallel workers (0 for sequential)", + ) + run_parser.add_argument( + "--fail-fast", + action="store_true", + help="Stop on first failure", + ) + run_parser.add_argument( + "--type", + choices=["constraint", "success", "edge_case", "all"], + default="all", + help="Type of tests to run", + ) + run_parser.set_defaults(func=cmd_test_run) + + # test-debug + debug_parser = subparsers.add_parser( + "test-debug", + help="Debug a failed test", + ) + debug_parser.add_argument( + "goal_id", + help="Goal ID", + ) + debug_parser.add_argument( + "test_id", + help="Test ID to debug", + ) + debug_parser.add_argument( + "--run-id", + help="Runtime run ID for detailed logs", + ) + debug_parser.set_defaults(func=cmd_test_debug) + + # test-list + list_parser = subparsers.add_parser( + "test-list", + help="List tests for a goal", + ) + list_parser.add_argument( + "goal_id", + help="Goal ID", + ) + list_parser.add_argument( + "--status", + choices=["pending", "approved", "modified", "rejected", "all"], + default="all", + help="Filter by approval status", + ) + list_parser.set_defaults(func=cmd_test_list) + + # test-stats + stats_parser = subparsers.add_parser( + "test-stats", + help="Show test statistics for a goal", + ) + stats_parser.add_argument( + "goal_id", + help="Goal ID", + ) + stats_parser.set_defaults(func=cmd_test_stats) + + +def cmd_test_generate(args: argparse.Namespace) -> int: + """Generate tests from a goal file.""" + # Load goal + goal_path = Path(args.goal_file) + if not goal_path.exists(): + print(f"Error: Goal file not found: {goal_path}") + return 1 + + with open(goal_path) as f: + goal = Goal.model_validate_json(f.read()) + + print(f"Loaded goal: {goal.name} ({goal.id})") + + # Determine output directory + output_dir = Path(args.output) if args.output else DEFAULT_STORAGE_PATH / goal.id + storage = TestStorage(output_dir) + + # Get LLM provider + try: + from framework.llm import AnthropicProvider + llm = AnthropicProvider() + except Exception as e: + print(f"Error: Failed to initialize LLM provider: {e}") + return 1 + + all_tests = [] + + # Generate constraint tests + if args.type in ("constraint", "all"): + print(f"\nGenerating constraint tests for {len(goal.constraints)} constraints...") + generator = ConstraintTestGenerator(llm) + constraint_tests = generator.generate(goal) + all_tests.extend(constraint_tests) + print(f"Generated {len(constraint_tests)} constraint tests") + + # Generate success criteria tests + if args.type in ("success", "all"): + print(f"\nGenerating success criteria tests for {len(goal.success_criteria)} criteria...") + generator = SuccessCriteriaTestGenerator(llm) + success_tests = generator.generate(goal) + all_tests.extend(success_tests) + print(f"Generated {len(success_tests)} success criteria tests") + + if not all_tests: + print("\nNo tests generated.") + return 0 + + print(f"\nTotal tests generated: {len(all_tests)}") + + # Approval + if args.auto_approve: + print("\nAuto-approving all tests...") + for test in all_tests: + test.approve("cli-auto") + storage.save_test(test) + print(f"Saved {len(all_tests)} tests to {output_dir}") + else: + print("\nStarting interactive approval...") + # Save pending tests first + for test in all_tests: + storage.save_test(test) + + results = interactive_approval(all_tests, storage) + approved = sum(1 for r in results if r.action.value in ("approve", "modify")) + print(f"\nApproved: {approved}/{len(all_tests)} tests") + + return 0 + + +def cmd_test_approve(args: argparse.Namespace) -> int: + """Review and approve pending tests.""" + storage_path = Path(args.storage) if args.storage else DEFAULT_STORAGE_PATH / args.goal_id + storage = TestStorage(storage_path) + + pending = storage.get_pending_tests(args.goal_id) + + if not pending: + print(f"No pending tests for goal {args.goal_id}") + return 0 + + print(f"Found {len(pending)} pending tests\n") + + results = interactive_approval(pending, storage) + approved = sum(1 for r in results if r.action.value in ("approve", "modify")) + print(f"\nApproved: {approved}/{len(pending)} tests") + + return 0 + + +def cmd_test_run(args: argparse.Namespace) -> int: + """Run tests for an agent.""" + storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal) + + # Get approved tests + tests = storage.get_approved_tests(args.goal) + + # Filter by type + if args.type != "all": + type_map = { + "constraint": TestType.CONSTRAINT, + "success": TestType.SUCCESS_CRITERIA, + "edge_case": TestType.EDGE_CASE, + } + filter_type = type_map.get(args.type) + if filter_type: + tests = [t for t in tests if t.test_type == filter_type] + + if not tests: + print(f"No approved tests found for goal {args.goal}") + return 1 + + print(f"Running {len(tests)} tests...\n") + + # Configure runner + config = ParallelConfig( + num_workers=args.parallel if args.parallel > 0 else 1, + fail_fast=args.fail_fast, + ) + + # Run with progress - use AgentFactory for picklable parallel execution + runner = ParallelTestRunner(config, storage) + + def on_result(result): + status = "✓" if result.passed else "✗" + print(f" {status} {result.test_id} ({result.duration_ms}ms)") + + result = runner.run_all( + goal_id=args.goal, + agent_factory=AgentFactory(args.agent_path), + tests=tests, + on_result=on_result, + ) + + # Print summary + print(f"\n{'=' * 40}") + print(f"Results: {result.passed}/{result.total} passed ({result.pass_rate:.1%})") + print(f"Duration: {result.duration_ms}ms") + + if not result.all_passed: + print(f"\nFailed tests:") + for r in result.get_failed_results(): + print(f" - {r.test_id}: {r.error_message}") + if r.error_category: + print(f" Category: {r.error_category.value}") + + return 0 if result.all_passed else 1 + + +def cmd_test_debug(args: argparse.Namespace) -> int: + """Debug a failed test.""" + storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id) + + # Try to load runtime storage + runtime_storage = None + try: + from framework.storage.backend import FileStorage + runtime_storage = FileStorage(f"data/runtime/{args.goal_id}") + except Exception: + pass + + debug_tool = DebugTool(storage, runtime_storage) + info = debug_tool.analyze(args.goal_id, args.test_id, args.run_id) + + # Print debug info + print(f"Debug Info for: {info.test_name}") + print("=" * 50) + + print(f"\nTest ID: {info.test_id}") + print(f"Passed: {info.passed}") + + if info.error_category: + print(f"\nError Category: {info.error_category}") + print(f"Suggested Fix: {info.suggested_fix}") + + if info.error_message: + print(f"\nError Message:\n{info.error_message}") + + if info.stack_trace: + print(f"\nStack Trace:\n{info.stack_trace}") + + if info.iteration_guidance: + print(f"\nIteration Guidance:") + print(f" Stage: {info.iteration_guidance.get('stage')}") + print(f" Action: {info.iteration_guidance.get('action')}") + print(f" Restart Required: {info.iteration_guidance.get('restart_required')}") + + print(f"\nInput:\n{json.dumps(info.input, indent=2)}") + print(f"\nExpected:\n{json.dumps(info.expected, indent=2)}") + print(f"\nActual:\n{json.dumps(info.actual, indent=2, default=str)}") + + return 0 + + +def cmd_test_list(args: argparse.Namespace) -> int: + """List tests for a goal.""" + storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id) + tests = storage.get_tests_by_goal(args.goal_id) + + # Filter by status + if args.status != "all": + from framework.testing.test_case import ApprovalStatus + try: + filter_status = ApprovalStatus(args.status) + tests = [t for t in tests if t.approval_status == filter_status] + except ValueError: + pass + + if not tests: + print(f"No tests found for goal {args.goal_id}") + return 0 + + print(f"Tests for goal {args.goal_id}:\n") + for t in tests: + status_icon = { + "pending": "⏳", + "approved": "✓", + "modified": "✓*", + "rejected": "✗", + }.get(t.approval_status.value, "?") + + result_icon = "" + if t.last_result: + result_icon = " [PASS]" if t.last_result == "passed" else " [FAIL]" + + print(f" {status_icon} {t.test_name} ({t.test_type.value}){result_icon}") + print(f" ID: {t.id}") + print(f" Criteria: {t.parent_criteria_id}") + if t.llm_confidence: + print(f" Confidence: {t.llm_confidence:.0%}") + print() + + return 0 + + +def cmd_test_stats(args: argparse.Namespace) -> int: + """Show test statistics.""" + storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id) + stats = storage.get_stats() + + print(f"Statistics for goal {args.goal_id}:\n") + print(f" Total tests: {stats['total_tests']}") + print(f"\n By approval status:") + for status, count in stats["by_approval"].items(): + print(f" {status}: {count}") + + # Get pass/fail stats + tests = storage.get_approved_tests(args.goal_id) + passed = sum(1 for t in tests if t.last_result == "passed") + failed = sum(1 for t in tests if t.last_result == "failed") + not_run = sum(1 for t in tests if t.last_result is None) + + print(f"\n Execution results:") + print(f" Passed: {passed}") + print(f" Failed: {failed}") + print(f" Not run: {not_run}") + + return 0 diff --git a/core/framework/testing/constraint_gen.py b/core/framework/testing/constraint_gen.py new file mode 100644 index 00000000..11e7e8c5 --- /dev/null +++ b/core/framework/testing/constraint_gen.py @@ -0,0 +1,201 @@ +""" +Constraint test generator. + +Generates tests for Goal constraints using LLM. +Tests are returned with PENDING approval status. +""" + +import uuid +from typing import TYPE_CHECKING + +from framework.graph.goal import Goal, Constraint +from framework.testing.test_case import Test, TestType, ApprovalStatus +from framework.testing.prompts import CONSTRAINT_TEST_PROMPT +from framework.llm.provider import Tool, ToolUse, ToolResult + +if TYPE_CHECKING: + from framework.llm.provider import LLMProvider + + +# Tool for collecting generated tests - Claude handles JSON escaping automatically +SUBMIT_TEST_TOOL = Tool( + name="submit_test", + description="Submit a generated constraint test. Call once per test.", + parameters={ + "properties": { + "constraint_id": { + "type": "string", + "description": "ID of the constraint being tested", + }, + "test_name": { + "type": "string", + "description": "pytest function name, e.g., test_constraint_api_limits_respected", + }, + "test_code": { + "type": "string", + "description": "Complete Python test function code", + }, + "description": { + "type": "string", + "description": "What the test validates", + }, + "input": { + "type": "object", + "description": "Test input data", + }, + "expected_output": { + "type": "object", + "description": "Expected output", + }, + "confidence": { + "type": "number", + "description": "Confidence score 0-1", + }, + }, + "required": ["constraint_id", "test_name", "test_code", "description", "confidence"], + }, +) + + +class ConstraintTestGenerator: + """ + Generate constraint tests from Goal constraints. + + Generated tests require user approval before being added to the test suite. + """ + + def __init__(self, llm: "LLMProvider"): + """ + Initialize generator with LLM provider. + + Args: + llm: LLM provider for test generation (e.g., AnthropicProvider) + """ + self.llm = llm + + def generate(self, goal: Goal) -> list[Test]: + """ + Generate tests for all constraints in a goal. + + Args: + goal: Goal with constraints to test + + Returns: + List of Test objects with approval_status=PENDING. + These MUST be approved before being added to the test suite. + """ + if not goal.constraints: + return [] + + # Format prompt + prompt = CONSTRAINT_TEST_PROMPT.format( + goal_name=goal.name, + goal_description=goal.description, + constraints_formatted=self._format_constraints(goal.constraints), + ) + + # Collect tests via tool calls - Claude handles JSON escaping automatically + collected_tests: list[dict] = [] + + def tool_executor(tool_use: ToolUse) -> ToolResult: + if tool_use.name == "submit_test": + collected_tests.append(tool_use.input) + return ToolResult( + tool_use_id=tool_use.id, content="Test recorded successfully" + ) + return ToolResult( + tool_use_id=tool_use.id, content="Unknown tool", is_error=True + ) + + self.llm.complete_with_tools( + messages=[{"role": "user", "content": prompt}], + system="You are a test generation expert. For each constraint, call the submit_test tool with the test details.", + tools=[SUBMIT_TEST_TOOL], + tool_executor=tool_executor, + max_iterations=20, + ) + + return self._create_tests_from_collected(collected_tests, goal.id) + + def generate_for_constraint( + self, goal: Goal, constraint: Constraint + ) -> list[Test]: + """ + Generate tests for a single constraint. + + Args: + goal: Goal containing the constraint + constraint: Specific constraint to test + + Returns: + List of Test objects for the constraint + """ + # Format prompt with just this constraint + prompt = CONSTRAINT_TEST_PROMPT.format( + goal_name=goal.name, + goal_description=goal.description, + constraints_formatted=self._format_constraint(constraint), + ) + + # Collect tests via tool calls + collected_tests: list[dict] = [] + + def tool_executor(tool_use: ToolUse) -> ToolResult: + if tool_use.name == "submit_test": + collected_tests.append(tool_use.input) + return ToolResult( + tool_use_id=tool_use.id, content="Test recorded successfully" + ) + return ToolResult( + tool_use_id=tool_use.id, content="Unknown tool", is_error=True + ) + + self.llm.complete_with_tools( + messages=[{"role": "user", "content": prompt}], + system="You are a test generation expert. Call the submit_test tool with the test details.", + tools=[SUBMIT_TEST_TOOL], + tool_executor=tool_executor, + max_iterations=10, + ) + + return self._create_tests_from_collected(collected_tests, goal.id) + + def _format_constraints(self, constraints: list[Constraint]) -> str: + """Format constraints for prompt.""" + lines = [] + for c in constraints: + lines.append(self._format_constraint(c)) + lines.append("") + return "\n".join(lines) + + def _format_constraint(self, constraint: Constraint) -> str: + """Format a single constraint for prompt.""" + severity = "HARD" if constraint.constraint_type == "hard" else "SOFT" + return f"""### Constraint: {constraint.id} +- Type: {severity} ({constraint.constraint_type}) +- Category: {constraint.category} +- Description: {constraint.description} +- Check: {constraint.check}""" + + def _create_tests_from_collected( + self, collected: list[dict], goal_id: str + ) -> list[Test]: + """Create Test objects from tool call data.""" + tests = [] + for td in collected: + test = Test( + id=f"test_{uuid.uuid4().hex[:8]}", + goal_id=goal_id, + parent_criteria_id=td.get("constraint_id", "unknown"), + test_type=TestType.CONSTRAINT, + test_name=td.get("test_name", "unnamed_test"), + test_code=td.get("test_code", ""), + description=td.get("description", ""), + input=td.get("input", {}), + expected_output=td.get("expected_output", {}), + generated_by="llm", + llm_confidence=float(td.get("confidence", 0.5)), + approval_status=ApprovalStatus.PENDING, + ) + tests.append(test) + return tests diff --git a/core/framework/testing/debug_tool.py b/core/framework/testing/debug_tool.py new file mode 100644 index 00000000..404a6830 --- /dev/null +++ b/core/framework/testing/debug_tool.py @@ -0,0 +1,286 @@ +""" +Debug tool for analyzing failed tests. + +Provides detailed information for debugging: +- Test input and expected output +- Actual output and error details +- Error categorization +- Runtime logs and execution path +- Fix suggestions +""" + +from typing import Any + +from pydantic import BaseModel, Field + +from framework.testing.test_case import Test +from framework.testing.test_result import TestResult, ErrorCategory +from framework.testing.test_storage import TestStorage +from framework.testing.categorizer import ErrorCategorizer + + +class DebugInfo(BaseModel): + """ + Comprehensive debug information for a failed test. + """ + test_id: str + test_name: str + + # Test definition + input: dict[str, Any] = Field(default_factory=dict) + expected: dict[str, Any] = Field(default_factory=dict) + + # Actual result + actual: Any = None + passed: bool = False + + # Error details + error_message: str | None = None + error_category: str | None = None + stack_trace: str | None = None + + # Runtime data + logs: list[dict[str, Any]] = Field(default_factory=list) + runtime_data: dict[str, Any] = Field(default_factory=dict) + + # Fix guidance + suggested_fix: str | None = None + iteration_guidance: dict[str, Any] = Field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + """Convert to dict for JSON serialization.""" + return self.model_dump() + + +class DebugTool: + """ + Debug tool for analyzing failed tests. + + Integrates with: + - TestStorage for test and result data + - Runtime storage (optional) for decision logs + - ErrorCategorizer for classification + """ + + def __init__( + self, + test_storage: TestStorage, + runtime_storage: Any | None = None, + ): + """ + Initialize debug tool. + + Args: + test_storage: Storage for test and result data + runtime_storage: Optional FileStorage for Runtime data + """ + self.test_storage = test_storage + self.runtime_storage = runtime_storage + self.categorizer = ErrorCategorizer() + + def analyze( + self, + goal_id: str, + test_id: str, + run_id: str | None = None, + ) -> DebugInfo: + """ + Get detailed debug info for a failed test. + + Args: + goal_id: Goal ID containing the test + test_id: ID of the test to analyze + run_id: Optional Runtime run ID for detailed logs + + Returns: + DebugInfo with comprehensive debug data + """ + # Load test + test = self.test_storage.load_test(goal_id, test_id) + if not test: + return DebugInfo( + test_id=test_id, + test_name="unknown", + error_message=f"Test {test_id} not found in goal {goal_id}", + ) + + # Load latest result + result = self.test_storage.get_latest_result(test_id) + + # Build debug info + debug_info = DebugInfo( + test_id=test_id, + test_name=test.test_name, + input=test.input, + expected=test.expected_output, + ) + + if result: + debug_info.actual = result.actual_output + debug_info.passed = result.passed + debug_info.error_message = result.error_message + debug_info.stack_trace = result.stack_trace + debug_info.logs = result.runtime_logs + + # Set category + if result.error_category: + debug_info.error_category = result.error_category.value + elif not result.passed: + # Categorize if not already done + category = self.categorizer.categorize(result) + if category: + debug_info.error_category = category.value + + # Get runtime data if available + if run_id and self.runtime_storage: + debug_info.runtime_data = self._get_runtime_data(run_id) + + # Generate fix suggestions + if debug_info.error_category: + category = ErrorCategory(debug_info.error_category) + debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category) + debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category) + + return debug_info + + def analyze_result( + self, + test: Test, + result: TestResult, + run_id: str | None = None, + ) -> DebugInfo: + """ + Analyze a test result directly (without loading from storage). + + Args: + test: The Test that was run + result: The TestResult to analyze + run_id: Optional Runtime run ID + + Returns: + DebugInfo with debug data + """ + debug_info = DebugInfo( + test_id=test.id, + test_name=test.test_name, + input=test.input, + expected=test.expected_output, + actual=result.actual_output, + passed=result.passed, + error_message=result.error_message, + stack_trace=result.stack_trace, + logs=result.runtime_logs, + ) + + # Categorize + if result.error_category: + debug_info.error_category = result.error_category.value + elif not result.passed: + category = self.categorizer.categorize(result) + if category: + debug_info.error_category = category.value + + # Runtime data + if run_id and self.runtime_storage: + debug_info.runtime_data = self._get_runtime_data(run_id) + + # Fix suggestions + if debug_info.error_category: + category = ErrorCategory(debug_info.error_category) + debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category) + debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category) + + return debug_info + + def get_failure_summary( + self, + goal_id: str, + ) -> dict[str, Any]: + """ + Get summary of all failures for a goal. + + Returns: + Dict with failure counts by category and test IDs + """ + tests = self.test_storage.get_tests_by_goal(goal_id) + + failures_by_category: dict[str, list[str]] = { + "logic_error": [], + "implementation_error": [], + "edge_case": [], + "uncategorized": [], + } + + for test in tests: + if test.last_result == "failed": + result = self.test_storage.get_latest_result(test.id) + if result and result.error_category: + failures_by_category[result.error_category.value].append(test.id) + else: + failures_by_category["uncategorized"].append(test.id) + + return { + "goal_id": goal_id, + "total_failures": sum(len(ids) for ids in failures_by_category.values()), + "by_category": failures_by_category, + "iteration_suggestions": self._get_iteration_suggestions(failures_by_category), + } + + def _get_runtime_data(self, run_id: str) -> dict[str, Any]: + """Extract runtime data from Runtime storage.""" + if not self.runtime_storage: + return {} + + try: + run = self.runtime_storage.load_run(run_id) + if not run: + return {"error": f"Run {run_id} not found"} + + return { + "execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [], + "decisions": [ + d.model_dump() if hasattr(d, "model_dump") else str(d) + for d in getattr(run, "decisions", []) + ], + "problems": [ + p.model_dump() if hasattr(p, "model_dump") else str(p) + for p in getattr(run, "problems", []) + ], + "status": run.status.value if hasattr(run, "status") else "unknown", + } + except Exception as e: + return {"error": f"Failed to load runtime data: {e}"} + + def _get_iteration_suggestions( + self, + failures_by_category: dict[str, list[str]], + ) -> list[str]: + """Generate iteration suggestions based on failure categories.""" + suggestions = [] + + if failures_by_category["logic_error"]: + suggestions.append( + f"Found {len(failures_by_category['logic_error'])} logic errors. " + "Review and update Goal success_criteria/constraints, then restart " + "the full Goal → Agent → Eval flow." + ) + + if failures_by_category["implementation_error"]: + suggestions.append( + f"Found {len(failures_by_category['implementation_error'])} implementation errors. " + "Fix agent node/edge code and re-run Eval." + ) + + if failures_by_category["edge_case"]: + suggestions.append( + f"Found {len(failures_by_category['edge_case'])} edge cases. " + "These are new scenarios - add tests for them." + ) + + if failures_by_category["uncategorized"]: + suggestions.append( + f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. " + "Manual review required." + ) + + return suggestions diff --git a/core/framework/testing/executor.py b/core/framework/testing/executor.py new file mode 100644 index 00000000..9f3b23ff --- /dev/null +++ b/core/framework/testing/executor.py @@ -0,0 +1,407 @@ +""" +Single test executor. + +Executes a single test against an agent and returns a TestResult. +""" + +import asyncio +import inspect +import os +import time +import traceback +from typing import Any, Protocol, runtime_checkable + +from framework.testing.test_case import Test +from framework.testing.test_result import TestResult, ErrorCategory +from framework.testing.categorizer import ErrorCategorizer + + +class LLMJudge: + """ + LLM-based judge for semantic evaluation of test results. + + Used by tests that need to evaluate semantic properties like + "no hallucination" or "preserves meaning" that can't be checked + with simple assertions. + """ + + def __init__(self): + """Initialize the LLM judge.""" + self._client = None + + def _get_client(self): + """Lazy-load the Anthropic client.""" + if self._client is None: + try: + import anthropic + self._client = anthropic.Anthropic() + except ImportError: + raise RuntimeError("anthropic package required for LLM judge") + return self._client + + def evaluate( + self, + constraint: str, + source_document: str, + summary: str, + criteria: str, + ) -> dict[str, Any]: + """ + Evaluate whether a summary meets a constraint. + + Args: + constraint: The constraint being tested (e.g., "no-hallucination") + source_document: The original document + summary: The generated summary to evaluate + criteria: Human-readable criteria for evaluation + + Returns: + Dict with 'passes' (bool) and 'explanation' (str) + """ + client = self._get_client() + + prompt = f"""You are evaluating whether a summary meets a specific constraint. + +CONSTRAINT: {constraint} +CRITERIA: {criteria} + +SOURCE DOCUMENT: +{source_document} + +SUMMARY TO EVALUATE: +{summary} + +Evaluate whether the summary meets the constraint. Be strict but fair. + +Respond with JSON in this exact format: +{{"passes": true/false, "explanation": "brief explanation of your judgment"}} + +Only output the JSON, nothing else.""" + + try: + response = client.messages.create( + model="claude-haiku-4-5-20251001", + max_tokens=500, + messages=[{"role": "user", "content": prompt}] + ) + + # Parse the response + import json + text = response.content[0].text.strip() + # Handle potential markdown code blocks + if text.startswith("```"): + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + text = text.strip() + + result = json.loads(text) + return { + "passes": bool(result.get("passes", False)), + "explanation": result.get("explanation", "No explanation provided") + } + except Exception as e: + # On error, fail the test with explanation + return { + "passes": False, + "explanation": f"LLM judge error: {e}" + } + + +@runtime_checkable +class AgentProtocol(Protocol): + """Protocol for agent that can be tested.""" + + def run(self, input: dict[str, Any]) -> Any: + """Run the agent with input and return result.""" + ... + + +class SyncAgentWrapper: + """ + Wrapper that makes async agent.run() callable synchronously. + + This allows tests to call agent.run() without async/await syntax, + which simplifies test code generation and execution. + """ + + def __init__(self, agent: Any): + self._agent = agent + self._loop: asyncio.AbstractEventLoop | None = None + + def run(self, input_data: dict[str, Any]) -> Any: + """ + Run agent synchronously by wrapping async call. + + Args: + input_data: Input data for the agent + + Returns: + Output dict from the agent's ExecutionResult + """ + coro = self._agent.run(input_data) + + # Check if we're already in an async context + try: + loop = asyncio.get_running_loop() + # We're in an async context, can't use run_until_complete + # This shouldn't happen in normal test execution + raise RuntimeError("Cannot run sync wrapper from async context") + except RuntimeError: + # No running loop, create one or reuse + pass + + # Get or create event loop + try: + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + return self._loop.run_until_complete(coro).output + finally: + # Don't close the loop here - we may need it for subsequent calls + pass + + def __getattr__(self, name: str) -> Any: + """Forward other attribute access to wrapped agent.""" + return getattr(self._agent, name) + + +class TestExecutor: + """ + Execute a single test against an agent. + + Handles: + - Test code compilation and execution + - Timing measurement + - Error capture and categorization + - Result creation + """ + + def __init__( + self, + categorizer: ErrorCategorizer | None = None, + timeout: float = 60.0, + ): + """ + Initialize executor. + + Args: + categorizer: ErrorCategorizer for classifying failures + timeout: Maximum test execution time in seconds + """ + self.categorizer = categorizer or ErrorCategorizer() + self.timeout = timeout + + def execute( + self, + test: Test, + agent: AgentProtocol, + capture_logs: bool = True, + ) -> TestResult: + """ + Execute a test against an agent. + + Args: + test: Test to execute + agent: Agent instance to test + capture_logs: Whether to capture runtime logs + + Returns: + TestResult with execution details + """ + start_time = time.perf_counter() + + try: + # Build test environment + test_globals = self._build_test_globals(agent, test) + + # Compile test code + try: + compiled = compile(test.test_code, f"", "exec") + except SyntaxError as e: + return self._create_error_result( + test=test, + start_time=start_time, + error_message=f"Test code syntax error: {e}", + stack_trace=traceback.format_exc(), + ) + + # Execute test + try: + exec(compiled, test_globals) + + # Look for test function and call it + test_func = test_globals.get(test.test_name) + if test_func is None: + # Try to find any function starting with test_ + for name, obj in test_globals.items(): + if name.startswith("test_") and callable(obj): + test_func = obj + break + + if test_func is None: + return self._create_error_result( + test=test, + start_time=start_time, + error_message=f"Test function '{test.test_name}' not found in test code", + ) + + # Call the test function with appropriate arguments + # Inspect the function signature to determine what to pass + sig = inspect.signature(test_func) + params = list(sig.parameters.keys()) + + # Build arguments based on what the function expects + call_args = [] + for param in params: + if param == "agent": + call_args.append(test_globals["agent"]) + elif param == "llm_judge": + call_args.append(test_globals["llm_judge"]) + elif param in test_globals: + call_args.append(test_globals[param]) + else: + # Unknown parameter - this will likely cause an error + # but we let it happen naturally + break + + test_func(*call_args) + + # Test passed + duration_ms = int((time.perf_counter() - start_time) * 1000) + return TestResult( + test_id=test.id, + passed=True, + duration_ms=duration_ms, + expected_output=test.expected_output, + actual_output={"status": "passed"}, + ) + + except AssertionError as e: + return self._create_failure_result( + test=test, + start_time=start_time, + error_message=str(e) or "Assertion failed", + stack_trace=traceback.format_exc(), + ) + + except Exception as e: + return self._create_failure_result( + test=test, + start_time=start_time, + error_message=f"{type(e).__name__}: {e}", + stack_trace=traceback.format_exc(), + ) + + except Exception as e: + return self._create_error_result( + test=test, + start_time=start_time, + error_message=f"Test execution error: {e}", + stack_trace=traceback.format_exc(), + ) + + def _build_test_globals( + self, + agent: AgentProtocol, + test: Test, + ) -> dict[str, Any]: + """Build the globals dict for test execution.""" + # Wrap async agents in a sync wrapper so test code can call agent.run() + # without async/await syntax + wrapped_agent = self._wrap_agent_if_async(agent) + + return { + "__builtins__": __builtins__, + "agent": wrapped_agent, + "llm_judge": LLMJudge(), # For semantic evaluation tests + "test_input": test.input, + "expected_output": test.expected_output, + # Common test utilities + "assert": assert_, # Built-in + "isinstance": isinstance, + "len": len, + "str": str, + "int": int, + "float": float, + "list": list, + "dict": dict, + "set": set, + "tuple": tuple, + "any": any, + "all": all, + "print": print, # For debugging + } + + def _wrap_agent_if_async(self, agent: AgentProtocol) -> Any: + """ + Wrap agent if its run() method is async. + + Args: + agent: Agent to potentially wrap + + Returns: + SyncAgentWrapper if agent.run() is async, otherwise the original agent + """ + run_method = getattr(agent, "run", None) + if run_method is None: + return agent + + # Check if run() is a coroutine function + if inspect.iscoroutinefunction(run_method): + return SyncAgentWrapper(agent) + + return agent + + def _create_failure_result( + self, + test: Test, + start_time: float, + error_message: str, + stack_trace: str | None = None, + ) -> TestResult: + """Create a result for a test that failed assertions.""" + duration_ms = int((time.perf_counter() - start_time) * 1000) + + result = TestResult( + test_id=test.id, + passed=False, + duration_ms=duration_ms, + expected_output=test.expected_output, + error_message=error_message, + stack_trace=stack_trace, + ) + + # Categorize the error + result.error_category = self.categorizer.categorize(result) + + return result + + def _create_error_result( + self, + test: Test, + start_time: float, + error_message: str, + stack_trace: str | None = None, + ) -> TestResult: + """Create a result for a test that couldn't run.""" + duration_ms = int((time.perf_counter() - start_time) * 1000) + + result = TestResult( + test_id=test.id, + passed=False, + duration_ms=duration_ms, + error_message=error_message, + stack_trace=stack_trace, + ) + + # Implementation error for test setup failures + result.error_category = ErrorCategory.IMPLEMENTATION_ERROR + + return result + + +def assert_(condition: bool, message: str = "") -> None: + """Assert helper with message.""" + if not condition: + raise AssertionError(message) diff --git a/core/framework/testing/parallel.py b/core/framework/testing/parallel.py new file mode 100644 index 00000000..4af91de9 --- /dev/null +++ b/core/framework/testing/parallel.py @@ -0,0 +1,344 @@ +""" +Parallel test runner inspired by pytest-xdist. + +Features: +- Per-test parallelism: Each test runs independently with load balancing +- Worker initialization: Agent created once per worker thread (not per test) +- Thread-based parallelism: Uses ThreadPoolExecutor for I/O-bound LLM calls +- Fail-fast option: Stop on first failure +""" + +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from multiprocessing import cpu_count +from typing import Any, Callable, Protocol, runtime_checkable + +from framework.testing.test_case import Test +from framework.testing.test_result import TestResult, TestSuiteResult +from framework.testing.test_storage import TestStorage +from framework.testing.executor import TestExecutor, AgentProtocol +from framework.testing.categorizer import ErrorCategorizer + + +# Thread-local storage for worker agents +# Each worker thread gets its own agent instance to avoid race conditions +_thread_local = threading.local() + + +def _init_worker(agent_factory: Any) -> None: + """ + Initialize worker thread with its own agent instance. + + Called once per worker thread when the ThreadPoolExecutor starts. + The agent is stored in thread-local storage and reused for all tests + executed by this worker. + """ + if hasattr(agent_factory, "create"): + _thread_local.agent = agent_factory.create() + else: + _thread_local.agent = agent_factory() + + +def _run_single_test(test: Test, timeout: float) -> TestResult: + """ + Run a single test using the worker's pre-initialized agent. + + Args: + test: Test to execute + timeout: Timeout per test in seconds + + Returns: + TestResult with execution details + """ + executor = TestExecutor( + categorizer=ErrorCategorizer(), + timeout=timeout, + ) + return executor.execute(test, _thread_local.agent) + + +@dataclass +class ParallelConfig: + """Configuration for parallel test execution.""" + + num_workers: int = field(default_factory=cpu_count) + timeout_per_test: float = 60.0 # seconds + fail_fast: bool = False + mock_external_apis: bool = True + + +@runtime_checkable +class AgentFactoryProtocol(Protocol): + """Protocol for creating agent instances.""" + + def create(self) -> AgentProtocol: + """Create a new agent instance.""" + ... + + +class AgentFactory: + """Picklable factory that creates AgentRunner instances from a path. + + This class is used instead of a lambda for parallel test execution, + since lambdas capturing local variables cannot be pickled by ProcessPoolExecutor. + """ + + def __init__(self, agent_path: str): + self.agent_path = agent_path + + def create(self): + from framework.runner import AgentRunner + return AgentRunner.load(self.agent_path) + + +class ParallelTestRunner: + """ + Parallel test execution using ThreadPoolExecutor. + + Key features: + - Per-test distribution: Tests distributed individually for load balancing + - Worker initialization: Each worker thread creates one agent at startup + - Thread-based parallelism: Uses threads (not processes) for I/O-bound LLM calls + - Thread-local storage: Each worker has isolated agent state via threading.local() + """ + + def __init__( + self, + config: ParallelConfig | None = None, + storage: TestStorage | None = None, + ): + """ + Initialize parallel runner. + + Args: + config: Parallel execution configuration + storage: TestStorage for saving results + """ + self.config = config or ParallelConfig() + self.storage = storage + self.categorizer = ErrorCategorizer() + + def run_all( + self, + goal_id: str, + agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol], + tests: list[Test] | None = None, + on_result: Callable[[TestResult], None] | None = None, + ) -> TestSuiteResult: + """ + Run all approved tests for a goal. + + Args: + goal_id: Goal ID to run tests for + agent_factory: Factory for creating agent instances + tests: Optional list of tests (loads from storage if not provided) + on_result: Optional callback for each test result + + Returns: + TestSuiteResult with summary and individual results + """ + # Load tests if not provided + if tests is None: + if self.storage is None: + raise ValueError("Either tests or storage must be provided") + tests = self.storage.get_approved_tests(goal_id) + + if not tests: + return TestSuiteResult( + goal_id=goal_id, + total=0, + passed=0, + failed=0, + ) + + # Execute tests + results: list[TestResult] = [] + + if self.config.num_workers <= 1: + # Sequential execution - create single agent and run all tests + results = self._run_sequential(tests, agent_factory, on_result) + else: + # Parallel execution with per-test distribution + results = self._run_parallel(tests, agent_factory, on_result) + + # Save results if storage available + if self.storage: + # Create test_id -> test mapping for lookup + test_map = {t.id: t for t in tests} + + for result in results: + # Update the Test object with execution result + if result.test_id in test_map: + test = test_map[result.test_id] + test.record_result(result.passed) + self.storage.update_test(test) + + # Save the TestResult + self.storage.save_result(result.test_id, result) + + # Create suite result + return self._create_suite_result(goal_id, results) + + def run_tests( + self, + tests: list[Test], + agent: AgentProtocol, + on_result: Callable[[TestResult], None] | None = None, + ) -> list[TestResult]: + """ + Run a list of tests against an agent instance. + + Args: + tests: Tests to run + agent: Agent instance to test + on_result: Optional callback for each result + + Returns: + List of TestResult + """ + executor = TestExecutor( + categorizer=self.categorizer, + timeout=self.config.timeout_per_test, + ) + + results = [] + for test in tests: + result = executor.execute(test, agent) + results.append(result) + + if on_result: + on_result(result) + + # Fail-fast check + if self.config.fail_fast and not result.passed: + break + + return results + + def _run_sequential( + self, + tests: list[Test], + agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol], + on_result: Callable[[TestResult], None] | None = None, + ) -> list[TestResult]: + """Run tests sequentially with a single agent instance.""" + results = [] + executor = TestExecutor( + categorizer=self.categorizer, + timeout=self.config.timeout_per_test, + ) + + # Create single agent for all tests + if isinstance(agent_factory, AgentFactoryProtocol): + agent = agent_factory.create() + else: + agent = agent_factory() + + # Run all tests + for test in tests: + result = executor.execute(test, agent) + results.append(result) + + if on_result: + on_result(result) + + # Fail-fast + if self.config.fail_fast and not result.passed: + return results + + return results + + def _run_parallel( + self, + tests: list[Test], + agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol], + on_result: Callable[[TestResult], None] | None = None, + ) -> list[TestResult]: + """ + Run tests in parallel using ThreadPoolExecutor with worker initialization. + + Each worker thread creates ONE agent instance at startup and reuses it + for all tests assigned to that worker. Tests are distributed individually + for true load-balanced parallelism. + + Uses threads instead of processes because LLM API calls are I/O-bound, + and threads have lower overhead (no pickling, shared memory). + """ + results = [] + failed = False + + with ThreadPoolExecutor( + max_workers=self.config.num_workers, + initializer=_init_worker, + initargs=(agent_factory,), + ) as executor: + # Submit each test individually for true parallelism + futures = { + executor.submit(_run_single_test, test, self.config.timeout_per_test): test + for test in tests + } + + for future in as_completed(futures): + test = futures[future] + try: + result = future.result(timeout=self.config.timeout_per_test + 30) + results.append(result) + + if on_result: + on_result(result) + + if not result.passed: + failed = True + + except TimeoutError: + result = TestResult( + test_id=test.id, + passed=False, + duration_ms=int(self.config.timeout_per_test * 1000), + error_message="Test timed out", + ) + results.append(result) + if on_result: + on_result(result) + failed = True + + except Exception as e: + result = TestResult( + test_id=test.id, + passed=False, + duration_ms=0, + error_message=f"Execution error: {e}", + ) + results.append(result) + if on_result: + on_result(result) + failed = True + + # Fail-fast + if self.config.fail_fast and failed: + executor.shutdown(wait=False, cancel_futures=True) + break + + return results + + def _create_suite_result( + self, + goal_id: str, + results: list[TestResult], + ) -> TestSuiteResult: + """Create TestSuiteResult from individual results.""" + passed = sum(1 for r in results if r.passed) + failed = len(results) - passed + total_duration = sum(r.duration_ms for r in results) + + return TestSuiteResult( + goal_id=goal_id, + total=len(results), + passed=passed, + failed=failed, + results=results, + duration_ms=total_duration, + ) + + diff --git a/core/framework/testing/prompts.py b/core/framework/testing/prompts.py new file mode 100644 index 00000000..f4bb5689 --- /dev/null +++ b/core/framework/testing/prompts.py @@ -0,0 +1,112 @@ +""" +LLM prompt templates for test generation. + +These prompts instruct the LLM to generate pytest-compatible tests +from Goal success_criteria and constraints using tool calling. +""" + +CONSTRAINT_TEST_PROMPT = """You are generating test cases for an AI agent's constraints. + +## Goal +Name: {goal_name} +Description: {goal_description} + +## Constraints to Test +{constraints_formatted} + +## Instructions +For each constraint, generate pytest-compatible tests that verify the constraint is satisfied. + +For EACH test, call the `submit_test` tool with: +- constraint_id: The ID of the constraint being tested +- test_name: A descriptive pytest function name (test_constraint__) +- test_code: Complete Python test function code +- description: What the test validates +- input: Test input data as an object +- expected_output: Expected output as an object +- confidence: 0-1 score based on how testable/well-defined the constraint is + +Consider for each constraint: +- Happy path: Normal execution that should satisfy the constraint +- Boundary conditions: Inputs at the edge of constraint boundaries +- Violation scenarios: Inputs that should trigger constraint violation + +The test code should: +- Be valid Python using pytest conventions +- Use `agent.run(input)` to execute the agent +- Include descriptive assertion messages +- Handle potential exceptions appropriately + +Generate tests now by calling submit_test for each test.""" + +SUCCESS_CRITERIA_TEST_PROMPT = """You are generating success criteria tests for an AI agent. + +## Goal +Name: {goal_name} +Description: {goal_description} + +## Success Criteria +{success_criteria_formatted} + +## Agent Flow (for context) +Nodes: {node_names} +Tools: {tool_names} + +## Instructions +For each success criterion, generate tests that verify the agent achieves its goals. + +For EACH test, call the `submit_test` tool with: +- criteria_id: The ID of the success criterion being tested +- test_name: A descriptive pytest function name (test__) +- test_code: Complete Python test function code +- description: What the test validates +- input: Test input data as an object +- expected_output: Expected output as an object +- confidence: 0-1 score based on how measurable/specific the criterion is + +Consider for each criterion: +- Happy path: Normal successful execution +- Boundary conditions: Exactly at target thresholds (if applicable) +- Graceful handling: Near-misses and edge cases + +The test code should: +- Be valid Python using pytest conventions +- Use `agent.run(input)` to execute the agent +- Validate the metric defined in the success criterion +- Include descriptive assertion messages + +Generate tests now by calling submit_test for each test.""" + +EDGE_CASE_TEST_PROMPT = """You are generating edge case tests for an AI agent. + +## Goal +Name: {goal_name} +Description: {goal_description} + +## Existing Tests +{existing_tests_summary} + +## Recent Failures (if any) +{failures_summary} + +## Instructions +Generate additional edge case tests that cover scenarios not addressed by existing tests. + +Focus on: +1. Unusual input formats or values +2. Empty or null inputs +3. Extremely large or small values +4. Unicode and special characters +5. Concurrent or timing-related scenarios +6. Network/API failure simulations (if applicable) + +For EACH test, call the `submit_test` tool with: +- criteria_id: An identifier for the edge case category being tested +- test_name: A descriptive pytest function name (test_edge_case_) +- test_code: Complete Python test function code +- description: What the test validates +- input: Test input data as an object +- expected_output: Expected output as an object +- confidence: 0-1 score + +Generate edge case tests now by calling submit_test for each test.""" diff --git a/core/framework/testing/success_gen.py b/core/framework/testing/success_gen.py new file mode 100644 index 00000000..c5ff4136 --- /dev/null +++ b/core/framework/testing/success_gen.py @@ -0,0 +1,219 @@ +""" +Success criteria test generator. + +Generates tests for Goal success_criteria using LLM. +Tests are returned with PENDING approval status. +""" + +import uuid +from typing import TYPE_CHECKING + +from framework.graph.goal import Goal, SuccessCriterion +from framework.testing.test_case import Test, TestType, ApprovalStatus +from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT +from framework.llm.provider import Tool, ToolUse, ToolResult + +if TYPE_CHECKING: + from framework.llm.provider import LLMProvider + + +# Tool for collecting generated tests - Claude handles JSON escaping automatically +SUBMIT_TEST_TOOL = Tool( + name="submit_test", + description="Submit a generated success criteria test. Call once per test.", + parameters={ + "properties": { + "criteria_id": { + "type": "string", + "description": "ID of the success criterion being tested", + }, + "test_name": { + "type": "string", + "description": "pytest function name, e.g., test_find_videos_happy_path", + }, + "test_code": { + "type": "string", + "description": "Complete Python test function code", + }, + "description": { + "type": "string", + "description": "What the test validates", + }, + "input": { + "type": "object", + "description": "Test input data", + }, + "expected_output": { + "type": "object", + "description": "Expected output", + }, + "confidence": { + "type": "number", + "description": "Confidence score 0-1", + }, + }, + "required": ["criteria_id", "test_name", "test_code", "description", "confidence"], + }, +) + + +class SuccessCriteriaTestGenerator: + """ + Generate success criteria tests from Goal success_criteria. + + Generated tests require user approval before being added to the test suite. + Unlike constraint tests, success criteria tests are generated during the + Eval stage (after the agent exists) and may reference agent nodes/tools. + """ + + def __init__(self, llm: "LLMProvider"): + """ + Initialize generator with LLM provider. + + Args: + llm: LLM provider for test generation (e.g., AnthropicProvider) + """ + self.llm = llm + + def generate( + self, + goal: Goal, + node_names: list[str] | None = None, + tool_names: list[str] | None = None, + ) -> list[Test]: + """ + Generate tests for all success criteria in a goal. + + Args: + goal: Goal with success_criteria to test + node_names: Names of agent nodes (for context) + tool_names: Names of tools available to agent (for context) + + Returns: + List of Test objects with approval_status=PENDING. + These MUST be approved before being added to the test suite. + """ + if not goal.success_criteria: + return [] + + # Format prompt + prompt = SUCCESS_CRITERIA_TEST_PROMPT.format( + goal_name=goal.name, + goal_description=goal.description, + success_criteria_formatted=self._format_criteria(goal.success_criteria), + node_names=", ".join(node_names or ["(not specified)"]), + tool_names=", ".join(tool_names or ["(not specified)"]), + ) + + # Collect tests via tool calls - Claude handles JSON escaping automatically + collected_tests: list[dict] = [] + + def tool_executor(tool_use: ToolUse) -> ToolResult: + if tool_use.name == "submit_test": + collected_tests.append(tool_use.input) + return ToolResult( + tool_use_id=tool_use.id, content="Test recorded successfully" + ) + return ToolResult( + tool_use_id=tool_use.id, content="Unknown tool", is_error=True + ) + + self.llm.complete_with_tools( + messages=[{"role": "user", "content": prompt}], + system="You are a test generation expert. For each success criterion, call the submit_test tool with the test details.", + tools=[SUBMIT_TEST_TOOL], + tool_executor=tool_executor, + max_iterations=20, + ) + + return self._create_tests_from_collected(collected_tests, goal.id) + + def generate_for_criterion( + self, + goal: Goal, + criterion: SuccessCriterion, + node_names: list[str] | None = None, + tool_names: list[str] | None = None, + ) -> list[Test]: + """ + Generate tests for a single success criterion. + + Args: + goal: Goal containing the criterion + criterion: Specific criterion to test + node_names: Names of agent nodes + tool_names: Names of tools available + + Returns: + List of Test objects for the criterion + """ + prompt = SUCCESS_CRITERIA_TEST_PROMPT.format( + goal_name=goal.name, + goal_description=goal.description, + success_criteria_formatted=self._format_criterion(criterion), + node_names=", ".join(node_names or ["(not specified)"]), + tool_names=", ".join(tool_names or ["(not specified)"]), + ) + + # Collect tests via tool calls + collected_tests: list[dict] = [] + + def tool_executor(tool_use: ToolUse) -> ToolResult: + if tool_use.name == "submit_test": + collected_tests.append(tool_use.input) + return ToolResult( + tool_use_id=tool_use.id, content="Test recorded successfully" + ) + return ToolResult( + tool_use_id=tool_use.id, content="Unknown tool", is_error=True + ) + + self.llm.complete_with_tools( + messages=[{"role": "user", "content": prompt}], + system="You are a test generation expert. Call the submit_test tool with the test details.", + tools=[SUBMIT_TEST_TOOL], + tool_executor=tool_executor, + max_iterations=10, + ) + + return self._create_tests_from_collected(collected_tests, goal.id) + + def _format_criteria(self, criteria: list[SuccessCriterion]) -> str: + """Format success criteria for prompt.""" + lines = [] + for c in criteria: + lines.append(self._format_criterion(c)) + lines.append("") + return "\n".join(lines) + + def _format_criterion(self, criterion: SuccessCriterion) -> str: + """Format a single criterion for prompt.""" + return f"""### Success Criterion: {criterion.id} +- Description: {criterion.description} +- Metric: {criterion.metric} +- Target: {criterion.target} +- Weight: {criterion.weight} +- Currently met: {criterion.met}""" + + def _create_tests_from_collected( + self, collected: list[dict], goal_id: str + ) -> list[Test]: + """Create Test objects from tool call data.""" + tests = [] + for td in collected: + test = Test( + id=f"test_{uuid.uuid4().hex[:8]}", + goal_id=goal_id, + parent_criteria_id=td.get("criteria_id", "unknown"), + test_type=TestType.SUCCESS_CRITERIA, + test_name=td.get("test_name", "unnamed_test"), + test_code=td.get("test_code", ""), + description=td.get("description", ""), + input=td.get("input", {}), + expected_output=td.get("expected_output", {}), + generated_by="llm", + llm_confidence=float(td.get("confidence", 0.5)), + approval_status=ApprovalStatus.PENDING, + ) + tests.append(test) + return tests diff --git a/core/framework/testing/test_case.py b/core/framework/testing/test_case.py new file mode 100644 index 00000000..0c11698f --- /dev/null +++ b/core/framework/testing/test_case.py @@ -0,0 +1,150 @@ +""" +Test case schema with approval tracking. + +Tests are generated by LLM from Goal success_criteria and constraints, +but require mandatory user approval before being stored. +""" + +from datetime import datetime +from enum import Enum +from typing import Any + +from pydantic import BaseModel, Field + + +class ApprovalStatus(str, Enum): + """Status of user approval for a generated test.""" + PENDING = "pending" # Awaiting user review + APPROVED = "approved" # User accepted as-is + MODIFIED = "modified" # User edited before accepting + REJECTED = "rejected" # User declined (with reason) + + +class TestType(str, Enum): + """Type of test based on what it validates.""" + CONSTRAINT = "constraint" # Validates constraint boundaries + SUCCESS_CRITERIA = "outcome" # Validates success criteria achievement + EDGE_CASE = "edge_case" # Validates edge case handling + + +class Test(BaseModel): + """ + A test case generated from Goal success_criteria or constraints. + + Tests are either: + - Generated by LLM during Goal stage (constraints) or Eval stage (success criteria) + - Created manually by human engineers + + All tests require approval before being added to the test suite. + """ + id: str + goal_id: str + parent_criteria_id: str = Field( + description="Links to success_criteria.id or constraint.id" + ) + test_type: TestType + + # Test definition + test_name: str = Field( + description="Descriptive function name, e.g., test_constraint_api_limits_respected" + ) + test_code: str = Field( + description="Python test function code (pytest compatible)" + ) + description: str = Field( + description="Human-readable description of what the test validates" + ) + input: dict[str, Any] = Field( + default_factory=dict, + description="Test input data" + ) + expected_output: dict[str, Any] = Field( + default_factory=dict, + description="Expected output or assertions" + ) + + # LLM generation metadata + generated_by: str = Field( + default="llm", + description="Who created the test: 'llm' or 'human'" + ) + llm_confidence: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description="LLM's confidence in the test quality (0-1)" + ) + + # Approval tracking (CRITICAL - tests are never used without approval) + approval_status: ApprovalStatus = ApprovalStatus.PENDING + approved_by: str | None = None + approved_at: datetime | None = None + rejection_reason: str | None = Field( + default=None, + description="Reason for rejection if status is REJECTED" + ) + original_code: str | None = Field( + default=None, + description="Original LLM-generated code if user modified it" + ) + + # Execution tracking + last_run: datetime | None = None + last_result: str | None = Field( + default=None, + description="Result of last run: 'passed', 'failed', 'error'" + ) + run_count: int = 0 + pass_count: int = 0 + fail_count: int = 0 + + # Timestamps + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + + model_config = {"extra": "allow"} + + def approve(self, approved_by: str = "user") -> None: + """Mark test as approved.""" + self.approval_status = ApprovalStatus.APPROVED + self.approved_by = approved_by + self.approved_at = datetime.now() + self.updated_at = datetime.now() + + def modify(self, new_code: str, approved_by: str = "user") -> None: + """Approve test with modifications.""" + self.original_code = self.test_code + self.test_code = new_code + self.approval_status = ApprovalStatus.MODIFIED + self.approved_by = approved_by + self.approved_at = datetime.now() + self.updated_at = datetime.now() + + def reject(self, reason: str) -> None: + """Reject the test with a reason.""" + self.approval_status = ApprovalStatus.REJECTED + self.rejection_reason = reason + self.updated_at = datetime.now() + + def record_result(self, passed: bool) -> None: + """Record a test run result.""" + self.last_run = datetime.now() + self.last_result = "passed" if passed else "failed" + self.run_count += 1 + if passed: + self.pass_count += 1 + else: + self.fail_count += 1 + self.updated_at = datetime.now() + + @property + def is_approved(self) -> bool: + """Check if test has been approved (approved or modified).""" + return self.approval_status in (ApprovalStatus.APPROVED, ApprovalStatus.MODIFIED) + + @property + def pass_rate(self) -> float | None: + """Calculate pass rate if test has been run.""" + if self.run_count == 0: + return None + return self.pass_count / self.run_count diff --git a/core/framework/testing/test_result.py b/core/framework/testing/test_result.py new file mode 100644 index 00000000..41b54665 --- /dev/null +++ b/core/framework/testing/test_result.py @@ -0,0 +1,153 @@ +""" +Test result schemas for tracking test execution outcomes. + +Results include detailed error information for debugging and +categorization for guiding iteration strategy. +""" + +from datetime import datetime +from enum import Enum +from typing import Any + +from pydantic import BaseModel, Field + + +class ErrorCategory(str, Enum): + """ + Category of test failure for guiding iteration. + + Each category has different implications for how to fix: + - LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints + - IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage + - EDGE_CASE: New scenario discovered → add new test only + """ + LOGIC_ERROR = "logic_error" + IMPLEMENTATION_ERROR = "implementation_error" + EDGE_CASE = "edge_case" + + +class TestResult(BaseModel): + """ + Result of a single test execution. + + Captures: + - Pass/fail status with timing + - Actual vs expected output + - Error details for debugging + - Runtime logs and execution path + """ + test_id: str + passed: bool + duration_ms: int = Field( + ge=0, + description="Test execution time in milliseconds" + ) + + # Output comparison + actual_output: Any = None + expected_output: Any = None + + # Error details (populated on failure) + error_message: str | None = None + error_category: ErrorCategory | None = None + stack_trace: str | None = None + + # Runtime data for debugging + runtime_logs: list[dict[str, Any]] = Field( + default_factory=list, + description="Log entries from test execution" + ) + node_outputs: dict[str, Any] = Field( + default_factory=dict, + description="Output from each node executed during test" + ) + execution_path: list[str] = Field( + default_factory=list, + description="Sequence of nodes executed" + ) + + # Associated run ID (links to Runtime data) + run_id: str | None = Field( + default=None, + description="Runtime run ID for detailed analysis" + ) + + timestamp: datetime = Field(default_factory=datetime.now) + + model_config = {"extra": "allow"} + + def summary_dict(self) -> dict[str, Any]: + """Return a summary dict for quick overview.""" + return { + "test_id": self.test_id, + "passed": self.passed, + "duration_ms": self.duration_ms, + "error_category": self.error_category.value if self.error_category else None, + "error_message": self.error_message[:100] if self.error_message else None, + } + + +class TestSuiteResult(BaseModel): + """ + Aggregate result from running a test suite. + + Provides summary statistics and individual results. + """ + goal_id: str + total: int + passed: int + failed: int + errors: int = 0 # Tests that couldn't run (e.g., exceptions in setup) + skipped: int = 0 + + results: list[TestResult] = Field(default_factory=list) + + duration_ms: int = Field( + default=0, + description="Total execution time in milliseconds" + ) + + timestamp: datetime = Field(default_factory=datetime.now) + + model_config = {"extra": "allow"} + + @property + def all_passed(self) -> bool: + """Check if all tests passed.""" + return self.failed == 0 and self.errors == 0 + + @property + def pass_rate(self) -> float: + """Calculate pass rate.""" + if self.total == 0: + return 0.0 + return self.passed / self.total + + def summary_dict(self) -> dict[str, Any]: + """Return summary for reporting.""" + return { + "goal_id": self.goal_id, + "overall_passed": self.all_passed, + "summary": { + "total": self.total, + "passed": self.passed, + "failed": self.failed, + "errors": self.errors, + "skipped": self.skipped, + }, + "pass_rate": f"{self.pass_rate:.1%}", + "duration_ms": self.duration_ms, + } + + def get_failed_results(self) -> list[TestResult]: + """Get all failed test results for debugging.""" + return [r for r in self.results if not r.passed] + + def get_results_by_category( + self, category: ErrorCategory + ) -> list[TestResult]: + """Get failed results by error category.""" + return [ + r for r in self.results + if not r.passed and r.error_category == category + ] diff --git a/core/framework/testing/test_storage.py b/core/framework/testing/test_storage.py new file mode 100644 index 00000000..c3eeb3e0 --- /dev/null +++ b/core/framework/testing/test_storage.py @@ -0,0 +1,260 @@ +""" +File-based storage backend for test data. + +Follows the same pattern as framework/storage/backend.py (FileStorage), +storing tests as JSON files with indexes for efficient querying. +""" + +import json +from pathlib import Path +from datetime import datetime + +from framework.testing.test_case import Test, ApprovalStatus, TestType +from framework.testing.test_result import TestResult + + +class TestStorage: + """ + File-based storage for tests and results. + + Directory structure: + {base_path}/ + tests/ + {goal_id}/ + {test_id}.json # Full test data + indexes/ + by_goal/{goal_id}.json # List of test IDs for this goal + by_approval/{status}.json # Tests by approval status + by_type/{test_type}.json # Tests by type + by_criteria/{criteria_id}.json # Tests by parent criteria + results/ + {test_id}/ + {timestamp}.json # Test run results + latest.json # Most recent result + suites/ + {goal_id}_suite.json # Test suite metadata + """ + + def __init__(self, base_path: str | Path): + self.base_path = Path(base_path) + self._ensure_dirs() + + def _ensure_dirs(self) -> None: + """Create directory structure if it doesn't exist.""" + dirs = [ + self.base_path / "tests", + self.base_path / "indexes" / "by_goal", + self.base_path / "indexes" / "by_approval", + self.base_path / "indexes" / "by_type", + self.base_path / "indexes" / "by_criteria", + self.base_path / "results", + self.base_path / "suites", + ] + for d in dirs: + d.mkdir(parents=True, exist_ok=True) + + # === TEST OPERATIONS === + + def save_test(self, test: Test) -> None: + """Save a test to storage.""" + # Ensure goal directory exists + goal_dir = self.base_path / "tests" / test.goal_id + goal_dir.mkdir(parents=True, exist_ok=True) + + # Save full test + test_path = goal_dir / f"{test.id}.json" + with open(test_path, "w") as f: + f.write(test.model_dump_json(indent=2)) + + # Update indexes + self._add_to_index("by_goal", test.goal_id, test.id) + self._add_to_index("by_approval", test.approval_status.value, test.id) + self._add_to_index("by_type", test.test_type.value, test.id) + self._add_to_index("by_criteria", test.parent_criteria_id, test.id) + + def load_test(self, goal_id: str, test_id: str) -> Test | None: + """Load a test from storage.""" + test_path = self.base_path / "tests" / goal_id / f"{test_id}.json" + if not test_path.exists(): + return None + with open(test_path) as f: + return Test.model_validate_json(f.read()) + + def delete_test(self, goal_id: str, test_id: str) -> bool: + """Delete a test from storage.""" + test_path = self.base_path / "tests" / goal_id / f"{test_id}.json" + + if not test_path.exists(): + return False + + # Load test to get index keys + test = self.load_test(goal_id, test_id) + if test: + self._remove_from_index("by_goal", test.goal_id, test_id) + self._remove_from_index("by_approval", test.approval_status.value, test_id) + self._remove_from_index("by_type", test.test_type.value, test_id) + self._remove_from_index("by_criteria", test.parent_criteria_id, test_id) + + test_path.unlink() + + # Also delete results + results_dir = self.base_path / "results" / test_id + if results_dir.exists(): + for f in results_dir.iterdir(): + f.unlink() + results_dir.rmdir() + + return True + + def update_test(self, test: Test) -> None: + """ + Update an existing test. + + Handles index updates if approval_status changed. + """ + # Load old test to check for index changes + old_test = self.load_test(test.goal_id, test.id) + if old_test and old_test.approval_status != test.approval_status: + self._remove_from_index("by_approval", old_test.approval_status.value, test.id) + self._add_to_index("by_approval", test.approval_status.value, test.id) + + # Update timestamp + test.updated_at = datetime.now() + + # Save + self.save_test(test) + + # === QUERY OPERATIONS === + + def get_tests_by_goal(self, goal_id: str) -> list[Test]: + """Get all tests for a goal.""" + test_ids = self._get_index("by_goal", goal_id) + tests = [] + for test_id in test_ids: + test = self.load_test(goal_id, test_id) + if test: + tests.append(test) + return tests + + def get_tests_by_approval_status(self, status: ApprovalStatus) -> list[str]: + """Get test IDs by approval status.""" + return self._get_index("by_approval", status.value) + + def get_tests_by_type(self, test_type: TestType) -> list[str]: + """Get test IDs by test type.""" + return self._get_index("by_type", test_type.value) + + def get_tests_by_criteria(self, criteria_id: str) -> list[str]: + """Get test IDs for a specific criteria.""" + return self._get_index("by_criteria", criteria_id) + + def get_pending_tests(self, goal_id: str) -> list[Test]: + """Get all pending tests for a goal.""" + tests = self.get_tests_by_goal(goal_id) + return [t for t in tests if t.approval_status == ApprovalStatus.PENDING] + + def get_approved_tests(self, goal_id: str) -> list[Test]: + """Get all approved tests for a goal (approved or modified).""" + tests = self.get_tests_by_goal(goal_id) + return [t for t in tests if t.is_approved] + + def list_all_goals(self) -> list[str]: + """List all goal IDs that have tests.""" + goals_dir = self.base_path / "indexes" / "by_goal" + return [f.stem for f in goals_dir.glob("*.json")] + + # === RESULT OPERATIONS === + + def save_result(self, test_id: str, result: TestResult) -> None: + """Save a test result.""" + results_dir = self.base_path / "results" / test_id + results_dir.mkdir(parents=True, exist_ok=True) + + # Save with timestamp + timestamp = result.timestamp.strftime("%Y%m%d_%H%M%S") + result_path = results_dir / f"{timestamp}.json" + with open(result_path, "w") as f: + f.write(result.model_dump_json(indent=2)) + + # Update latest + latest_path = results_dir / "latest.json" + with open(latest_path, "w") as f: + f.write(result.model_dump_json(indent=2)) + + def get_latest_result(self, test_id: str) -> TestResult | None: + """Get the most recent result for a test.""" + latest_path = self.base_path / "results" / test_id / "latest.json" + if not latest_path.exists(): + return None + with open(latest_path) as f: + return TestResult.model_validate_json(f.read()) + + def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]: + """Get result history for a test, most recent first.""" + results_dir = self.base_path / "results" / test_id + if not results_dir.exists(): + return [] + + # Get all result files except latest.json + result_files = sorted( + [f for f in results_dir.glob("*.json") if f.name != "latest.json"], + reverse=True + )[:limit] + + results = [] + for f in result_files: + with open(f) as file: + results.append(TestResult.model_validate_json(file.read())) + + return results + + # === INDEX OPERATIONS === + + def _get_index(self, index_type: str, key: str) -> list[str]: + """Get values from an index.""" + index_path = self.base_path / "indexes" / index_type / f"{key}.json" + if not index_path.exists(): + return [] + with open(index_path) as f: + return json.load(f) + + def _add_to_index(self, index_type: str, key: str, value: str) -> None: + """Add a value to an index.""" + index_path = self.base_path / "indexes" / index_type / f"{key}.json" + values = self._get_index(index_type, key) + if value not in values: + values.append(value) + with open(index_path, "w") as f: + json.dump(values, f) + + def _remove_from_index(self, index_type: str, key: str, value: str) -> None: + """Remove a value from an index.""" + index_path = self.base_path / "indexes" / index_type / f"{key}.json" + values = self._get_index(index_type, key) + if value in values: + values.remove(value) + with open(index_path, "w") as f: + json.dump(values, f) + + # === UTILITY === + + def get_stats(self) -> dict: + """Get storage statistics.""" + goals = self.list_all_goals() + total_tests = sum(len(self._get_index("by_goal", g)) for g in goals) + pending = len(self._get_index("by_approval", "pending")) + approved = len(self._get_index("by_approval", "approved")) + modified = len(self._get_index("by_approval", "modified")) + rejected = len(self._get_index("by_approval", "rejected")) + + return { + "total_goals": len(goals), + "total_tests": total_tests, + "by_approval": { + "pending": pending, + "approved": approved, + "modified": modified, + "rejected": rejected, + }, + "storage_path": str(self.base_path), + } diff --git a/core/tests/test_testing_framework.py b/core/tests/test_testing_framework.py new file mode 100644 index 00000000..477d0e51 --- /dev/null +++ b/core/tests/test_testing_framework.py @@ -0,0 +1,612 @@ +""" +Unit tests for the goal-based testing framework. + +Tests cover: +- Schema validation +- Storage CRUD operations +- Error categorization heuristics +- Parallel runner grouping logic +""" + +import pytest +import tempfile +from pathlib import Path +from datetime import datetime + +from framework.testing.test_case import ( + Test, + TestType, + ApprovalStatus, +) +from framework.testing.test_result import ( + TestResult, + TestSuiteResult, + ErrorCategory, +) +from framework.testing.test_storage import TestStorage +from framework.testing.categorizer import ErrorCategorizer +from framework.testing.parallel import ParallelTestRunner, ParallelConfig +from framework.testing.debug_tool import DebugTool + + +# ============================================================================ +# Test Schema Tests +# ============================================================================ + +class TestTestCaseSchema: + """Tests for Test schema.""" + + def test_create_test(self): + """Test creating a basic test.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_api_limits", + test_type=TestType.CONSTRAINT, + test_name="test_constraint_api_limits", + test_code="def test_constraint_api_limits(agent): pass", + description="Tests API rate limits", + input={"topic": "test"}, + expected_output={"count": 5}, + ) + + assert test.id == "test_001" + assert test.goal_id == "goal_001" + assert test.test_type == TestType.CONSTRAINT + assert test.approval_status == ApprovalStatus.PENDING + assert not test.is_approved + + def test_approve_test(self): + """Test approving a test.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="pass", + description="test", + ) + + test.approve("test_user") + + assert test.approval_status == ApprovalStatus.APPROVED + assert test.approved_by == "test_user" + assert test.approved_at is not None + assert test.is_approved + + def test_modify_test(self): + """Test modifying a test before approval.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="original code", + description="test", + ) + + test.modify("modified code", "test_user") + + assert test.approval_status == ApprovalStatus.MODIFIED + assert test.original_code == "original code" + assert test.test_code == "modified code" + assert test.is_approved + + def test_reject_test(self): + """Test rejecting a test.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="pass", + description="test", + ) + + test.reject("Not a valid test case") + + assert test.approval_status == ApprovalStatus.REJECTED + assert test.rejection_reason == "Not a valid test case" + assert not test.is_approved + + def test_record_result(self): + """Test recording test results.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="pass", + description="test", + ) + + test.record_result(passed=True) + assert test.last_result == "passed" + assert test.run_count == 1 + assert test.pass_count == 1 + assert test.pass_rate == 1.0 + + test.record_result(passed=False) + assert test.last_result == "failed" + assert test.run_count == 2 + assert test.pass_count == 1 + assert test.fail_count == 1 + assert test.pass_rate == 0.5 + + +class TestTestResultSchema: + """Tests for TestResult schema.""" + + def test_create_passed_result(self): + """Test creating a passed result.""" + result = TestResult( + test_id="test_001", + passed=True, + duration_ms=100, + actual_output={"status": "ok"}, + expected_output={"status": "ok"}, + ) + + assert result.passed + assert result.duration_ms == 100 + assert result.error_category is None + + def test_create_failed_result(self): + """Test creating a failed result.""" + result = TestResult( + test_id="test_001", + passed=False, + duration_ms=50, + error_message="Assertion failed", + error_category=ErrorCategory.IMPLEMENTATION_ERROR, + stack_trace="Traceback...", + ) + + assert not result.passed + assert result.error_category == ErrorCategory.IMPLEMENTATION_ERROR + + def test_summary_dict(self): + """Test summary dict generation.""" + result = TestResult( + test_id="test_001", + passed=False, + duration_ms=50, + error_message="Very long error " * 20, + error_category=ErrorCategory.LOGIC_ERROR, + ) + + summary = result.summary_dict() + assert summary["test_id"] == "test_001" + assert summary["passed"] is False + assert summary["error_category"] == "logic_error" + assert len(summary["error_message"]) == 100 # Truncated + + +class TestTestSuiteResult: + """Tests for TestSuiteResult schema.""" + + def test_suite_result_properties(self): + """Test suite result calculation properties.""" + results = [ + TestResult(test_id="t1", passed=True, duration_ms=100), + TestResult(test_id="t2", passed=True, duration_ms=50), + TestResult(test_id="t3", passed=False, duration_ms=75, + error_category=ErrorCategory.IMPLEMENTATION_ERROR), + ] + + suite = TestSuiteResult( + goal_id="goal_001", + total=3, + passed=2, + failed=1, + results=results, + duration_ms=225, + ) + + assert not suite.all_passed + assert suite.pass_rate == pytest.approx(2/3) + assert len(suite.get_failed_results()) == 1 + + def test_get_results_by_category(self): + """Test filtering results by error category.""" + results = [ + TestResult(test_id="t1", passed=False, duration_ms=100, + error_category=ErrorCategory.LOGIC_ERROR), + TestResult(test_id="t2", passed=False, duration_ms=50, + error_category=ErrorCategory.IMPLEMENTATION_ERROR), + TestResult(test_id="t3", passed=False, duration_ms=75, + error_category=ErrorCategory.IMPLEMENTATION_ERROR), + ] + + suite = TestSuiteResult( + goal_id="goal_001", + total=3, + passed=0, + failed=3, + results=results, + ) + + impl_errors = suite.get_results_by_category(ErrorCategory.IMPLEMENTATION_ERROR) + assert len(impl_errors) == 2 + + +# ============================================================================ +# Storage Tests +# ============================================================================ + +class TestTestStorage: + """Tests for TestStorage.""" + + @pytest.fixture + def storage(self, tmp_path): + """Create a temporary storage instance.""" + return TestStorage(tmp_path) + + def test_save_and_load_test(self, storage): + """Test saving and loading a test.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="def test_something(agent): pass", + description="A test", + ) + + storage.save_test(test) + + loaded = storage.load_test("goal_001", "test_001") + assert loaded is not None + assert loaded.id == "test_001" + assert loaded.test_name == "test_something" + + def test_delete_test(self, storage): + """Test deleting a test.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="pass", + description="test", + ) + + storage.save_test(test) + assert storage.load_test("goal_001", "test_001") is not None + + storage.delete_test("goal_001", "test_001") + assert storage.load_test("goal_001", "test_001") is None + + def test_get_tests_by_goal(self, storage): + """Test querying tests by goal.""" + for i in range(3): + test = Test( + id=f"test_{i}", + goal_id="goal_001", + parent_criteria_id=f"constraint_{i}", + test_type=TestType.CONSTRAINT, + test_name=f"test_{i}", + test_code="pass", + description="test", + ) + storage.save_test(test) + + tests = storage.get_tests_by_goal("goal_001") + assert len(tests) == 3 + + def test_get_approved_tests(self, storage): + """Test querying approved tests.""" + # Create tests with different approval statuses + test1 = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="c1", + test_type=TestType.CONSTRAINT, + test_name="test_1", + test_code="pass", + description="test", + ) + test1.approve() + storage.save_test(test1) + + test2 = Test( + id="test_002", + goal_id="goal_001", + parent_criteria_id="c2", + test_type=TestType.CONSTRAINT, + test_name="test_2", + test_code="pass", + description="test", + ) + # Leave pending + storage.save_test(test2) + + test3 = Test( + id="test_003", + goal_id="goal_001", + parent_criteria_id="c3", + test_type=TestType.CONSTRAINT, + test_name="test_3", + test_code="pass", + description="test", + ) + test3.modify("modified", "user") + storage.save_test(test3) + + approved = storage.get_approved_tests("goal_001") + assert len(approved) == 2 # approved and modified + + def test_save_and_load_result(self, storage): + """Test saving and loading test results.""" + result = TestResult( + test_id="test_001", + passed=True, + duration_ms=100, + ) + + storage.save_result("test_001", result) + + loaded = storage.get_latest_result("test_001") + assert loaded is not None + assert loaded.passed is True + assert loaded.duration_ms == 100 + + def test_result_history(self, storage): + """Test getting result history.""" + # Save multiple results + for i in range(5): + result = TestResult( + test_id="test_001", + passed=(i % 2 == 0), + duration_ms=100 + i, + ) + storage.save_result("test_001", result) + + history = storage.get_result_history("test_001", limit=3) + assert len(history) <= 3 + + def test_get_stats(self, storage): + """Test getting storage statistics.""" + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="c1", + test_type=TestType.CONSTRAINT, + test_name="test_1", + test_code="pass", + description="test", + ) + test.approve() + storage.save_test(test) + + stats = storage.get_stats() + assert stats["total_tests"] == 1 + assert stats["by_approval"]["approved"] == 1 + + +# ============================================================================ +# Error Categorizer Tests +# ============================================================================ + +class TestErrorCategorizer: + """Tests for ErrorCategorizer.""" + + @pytest.fixture + def categorizer(self): + return ErrorCategorizer() + + def test_categorize_passed(self, categorizer): + """Test that passed results return None.""" + result = TestResult(test_id="t1", passed=True, duration_ms=100) + assert categorizer.categorize(result) is None + + def test_categorize_logic_error(self, categorizer): + """Test categorization of logic errors.""" + result = TestResult( + test_id="t1", + passed=False, + duration_ms=100, + error_message="goal not achieved: expected success criteria was not met", + ) + assert categorizer.categorize(result) == ErrorCategory.LOGIC_ERROR + + def test_categorize_implementation_error(self, categorizer): + """Test categorization of implementation errors.""" + result = TestResult( + test_id="t1", + passed=False, + duration_ms=100, + error_message="TypeError: 'NoneType' object has no attribute 'get'", + ) + assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR + + def test_categorize_edge_case(self, categorizer): + """Test categorization of edge cases.""" + result = TestResult( + test_id="t1", + passed=False, + duration_ms=100, + error_message="timeout: request took longer than expected", + ) + assert categorizer.categorize(result) == ErrorCategory.EDGE_CASE + + def test_categorize_from_stack_trace(self, categorizer): + """Test categorization from stack trace.""" + result = TestResult( + test_id="t1", + passed=False, + duration_ms=100, + error_message="Error occurred", + stack_trace="KeyError: 'missing_key'\n at line 42", + ) + assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR + + def test_get_fix_suggestion(self, categorizer): + """Test fix suggestions for each category.""" + assert "Goal" in categorizer.get_fix_suggestion(ErrorCategory.LOGIC_ERROR) + assert "code" in categorizer.get_fix_suggestion(ErrorCategory.IMPLEMENTATION_ERROR).lower() + assert "test" in categorizer.get_fix_suggestion(ErrorCategory.EDGE_CASE).lower() + + def test_get_iteration_guidance(self, categorizer): + """Test iteration guidance.""" + guidance = categorizer.get_iteration_guidance(ErrorCategory.LOGIC_ERROR) + assert guidance["stage"] == "Goal" + assert guidance["restart_required"] is True + + guidance = categorizer.get_iteration_guidance(ErrorCategory.IMPLEMENTATION_ERROR) + assert guidance["stage"] == "Agent" + assert guidance["restart_required"] is False + + +# ============================================================================ +# Parallel Runner Tests +# ============================================================================ + +class TestParallelRunner: + """Tests for ParallelTestRunner.""" + + @pytest.fixture + def runner(self, tmp_path): + """Create a test runner with temporary storage.""" + storage = TestStorage(tmp_path) + config = ParallelConfig(num_workers=1) # Sequential for testing + return ParallelTestRunner(config, storage) + + def test_create_suite_result(self, runner): + """Test creating suite result from individual results.""" + results = [ + TestResult(test_id="t1", passed=True, duration_ms=100), + TestResult(test_id="t2", passed=False, duration_ms=50), + ] + + suite = runner._create_suite_result("goal_001", results) + + assert suite.goal_id == "goal_001" + assert suite.total == 2 + assert suite.passed == 1 + assert suite.failed == 1 + assert suite.duration_ms == 150 + + +# ============================================================================ +# Debug Tool Tests +# ============================================================================ + +class TestDebugTool: + """Tests for DebugTool.""" + + @pytest.fixture + def debug_tool(self, tmp_path): + """Create a debug tool with temporary storage.""" + storage = TestStorage(tmp_path) + return DebugTool(storage) + + def test_analyze_missing_test(self, debug_tool): + """Test analyzing a non-existent test.""" + info = debug_tool.analyze("goal_001", "nonexistent") + + assert info.test_id == "nonexistent" + assert "not found" in info.error_message.lower() + + def test_analyze_with_result(self, debug_tool, tmp_path): + """Test analyzing a test with result.""" + storage = TestStorage(tmp_path) + + # Create and save test + test = Test( + id="test_001", + goal_id="goal_001", + parent_criteria_id="c1", + test_type=TestType.CONSTRAINT, + test_name="test_something", + test_code="pass", + description="A test", + input={"key": "value"}, + expected_output={"result": "expected"}, + ) + storage.save_test(test) + + # Create and save result + result = TestResult( + test_id="test_001", + passed=False, + duration_ms=100, + error_message="TypeError: something went wrong", + error_category=ErrorCategory.IMPLEMENTATION_ERROR, + ) + storage.save_result("test_001", result) + + # Create new debug tool with same storage + debug_tool = DebugTool(storage) + + info = debug_tool.analyze("goal_001", "test_001") + + assert info.test_id == "test_001" + assert info.test_name == "test_something" + assert not info.passed + assert info.error_category == "implementation_error" + assert info.suggested_fix is not None + + +# ============================================================================ +# Integration Tests +# ============================================================================ + +class TestIntegration: + """Integration tests for the testing framework.""" + + def test_full_workflow(self, tmp_path): + """Test a simplified full workflow.""" + storage = TestStorage(tmp_path) + + # 1. Create tests (simulating generation) + tests = [] + for i in range(3): + test = Test( + id=f"test_{i}", + goal_id="goal_001", + parent_criteria_id="constraint_001", + test_type=TestType.CONSTRAINT, + test_name=f"test_constraint_{i}", + test_code=f"def test_constraint_{i}(agent): assert True", + description=f"Test {i}", + ) + tests.append(test) + + # 2. Approve tests + for test in tests: + test.approve("user") + storage.save_test(test) + + # 3. Verify storage + approved = storage.get_approved_tests("goal_001") + assert len(approved) == 3 + + # 4. Simulate running tests + config = ParallelConfig(num_workers=1) + runner = ParallelTestRunner(config, storage) + + class MockAgent: + def run(self, input): + return {"success": True} + + results = runner.run_tests(approved, MockAgent()) + assert len(results) == 3 + + # 5. Save results + for result in results: + storage.save_result(result.test_id, result) + + # 6. Check stats + stats = storage.get_stats() + assert stats["total_tests"] == 3 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])