Merge branch 'staging' into feat/credential-manager

This commit is contained in:
bryan
2026-01-21 14:33:28 -08:00
44 changed files with 6451 additions and 360 deletions
+6 -1
View File
@@ -57,10 +57,15 @@ __pycache__/
.eggs/
*.egg
# Generated runtime data
core/data/
# Misc
*.local
.cache/
tmp/
temp/
exports/*
exports/*
core/.agent-builder-sessions/*
+2 -2
View File
@@ -3,9 +3,9 @@
"agent-builder": {
"command": "python",
"args": ["-m", "framework.mcp.agent_builder_server"],
"cwd": "/Users/acho-admin/acho/local-oss/hive/core",
"cwd": "core",
"env": {
"PYTHONPATH": "/Users/acho-admin/acho/local-oss/hive/aden-tools/src"
"PYTHONPATH": "../aden-tools/src"
}
}
}
+16 -2
View File
@@ -29,6 +29,18 @@ import argparse
import os
import sys
# Suppress FastMCP banner in STDIO mode
if "--stdio" in sys.argv:
# Monkey-patch rich Console to redirect to stderr
import rich.console
_original_console_init = rich.console.Console.__init__
def _patched_console_init(self, *args, **kwargs):
kwargs['file'] = sys.stderr # Force all rich output to stderr
_original_console_init(self, *args, **kwargs)
rich.console.Console.__init__ = _patched_console_init
from fastmcp import FastMCP
from starlette.requests import Request
from starlette.responses import PlainTextResponse
@@ -51,7 +63,9 @@ mcp = FastMCP("aden-tools")
# Register all tools with the MCP server, passing credential manager
tools = register_all_tools(mcp, credentials=credentials)
print(f"[MCP] Registered {len(tools)} tools: {tools}")
# Only print to stdout in HTTP mode (STDIO mode requires clean stdout for JSON-RPC)
if "--stdio" not in sys.argv:
print(f"[MCP] Registered {len(tools)} tools: {tools}")
@mcp.custom_route("/health", methods=["GET"])
@@ -88,7 +102,7 @@ def main() -> None:
args = parser.parse_args()
if args.stdio:
print("[MCP] Starting with STDIO transport")
# STDIO mode: only JSON-RPC messages go to stdout
mcp.run(transport="stdio")
else:
print(f"[MCP] Starting HTTP server on {args.host}:{args.port}")
@@ -9,9 +9,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def apply_diff(path: str, diff_text: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
"""
Apply a diff to a file within the session sandbox.
Purpose
Apply a structured diff to update a file while preserving context.
Use this when you need to apply structured diff patches to modify file content.
When to use
Larger but still controlled updates
Refactoring structured memory (tables, sections)
Automated compaction or cleanup passes
Rules & Constraints
Diff must be context-aware
Rejected if it touches restricted sections
Prefer apply_patch for small changes
Args:
path: The path to the file (relative to session root)
@@ -9,10 +9,21 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def apply_patch(path: str, patch_text: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
"""
Apply a patch to a file within the session sandbox.
Purpose
Apply a scoped, line-level modification to an existing file.
Use this when you need to apply patch-formatted changes to a file.
This is an alias for apply_diff with the same functionality.
When to use
Update curated canonical memory
Fix or refine existing summaries or facts
Remove duplication or stale information
Rules & Constraints
Patch must be small and targeted
Must preserve unrelated content
Only allowed on approved files and sections
Best practice
Always read the file first. Never patch blindly.
Args:
path: The path to the file (relative to session root)
@@ -10,10 +10,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def execute_command_tool(command: str, workspace_id: str, agent_id: str, session_id: str, cwd: Optional[str] = None) -> dict:
"""
Execute a shell command within the session sandbox.
Purpose
Execute a shell command within the session sandbox.
Use this when you need to run shell commands safely within the sandboxed environment.
Commands are executed with a 60-second timeout.
When to use
Run validators or linters
Generate derived artifacts (indexes, summaries)
Perform controlled maintenance tasks
Rules & Constraints
No network access unless explicitly allowed
No destructive commands (rm -rf, system modification)
Output must be treated as data, not truth
Args:
command: The shell command to execute
@@ -9,10 +9,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def grep_search(path: str, pattern: str, workspace_id: str, agent_id: str, session_id: str, recursive: bool = False) -> dict:
"""
Search for a pattern in a file or directory within the session sandbox.
Purpose
Search for a regex pattern in files within the session sandbox.
Use this when you need to find specific content or patterns in files using regex.
Set recursive=True to search through all subdirectories.
When to use
Find specific content or patterns across files
Locate references to variables, functions, or terms
Search through logs or data files for matching entries
Rules & Constraints
Pattern must be a valid regex expression
Set recursive=True to search through subdirectories
Binary files and permission-denied files are skipped
Args:
path: The path to search in (file or directory, relative to session root)
@@ -8,10 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def list_dir(path: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
"""
List the contents of a directory within the session sandbox.
Purpose
List the contents of a directory within the session sandbox.
Use this when you need to explore directory contents and see what files
and subdirectories exist.
When to use
Explore directory structure and contents
Discover available files and subdirectories
Verify file existence before reading or writing
Rules & Constraints
Path must point to an existing directory
Returns file names, types, and sizes
Does not recurse into subdirectories
Args:
path: The directory path (relative to session root)
@@ -8,10 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def replace_file_content(path: str, target: str, replacement: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
"""
Replace content in a file within the session sandbox.
Purpose
Replace all occurrences of a target string with replacement text in a file.
Use this when you need to perform find-and-replace operations on file content.
All occurrences of the target string will be replaced.
When to use
Fixing repeated errors or typos
Updating deprecated terms or placeholders
Refactoring simple patterns across a file
Rules & Constraints
Target must exist in file
Replacement must be intentional
No regex or complex logic - pure string replacement
Args:
path: The path to the file (relative to session root)
@@ -8,9 +8,18 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def view_file(path: str, workspace_id: str, agent_id: str, session_id: str) -> dict:
"""
Read the content of a file within the session sandbox.
Purpose
Read the content of a file within the session sandbox.
Use this when you need to view the contents of an existing file.
When to use
Inspect file contents before making changes
Retrieve stored data or configuration
Review logs or artifacts
Rules & Constraints
File must exist at the specified path
Returns full content with size and line count
Always read before patching or modifying
Args:
path: The path to the file (relative to session root)
@@ -8,10 +8,21 @@ def register_tools(mcp: FastMCP) -> None:
@mcp.tool()
def write_to_file(path: str, content: str, workspace_id: str, agent_id: str, session_id: str, append: bool = False) -> dict:
"""
Write content to a file within the session sandbox.
Purpose
Create a new file or append content to an existing file.
Use this when you need to create a new file or overwrite an existing file.
Set append=True to add content to the end of an existing file.
When to use
Append new events to append-only logs
Create new artifacts or summaries
Initialize new canonical memory files
Rules & Constraints
Must not overwrite canonical memory unless explicitly allowed
Should include structured data (JSON, Markdown with headers)
Every write must be intentional and minimal
Anti-pattern
Do NOT dump raw conversation transcripts without structure or reason.
Args:
path: The path to the file (relative to session root)
+227 -10
View File
@@ -10,9 +10,11 @@ Build goal-driven agents that use LLM reasoning to accomplish tasks.
## Quick Start
1. Define the goal (what success looks like)
2. Add nodes (units of work)
3. Connect with edges (flow between nodes)
4. Validate and test
2. Generate constraint tests from goal → Approve tests
3. Add nodes (units of work) - validate against constraint tests
4. Connect with edges (flow between nodes)
5. Validate and test graph
6. Handoff to testing-agent skill for final evaluation
## Core Concepts
@@ -117,10 +119,15 @@ For each component (goal, node, edge):
```
Agent Build Progress:
GOAL STAGE:
- [ ] Define goal with success criteria → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Define goal constraints → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add entry node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add each processing node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Generate constraint tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓ (NEW)
AGENT STAGE:
- [ ] Add entry node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add each processing node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add pause nodes (if HITL needed) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add resume entry points (for pause nodes) → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
- [ ] Add terminal node(s) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
@@ -129,6 +136,11 @@ Agent Build Progress:
- [ ] Validate full graph → TEST GRAPH → SHOW RESULTS
- [ ] Final approval → ASK APPROVAL (clickable: Approve & Export/Reject/Pause) ✓
- [ ] Export to exports/{agent-name}/
EVAL STAGE (handoff to testing-agent skill):
- [ ] Generate success criteria tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓
- [ ] Run all tests (constraint + success criteria)
- [ ] Debug failures and iterate
```
### Testing During Approval
@@ -147,6 +159,31 @@ Show the human:
- What tools are available
- What outputs will be written
**Validate against constraint tests** (if available):
After approving constraint tests, reference them during node development:
```python
# When presenting a node for approval, show constraint alignment:
"""
**NODE: search_node**
Test Results: [test_node output]
Constraint Test Alignment:
✓ test_constraint_api_limits_respected
→ Node uses rate-limited tool wrapper ✓
✓ test_constraint_content_safety_filter
→ Output includes safety_score field ✓
Validation: ✅ PASS
"""
```
**IMPORTANT**: Constraint tests may not fully execute until the agent is complete,
but their test definitions guide node design. Review the test code to ensure
your nodes handle the constraint scenarios.
**Before final approval**, use `test_graph` to simulate full execution:
```
test_graph(
@@ -425,6 +462,7 @@ Goal(
description="What the agent must NOT do",
constraint_type="hard", # hard = must not violate
category="safety",
check="llm_judge", # Optional: how to validate ("llm_judge", expression, or function)
),
],
)
@@ -433,6 +471,98 @@ Goal(
**Good goals**: Specific, measurable, constrained
**Bad goals**: Vague, unmeasurable, no boundaries
## Constraint Test Generation
**CRITICAL**: After approving the goal, generate constraint tests BEFORE building nodes.
Constraint tests verify that the agent will respect its defined constraints (safety, rate limits, etc.).
These tests are **agent-agnostic** - they test boundaries, not implementation. This means they can be
generated before any nodes exist.
### Why Generate Tests Before Building?
1. **Early Validation**: Catch constraint violations during node development, not after
2. **Design Guidance**: Tests make constraints concrete and testable
3. **Incremental Feedback**: Review constraint tests while designing each node
### Generation Workflow
```python
# 1. After goal is approved, generate constraint tests
result = generate_constraint_tests(
goal_id=goal_data["id"],
goal_json=json.dumps(goal_data)
)
# 2. Tests are returned with PENDING status
# The MCP tool returns approval_required=True
# 3. Display each test to the human for approval
[1/3] test_constraint_api_limits_respected
Constraint: api_limits
Confidence: 88%
def test_constraint_api_limits_respected(agent):
...
[a]pprove [r]eject [e]dit [s]kip
# 4. Use AskUserQuestion with approval options
AskUserQuestion(
questions=[{
"question": "Do you approve this constraint test?",
"header": "Test Approval",
"options": [
{"label": "✓ Approve (Recommended)", "description": "Test looks good"},
{"label": "✗ Reject", "description": "Test is invalid"},
{"label": "✎ Edit", "description": "Modify before accepting"},
{"label": "⏭ Skip", "description": "Decide later"}
],
"multiSelect": false
}]
)
# 5. Call approve_tests with the decisions
approve_tests(
goal_id=goal_data["id"],
approvals='[{"test_id": "...", "action": "approve"}, ...]'
)
# 6. Verify no pending tests before proceeding to nodes
pending = get_pending_tests(goal_id=goal_data["id"])
if json.loads(pending)["pending_count"] > 0:
# Prompt user to handle remaining tests
print("⚠️ Pending tests must be resolved before building nodes")
```
### Approval Rules
- **All tests must be reviewed** - no auto-approval
- **Approved/Modified tests are stored** for use during node validation
- **Rejected tests are not stored** (with reason tracked)
- **Skipped tests remain pending** - must be resolved before export
### Using Constraint Tests During Node Building
Once constraint tests are approved, reference them when designing nodes:
```python
# Before adding a node that makes API calls, review constraint tests:
"""
Creating node: search_node (llm_tool_use)
Tools: youtube_search, video_details
Constraint Test Review:
✓ test_constraint_api_limits_respected - checks rate limits
→ Ensure search_node handles rate limit responses gracefully
✓ test_constraint_content_safety_filter - checks safe content
→ Ensure output_keys include safety flags for filtering
"""
```
## Integrating External Tools (MCP Servers)
Before adding nodes, you can register MCP servers to make their tools available to your agent.
@@ -772,11 +902,29 @@ analyze → needs_clarification? → YES → request-clarification (PAUSE)
| `export_graph` | Export the completed agent |
| `get_session_status` | View current build progress |
### Testing Tools (for HITL approval)
| Tool | Purpose |
|------|---------|
| `test_node` | Run a single node with sample inputs to show behavior |
| `test_graph` | Simulate full graph execution to show the complete flow |
### Testing Tools by Stage
#### Goal Stage (this skill) - Generate constraint tests
| Tool | Purpose | When to Use |
|------|---------|-------------|
| `generate_constraint_tests` | Generate tests from constraints | Immediately after goal approval |
| `approve_tests` | Approve/reject/modify tests | After generation, before building nodes |
| `get_pending_tests` | List tests awaiting approval | Before proceeding to node building |
#### Agent Stage (this skill) - Build and validate nodes
| Tool | Purpose | When to Use |
|------|---------|-------------|
| `test_node` | Run a single node with sample inputs | Before each node approval |
| `test_graph` | Simulate full graph execution | Before final approval |
#### Eval Stage (testing-agent skill) - Final evaluation
| Tool | Purpose | When to Use |
|------|---------|-------------|
| `generate_success_tests` | Generate tests from success criteria | After agent export |
| `run_tests` | Run all tests in parallel | After test approval |
| `debug_test` | Debug failed tests | After test failures |
See the [testing-agent skill](../testing-agent/SKILL.md) for the full Eval stage workflow.
## Using the Exported Agent
@@ -917,3 +1065,72 @@ result = await runner.run(context)
```
For complete API details, see [reference/api.md](reference/api.md).
## Handoff to Testing-Agent Skill
After exporting the agent, switch to the **testing-agent** skill for final evaluation (Eval Stage).
### What Transfers
1. **Goal definition** (with constraints and success criteria)
2. **Approved constraint tests** (generated in Goal Stage)
3. **Exported agent** at `exports/{agent-name}/`
### What Happens in Testing-Agent
1. Generate **success criteria tests** (these need agent details, so generated after build)
2. Run **all tests** (constraint + success criteria) in parallel
3. Debug failures and categorize errors
4. Iterate based on error type
### Triggering the Handoff
After `export_graph` completes successfully, display:
```
✅ Agent exported to exports/{agent-name}/
Next Steps (Eval Stage):
1. Switch to testing-agent skill
2. Generate success criteria tests
3. Run full evaluation
4. Debug any failures
Command: "Run /testing-agent for exports/{agent-name}"
```
### Error Category Routing
If tests fail in the Eval stage, the error category determines where to go:
| Error Category | Meaning | Action |
|---------------|---------|--------|
| `LOGIC_ERROR` | Goal definition is wrong | Return to Goal Stage - update goal, regenerate constraint tests |
| `IMPLEMENTATION_ERROR` | Code bug in nodes/edges | Return to Agent Stage - fix nodes/edges, re-export |
| `EDGE_CASE` | New scenario discovered | Stay in Eval Stage - add edge case test, continue |
### Flow Diagram
```
┌──────────────────────────────────────────────────────────────┐
│ GOAL STAGE (building-agents skill) │
│ 1. Define success_criteria and constraints → APPROVE │
│ 2. Generate CONSTRAINT TESTS from constraints │
│ 3. APPROVE each constraint test │
└──────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────┐
│ AGENT STAGE (building-agents skill) │
│ 1. Add nodes - review constraint tests for design guidance │
│ 2. Test each node - validate against constraint expectations│
│ 3. Connect edges → Validate graph → Export │
└──────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────┐
│ EVAL STAGE (testing-agent skill) │
│ 1. Generate SUCCESS_CRITERIA TESTS → APPROVE │
│ 2. Run ALL tests (constraint + success criteria) │
│ 3. Debug failures → Categorize errors │
│ 4. Route back based on error category (if needed) │
└──────────────────────────────────────────────────────────────┘
```
+625
View File
@@ -0,0 +1,625 @@
---
name: testing-agent
description: Run goal-based evaluation tests for agents. Use when you need to verify an agent meets its goals, debug failing tests, or iterate on agent improvements based on test results.
---
# Testing Agents
Run goal-based evaluation tests for agents built with the building-agents skill.
## Quick Start
1. **Check existing state first** - See if tests already exist
2. Generate tests from goal (only if needed)
3. Approve tests (mandatory human approval)
4. Run tests against agent
5. Debug failures and iterate
## Check Existing State First
**CRITICAL**: Before generating any tests, ALWAYS check if tests already exist for the goal.
```python
# Check what tests exist for this goal
result = list_tests(goal_id="youtube-research")
# Returns:
{
"goal_id": "youtube-research",
"total": 42,
"by_status": {
"pending": 10,
"approved": 30,
"modified": 2,
"rejected": 0
},
"by_type": {
"constraint": 15,
"success_criteria": 25,
"edge_case": 2
},
"tests": [...] # List of test summaries
}
```
### Decision Tree
Based on existing state, choose the right action:
```
list_tests(goal_id) → Check existing tests
┌───────┴────────────────────────────────────────┐
│ │
No tests exist Tests exist
│ │
↓ ┌─────────┴─────────┐
Generate tests │ │
(constraint first, Has pending All approved
then success_criteria) tests │
│ ↓
↓ Run tests
Approve pending directly
tests first
```
### Resuming a Testing Session
When the user asks to test an agent that may have been tested before:
1. **Always check first**: `list_tests(goal_id="...")`
2. **Show the user what exists**:
- "Found 42 existing tests: 30 approved, 10 pending, 2 modified"
- "Last run: 28/30 passed (93.3%)"
3. **Ask what they want to do**:
```python
AskUserQuestion(
questions=[{
"question": "Tests already exist for this agent. What would you like to do?",
"header": "Existing Tests",
"options": [
{
"label": "Run existing tests (Recommended)",
"description": "Run the 32 approved tests against the agent"
},
{
"label": "Approve pending tests",
"description": "Review and approve the 10 pending tests first"
},
{
"label": "Regenerate all tests",
"description": "Delete existing and generate fresh tests (loses approvals)"
},
{
"label": "Show test details",
"description": "List all tests with their status and last results"
}
],
"multiSelect": false
}]
)
```
### Why This Matters
- **Saves time**: Approved tests don't need re-approval
- **Preserves work**: User's previous approvals/modifications are kept
- **Clear state**: User knows exactly what exists before taking action
- **Prevents duplicates**: Won't generate tests that already exist
## Core Concepts
**Test Types**: Three types of tests, generated at different stages:
- `constraint` - Generated during Goal stage (agent-agnostic boundaries)
- `success_criteria` - Generated during Eval stage (after agent exists)
- `edge_case` - Generated when new scenarios discovered during debugging
**Approval**: All LLM-generated tests require explicit user approval before running.
**Error Categories**: Failed tests are categorized to guide iteration:
- `LOGIC_ERROR` - Goal definition is wrong → Update goal, restart full flow
- `IMPLEMENTATION_ERROR` - Code bug → Fix agent, re-run Eval
- `EDGE_CASE` - New scenario discovered → Add test, continue Eval
**Iteration**: Each error category has a specific fix path (see Error Categorization section).
## Workflow (HITL Required)
**CRITICAL**: Each step requires human approval before proceeding.
**CRITICAL**: Use structured questions (AskUserQuestion) with fallback to text mode.
### Approval Strategy
**Always try structured questions first**, with graceful fallback:
1. **Attempt**: Call AskUserQuestion with clickable options
2. **Catch**: If tool fails/rejected, fall back to text prompt
3. **Parse**: Accept text input like "approve", "reject", "skip"
This ensures the workflow works in all environments (VSCode extension, CLI, web).
### Test Loop
```
For each test generated:
1. DISPLAY → Show the test details to the human
2. VALIDATE → Check test syntax and structure
3. ASK APPROVAL → Use AskUserQuestion with clickable options
4. Only run tests after approval
```
### Checklist (ask approval at each check)
```
Agent Testing Progress:
- [ ] Load goal and agent → VERIFY PATHS
- [ ] CHECK EXISTING TESTS → list_tests, show stats, ask what to do
- [ ] If no tests OR user wants fresh: Generate tests → ASK APPROVAL
- [ ] If pending tests exist: Approve pending tests first
- [ ] Run all approved tests → SHOW RESULTS
- [ ] Debug failed tests → SHOW CATEGORIZATION
- [ ] Iterate based on category → ASK APPROVAL for changes
```
## The Three-Stage Flow
```
┌─────────────────────────────────────────────────────────────────────────┐
│ GOAL STAGE │
│ 1. Define success_criteria and constraints (building-agents skill) │
│ 2. Generate CONSTRAINT TESTS → USER APPROVAL → tests stored │
└─────────────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────────┐
│ AGENT STAGE │
│ Build nodes + edges (building-agents skill) │
│ Constraint tests can run during development for early feedback │
└─────────────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────────┐
│ EVAL STAGE (this skill) │
│ 1. Generate SUCCESS_CRITERIA TESTS → USER APPROVAL → tests stored │
│ 2. Run all tests in parallel → pass/fail summary │
│ 3. On failure → Debug tool with categorization │
│ 4. Iterate based on error category │
└─────────────────────────────────────────────────────────────────────────┘
```
## Test Generation
### When to Generate Each Type
| Test Type | When Generated | Why |
|-----------|----------------|-----|
| **Constraint Tests** | During Goal stage (before agent exists) | Constraints are agent-agnostic boundaries |
| **Success Criteria Tests** | During Eval stage (after agent exists) | May depend on agent flow/nodes |
| **Edge Case Tests** | During debugging (when new scenario found) | Discovered through test failures |
### Generating Tests
```python
import json
# 1. Generate constraint tests (Goal stage)
result = generate_constraint_tests(
goal_id="youtube-research",
goal_json=json.dumps({
"id": "youtube-research",
"name": "YouTube Research Agent",
"description": "Find relevant YouTube videos on a topic",
"success_criteria": [
{
"id": "find_videos",
"description": "Find 3-5 relevant videos",
"metric": "video_count",
"target": "3-5",
"weight": 1.0
}
],
"constraints": [
{
"id": "api_limits",
"description": "Must respect YouTube API rate limits",
"constraint_type": "hard",
"category": "reliability",
"check": "llm_judge" # Optional: how to validate
}
]
})
)
# 2. Generate success criteria tests (Eval stage, after agent built)
result = generate_success_tests(
goal_id="youtube-research",
goal_json='...', # Same structure as above
node_names="search_node,filter_node,format_node",
tool_names="youtube_search,video_details"
)
```
**After generation**, tests are stored as PENDING. They must be approved before running.
## Approval Patterns
### Interactive Approval Flow
```
┌─────────────────────────────────────────────────────────────────┐
│ Generated Tests for: youtube-research (3 tests) │
├─────────────────────────────────────────────────────────────────┤
│ [1/3] test_find_videos_happy_path │
│ Type: SUCCESS_CRITERIA │
│ Confidence: 92% │
│ Input: {"topic": "machine learning tutorials"} │
│ Expected: 3-5 videos with titles and IDs │
│ │
│ def test_find_videos_happy_path(agent): │
│ result = agent.run({"topic": "machine learning"}) │
│ assert 3 <= len(result.videos) <= 5 │
│ assert all(v.title for v in result.videos) │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
└─────────────────────────────────────────────────────────────────┘
```
### Approval Actions
| Action | Description | Result |
|--------|-------------|--------|
| **approve** | Accept test as-is | Status → APPROVED, test will run |
| **reject** | Decline with reason | Status → REJECTED, test won't run |
| **edit** | Modify code before accepting | Status → MODIFIED, original preserved |
| **skip** | Leave for later | Status → PENDING, decide later |
### Approval Code Pattern
```python
# After generating tests, approve them
result = approve_tests(
goal_id="youtube-research",
approvals='[
{"test_id": "test_001", "action": "approve"},
{"test_id": "test_002", "action": "modify", "modified_code": "def test_..."},
{"test_id": "test_003", "action": "reject", "reason": "Not a valid scenario"},
{"test_id": "test_004", "action": "skip"}
]'
)
```
### Structured Approval Questions
```python
# Try structured approval first
try:
response = AskUserQuestion(
questions=[{
"question": "Do you approve this test?",
"header": "Test Approval",
"options": [
{
"label": "Approve (Recommended)",
"description": "Test looks good, include in test suite"
},
{
"label": "Reject",
"description": "Test is invalid or unnecessary"
},
{
"label": "Edit",
"description": "Modify the test code before accepting"
},
{
"label": "Skip",
"description": "Decide later, leave as pending"
}
],
"multiSelect": false
}]
)
except:
# Fallback to text mode
print("Do you approve this test? Type: approve | reject | edit | skip")
```
## Test Execution
### Parallel Configuration
```python
# Tests run in parallel with these defaults
ParallelConfig(
num_workers=cpu_count(), # Use all CPU cores
timeout_per_test=60.0, # 60 seconds per test
fail_fast=False, # Run all tests, don't stop on first failure
mode="loadfile", # Group tests by parent_criteria_id
)
```
### Running Tests
```python
# Run all approved tests
result = run_tests(
goal_id="youtube-research",
agent_path="exports/youtube-agent",
test_types='["all"]', # or ["constraint", "success_criteria", "edge_case"]
parallel=4, # Number of workers
fail_fast=False # Run all tests
)
# Result structure
{
"goal_id": "youtube-research",
"overall_passed": false,
"summary": {
"total": 15,
"passed": 12,
"failed": 3,
"pass_rate": "80.0%"
},
"duration_ms": 5432,
"results": [
{"test_id": "test_001", "passed": true, "duration_ms": 234},
{"test_id": "test_002", "passed": false, "duration_ms": 567, "error_category": "IMPLEMENTATION_ERROR"},
...
]
}
```
### Execution Flow
1. Load only APPROVED and MODIFIED tests (skip PENDING and REJECTED)
2. Group tests by `parent_criteria_id` for shared fixture setup
3. Run groups in parallel with process isolation
4. Aggregate results with timing information
## Error Categorization & Iteration
### Decision Tree
```
Test Fails → Categorize Error
┌───────────┴─────────────────┬────────────────────┐
│ │ │
LOGIC ERROR IMPLEMENTATION ERROR EDGE CASE
(criteria wrong) (code bug) (new scenario)
│ │ │
↓ ↓ ↓
Update goal Fix nodes/edges Generate new
success_criteria in Agent stage edge case test
↓ ↓ │
FULL 3-STEP Re-run Eval Continue in
FLOW RESTART (skip Goal stage) Eval stage
```
### Pattern-Based Heuristics
The categorizer uses these patterns to classify errors:
**LOGIC_ERROR** (goal definition is wrong):
- "goal not achieved"
- "constraint violated: core"
- "fundamental assumption"
- "success criteria mismatch"
- "expected behavior incorrect"
**IMPLEMENTATION_ERROR** (code bug in agent):
- TypeError, AttributeError, KeyError, ValueError
- "tool call failed"
- "node execution error"
- "assertion failed"
- "null pointer", "undefined"
**EDGE_CASE** (new scenario discovered):
- "boundary condition"
- "timeout", "rate limit"
- "empty result", "no results"
- "unexpected format"
- "rare input", "unusual"
### Iteration Guidance
```python
# After categorization, you get guidance
{
"error_category": "IMPLEMENTATION_ERROR",
"iteration_guidance": {
"stage": "Agent",
"action": "Fix the code in nodes/edges",
"restart_required": false,
"description": "The goal is correct, but the implementation has a bug. Fix the agent code and re-run Eval."
}
}
```
| Category | Go To Stage | Restart Required | Action |
|----------|-------------|------------------|--------|
| LOGIC_ERROR | Goal | Yes | Update success_criteria/constraints, rebuild agent |
| IMPLEMENTATION_ERROR | Agent | No | Fix nodes/edges, re-run Eval only |
| EDGE_CASE | Eval | No | Generate edge case test, continue in Eval |
## Debugging Failed Tests
### Debug Tool
```python
# Get detailed debug info for a failed test
result = debug_test(
goal_id="youtube-research",
test_id="test_find_videos_no_results"
)
# Returns comprehensive debug info
{
"test_id": "test_find_videos_no_results",
"test_name": "test_find_videos_no_results",
"input": {"topic": "xyzabc123nonsense"},
"expected": {"videos": [], "message": "No results found"},
"actual": {"error": "NullPointerException at node_3"},
"passed": false,
"error_message": "TypeError: 'NoneType' has no attribute 'get'",
"error_category": "IMPLEMENTATION_ERROR",
"stack_trace": "Traceback (most recent call last):\n ...",
"logs": [
{"timestamp": "...", "node": "search_node", "level": "INFO", "msg": "..."},
{"timestamp": "...", "node": "filter_node", "level": "ERROR", "msg": "..."}
],
"runtime_data": {
"execution_path": ["start", "search_node", "filter_node"],
"node_outputs": {...}
},
"suggested_fix": "Check null handling in filter_node when no results returned",
"iteration_guidance": {
"stage": "Agent",
"action": "Fix the code in nodes/edges",
"restart_required": false
}
}
```
### Debug Workflow
1. **Run all tests** → Get pass/fail summary
2. **Select failed test** → Get detailed DebugInfo
3. **Review categorization** → Understand error type
4. **Check suggested fix** → Get actionable guidance
5. **Follow iteration guidance** → Go to correct stage
## Example: Testing YouTube Agent
See [examples/testing-youtube-agent.md](examples/testing-youtube-agent.md) for a complete walkthrough.
## Common Patterns
### Happy Path Tests
Test normal successful execution with valid inputs:
```python
def test_find_videos_happy_path(agent):
result = agent.run({"topic": "python tutorials"})
assert result.success
assert len(result.videos) >= 3
assert all(v.title for v in result.videos)
```
### Boundary Condition Tests
Test exactly at target thresholds:
```python
def test_find_videos_minimum_count(agent):
result = agent.run({"topic": "very specific niche topic"})
assert len(result.videos) >= 1 # At least one result
```
### Error Handling Tests
Test graceful handling of failures:
```python
def test_find_videos_invalid_input(agent):
result = agent.run({"topic": ""}) # Empty input
assert not result.success or result.message == "Invalid input"
```
### Constraint Violation Tests
Test that constraints are respected:
```python
def test_api_rate_limit_respected(agent):
# Run multiple times quickly
for _ in range(5):
result = agent.run({"topic": "test"})
# Should not hit rate limit errors
assert "rate limit" not in str(result).lower()
```
## Anti-Patterns
| Don't | Do Instead |
|-------|------------|
| Auto-approve tests | Always require explicit user approval |
| Run PENDING/REJECTED tests | Only run APPROVED/MODIFIED tests |
| Generate success tests during Goal stage | Wait until agent exists |
| Treat all failures the same | Categorize and iterate appropriately |
| Restart full flow for IMPLEMENTATION_ERROR | Fix agent, re-run Eval only |
| Add test for LOGIC_ERROR | Fix the goal definition instead |
| Ignore confidence scores | Review low-confidence categorizations manually |
| Skip the approval step | Tests must be reviewed before running |
## Tools Reference
### Testing Tools
| Tool | Purpose | When to Use |
|------|---------|-------------|
| `generate_constraint_tests` | Generate tests from goal constraints | Goal stage |
| `generate_success_tests` | Generate tests from success criteria | Eval stage (after agent built) |
| `approve_tests` | Approve/reject/modify generated tests | After generation |
| `run_tests` | Execute tests in parallel | After approval |
| `debug_test` | Analyze failed test with categorization | After test fails |
| `list_tests` | List tests for a goal by status | Anytime |
| `get_pending_tests` | Get tests awaiting approval | Before approval |
### Building Tools (for iteration)
When iteration requires modifying the agent, use these from the building-agents skill:
| Tool | Purpose | When to Use |
|------|---------|-------------|
| `set_goal` | Update goal definition | LOGIC_ERROR iteration |
| `add_node` | Add or modify nodes | IMPLEMENTATION_ERROR iteration |
| `add_edge` | Add or modify edges | IMPLEMENTATION_ERROR iteration |
| `validate_graph` | Validate changes | After any modification |
| `export_graph` | Re-export agent | After fixes complete |
## CLI Commands
```bash
# Generate tests from goal
python -m core test-generate goal.json --type all
# Interactive approval of pending tests
python -m core test-approve <goal_id>
# Run tests for an agent
python -m core test-run <agent_path> --goal <goal_id> --parallel 4
# Debug a failed test
python -m core test-debug <goal_id> <test_id>
# List tests by status
python -m core test-list <goal_id> --status approved
# Show test statistics
python -m core test-stats <goal_id>
```
## Integration with building-agents
### Handoff Points
| Scenario | From | To | Action |
|----------|------|-----|--------|
| Agent built, ready to test | building-agents | testing-agent | Generate success tests |
| LOGIC_ERROR found | testing-agent | building-agents | Update goal, rebuild |
| IMPLEMENTATION_ERROR found | testing-agent | building-agents | Fix nodes/edges |
| EDGE_CASE found | testing-agent | testing-agent | Generate edge case test |
| All tests pass | testing-agent | Done | Agent is validated |
### When to Switch Skills
**Use building-agents when:**
- Defining goals and constraints
- Building agent nodes and edges
- Fixing LOGIC_ERROR or IMPLEMENTATION_ERROR
**Use testing-agent when:**
- Generating tests from goals
- Approving and running tests
- Debugging failures
- Categorizing errors
### Shared Patterns
Both skills use:
- AskUserQuestion with structured options
- HITL at every critical step
- Fallback to text mode when widgets unavailable
- Session state management for continuity
@@ -0,0 +1,348 @@
# Example: Testing a YouTube Research Agent
This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
## Prerequisites
- Agent built with building-agents skill at `exports/youtube-research/`
- Goal defined with success criteria and constraints
## Step 1: Load the Goal
First, load the goal that was defined during the Goal stage:
```json
{
"id": "youtube-research",
"name": "YouTube Research Agent",
"description": "Find relevant YouTube videos on a given topic",
"success_criteria": [
{
"id": "find_videos",
"description": "Find 3-5 relevant videos",
"metric": "video_count",
"target": "3-5",
"weight": 1.0
},
{
"id": "relevance",
"description": "Videos must be relevant to the topic",
"metric": "relevance_score",
"target": ">0.8",
"weight": 0.8
}
],
"constraints": [
{
"id": "api_limits",
"description": "Must not exceed YouTube API rate limits",
"constraint_type": "hard",
"category": "technical"
},
{
"id": "content_safety",
"description": "Must filter out inappropriate content",
"constraint_type": "hard",
"category": "safety"
}
]
}
```
## Step 2: Generate Constraint Tests
During the Goal stage (or early Eval), generate tests for constraints:
```python
result = generate_constraint_tests(
goal_id="youtube-research",
goal_json='<goal JSON above>'
)
```
**Generated tests (awaiting approval):**
```
┌─────────────────────────────────────────────────────────────────┐
│ Generated Constraint Tests (2 tests) │
├─────────────────────────────────────────────────────────────────┤
│ [1/2] test_constraint_api_limits_respected │
│ Constraint: api_limits │
│ Confidence: 88% │
│ │
│ def test_constraint_api_limits_respected(agent): │
│ """Verify API rate limits are not exceeded.""" │
│ import time │
│ for i in range(10): │
│ result = agent.run({"topic": f"test_{i}"}) │
│ time.sleep(0.1) │
│ # Should complete without rate limit errors │
│ assert "rate limit" not in str(result).lower() │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
├─────────────────────────────────────────────────────────────────┤
│ [2/2] test_constraint_content_safety_filter │
│ Constraint: content_safety │
│ Confidence: 91% │
│ │
│ def test_constraint_content_safety_filter(agent): │
│ """Verify inappropriate content is filtered.""" │
│ result = agent.run({"topic": "general topic"}) │
│ for video in result.videos: │
│ assert video.safe_for_work is True │
│ assert video.age_restricted is False │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
└─────────────────────────────────────────────────────────────────┘
```
## Step 3: Approve Constraint Tests
Review and approve each test:
```python
result = approve_tests(
goal_id="youtube-research",
approvals='[
{"test_id": "test_constraint_api_001", "action": "approve"},
{"test_id": "test_constraint_content_001", "action": "approve"}
]'
)
```
## Step 4: Generate Success Criteria Tests
After the agent is built, generate success criteria tests:
```python
result = generate_success_tests(
goal_id="youtube-research",
goal_json='<goal JSON>',
node_names="search_node,filter_node,rank_node,format_node",
tool_names="youtube_search,video_details,channel_info"
)
```
**Generated tests (awaiting approval):**
```
┌─────────────────────────────────────────────────────────────────┐
│ Generated Success Criteria Tests (4 tests) │
├─────────────────────────────────────────────────────────────────┤
│ [1/4] test_find_videos_happy_path │
│ Criteria: find_videos │
│ Confidence: 95% │
│ │
│ def test_find_videos_happy_path(agent): │
│ """Test finding videos for a common topic.""" │
│ result = agent.run({"topic": "machine learning"}) │
│ assert result.success │
│ assert 3 <= len(result.videos) <= 5 │
│ assert all(v.title for v in result.videos) │
│ assert all(v.video_id for v in result.videos) │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
├─────────────────────────────────────────────────────────────────┤
│ [2/4] test_find_videos_minimum_boundary │
│ Criteria: find_videos │
│ Confidence: 87% │
│ │
│ def test_find_videos_minimum_boundary(agent): │
│ """Test at minimum threshold (3 videos).""" │
│ result = agent.run({"topic": "niche topic xyz"}) │
│ assert len(result.videos) >= 3 │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
├─────────────────────────────────────────────────────────────────┤
│ [3/4] test_relevance_score_threshold │
│ Criteria: relevance │
│ Confidence: 92% │
│ │
│ def test_relevance_score_threshold(agent): │
│ """Test relevance scoring meets threshold.""" │
│ result = agent.run({"topic": "python programming"}) │
│ for video in result.videos: │
│ assert video.relevance_score > 0.8 │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
├─────────────────────────────────────────────────────────────────┤
│ [4/4] test_find_videos_no_results_graceful │
│ Criteria: find_videos │
│ Confidence: 84% │
│ │
│ def test_find_videos_no_results_graceful(agent): │
│ """Test graceful handling of no results.""" │
│ result = agent.run({"topic": "xyznonexistent123"}) │
│ # Should not crash, return empty or message │
│ assert result.videos == [] or result.message │
│ │
│ [a]pprove [r]eject [e]dit [s]kip │
└─────────────────────────────────────────────────────────────────┘
```
## Step 5: Approve Success Criteria Tests
```python
result = approve_tests(
goal_id="youtube-research",
approvals='[
{"test_id": "test_success_001", "action": "approve"},
{"test_id": "test_success_002", "action": "approve"},
{"test_id": "test_success_003", "action": "approve"},
{"test_id": "test_success_004", "action": "approve"}
]'
)
```
## Step 6: Run All Tests
Execute all approved tests:
```python
result = run_tests(
goal_id="youtube-research",
agent_path="exports/youtube-research",
test_types='["all"]',
parallel=4
)
```
**Results:**
```json
{
"goal_id": "youtube-research",
"overall_passed": false,
"summary": {
"total": 6,
"passed": 5,
"failed": 1,
"pass_rate": "83.3%"
},
"duration_ms": 4521,
"results": [
{"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
{"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
{"test_id": "test_success_001", "passed": true, "duration_ms": 789},
{"test_id": "test_success_002", "passed": true, "duration_ms": 654},
{"test_id": "test_success_003", "passed": true, "duration_ms": 543},
{"test_id": "test_success_004", "passed": false, "duration_ms": 845,
"error_category": "IMPLEMENTATION_ERROR",
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
]
}
```
## Step 7: Debug the Failed Test
```python
result = debug_test(
goal_id="youtube-research",
test_id="test_success_004"
)
```
**Debug Output:**
```json
{
"test_id": "test_success_004",
"test_name": "test_find_videos_no_results_graceful",
"input": {"topic": "xyznonexistent123"},
"expected": "Empty list or message",
"actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
"passed": false,
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
"error_category": "IMPLEMENTATION_ERROR",
"stack_trace": "Traceback (most recent call last):\n File \"filter_node.py\", line 42\n for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
"logs": [
{"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
{"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
{"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
],
"runtime_data": {
"execution_path": ["start", "search_node", "filter_node"],
"node_outputs": {
"search_node": null
}
},
"suggested_fix": "Add null check in filter_node before accessing .videos attribute",
"iteration_guidance": {
"stage": "Agent",
"action": "Fix the code in nodes/edges",
"restart_required": false,
"description": "The goal is correct, but filter_node doesn't handle null results from search_node."
}
}
```
## Step 8: Iterate Based on Category
Since this is an **IMPLEMENTATION_ERROR**, we:
1. **Don't restart** the Goal → Agent → Eval flow
2. **Fix the agent** using building-agents skill:
- Modify `filter_node` to handle null results
3. **Re-run Eval** (tests only)
### Fix in building-agents:
```python
# Update the filter_node to handle null
add_node(
node_id="filter_node",
name="Filter Node",
description="Filter and rank videos",
node_type="function",
input_keys=["search_results"],
output_keys=["filtered_videos"],
system_prompt="""
Filter videos by relevance.
IMPORTANT: Handle case where search_results is None or empty.
Return empty list if no results.
"""
)
```
### Re-export and re-test:
```python
# Re-export the fixed agent
export_graph(path="exports/youtube-research")
# Re-run tests
result = run_tests(
goal_id="youtube-research",
agent_path="exports/youtube-research",
test_types='["all"]'
)
```
**Updated Results:**
```json
{
"goal_id": "youtube-research",
"overall_passed": true,
"summary": {
"total": 6,
"passed": 6,
"failed": 0,
"pass_rate": "100.0%"
}
}
```
## Summary
1. **Generated** constraint tests during Goal stage
2. **Generated** success criteria tests during Eval stage
3. **Approved** all tests with user review
4. **Ran** tests in parallel
5. **Debugged** the one failure
6. **Categorized** as IMPLEMENTATION_ERROR
7. **Fixed** the agent (not the goal)
8. **Re-ran** Eval only (didn't restart full flow)
9. **Passed** all tests
The agent is now validated and ready for production use.
+57 -203
View File
@@ -64,7 +64,7 @@ To use the agent builder with Claude Desktop or other MCP clients, add this to y
"agent-builder": {
"command": "python",
"args": ["-m", "framework.mcp.agent_builder_server"],
"cwd": "/path/to/hive/core"
"cwd": "/path/to/goal-agent"
}
}
}
@@ -75,144 +75,48 @@ The MCP server provides tools for:
- Defining goals with success criteria
- Adding nodes (llm_generate, llm_tool_use, router, function)
- Connecting nodes with edges
- **Registering MCP servers as tool sources** ✨
- **Discovering tools from MCP servers** ✨
- Validating and exporting agent graphs
- Testing nodes and full agent graphs
When you register an MCP server during agent building, the tools from that server become available to your agent, and an `mcp_servers.json` configuration file is automatically created on export.
See [MCP_SERVER_GUIDE.md](MCP_SERVER_GUIDE.md) for agent builder instructions and [MCP_BUILDER_TOOLS_GUIDE.md](MCP_BUILDER_TOOLS_GUIDE.md) for MCP integration tools.
## MCP Tool Integration
The framework also supports **connecting to MCP servers as tool providers**, allowing your agents to use tools from external MCP servers (like aden-tools). This enables you to extend your agents with powerful external capabilities.
### Quick Example
```python
from framework.runner.runner import AgentRunner
# Load an agent
runner = AgentRunner.load("exports/task-planner")
# Register an MCP server with tools
runner.register_mcp_server(
name="aden-tools",
transport="stdio",
command="python",
args=["mcp_server.py", "--stdio"],
cwd="../aden-tools"
)
# Tools from the MCP server are now available to your agent
result = await runner.run({"query": "Search for AI news"})
```
### Auto-loading MCP Servers
Create `mcp_servers.json` in your agent folder:
```json
{
"servers": [
{
"name": "aden-tools",
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"cwd": "../aden-tools"
}
]
}
```
MCP servers will be automatically loaded when you load the agent.
### Available Tools from aden-tools
When you register the aden-tools MCP server, these tools become available:
- `web_search` - Search the web using Brave Search API
- `web_scrape` - Extract content from web pages
- `file_read` - Read file contents
- `file_write` - Write content to files
- `pdf_read` - Extract text from PDF files
See [MCP_INTEGRATION_GUIDE.md](MCP_INTEGRATION_GUIDE.md) for detailed instructions on MCP tool integration.
## Quick Start
### Running Agents
### Calculator Agent
The framework comes with pre-built example agents in the `exports/` directory:
Run an LLM-powered calculator:
```bash
# List available agents
python -m framework list exports/
# Single calculation
python -m framework calculate "2 + 3 * 4"
# Show agent information
python -m framework info exports/task-planner
# Interactive mode
python -m framework interactive
# Run an agent
python -m framework run exports/task-planner --input '{"objective": "Build a web scraper"}'
# Interactive shell mode (with human-in-the-loop approval)
python -m framework shell exports/task-planner
# Analyze runs with Builder
python -m framework analyze calculator
```
### Available Commands
- `run` - Execute an exported agent with given input
- `info` - Display agent details (goal, nodes, edges, success criteria)
- `validate` - Check that an agent is valid and runnable
- `list` - List all exported agents in a directory
- `dispatch` - Route requests to multiple agents using the orchestrator
- `shell` - Start an interactive session with an agent
### Building Agents Programmatically
You can build agents using the MCP server (recommended) or programmatically:
### Using the Runtime
```python
from framework import Runtime
# Initialize runtime with storage path
runtime = Runtime("./storage")
runtime = Runtime("/path/to/storage")
# Start a run for a goal
run_id = runtime.start_run(
goal_id="data-processor",
goal_description="Process data with quality checks",
input_data={"dataset": "customers.csv"}
)
# Set the current node context
runtime.set_node("processor-node")
# Start a run
run_id = runtime.start_run("my_goal", "Description of what we're doing")
# Record a decision
decision_id = runtime.decide(
intent="Choose how to process the data",
options=[
{
"id": "fast",
"description": "Quick processing",
"action_type": "tool_call",
"pros": ["Fast"],
"cons": ["Less accurate"]
},
{
"id": "thorough",
"description": "Detailed processing",
"action_type": "tool_call",
"pros": ["Accurate"],
"cons": ["Slower"]
},
{"id": "fast", "description": "Quick processing", "pros": ["Fast"], "cons": ["Less accurate"]},
{"id": "thorough", "description": "Detailed processing", "pros": ["Accurate"], "cons": ["Slower"]},
],
chosen="thorough",
reasoning="Accuracy is more important for this task"
)
# Record the outcome of the decision
# Record the outcome
runtime.record_outcome(
decision_id=decision_id,
success=True,
@@ -221,13 +125,32 @@ runtime.record_outcome(
)
# End the run
runtime.end_run(
success=True,
narrative="Successfully processed all data",
output_data={"total_processed": 100}
)
runtime.end_run(success=True, narrative="Successfully processed all data")
```
### Testing Agents
The framework includes a goal-based testing framework for validating agent behavior.
```bash
# Generate tests from a goal definition
python -m framework test-generate goal.json
# Interactively approve generated tests
python -m framework test-approve <goal_id>
# Run tests against an agent
python -m framework test-run <agent_path> --parallel 4
# Debug failed tests
python -m framework test-debug <goal_id> <test_id>
# List tests by status
python -m framework test-list <goal_id>
```
For detailed testing workflows, see the [testing-agent skill](.claude/skills/testing-agent/SKILL.md).
### Analyzing Agent Behavior with Builder
The BuilderQuery interface allows you to analyze agent runs and identify improvements:
@@ -235,119 +158,50 @@ The BuilderQuery interface allows you to analyze agent runs and identify improve
```python
from framework import BuilderQuery
# Initialize Builder query interface
query = BuilderQuery("./storage")
query = BuilderQuery("/path/to/storage")
# Find patterns across runs for a goal
patterns = query.find_patterns("data-processor")
if patterns:
print(f"Success rate: {patterns.success_rate:.1%}")
print(f"Runs analyzed: {patterns.run_count}")
# Find patterns across runs
patterns = query.find_patterns("my_goal")
print(f"Success rate: {patterns.success_rate:.1%}")
# Show problematic nodes
for node_id, failure_rate in patterns.problematic_nodes:
print(f"Node '{node_id}' has {failure_rate:.1%} failure rate")
# Analyze a failure
analysis = query.analyze_failure("run_123")
print(f"Root cause: {analysis.root_cause}")
print(f"Suggestions: {analysis.suggestions}")
# Analyze a specific failure
analysis = query.analyze_failure("run_20260119_143022_abc123")
if analysis:
print(f"Failure point: {analysis.failure_point}")
print(f"Root cause: {analysis.root_cause}")
print(f"\nSuggestions:")
for suggestion in analysis.suggestions:
print(f" - {suggestion}")
# Get improvement recommendations for a goal
suggestions = query.suggest_improvements("data-processor")
# Get improvement recommendations
suggestions = query.suggest_improvements("my_goal")
for s in suggestions:
print(f"[{s['priority']}] {s['recommendation']}")
print(f" Reason: {s['reason']}")
# Get performance metrics for a specific node
perf = query.get_node_performance("processor-node")
print(f"Node: {perf['node_id']}")
print(f"Success rate: {perf['success_rate']:.1%}")
print(f"Avg latency: {perf['avg_latency_ms']:.0f}ms")
```
## Architecture
The framework consists of several layers:
```
┌─────────────────┐
│ Human Engineer │ ← Supervision, approval via HITL
│ Human Engineer │ ← Supervision, approval
└────────┬────────┘
┌────────▼────────┐
│ Builder LLM │ ← Analyzes runs, suggests improvements (via MCP)
│ Builder LLM │ ← Analyzes runs, suggests improvements
│ (BuilderQuery) │
└────────┬────────┘
┌────────▼────────┐
│ Agent Graph │ ← Node-based execution flow
(AgentRunner) (llm_generate, llm_tool_use, router, function)
└────────┬────────┘
┌────────▼────────┐
│ Runtime │ ← Records decisions, outcomes, problems
│ (Decision DB) │
│ Agent LLM │ ← Executes tasks, records decisions
(Runtime)
└─────────────────┘
```
## Key Concepts
### Graph-Based Agents
Agents are defined as directed graphs with:
- **Nodes**: Execution steps (llm_generate, llm_tool_use, router, function)
- **Edges**: Control flow between nodes, including conditional routing
- **Goal**: What the agent is designed to accomplish with success criteria
- **Constraints**: Hard and soft limits on agent behavior
### Decision Recording
- **Decision**: The atomic unit of agent behavior. Captures intent, options, choice, and reasoning.
- **Outcome**: Result of executing a decision (success/failure, latency, tokens, state changes)
- **Run**: A complete execution trace with all decisions and outcomes
- **Problem**: Issues reported during execution with severity and suggested fixes
### Analysis & Improvement
- **Runtime**: Interface agents use to record their behavior during execution
- **BuilderQuery**: Interface for analyzing agent runs and identifying patterns
- **PatternAnalysis**: Cross-run analysis showing success rates, common failures, problematic nodes
- **FailureAnalysis**: Deep dive into why a specific run failed with suggestions
### Human-in-the-Loop (HITL)
- **Approval Callbacks**: Nodes can require human approval before execution
- **Interactive Shell**: Chat-like interface for running agents with approval prompts
- **Session State**: Agents can pause and resume based on user input
### Multi-Agent Orchestration
- **AgentOrchestrator**: Dispatch requests to multiple agents
- **Agent Discovery**: Automatically discover and register agents from a directory
- **Dispatch Strategy**: Route requests to the most appropriate agent(s)
## Example Agents
The `exports/` directory contains example agents you can run or use as templates:
- **task-planner**: Breaks down complex objectives into actionable tasks with dependencies
- **research-summary-agent**: Conducts research and generates summaries
- **outbound-sales-agent**: Handles outbound sales workflows
- **youtube-comments-research**: Analyzes YouTube comments for insights
Each agent includes:
- `agent.json`: Graph definition with nodes, edges, goal, and constraints
- `README.md`: Agent documentation
- `tools.py` (optional): Custom tool implementations
- **Run**: A complete execution with all decisions and outcomes.
- **Runtime**: Interface agents use to record their behavior.
- **BuilderQuery**: Interface Builder uses to analyze agent behavior.
## Requirements
- Python 3.11+
- pydantic >= 2.0
- anthropic >= 0.40.0 (for LLM-powered agents)
- mcp, fastmcp (optional, for MCP server)
+37
View File
@@ -10,6 +10,16 @@ choice the agent makes is captured with:
- Whether that was good or bad (evaluated post-hoc)
This gives the Builder LLM the information it needs to improve agent behavior.
## Testing Framework
The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
- Generate tests from Goal success_criteria and constraints
- Mandatory user approval before tests are stored
- Parallel test execution with error categorization
- Debug tools with fix suggestions
See `framework.testing` for details.
"""
from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
@@ -19,6 +29,21 @@ from framework.builder.query import BuilderQuery
from framework.llm import LLMProvider, AnthropicProvider
from framework.runner import AgentRunner, AgentOrchestrator
# Testing framework
from framework.testing import (
Test,
TestResult,
TestSuiteResult,
TestStorage,
ApprovalStatus,
ErrorCategory,
ConstraintTestGenerator,
SuccessCriteriaTestGenerator,
ParallelTestRunner,
ParallelConfig,
DebugTool,
)
__all__ = [
# Schemas
"Decision",
@@ -38,4 +63,16 @@ __all__ = [
# Runner
"AgentRunner",
"AgentOrchestrator",
# Testing
"Test",
"TestResult",
"TestSuiteResult",
"TestStorage",
"ApprovalStatus",
"ErrorCategory",
"ConstraintTestGenerator",
"SuccessCriteriaTestGenerator",
"ParallelTestRunner",
"ParallelConfig",
"DebugTool",
]
+13 -1
View File
@@ -8,6 +8,14 @@ Usage:
python -m core list exports/
python -m core dispatch exports/ --input '{"key": "value"}'
python -m core shell exports/my-agent
Testing commands:
python -m core test-generate goal.json
python -m core test-approve <goal_id>
python -m core test-run <agent_path> --goal <goal_id>
python -m core test-debug <goal_id> <test_id>
python -m core test-list <goal_id>
python -m core test-stats <goal_id>
"""
import argparse
@@ -20,7 +28,7 @@ def main():
)
parser.add_argument(
"--model",
default="claude-sonnet-4-20250514",
default="claude-haiku-4-5-20251001",
help="Anthropic model to use",
)
@@ -30,6 +38,10 @@ def main():
from framework.runner.cli import register_commands
register_commands(subparsers)
# Register testing commands (test-generate, test-approve, test-run, test-debug, etc.)
from framework.testing.cli import register_testing_commands
register_testing_commands(subparsers)
args = parser.parse_args()
if hasattr(args, "func"):
+1 -1
View File
@@ -340,7 +340,7 @@ class GraphSpec(BaseModel):
)
# Default LLM settings
default_model: str = "claude-sonnet-4-20250514"
default_model: str = "claude-haiku-4-5-20251001"
max_tokens: int = 1024
# Execution limits
+6 -6
View File
@@ -165,12 +165,7 @@ class GraphExecutor:
path.append(current_node_id)
# Check if terminal
if current_node_id in graph.terminal_nodes:
self.logger.info(f"✓ Reached terminal node: {node_spec.name}")
break
# Check if pause (HITL)
# Check if pause (HITL) before execution
if current_node_id in graph.pause_nodes:
self.logger.info(f"⏸ Paused at HITL node: {node_spec.name}")
# Execute this node, then pause
@@ -279,6 +274,11 @@ class GraphExecutor:
session_state=session_state_out,
)
# Check if this is a terminal node - if so, we're done
if node_spec.id in graph.terminal_nodes:
self.logger.info(f"✓ Reached terminal node: {node_spec.name}")
break
# Determine next node
if result.next_node:
# Router explicitly set next node
+1
View File
@@ -76,6 +76,7 @@ class Constraint(BaseModel):
description="Category: 'time', 'cost', 'safety', 'scope', 'quality'"
)
check: str = Field(
default="",
description="How to check: expression, function name, or 'llm_judge'"
)
+85 -16
View File
@@ -431,22 +431,13 @@ class LLMNode(NodeProtocol):
# Write to output keys
output = self._parse_output(response.content, ctx.node_spec)
# For llm_generate nodes, try to parse JSON and extract fields
if ctx.node_spec.node_type == "llm_generate" and len(ctx.node_spec.output_keys) > 1:
# For llm_generate and llm_tool_use nodes, try to parse JSON and extract fields
if ctx.node_spec.node_type in ("llm_generate", "llm_tool_use") and len(ctx.node_spec.output_keys) > 1:
try:
# Try to parse as JSON
import json
import re
# Remove markdown code blocks if present
content = response.content.strip()
if content.startswith("```"):
# Extract JSON from code block
match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', content, re.DOTALL)
if match:
content = match.group(1).strip()
parsed = json.loads(content)
# Try direct JSON parse first
parsed = self._extract_json_with_haiku(response.content, ctx.node_spec.output_keys)
# If parsed successfully, write each field to its corresponding output key
if isinstance(parsed, dict):
@@ -454,8 +445,12 @@ class LLMNode(NodeProtocol):
if key in parsed:
ctx.memory.write(key, parsed[key])
output[key] = parsed[key]
elif key in ctx.input_data:
# Key not in parsed JSON but exists in input - pass through input value
ctx.memory.write(key, ctx.input_data[key])
output[key] = ctx.input_data[key]
else:
# Key not in parsed JSON, write the whole response
# Key not in parsed JSON or input, write the whole response
ctx.memory.write(key, response.content)
output[key] = response.content
else:
@@ -465,8 +460,8 @@ class LLMNode(NodeProtocol):
output[key] = response.content
except (json.JSONDecodeError, Exception) as e:
# JSON parsing failed, fall back to writing entire response
logger.warning(f" ⚠ Failed to parse JSON output, using raw response: {e}")
# JSON extraction failed completely
logger.warning(f" ⚠ Failed to extract JSON output: {e}")
for key in ctx.node_spec.output_keys:
ctx.memory.write(key, response.content)
output[key] = response.content
@@ -503,6 +498,80 @@ class LLMNode(NodeProtocol):
# Default output
return {"result": content}
def _extract_json_with_haiku(self, raw_response: str, output_keys: list[str]) -> dict[str, Any]:
"""Use Haiku to extract clean JSON from potentially verbose LLM response."""
import json
import re
# Try direct JSON parse first (fast path)
try:
content = raw_response.strip()
# Remove markdown code blocks if present
if content.startswith("```"):
match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', content, re.DOTALL)
if match:
content = match.group(1).strip()
parsed = json.loads(content)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
# JSON parse failed - use Haiku to extract clean JSON
import os
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
# No API key, try one more simple extraction
try:
# Find first { and last }
start = raw_response.find('{')
end = raw_response.rfind('}')
if start != -1 and end != -1:
json_str = raw_response[start:end+1]
return json.loads(json_str)
except:
pass
raise ValueError("Cannot parse JSON and no API key for Haiku cleanup")
# Use Haiku to clean the response
from framework.llm.anthropic import AnthropicProvider
haiku = AnthropicProvider(model="claude-3-5-haiku-20241022")
prompt = f"""Extract the JSON object from this LLM response. Extract ONLY the values that the LLM actually generated.
Expected output keys: {output_keys}
LLM Response:
{raw_response}
IMPORTANT:
- Only extract keys that the LLM explicitly output in its response
- Do NOT include keys that were just mentioned or passed through from input
- If the LLM output multiple pieces of text/JSON, extract the LAST JSON object only
- Output ONLY valid JSON with no extra text, no markdown, no explanations"""
try:
result = haiku.complete(
messages=[{"role": "user", "content": prompt}],
system="You extract clean JSON from messy responses. Output only valid JSON, nothing else.",
)
cleaned = result.content.strip()
# Remove markdown if Haiku added it
if cleaned.startswith("```"):
match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, re.DOTALL)
if match:
cleaned = match.group(1).strip()
parsed = json.loads(cleaned)
logger.info(f" ✓ Haiku cleaned JSON output")
return parsed
except Exception as e:
logger.warning(f" ⚠ Haiku JSON extraction failed: {e}")
raise
def _build_messages(self, ctx: NodeContext) -> list[dict]:
"""Build the message list for the LLM."""
# Use Haiku to intelligently format inputs from memory
+2 -2
View File
@@ -18,14 +18,14 @@ class AnthropicProvider(LLMProvider):
def __init__(
self,
api_key: str | None = None,
model: str = "claude-sonnet-4-20250514",
model: str = "claude-haiku-4-5-20251001",
):
"""
Initialize the Anthropic provider.
Args:
api_key: Anthropic API key. If not provided, uses ANTHROPIC_API_KEY env var.
model: Model to use (default: claude-sonnet-4-20250514)
model: Model to use (default: claude-haiku-4-5-20251001)
"""
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
if not self.api_key:
+681 -3
View File
@@ -9,6 +9,7 @@ Usage:
import json
from datetime import datetime
from pathlib import Path
from typing import Annotated
from mcp.server import FastMCP
@@ -16,32 +17,163 @@ from mcp.server import FastMCP
from framework.graph import Goal, SuccessCriterion, Constraint, NodeSpec, EdgeSpec, EdgeCondition
from framework.graph.edge import GraphSpec
# Testing framework imports
from framework.testing.test_case import Test, ApprovalStatus, TestType
from framework.testing.test_storage import TestStorage
from framework.testing.constraint_gen import ConstraintTestGenerator
from framework.testing.success_gen import SuccessCriteriaTestGenerator
from framework.testing.approval_types import ApprovalRequest, ApprovalAction
from framework.testing.debug_tool import DebugTool
from framework.testing.parallel import AgentFactory
# Initialize MCP server
mcp = FastMCP("agent-builder")
# Session persistence directory
SESSIONS_DIR = Path(".agent-builder-sessions")
ACTIVE_SESSION_FILE = SESSIONS_DIR / ".active"
# Session storage
class BuildSession:
"""In-memory build session."""
"""Build session with persistence support."""
def __init__(self, name: str):
self.id = f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
def __init__(self, name: str, session_id: str | None = None):
self.id = session_id or f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
self.name = name
self.goal: Goal | None = None
self.nodes: list[NodeSpec] = []
self.edges: list[EdgeSpec] = []
self.mcp_servers: list[dict] = [] # MCP server configurations
self.created_at = datetime.now().isoformat()
self.last_modified = datetime.now().isoformat()
def to_dict(self) -> dict:
"""Serialize session to dictionary."""
return {
"session_id": self.id,
"name": self.name,
"goal": self.goal.model_dump() if self.goal else None,
"nodes": [n.model_dump() for n in self.nodes],
"edges": [e.model_dump() for e in self.edges],
"mcp_servers": self.mcp_servers,
"created_at": self.created_at,
"last_modified": self.last_modified,
}
@classmethod
def from_dict(cls, data: dict) -> "BuildSession":
"""Deserialize session from dictionary."""
session = cls(name=data["name"], session_id=data["session_id"])
session.created_at = data.get("created_at", session.created_at)
session.last_modified = data.get("last_modified", session.last_modified)
# Restore goal
if data.get("goal"):
goal_data = data["goal"]
session.goal = Goal(
id=goal_data["id"],
name=goal_data["name"],
description=goal_data["description"],
success_criteria=[
SuccessCriterion(**sc) for sc in goal_data.get("success_criteria", [])
],
constraints=[
Constraint(**c) for c in goal_data.get("constraints", [])
],
)
# Restore nodes
session.nodes = [NodeSpec(**n) for n in data.get("nodes", [])]
# Restore edges
edges_data = data.get("edges", [])
for e in edges_data:
# Convert condition string back to enum
condition_str = e.get("condition")
if isinstance(condition_str, str):
condition_map = {
"always": EdgeCondition.ALWAYS,
"on_success": EdgeCondition.ON_SUCCESS,
"on_failure": EdgeCondition.ON_FAILURE,
"conditional": EdgeCondition.CONDITIONAL,
}
e["condition"] = condition_map.get(condition_str, EdgeCondition.ON_SUCCESS)
session.edges.append(EdgeSpec(**e))
# Restore MCP servers
session.mcp_servers = data.get("mcp_servers", [])
return session
# Global session
_session: BuildSession | None = None
def _ensure_sessions_dir():
"""Ensure sessions directory exists."""
SESSIONS_DIR.mkdir(exist_ok=True)
def _save_session(session: BuildSession):
"""Save session to disk."""
_ensure_sessions_dir()
# Update last modified
session.last_modified = datetime.now().isoformat()
# Save session file
session_file = SESSIONS_DIR / f"{session.id}.json"
with open(session_file, "w") as f:
json.dump(session.to_dict(), f, indent=2, default=str)
# Update active session pointer
with open(ACTIVE_SESSION_FILE, "w") as f:
f.write(session.id)
def _load_session(session_id: str) -> BuildSession:
"""Load session from disk."""
session_file = SESSIONS_DIR / f"{session_id}.json"
if not session_file.exists():
raise ValueError(f"Session '{session_id}' not found")
with open(session_file, "r") as f:
data = json.load(f)
return BuildSession.from_dict(data)
def _load_active_session() -> BuildSession | None:
"""Load the active session if one exists."""
if not ACTIVE_SESSION_FILE.exists():
return None
try:
with open(ACTIVE_SESSION_FILE, "r") as f:
session_id = f.read().strip()
if session_id:
return _load_session(session_id)
except Exception:
pass
return None
def get_session() -> BuildSession:
global _session
# Try to load active session if no session in memory
if _session is None:
_session = _load_active_session()
if _session is None:
raise ValueError("No active session. Call create_session first.")
return _session
@@ -54,13 +186,122 @@ def create_session(name: Annotated[str, "Name for the agent being built"]) -> st
"""Create a new agent building session. Call this first before building an agent."""
global _session
_session = BuildSession(name)
_save_session(_session) # Auto-save
return json.dumps({
"session_id": _session.id,
"name": name,
"status": "created",
"persisted": True,
})
@mcp.tool()
def list_sessions() -> str:
"""List all saved agent building sessions."""
_ensure_sessions_dir()
sessions = []
if SESSIONS_DIR.exists():
for session_file in SESSIONS_DIR.glob("*.json"):
try:
with open(session_file, "r") as f:
data = json.load(f)
sessions.append({
"session_id": data["session_id"],
"name": data["name"],
"created_at": data.get("created_at"),
"last_modified": data.get("last_modified"),
"node_count": len(data.get("nodes", [])),
"edge_count": len(data.get("edges", [])),
"has_goal": data.get("goal") is not None,
})
except Exception:
pass # Skip corrupted files
# Check which session is currently active
active_id = None
if ACTIVE_SESSION_FILE.exists():
try:
with open(ACTIVE_SESSION_FILE, "r") as f:
active_id = f.read().strip()
except Exception:
pass
return json.dumps({
"sessions": sorted(sessions, key=lambda s: s["last_modified"], reverse=True),
"total": len(sessions),
"active_session_id": active_id,
}, indent=2)
@mcp.tool()
def load_session_by_id(session_id: Annotated[str, "ID of the session to load"]) -> str:
"""Load a previously saved agent building session by its ID."""
global _session
try:
_session = _load_session(session_id)
# Update active session pointer
with open(ACTIVE_SESSION_FILE, "w") as f:
f.write(session_id)
return json.dumps({
"success": True,
"session_id": _session.id,
"name": _session.name,
"node_count": len(_session.nodes),
"edge_count": len(_session.edges),
"has_goal": _session.goal is not None,
"created_at": _session.created_at,
"last_modified": _session.last_modified,
"message": f"Session '{_session.name}' loaded successfully"
})
except Exception as e:
return json.dumps({
"success": False,
"error": str(e)
})
@mcp.tool()
def delete_session(session_id: Annotated[str, "ID of the session to delete"]) -> str:
"""Delete a saved agent building session."""
global _session
session_file = SESSIONS_DIR / f"{session_id}.json"
if not session_file.exists():
return json.dumps({
"success": False,
"error": f"Session '{session_id}' not found"
})
try:
# Remove session file
session_file.unlink()
# Clear active session if it was the deleted one
if _session and _session.id == session_id:
_session = None
if ACTIVE_SESSION_FILE.exists():
with open(ACTIVE_SESSION_FILE, "r") as f:
active_id = f.read().strip()
if active_id == session_id:
ACTIVE_SESSION_FILE.unlink()
return json.dumps({
"success": True,
"deleted_session_id": session_id,
"message": f"Session '{session_id}' deleted successfully"
})
except Exception as e:
return json.dumps({
"success": False,
"error": str(e)
})
@mcp.tool()
def set_goal(
goal_id: Annotated[str, "Unique identifier for the goal"],
@@ -122,6 +363,8 @@ def set_goal(
if not constraint_list:
warnings.append("Consider adding constraints")
_save_session(session) # Auto-save
return json.dumps({
"valid": len(errors) == 0,
"errors": errors,
@@ -259,6 +502,8 @@ def add_node(
if node_type in ("llm_generate", "llm_tool_use") and not system_prompt:
warnings.append(f"LLM node '{node_id}' should have a system_prompt")
_save_session(session) # Auto-save
return json.dumps({
"valid": len(errors) == 0,
"errors": errors,
@@ -335,6 +580,8 @@ def add_edge(
if edge_condition == EdgeCondition.CONDITIONAL and not condition_expr:
errors.append(f"Conditional edge '{edge_id}' needs condition_expr")
_save_session(session) # Auto-save
return json.dumps({
"valid": len(errors) == 0,
"errors": errors,
@@ -425,6 +672,8 @@ def update_node(
if node.node_type in ("llm_generate", "llm_tool_use") and not node.system_prompt:
warnings.append(f"LLM node '{node_id}' should have a system_prompt")
_save_session(session) # Auto-save
return json.dumps({
"valid": len(errors) == 0,
"errors": errors,
@@ -482,6 +731,8 @@ def delete_node(
if not (e.source == node_id or e.target == node_id)
]
_save_session(session) # Auto-save
return json.dumps({
"valid": True,
"deleted_node": removed_node.model_dump(),
@@ -512,6 +763,8 @@ def delete_edge(
# Remove the edge
removed_edge = session.edges.pop(edge_idx)
_save_session(session) # Auto-save
return json.dumps({
"valid": True,
"deleted_edge": removed_edge.model_dump(),
@@ -944,6 +1197,46 @@ def export_graph() -> str:
entry_node = validation["entry_node"]
terminal_nodes = validation["terminal_nodes"]
# Extract pause/resume configuration from validation
pause_nodes = validation.get("pause_nodes", [])
resume_entry_points = validation.get("resume_entry_points", [])
# Build entry_points dict for pause/resume architecture
entry_points = {}
if entry_node:
entry_points["start"] = entry_node
# Add resume entry points with {pause_node}_resume naming convention
if pause_nodes and resume_entry_points:
# Strategy 1: Try to match by checking which resume node uses the pause node's outputs
pause_to_resume = {}
for pause_node_id in pause_nodes:
pause_node = next((n for n in session.nodes if n.id == pause_node_id), None)
if not pause_node:
continue
# Find resume nodes that read the outputs of this pause node
for resume_node_id in resume_entry_points:
resume_node = next((n for n in session.nodes if n.id == resume_node_id), None)
if not resume_node:
continue
# Check if resume node reads pause node's outputs
shared_keys = set(pause_node.output_keys) & set(resume_node.input_keys)
if shared_keys:
pause_to_resume[pause_node_id] = resume_node_id
break
# Strategy 2: Fallback - pair sequentially if no match found
unmatched_pause = [p for p in pause_nodes if p not in pause_to_resume]
unmatched_resume = [r for r in resume_entry_points if r not in pause_to_resume.values()]
for pause_id, resume_id in zip(unmatched_pause, unmatched_resume):
pause_to_resume[pause_id] = resume_id
# Build entry_points dict
for pause_id, resume_id in pause_to_resume.items():
entry_points[f"{pause_id}_resume"] = resume_id
# Build edges list
edges_list = [
{
@@ -988,6 +1281,8 @@ def export_graph() -> str:
"goal_id": session.goal.id,
"version": "1.0.0",
"entry_node": entry_node,
"entry_points": entry_points,
"pause_nodes": pause_nodes,
"terminal_nodes": terminal_nodes,
"nodes": [node.model_dump() for node in session.nodes],
"edges": edges_list,
@@ -1222,6 +1517,7 @@ def add_mcp_server(
# Add to session
session.mcp_servers.append(server_config)
_save_session(session) # Auto-save
return json.dumps({
"success": True,
@@ -1341,6 +1637,7 @@ def remove_mcp_server(
for i, server in enumerate(session.mcp_servers):
if server["name"] == name:
session.mcp_servers.pop(i)
_save_session(session) # Auto-save
return json.dumps({
"success": True,
"removed": name,
@@ -1964,6 +2261,387 @@ def simulate_plan_execution(
}, indent=2)
# =============================================================================
# TESTING TOOLS (Goal-Based Evaluation)
# =============================================================================
# Session storage for pending tests (not yet persisted)
_pending_tests: dict[str, list[Test]] = {}
# Default storage path for tests
DEFAULT_TEST_STORAGE_PATH = Path("data/tests")
@mcp.tool()
def generate_constraint_tests(
goal_id: Annotated[str, "ID of the goal to generate tests for"],
goal_json: Annotated[str, """JSON string of the Goal object. Constraint fields:
- id: string (required)
- description: string (required)
- constraint_type: "hard" or "soft" (required)
- category: string (optional, default: "general")
- check: string (optional, how to validate: "llm_judge", expression, or function name)"""],
) -> str:
"""
Generate constraint tests for a goal.
Returns proposals for user approval. Tests are NOT persisted until approved.
"""
try:
goal = Goal.model_validate_json(goal_json)
except Exception as e:
return json.dumps({"error": f"Invalid goal JSON: {e}"})
# Get LLM provider
try:
from framework.llm import AnthropicProvider
llm = AnthropicProvider()
except Exception as e:
return json.dumps({"error": f"Failed to initialize LLM: {e}"})
# Generate tests
generator = ConstraintTestGenerator(llm)
tests = generator.generate(goal)
# Store as pending (not persisted yet)
_pending_tests[goal_id] = tests
return json.dumps({
"goal_id": goal_id,
"generated_count": len(tests),
"tests": [
{
"id": t.id,
"test_name": t.test_name,
"parent_criteria_id": t.parent_criteria_id,
"description": t.description,
"confidence": t.llm_confidence,
"test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
}
for t in tests
],
"next_step": "Call approve_tests to approve, modify, or reject each test",
})
@mcp.tool()
def generate_success_tests(
goal_id: Annotated[str, "ID of the goal to generate tests for"],
goal_json: Annotated[str, "JSON string of the Goal object"],
node_names: Annotated[str, "Comma-separated list of agent node names"] = "",
tool_names: Annotated[str, "Comma-separated list of available tool names"] = "",
) -> str:
"""
Generate success criteria tests for a goal.
Should be called during Eval stage after agent exists.
Returns proposals for user approval.
"""
try:
goal = Goal.model_validate_json(goal_json)
except Exception as e:
return json.dumps({"error": f"Invalid goal JSON: {e}"})
# Get LLM provider
try:
from framework.llm import AnthropicProvider
llm = AnthropicProvider()
except Exception as e:
return json.dumps({"error": f"Failed to initialize LLM: {e}"})
# Parse node/tool names
nodes = [n.strip() for n in node_names.split(",") if n.strip()]
tools = [t.strip() for t in tool_names.split(",") if t.strip()]
# Generate tests
generator = SuccessCriteriaTestGenerator(llm)
tests = generator.generate(goal, node_names=nodes, tool_names=tools)
# Add to pending (may have constraint tests already)
if goal_id in _pending_tests:
_pending_tests[goal_id].extend(tests)
else:
_pending_tests[goal_id] = tests
return json.dumps({
"goal_id": goal_id,
"generated_count": len(tests),
"tests": [
{
"id": t.id,
"test_name": t.test_name,
"parent_criteria_id": t.parent_criteria_id,
"description": t.description,
"confidence": t.llm_confidence,
"test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
}
for t in tests
],
"next_step": "Call approve_tests to approve, modify, or reject each test",
})
@mcp.tool()
def approve_tests(
goal_id: Annotated[str, "ID of the goal"],
approvals: Annotated[str, "JSON array of approval decisions"],
) -> str:
"""
Approve, reject, or modify generated tests.
Approvals format:
[
{"test_id": "...", "action": "approve"},
{"test_id": "...", "action": "modify", "modified_code": "..."},
{"test_id": "...", "action": "reject", "reason": "..."},
{"test_id": "...", "action": "skip"}
]
Actions: approve, modify (requires modified_code), reject (requires reason), skip
"""
if goal_id not in _pending_tests:
return json.dumps({"error": f"No pending tests for goal {goal_id}"})
try:
approvals_list = json.loads(approvals)
except json.JSONDecodeError as e:
return json.dumps({"error": f"Invalid approvals JSON: {e}"})
# Create storage
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
# Build approval requests
requests = []
for a in approvals_list:
try:
action = ApprovalAction(a.get("action", "skip"))
requests.append(ApprovalRequest(
test_id=a["test_id"],
action=action,
modified_code=a.get("modified_code"),
reason=a.get("reason"),
approved_by="mcp_user",
))
except (KeyError, ValueError) as e:
return json.dumps({"error": f"Invalid approval entry: {e}"})
# Find and save approved tests
pending = {t.id: t for t in _pending_tests[goal_id]}
results = []
for req in requests:
test = pending.get(req.test_id)
if not test:
results.append({"test_id": req.test_id, "error": "Not found in pending"})
continue
if req.action == ApprovalAction.APPROVE:
test.approve(req.approved_by)
storage.save_test(test)
results.append({"test_id": req.test_id, "status": "approved"})
elif req.action == ApprovalAction.MODIFY:
if req.modified_code:
test.modify(req.modified_code, req.approved_by)
storage.save_test(test)
results.append({"test_id": req.test_id, "status": "modified"})
else:
results.append({"test_id": req.test_id, "error": "modified_code required"})
elif req.action == ApprovalAction.REJECT:
test.reject(req.reason or "No reason provided")
storage.save_test(test)
results.append({"test_id": req.test_id, "status": "rejected"})
elif req.action == ApprovalAction.SKIP:
results.append({"test_id": req.test_id, "status": "skipped"})
# Clear pending for processed tests
processed_ids = {r["test_id"] for r in results if "error" not in r}
_pending_tests[goal_id] = [t for t in _pending_tests[goal_id] if t.id not in processed_ids]
# Clean up if empty
if not _pending_tests[goal_id]:
del _pending_tests[goal_id]
return json.dumps({"goal_id": goal_id, "results": results})
@mcp.tool()
def run_tests(
goal_id: Annotated[str, "ID of the goal to test"],
agent_path: Annotated[str, "Path to the agent export folder"],
test_types: Annotated[str, 'JSON array of test types: ["constraint", "outcome", "edge_case", "all"]'] = '["all"]',
parallel: Annotated[int, "Number of parallel workers (0 for sequential)"] = 0,
fail_fast: Annotated[bool, "Stop on first failure"] = False,
) -> str:
"""
Run evaluation tests for a goal.
Returns pass/fail summary with detailed results for each test.
"""
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
# Parse test types
try:
types_list = json.loads(test_types)
except json.JSONDecodeError:
types_list = ["all"]
# Load storage
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
# Get approved tests
tests = storage.get_approved_tests(goal_id)
# Filter by type if not "all"
if "all" not in types_list:
type_map = {
"constraint": TestType.CONSTRAINT,
"outcome": TestType.SUCCESS_CRITERIA,
"edge_case": TestType.EDGE_CASE,
}
filter_types = {type_map.get(t) for t in types_list if t in type_map}
tests = [t for t in tests if t.test_type in filter_types]
if not tests:
return json.dumps({
"goal_id": goal_id,
"error": "No approved tests found",
"hint": "Generate and approve tests first using generate_constraint_tests and approve_tests",
})
# Configure runner
config = ParallelConfig(
num_workers=parallel if parallel > 0 else 1,
fail_fast=fail_fast,
)
# Run tests - use AgentFactory for picklable parallel execution
runner = ParallelTestRunner(config, storage)
result = runner.run_all(
goal_id=goal_id,
agent_factory=AgentFactory(agent_path),
tests=tests,
)
return json.dumps({
"goal_id": goal_id,
"overall_passed": result.all_passed,
"summary": {
"total": result.total,
"passed": result.passed,
"failed": result.failed,
"pass_rate": f"{result.pass_rate:.1%}",
},
"duration_ms": result.duration_ms,
"results": [r.summary_dict() for r in result.results],
})
@mcp.tool()
def debug_test(
goal_id: Annotated[str, "ID of the goal"],
test_id: Annotated[str, "ID of the failed test"],
run_id: Annotated[str, "Optional Runtime run ID for detailed logs"] = "",
) -> str:
"""
Get detailed debug info for a failed test.
Includes error categorization, logs, and fix suggestions.
"""
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
# Optionally load runtime storage
runtime_storage = None
try:
from framework.storage.backend import FileStorage
runtime_storage = FileStorage(f"data/runtime/{goal_id}")
except Exception:
pass
debug_tool = DebugTool(storage, runtime_storage)
info = debug_tool.analyze(goal_id, test_id, run_id or None)
return json.dumps(info.to_dict(), indent=2, default=str)
@mcp.tool()
def list_tests(
goal_id: Annotated[str, "ID of the goal"],
status: Annotated[str, "Filter by approval status: pending, approved, modified, rejected, all"] = "all",
) -> str:
"""
List tests for a goal.
Returns test metadata without full code (use debug_test for details).
"""
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
tests = storage.get_tests_by_goal(goal_id)
# Filter by status
if status != "all":
try:
filter_status = ApprovalStatus(status)
tests = [t for t in tests if t.approval_status == filter_status]
except ValueError:
pass
return json.dumps({
"goal_id": goal_id,
"total": len(tests),
"tests": [
{
"id": t.id,
"test_name": t.test_name,
"test_type": t.test_type.value,
"parent_criteria_id": t.parent_criteria_id,
"approval_status": t.approval_status.value,
"last_result": t.last_result,
"confidence": t.llm_confidence,
}
for t in tests
],
})
@mcp.tool()
def get_pending_tests(
goal_id: Annotated[str, "ID of the goal"],
) -> str:
"""
Get pending tests awaiting approval.
Returns tests that have been generated but not yet approved.
"""
if goal_id not in _pending_tests:
return json.dumps({
"goal_id": goal_id,
"pending_count": 0,
"tests": [],
})
tests = _pending_tests[goal_id]
return json.dumps({
"goal_id": goal_id,
"pending_count": len(tests),
"tests": [
{
"id": t.id,
"test_name": t.test_name,
"test_type": t.test_type.value,
"parent_criteria_id": t.parent_criteria_id,
"description": t.description,
"confidence": t.llm_confidence,
"test_code": t.test_code,
"input": t.input,
"expected_output": t.expected_output,
}
for t in tests
],
})
# =============================================================================
# PLAN LOADING AND EXECUTION
# =============================================================================
+90 -32
View File
@@ -6,8 +6,6 @@ import json
import sys
from pathlib import Path
from framework.graph import ExecutionStatus
def register_commands(subparsers: argparse._SubParsersAction) -> None:
"""Register runner commands with the main CLI."""
@@ -48,6 +46,11 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
action="store_true",
help="Only output the final result JSON",
)
run_parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Show detailed execution logs (steps, LLM calls, etc.)",
)
run_parser.set_defaults(func=cmd_run)
# info command
@@ -166,8 +169,17 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
def cmd_run(args: argparse.Namespace) -> int:
"""Run an exported agent."""
import logging
from framework.runner import AgentRunner
# Set logging level (quiet by default for cleaner output)
if args.quiet:
logging.basicConfig(level=logging.ERROR, format='%(message)s')
elif getattr(args, 'verbose', False):
logging.basicConfig(level=logging.INFO, format='%(message)s')
else:
logging.basicConfig(level=logging.WARNING, format='%(message)s')
# Load input context
context = {}
if args.input:
@@ -189,12 +201,18 @@ def cmd_run(args: argparse.Namespace) -> int:
runner = AgentRunner.load(
args.agent_path,
mock_mode=args.mock,
model=getattr(args, "model", "claude-sonnet-4-20250514"),
model=getattr(args, "model", "claude-haiku-4-5-20251001"),
)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
return 1
# Auto-inject user_id if the agent expects it but it's not provided
entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else []
if "user_id" in entry_input_keys and context.get("user_id") is None:
import os
context["user_id"] = os.environ.get("USER", "default_user")
if not args.quiet:
info = runner.info()
print(f"Agent: {info.name}")
@@ -212,12 +230,14 @@ def cmd_run(args: argparse.Namespace) -> int:
# Format output
output = {
"status": result.status.value if hasattr(result.status, "value") else str(result.status),
"completed_steps": result.completed_steps,
"results": result.results,
"success": result.success,
"steps_executed": result.steps_executed,
"output": result.output,
}
if result.feedback:
output["feedback"] = result.feedback
if result.error:
output["error"] = result.error
if result.paused_at:
output["paused_at"] = result.paused_at
# Output results
if args.output:
@@ -231,27 +251,51 @@ def cmd_run(args: argparse.Namespace) -> int:
else:
print()
print("=" * 60)
status_str = result.status.value if hasattr(result.status, "value") else str(result.status)
status_str = "SUCCESS" if result.success else "FAILED"
print(f"Status: {status_str}")
print(f"Completed steps: {len(result.completed_steps)}")
print(f"Steps executed: {result.steps_executed}")
print(f"Path: {''.join(result.path)}")
print("=" * 60)
if result.status == ExecutionStatus.COMPLETED:
if result.success:
print("\n--- Results ---")
for key, value in result.results.items():
if isinstance(value, (dict, list)):
print(f"\n{key}:")
value_str = json.dumps(value, indent=2, default=str)
if len(value_str) > 500:
value_str = value_str[:500] + "..."
print(value_str)
else:
print(f"{key}: {str(value)[:200]}")
elif result.feedback:
print(f"\nFeedback: {result.feedback}")
# Show only meaningful output keys (skip internal/intermediate values)
meaningful_keys = ["final_response", "response", "result", "answer", "output"]
# Try to find the most relevant output
shown = False
for key in meaningful_keys:
if key in result.output:
value = result.output[key]
if isinstance(value, str) and len(value) > 10:
print(value)
shown = True
break
elif isinstance(value, (dict, list)):
print(json.dumps(value, indent=2, default=str))
shown = True
break
# If no meaningful key found, show all non-internal keys
if not shown:
for key, value in result.output.items():
if not key.startswith("_") and key not in ["user_id", "request", "memory_loaded", "user_profile", "recent_context"]:
if isinstance(value, (dict, list)):
print(f"\n{key}:")
value_str = json.dumps(value, indent=2, default=str)
if len(value_str) > 300:
value_str = value_str[:300] + "..."
print(value_str)
else:
val_str = str(value)
if len(val_str) > 200:
val_str = val_str[:200] + "..."
print(f"{key}: {val_str}")
elif result.error:
print(f"\nError: {result.error}")
runner.cleanup()
return 0 if result.status == ExecutionStatus.COMPLETED else 1
return 0 if result.success else 1
def cmd_info(args: argparse.Namespace) -> int:
@@ -760,6 +804,11 @@ def cmd_shell(args: argparse.Namespace) -> int:
# STARTING FRESH: Merge new input with accumulated session memory
run_context = {**session_memory, **context}
# Auto-inject user_id if missing (for personal assistant agents)
if "user_id" in entry_input_keys and run_context.get("user_id") is None:
import os
run_context["user_id"] = os.environ.get("USER", "default_user")
# Add conversation history to context if agent expects it
if conversation_history:
run_context["_conversation_history"] = conversation_history.copy()
@@ -778,16 +827,25 @@ def cmd_shell(args: argparse.Namespace) -> int:
print(f"Steps executed: {result.steps_executed}")
print(f"Path: {''.join(result.path)}")
# Show clean output - prioritize meaningful keys
if result.output:
print("\nOutput:")
for key, value in result.output.items():
if isinstance(value, (dict, list)):
value_str = json.dumps(value, indent=2, default=str)
if len(value_str) > 300:
value_str = value_str[:300] + "..."
print(f" {key}: {value_str}")
else:
print(f" {key}: {str(value)[:200]}")
meaningful_keys = ["final_response", "response", "result", "answer", "output"]
shown = False
for key in meaningful_keys:
if key in result.output:
value = result.output[key]
if isinstance(value, str) and len(value) > 10:
print(f"\n{value}\n")
shown = True
break
if not shown:
print("\nOutput:")
for key, value in result.output.items():
if not key.startswith("_"):
val_str = str(value)[:200]
print(f" {key}: {val_str}")
if result.error:
print(f"\nError: {result.error}")
+118 -45
View File
@@ -65,10 +65,15 @@ class MCPClient:
self._session = None
self._read_stream = None
self._write_stream = None
self._stdio_context = None # Context manager for stdio_client
self._http_client: httpx.Client | None = None
self._tools: dict[str, MCPTool] = {}
self._connected = False
# Background event loop for persistent STDIO connection
self._loop = None
self._loop_thread = None
def _run_async(self, coro):
"""
Run an async coroutine, handling both sync and async contexts.
@@ -79,6 +84,13 @@ class MCPClient:
Returns:
Result of the coroutine
"""
# If we have a persistent loop (for STDIO), use it
if self._loop is not None:
import concurrent.futures
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
return future.result()
# Otherwise, use the standard approach
try:
# Try to get the current event loop
asyncio.get_running_loop()
@@ -129,12 +141,12 @@ class MCPClient:
self._connected = True
def _connect_stdio(self) -> None:
"""Connect to MCP server via STDIO transport using MCP SDK."""
"""Connect to MCP server via STDIO transport using MCP SDK with persistent connection."""
if not self.config.command:
raise ValueError("command is required for STDIO transport")
try:
# Import MCP SDK
import threading
from mcp import StdioServerParameters
# Create server parameters
@@ -145,10 +157,62 @@ class MCPClient:
cwd=self.config.cwd,
)
# Store for later use in async context
# Store for later use
self._server_params = server_params
logger.info(f"Connected to MCP server '{self.config.name}' via STDIO")
# Start background event loop for persistent connection
loop_started = threading.Event()
connection_ready = threading.Event()
connection_error = []
def run_event_loop():
"""Run event loop in background thread."""
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
loop_started.set()
# Initialize persistent connection
async def init_connection():
try:
from mcp import ClientSession
from mcp.client.stdio import stdio_client
# Create persistent stdio client context
self._stdio_context = stdio_client(server_params)
self._read_stream, self._write_stream = await self._stdio_context.__aenter__()
# Create persistent session
self._session = ClientSession(self._read_stream, self._write_stream)
await self._session.__aenter__()
# Initialize session
await self._session.initialize()
connection_ready.set()
except Exception as e:
connection_error.append(e)
connection_ready.set()
# Schedule connection initialization
self._loop.create_task(init_connection())
# Run loop forever
self._loop.run_forever()
self._loop_thread = threading.Thread(target=run_event_loop, daemon=True)
self._loop_thread.start()
# Wait for loop to start
loop_started.wait(timeout=5)
if not loop_started.is_set():
raise RuntimeError("Event loop failed to start")
# Wait for connection to be ready
connection_ready.wait(timeout=10)
if connection_error:
raise connection_error[0]
logger.info(f"Connected to MCP server '{self.config.name}' via STDIO (persistent)")
except Exception as e:
raise RuntimeError(f"Failed to connect to MCP server: {e}")
@@ -196,28 +260,23 @@ class MCPClient:
raise
async def _list_tools_stdio_async(self) -> list[dict]:
"""List tools via STDIO protocol using MCP SDK."""
from mcp import ClientSession
from mcp.client.stdio import stdio_client
"""List tools via STDIO protocol using persistent session."""
if not self._session:
raise RuntimeError("STDIO session not initialized")
async with stdio_client(self._server_params) as (read, write):
async with ClientSession(read, write) as session:
# Initialize the session
await session.initialize()
# List tools using persistent session
response = await self._session.list_tools()
# List tools
response = await session.list_tools()
# Convert tools to dict format
tools_list = []
for tool in response.tools:
tools_list.append({
"name": tool.name,
"description": tool.description,
"inputSchema": tool.inputSchema,
})
# Convert tools to dict format
tools_list = []
for tool in response.tools:
tools_list.append({
"name": tool.name,
"description": tool.description,
"inputSchema": tool.inputSchema,
})
return tools_list
return tools_list
def _list_tools_http(self) -> list[dict]:
"""List tools via HTTP protocol."""
@@ -280,31 +339,26 @@ class MCPClient:
return self._call_tool_http(tool_name, arguments)
async def _call_tool_stdio_async(self, tool_name: str, arguments: dict[str, Any]) -> Any:
"""Call tool via STDIO protocol using MCP SDK."""
from mcp import ClientSession
from mcp.client.stdio import stdio_client
"""Call tool via STDIO protocol using persistent session."""
if not self._session:
raise RuntimeError("STDIO session not initialized")
async with stdio_client(self._server_params) as (read, write):
async with ClientSession(read, write) as session:
# Initialize the session
await session.initialize()
# Call tool using persistent session
result = await self._session.call_tool(tool_name, arguments=arguments)
# Call tool
result = await session.call_tool(tool_name, arguments=arguments)
# Extract content
if result.content:
# MCP returns content as a list of content items
if len(result.content) > 0:
content_item = result.content[0]
# Check if it's a text content item
if hasattr(content_item, 'text'):
return content_item.text
elif hasattr(content_item, 'data'):
return content_item.data
return result.content
# Extract content
if result.content:
# MCP returns content as a list of content items
if len(result.content) > 0:
content_item = result.content[0]
# Check if it's a text content item
if hasattr(content_item, 'text'):
return content_item.text
elif hasattr(content_item, 'data'):
return content_item.data
return result.content
return None
return None
def _call_tool_http(self, tool_name: str, arguments: dict[str, Any]) -> Any:
"""Call tool via HTTP protocol."""
@@ -336,6 +390,25 @@ class MCPClient:
def disconnect(self) -> None:
"""Disconnect from the MCP server."""
# Clean up persistent STDIO connection
if self._loop is not None:
# Stop event loop - this will cause context managers to clean up naturally
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(self._loop.stop)
# Wait for thread to finish
if self._loop_thread and self._loop_thread.is_alive():
self._loop_thread.join(timeout=2)
# Clear references
self._session = None
self._stdio_context = None
self._read_stream = None
self._write_stream = None
self._loop = None
self._loop_thread = None
# Clean up HTTP client
if self._http_client:
self._http_client.close()
self._http_client = None
+1 -1
View File
@@ -57,7 +57,7 @@ class AgentOrchestrator:
def __init__(
self,
llm: LLMProvider | None = None,
model: str = "claude-sonnet-4-20250514",
model: str = "claude-haiku-4-5-20251001",
):
"""
Initialize the orchestrator.
+20 -4
View File
@@ -172,7 +172,7 @@ class AgentRunner:
goal: Goal,
mock_mode: bool = False,
storage_path: Path | None = None,
model: str = "claude-sonnet-4-20250514",
model: str = "claude-haiku-4-5-20251001",
):
"""
Initialize the runner (use AgentRunner.load() instead).
@@ -196,8 +196,12 @@ class AgentRunner:
self._storage_path = storage_path
self._temp_dir = None
else:
self._temp_dir = tempfile.TemporaryDirectory()
self._storage_path = Path(self._temp_dir.name) / "runtime"
# Use persistent storage in ~/.hive by default
home = Path.home()
default_storage = home / ".hive" / "storage" / agent_path.name
default_storage.mkdir(parents=True, exist_ok=True)
self._storage_path = default_storage
self._temp_dir = None
# Initialize components
self._tool_registry = ToolRegistry()
@@ -222,7 +226,7 @@ class AgentRunner:
agent_path: str | Path,
mock_mode: bool = False,
storage_path: Path | None = None,
model: str = "claude-sonnet-4-20250514",
model: str = "claude-haiku-4-5-20251001",
) -> "AgentRunner":
"""
Load an agent from an export folder.
@@ -367,6 +371,18 @@ class AgentRunner:
# Create runtime
self._runtime = Runtime(storage_path=self._storage_path)
# Set up session context for tools (workspace_id, agent_id, session_id)
workspace_id = "default" # Could be derived from storage path
agent_id = self.graph.id or "unknown"
# Use "current" as a stable session_id for persistent memory
session_id = "current"
self._tool_registry.set_session_context(
workspace_id=workspace_id,
agent_id=agent_id,
session_id=session_id,
)
# Create LLM provider (if not mock mode and API key available)
if not self.mock_mode and os.environ.get("ANTHROPIC_API_KEY"):
from framework.llm.anthropic import AnthropicProvider
+15 -3
View File
@@ -35,6 +35,7 @@ class ToolRegistry:
def __init__(self):
self._tools: dict[str, RegisteredTool] = {}
self._mcp_clients: list[Any] = [] # List of MCPClient instances
self._session_context: dict[str, Any] = {} # Auto-injected context for tools
def register(
self,
@@ -227,6 +228,15 @@ class ToolRegistry:
"""Check if a tool is registered."""
return name in self._tools
def set_session_context(self, **context) -> None:
"""
Set session context to auto-inject into tool calls.
Args:
**context: Key-value pairs to inject (e.g., workspace_id, agent_id, session_id)
"""
self._session_context.update(context)
def register_mcp_server(
self,
server_config: dict[str, Any],
@@ -279,10 +289,12 @@ class ToolRegistry:
tool = self._convert_mcp_tool_to_framework_tool(mcp_tool)
# Create executor that calls the MCP server
def make_mcp_executor(client_ref: MCPClient, tool_name: str):
def make_mcp_executor(client_ref: MCPClient, tool_name: str, registry_ref):
def executor(inputs: dict) -> Any:
try:
result = client_ref.call_tool(tool_name, inputs)
# Inject session context for tools that need it
merged_inputs = {**registry_ref._session_context, **inputs}
result = client_ref.call_tool(tool_name, merged_inputs)
# MCP tools return content array, extract the result
if isinstance(result, list) and len(result) > 0:
if isinstance(result[0], dict) and "text" in result[0]:
@@ -298,7 +310,7 @@ class ToolRegistry:
self.register(
mcp_tool.name,
tool,
make_mcp_executor(client, mcp_tool.name),
make_mcp_executor(client, mcp_tool.name, self),
)
count += 1
+20 -6
View File
@@ -9,12 +9,15 @@ handles all the structured logging.
from datetime import datetime
from typing import Any
from pathlib import Path
import logging
import uuid
from framework.schemas.decision import Decision, Option, Outcome, DecisionType
from framework.schemas.run import Run, RunStatus
from framework.storage.backend import FileStorage
logger = logging.getLogger(__name__)
class Runtime:
"""
@@ -100,7 +103,10 @@ class Runtime:
output_data: Final output of the run
"""
if self._current_run is None:
raise RuntimeError("No run in progress")
# Gracefully handle case where run was already ended or never started
# This can happen during exception handling cascades
logger.warning("end_run called but no run in progress (already ended or never started)")
return
status = RunStatus.COMPLETED if success else RunStatus.FAILED
self._current_run.output_data = output_data or {}
@@ -158,10 +164,12 @@ class Runtime:
context: Additional context available when deciding
Returns:
The decision ID (use this to record outcome later)
The decision ID (use this to record outcome later), or empty string if no run in progress
"""
if self._current_run is None:
raise RuntimeError("No run in progress. Call start_run() first.")
# Gracefully handle case where run ended during exception handling
logger.warning(f"decide called but no run in progress: {intent}")
return ""
# Build Option objects
option_objects = []
@@ -220,7 +228,10 @@ class Runtime:
latency_ms: Time taken in milliseconds
"""
if self._current_run is None:
raise RuntimeError("No run in progress")
# Gracefully handle case where run ended during exception handling
# This can happen in cascading error scenarios
logger.warning(f"record_outcome called but no run in progress (decision_id={decision_id})")
return
outcome = Outcome(
success=success,
@@ -258,10 +269,13 @@ class Runtime:
suggested_fix: What might fix it (if known)
Returns:
The problem ID
The problem ID, or empty string if no run in progress
"""
if self._current_run is None:
raise RuntimeError("No run in progress")
# Gracefully handle case where run ended during exception handling
# Log the problem since we can't store it, then return empty ID
logger.warning(f"report_problem called but no run in progress: [{severity}] {description}")
return ""
return self._current_run.add_problem(
severity=severity,
+144
View File
@@ -0,0 +1,144 @@
"""
Goal-Based Testing Framework
A three-stage framework (Goal Agent Eval) where tests are LLM-generated
from success_criteria and constraints, with mandatory user approval.
## Core Flow
1. **Goal Stage**: Define success_criteria and constraints, generate constraint tests
2. **Agent Stage**: Build nodes + edges, run constraint tests during development
3. **Eval Stage**: Generate success_criteria tests, run all tests, debug failures
## Key Components
- **Schemas**: Test, TestResult, TestSuiteResult, ApprovalStatus, ErrorCategory
- **Storage**: TestStorage for persisting tests and results
- **Generation**: LLM-based test generation from Goal criteria
- **Approval**: Mandatory user approval workflow (CLI and programmatic)
- **Runner**: Parallel test execution with pytest-xdist inspired design
- **Debug**: Error categorization and fix suggestions
## MCP Tools
Testing tools are integrated into the main agent_builder_server.py (not a separate server).
This ensures the building_agent skill has access to all testing functionality:
- generate_constraint_tests, generate_success_tests
- approve_tests, run_tests, debug_test
- list_tests, get_pending_tests
## Usage
```python
from framework.testing import (
Test, TestResult, TestStorage,
ConstraintTestGenerator, SuccessCriteriaTestGenerator,
ParallelTestRunner, DebugTool,
)
# Generate tests
generator = ConstraintTestGenerator(llm)
tests = generator.generate(goal)
# Approve tests (required)
for test in tests:
test.approve("user")
storage.save_test(test)
# Run tests
runner = ParallelTestRunner()
result = runner.run_all(goal_id, agent_factory, tests)
# Debug failures
debug = DebugTool(storage)
info = debug.analyze(goal_id, test_id)
```
## CLI Commands
```bash
python -m framework test-generate goal.json
python -m framework test-approve <goal_id>
python -m framework test-run <agent_path> --goal <goal_id>
python -m framework test-debug <goal_id> <test_id>
```
"""
# Schemas
from framework.testing.test_case import (
ApprovalStatus,
TestType,
Test,
)
from framework.testing.test_result import (
ErrorCategory,
TestResult,
TestSuiteResult,
)
# Storage
from framework.testing.test_storage import TestStorage
# Generation
from framework.testing.constraint_gen import ConstraintTestGenerator
from framework.testing.success_gen import SuccessCriteriaTestGenerator
from framework.testing.prompts import (
CONSTRAINT_TEST_PROMPT,
SUCCESS_CRITERIA_TEST_PROMPT,
)
# Approval
from framework.testing.approval_types import (
ApprovalAction,
ApprovalRequest,
ApprovalResult,
BatchApprovalRequest,
BatchApprovalResult,
)
from framework.testing.approval_cli import interactive_approval, batch_approval
# Runner
from framework.testing.executor import TestExecutor
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
from framework.testing.categorizer import ErrorCategorizer
# Debug
from framework.testing.debug_tool import DebugTool, DebugInfo
# CLI
from framework.testing.cli import register_testing_commands
__all__ = [
# Schemas
"ApprovalStatus",
"TestType",
"Test",
"ErrorCategory",
"TestResult",
"TestSuiteResult",
# Storage
"TestStorage",
# Generation
"ConstraintTestGenerator",
"SuccessCriteriaTestGenerator",
"CONSTRAINT_TEST_PROMPT",
"SUCCESS_CRITERIA_TEST_PROMPT",
# Approval
"ApprovalAction",
"ApprovalRequest",
"ApprovalResult",
"BatchApprovalRequest",
"BatchApprovalResult",
"interactive_approval",
"batch_approval",
# Runner
"TestExecutor",
"ParallelTestRunner",
"ParallelConfig",
"ErrorCategorizer",
# Debug
"DebugTool",
"DebugInfo",
# CLI
"register_testing_commands",
]
+295
View File
@@ -0,0 +1,295 @@
"""
Interactive CLI for reviewing and approving generated tests.
LLM-generated tests are NEVER created without user approval.
This CLI provides the interactive approval workflow.
"""
import json
import tempfile
import subprocess
import os
from typing import Callable
from framework.testing.test_case import Test, ApprovalStatus
from framework.testing.test_storage import TestStorage
from framework.testing.approval_types import (
ApprovalAction,
ApprovalRequest,
ApprovalResult,
BatchApprovalResult,
)
def interactive_approval(
tests: list[Test],
storage: TestStorage,
on_progress: Callable[[int, int], None] | None = None,
) -> list[ApprovalResult]:
"""
Interactive CLI flow for reviewing generated tests.
Displays each test and allows user to:
- [a]pprove: Accept as-is
- [r]eject: Decline with reason
- [e]dit: Modify before accepting
- [s]kip: Leave pending (decide later)
Args:
tests: List of pending tests to review
storage: TestStorage for saving decisions
on_progress: Optional callback(current, total) for progress tracking
Returns:
List of ApprovalResult for each processed test
"""
results = []
total = len(tests)
for i, test in enumerate(tests, 1):
if on_progress:
on_progress(i, total)
# Display test
_display_test(test, i, total)
# Get user action
action = _get_user_action()
# Process action
result = _process_action(test, action, storage)
results.append(result)
print() # Blank line between tests
return results
def batch_approval(
goal_id: str,
requests: list[ApprovalRequest],
storage: TestStorage,
) -> BatchApprovalResult:
"""
Process multiple approval requests at once.
Used by MCP interface for programmatic approval.
Args:
goal_id: Goal ID for the tests
requests: List of approval requests
storage: TestStorage for saving decisions
Returns:
BatchApprovalResult with counts and individual results
"""
results = []
counts = {
"approved": 0,
"modified": 0,
"rejected": 0,
"skipped": 0,
"errors": 0,
}
for req in requests:
# Validate request
valid, error = req.validate_action()
if not valid:
results.append(ApprovalResult.error_result(
req.test_id, req.action, error or "Invalid request"
))
counts["errors"] += 1
continue
# Load test
test = storage.load_test(goal_id, req.test_id)
if not test:
results.append(ApprovalResult.error_result(
req.test_id, req.action, f"Test {req.test_id} not found"
))
counts["errors"] += 1
continue
# Apply action
try:
if req.action == ApprovalAction.APPROVE:
test.approve(req.approved_by)
counts["approved"] += 1
elif req.action == ApprovalAction.MODIFY:
test.modify(req.modified_code or test.test_code, req.approved_by)
counts["modified"] += 1
elif req.action == ApprovalAction.REJECT:
test.reject(req.reason or "No reason provided")
counts["rejected"] += 1
elif req.action == ApprovalAction.SKIP:
counts["skipped"] += 1
# Save if not skipped
if req.action != ApprovalAction.SKIP:
storage.update_test(test)
results.append(ApprovalResult.success_result(
req.test_id, req.action, f"Test {req.action.value}d successfully"
))
except Exception as e:
results.append(ApprovalResult.error_result(
req.test_id, req.action, str(e)
))
counts["errors"] += 1
return BatchApprovalResult(
goal_id=goal_id,
total=len(requests),
approved=counts["approved"],
modified=counts["modified"],
rejected=counts["rejected"],
skipped=counts["skipped"],
errors=counts["errors"],
results=results,
)
def _display_test(test: Test, index: int, total: int) -> None:
"""Display a test for review."""
separator = "=" * 60
print(f"\n{separator}")
print(f"[{index}/{total}] {test.test_name}")
print(f"Type: {test.test_type.value}")
print(f"Criteria: {test.parent_criteria_id}")
print(f"Confidence: {test.llm_confidence * 100:.0f}%")
print(separator)
print(f"\nDescription: {test.description}")
if test.input:
print(f"\nInput:")
print(json.dumps(test.input, indent=2))
if test.expected_output:
print(f"\nExpected Output:")
print(json.dumps(test.expected_output, indent=2))
print(f"\nTest Code:")
print("-" * 40)
print(test.test_code)
print("-" * 40)
print("\n[a]pprove [r]eject [e]dit [s]kip")
def _get_user_action() -> ApprovalAction:
"""Get user's choice for action."""
while True:
choice = input("Your choice: ").strip().lower()
if choice == "a":
return ApprovalAction.APPROVE
elif choice == "r":
return ApprovalAction.REJECT
elif choice == "e":
return ApprovalAction.MODIFY
elif choice == "s":
return ApprovalAction.SKIP
else:
print("Invalid choice. Please enter a, r, e, or s.")
def _process_action(
test: Test,
action: ApprovalAction,
storage: TestStorage,
) -> ApprovalResult:
"""Process user's action on a test."""
try:
if action == ApprovalAction.APPROVE:
test.approve()
storage.update_test(test)
print("✓ Approved")
return ApprovalResult.success_result(test.id, action, "Approved")
elif action == ApprovalAction.REJECT:
reason = input("Rejection reason: ").strip()
if not reason:
reason = "No reason provided"
test.reject(reason)
storage.update_test(test)
print(f"✗ Rejected: {reason}")
return ApprovalResult.success_result(test.id, action, f"Rejected: {reason}")
elif action == ApprovalAction.MODIFY:
edited_code = _edit_test_code(test.test_code)
if edited_code != test.test_code:
test.modify(edited_code)
storage.update_test(test)
print("✓ Modified and approved")
return ApprovalResult.success_result(test.id, action, "Modified and approved")
else:
# No changes made, treat as approve
test.approve()
storage.update_test(test)
print("✓ Approved (no modifications)")
return ApprovalResult.success_result(test.id, ApprovalAction.APPROVE, "No modifications made")
elif action == ApprovalAction.SKIP:
print("⏭ Skipped (remains pending)")
return ApprovalResult.success_result(test.id, action, "Skipped")
else:
return ApprovalResult.error_result(test.id, action, f"Unknown action: {action}")
except Exception as e:
return ApprovalResult.error_result(test.id, action, str(e))
def _edit_test_code(code: str) -> str:
"""
Open test code in user's editor for modification.
Uses $EDITOR environment variable, falls back to vim/nano.
"""
editor = os.environ.get("EDITOR", "vim")
# Try to find an available editor
if not _command_exists(editor):
for fallback in ["nano", "vi", "notepad"]:
if _command_exists(fallback):
editor = fallback
break
# Create temp file with code
with tempfile.NamedTemporaryFile(
mode="w",
suffix=".py",
delete=False
) as f:
f.write(code)
temp_path = f.name
try:
# Open editor
subprocess.run([editor, temp_path], check=True)
# Read edited code
with open(temp_path) as f:
return f.read()
except subprocess.CalledProcessError:
print("Editor failed, keeping original code")
return code
except FileNotFoundError:
print(f"Editor '{editor}' not found, keeping original code")
return code
finally:
# Clean up temp file
try:
os.unlink(temp_path)
except OSError:
pass
def _command_exists(cmd: str) -> bool:
"""Check if a command exists in PATH."""
from shutil import which
return which(cmd) is not None
+130
View File
@@ -0,0 +1,130 @@
"""
Types for the approval workflow.
These types are used for both interactive CLI approval and
programmatic/MCP-based approval.
"""
from enum import Enum
from datetime import datetime
from typing import Any
from pydantic import BaseModel, Field
class ApprovalAction(str, Enum):
"""Actions a user can take on a generated test."""
APPROVE = "approve" # Accept as-is
MODIFY = "modify" # Accept with modifications
REJECT = "reject" # Decline
SKIP = "skip" # Leave pending (decide later)
class ApprovalRequest(BaseModel):
"""
Request to approve/modify/reject a generated test.
Used by both CLI and MCP interfaces.
"""
test_id: str
action: ApprovalAction
modified_code: str | None = Field(
default=None,
description="New code if action is MODIFY"
)
reason: str | None = Field(
default=None,
description="Rejection reason if action is REJECT"
)
approved_by: str = "user"
def validate_action(self) -> tuple[bool, str | None]:
"""
Validate that the request has required fields for its action.
Returns:
Tuple of (is_valid, error_message)
"""
if self.action == ApprovalAction.MODIFY and not self.modified_code:
return False, "modified_code is required for MODIFY action"
if self.action == ApprovalAction.REJECT and not self.reason:
return False, "reason is required for REJECT action"
return True, None
class ApprovalResult(BaseModel):
"""
Result of processing an approval request.
"""
test_id: str
action: ApprovalAction
success: bool
message: str | None = None
error: str | None = None
timestamp: datetime = Field(default_factory=datetime.now)
@classmethod
def success_result(
cls, test_id: str, action: ApprovalAction, message: str | None = None
) -> "ApprovalResult":
"""Create a successful result."""
return cls(
test_id=test_id,
action=action,
success=True,
message=message,
)
@classmethod
def error_result(
cls, test_id: str, action: ApprovalAction, error: str
) -> "ApprovalResult":
"""Create an error result."""
return cls(
test_id=test_id,
action=action,
success=False,
error=error,
)
class BatchApprovalRequest(BaseModel):
"""
Request to approve multiple tests at once.
Useful for MCP interface where user reviews all tests and submits decisions.
"""
goal_id: str
approvals: list[ApprovalRequest]
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"goal_id": self.goal_id,
"approvals": [a.model_dump() for a in self.approvals],
}
class BatchApprovalResult(BaseModel):
"""
Result of processing a batch approval request.
"""
goal_id: str
total: int
approved: int
modified: int
rejected: int
skipped: int
errors: int
results: list[ApprovalResult]
def summary(self) -> str:
"""Return a summary string."""
return (
f"Processed {self.total} tests: "
f"{self.approved} approved, "
f"{self.modified} modified, "
f"{self.rejected} rejected, "
f"{self.skipped} skipped, "
f"{self.errors} errors"
)
+260
View File
@@ -0,0 +1,260 @@
"""
Error categorization for test failures.
Categorizes errors to guide iteration strategy:
- LOGIC_ERROR: Goal definition is wrong update success_criteria/constraints
- IMPLEMENTATION_ERROR: Code bug fix nodes/edges in Agent stage
- EDGE_CASE: New scenario discovered add new test only
"""
import re
from typing import Any
from framework.testing.test_result import ErrorCategory, TestResult
class ErrorCategorizer:
"""
Categorize test failures for guiding iteration.
Uses pattern matching heuristics to classify errors.
Each category has different implications for how to fix.
"""
# Patterns indicating goal/criteria definition is wrong
LOGIC_ERROR_PATTERNS = [
r"goal not achieved",
r"constraint violated:?\s*core",
r"fundamental assumption",
r"success criteria mismatch",
r"criteria not met",
r"expected behavior incorrect",
r"specification error",
r"requirement mismatch",
]
# Patterns indicating code/implementation bug
IMPLEMENTATION_ERROR_PATTERNS = [
r"TypeError",
r"AttributeError",
r"KeyError",
r"IndexError",
r"ValueError",
r"NameError",
r"ImportError",
r"ModuleNotFoundError",
r"RuntimeError",
r"NullPointerException",
r"NoneType.*has no attribute",
r"tool call failed",
r"node execution error",
r"agent execution failed",
r"assertion.*failed",
r"AssertionError",
r"expected.*but got",
r"unexpected.*type",
r"missing required",
r"invalid.*argument",
]
# Patterns indicating edge case / new scenario
EDGE_CASE_PATTERNS = [
r"boundary condition",
r"timeout",
r"connection.*timeout",
r"request.*timeout",
r"unexpected format",
r"unexpected response",
r"rare input",
r"empty.*result",
r"null.*value",
r"empty.*response",
r"no.*results",
r"rate.*limit",
r"quota.*exceeded",
r"retry.*exhausted",
r"unicode.*error",
r"encoding.*error",
r"special.*character",
]
def __init__(self):
"""Initialize categorizer with compiled patterns."""
self._logic_patterns = [
re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS
]
self._impl_patterns = [
re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS
]
self._edge_patterns = [
re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS
]
def categorize(self, result: TestResult) -> ErrorCategory | None:
"""
Categorize a test failure.
Args:
result: TestResult to categorize
Returns:
ErrorCategory if test failed, None if passed
"""
if result.passed:
return None
# Combine error sources for analysis
error_text = self._get_error_text(result)
# Check patterns in priority order
# Logic errors take precedence (wrong goal definition)
for pattern in self._logic_patterns:
if pattern.search(error_text):
return ErrorCategory.LOGIC_ERROR
# Then implementation errors (code bugs)
for pattern in self._impl_patterns:
if pattern.search(error_text):
return ErrorCategory.IMPLEMENTATION_ERROR
# Then edge cases (new scenarios)
for pattern in self._edge_patterns:
if pattern.search(error_text):
return ErrorCategory.EDGE_CASE
# Default to implementation error (most common)
return ErrorCategory.IMPLEMENTATION_ERROR
def categorize_with_confidence(
self, result: TestResult
) -> tuple[ErrorCategory | None, float]:
"""
Categorize with a confidence score.
Args:
result: TestResult to categorize
Returns:
Tuple of (category, confidence 0-1)
"""
if result.passed:
return None, 1.0
error_text = self._get_error_text(result)
# Count pattern matches for each category
logic_matches = sum(
1 for p in self._logic_patterns if p.search(error_text)
)
impl_matches = sum(
1 for p in self._impl_patterns if p.search(error_text)
)
edge_matches = sum(
1 for p in self._edge_patterns if p.search(error_text)
)
total_matches = logic_matches + impl_matches + edge_matches
if total_matches == 0:
# No pattern matches, default to implementation with low confidence
return ErrorCategory.IMPLEMENTATION_ERROR, 0.3
# Calculate confidence based on match dominance
if logic_matches >= impl_matches and logic_matches >= edge_matches:
confidence = logic_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4)
if impl_matches >= logic_matches and impl_matches >= edge_matches:
confidence = impl_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4)
confidence = edge_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4)
def _get_error_text(self, result: TestResult) -> str:
"""Extract all error text from a result for analysis."""
parts = []
if result.error_message:
parts.append(result.error_message)
if result.stack_trace:
parts.append(result.stack_trace)
# Include log messages
for log in result.runtime_logs:
if log.get("level") in ("ERROR", "CRITICAL", "WARNING"):
parts.append(str(log.get("msg", "")))
return " ".join(parts)
def get_fix_suggestion(self, category: ErrorCategory) -> str:
"""
Get a fix suggestion based on error category.
Args:
category: ErrorCategory from categorization
Returns:
Human-readable fix suggestion
"""
suggestions = {
ErrorCategory.LOGIC_ERROR: (
"Review and update success_criteria or constraints in the Goal definition. "
"The goal specification may not accurately describe the desired behavior."
),
ErrorCategory.IMPLEMENTATION_ERROR: (
"Fix the code in agent nodes/edges. "
"There's a bug in the implementation that needs to be corrected."
),
ErrorCategory.EDGE_CASE: (
"Add a new test for this edge case scenario. "
"This is a valid scenario that wasn't covered by existing tests."
),
}
return suggestions.get(category, "Review the test and agent implementation.")
def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]:
"""
Get detailed iteration guidance based on error category.
Returns a dict with:
- stage: Which stage to return to (Goal, Agent, Eval)
- action: What action to take
- restart_required: Whether full 3-step flow restart is needed
"""
guidance = {
ErrorCategory.LOGIC_ERROR: {
"stage": "Goal",
"action": "Update success_criteria or constraints",
"restart_required": True,
"description": (
"The goal definition is incorrect. Update the success criteria "
"or constraints, then restart the full Goal → Agent → Eval flow."
),
},
ErrorCategory.IMPLEMENTATION_ERROR: {
"stage": "Agent",
"action": "Fix nodes/edges implementation",
"restart_required": False,
"description": (
"There's a code bug. Fix the agent implementation, "
"then re-run Eval (skip Goal stage)."
),
},
ErrorCategory.EDGE_CASE: {
"stage": "Eval",
"action": "Add new test only",
"restart_required": False,
"description": (
"This is a new scenario. Add a test for it and continue "
"in the Eval stage."
),
},
}
return guidance.get(category, {
"stage": "Unknown",
"action": "Review manually",
"restart_required": False,
"description": "Unable to determine category. Manual review required.",
})
+413
View File
@@ -0,0 +1,413 @@
"""
CLI commands for goal-based testing.
Provides commands:
- test-generate: Generate tests from a goal
- test-approve: Review and approve pending tests
- test-run: Run tests for an agent
- test-debug: Debug a failed test
"""
import argparse
import json
import sys
from pathlib import Path
from framework.graph.goal import Goal
from framework.testing.test_case import TestType
from framework.testing.test_storage import TestStorage
from framework.testing.constraint_gen import ConstraintTestGenerator
from framework.testing.success_gen import SuccessCriteriaTestGenerator
from framework.testing.approval_cli import interactive_approval
from framework.testing.parallel import ParallelTestRunner, ParallelConfig, AgentFactory
from framework.testing.debug_tool import DebugTool
DEFAULT_STORAGE_PATH = Path("data/tests")
def register_testing_commands(subparsers: argparse._SubParsersAction) -> None:
"""Register testing CLI commands."""
# test-generate
gen_parser = subparsers.add_parser(
"test-generate",
help="Generate tests from goal criteria",
)
gen_parser.add_argument(
"goal_file",
help="Path to goal JSON file",
)
gen_parser.add_argument(
"--type",
choices=["constraint", "success", "all"],
default="all",
help="Type of tests to generate",
)
gen_parser.add_argument(
"--auto-approve",
action="store_true",
help="Skip interactive approval (use with caution)",
)
gen_parser.add_argument(
"--output",
"-o",
help="Output directory for tests (default: data/tests/<goal_id>)",
)
gen_parser.set_defaults(func=cmd_test_generate)
# test-approve
approve_parser = subparsers.add_parser(
"test-approve",
help="Review and approve pending tests",
)
approve_parser.add_argument(
"goal_id",
help="Goal ID to review tests for",
)
approve_parser.add_argument(
"--storage",
help="Storage directory (default: data/tests/<goal_id>)",
)
approve_parser.set_defaults(func=cmd_test_approve)
# test-run
run_parser = subparsers.add_parser(
"test-run",
help="Run tests for an agent",
)
run_parser.add_argument(
"agent_path",
help="Path to agent export folder",
)
run_parser.add_argument(
"--goal",
"-g",
required=True,
help="Goal ID to run tests for",
)
run_parser.add_argument(
"--parallel",
"-p",
type=int,
default=0,
help="Number of parallel workers (0 for sequential)",
)
run_parser.add_argument(
"--fail-fast",
action="store_true",
help="Stop on first failure",
)
run_parser.add_argument(
"--type",
choices=["constraint", "success", "edge_case", "all"],
default="all",
help="Type of tests to run",
)
run_parser.set_defaults(func=cmd_test_run)
# test-debug
debug_parser = subparsers.add_parser(
"test-debug",
help="Debug a failed test",
)
debug_parser.add_argument(
"goal_id",
help="Goal ID",
)
debug_parser.add_argument(
"test_id",
help="Test ID to debug",
)
debug_parser.add_argument(
"--run-id",
help="Runtime run ID for detailed logs",
)
debug_parser.set_defaults(func=cmd_test_debug)
# test-list
list_parser = subparsers.add_parser(
"test-list",
help="List tests for a goal",
)
list_parser.add_argument(
"goal_id",
help="Goal ID",
)
list_parser.add_argument(
"--status",
choices=["pending", "approved", "modified", "rejected", "all"],
default="all",
help="Filter by approval status",
)
list_parser.set_defaults(func=cmd_test_list)
# test-stats
stats_parser = subparsers.add_parser(
"test-stats",
help="Show test statistics for a goal",
)
stats_parser.add_argument(
"goal_id",
help="Goal ID",
)
stats_parser.set_defaults(func=cmd_test_stats)
def cmd_test_generate(args: argparse.Namespace) -> int:
"""Generate tests from a goal file."""
# Load goal
goal_path = Path(args.goal_file)
if not goal_path.exists():
print(f"Error: Goal file not found: {goal_path}")
return 1
with open(goal_path) as f:
goal = Goal.model_validate_json(f.read())
print(f"Loaded goal: {goal.name} ({goal.id})")
# Determine output directory
output_dir = Path(args.output) if args.output else DEFAULT_STORAGE_PATH / goal.id
storage = TestStorage(output_dir)
# Get LLM provider
try:
from framework.llm import AnthropicProvider
llm = AnthropicProvider()
except Exception as e:
print(f"Error: Failed to initialize LLM provider: {e}")
return 1
all_tests = []
# Generate constraint tests
if args.type in ("constraint", "all"):
print(f"\nGenerating constraint tests for {len(goal.constraints)} constraints...")
generator = ConstraintTestGenerator(llm)
constraint_tests = generator.generate(goal)
all_tests.extend(constraint_tests)
print(f"Generated {len(constraint_tests)} constraint tests")
# Generate success criteria tests
if args.type in ("success", "all"):
print(f"\nGenerating success criteria tests for {len(goal.success_criteria)} criteria...")
generator = SuccessCriteriaTestGenerator(llm)
success_tests = generator.generate(goal)
all_tests.extend(success_tests)
print(f"Generated {len(success_tests)} success criteria tests")
if not all_tests:
print("\nNo tests generated.")
return 0
print(f"\nTotal tests generated: {len(all_tests)}")
# Approval
if args.auto_approve:
print("\nAuto-approving all tests...")
for test in all_tests:
test.approve("cli-auto")
storage.save_test(test)
print(f"Saved {len(all_tests)} tests to {output_dir}")
else:
print("\nStarting interactive approval...")
# Save pending tests first
for test in all_tests:
storage.save_test(test)
results = interactive_approval(all_tests, storage)
approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
print(f"\nApproved: {approved}/{len(all_tests)} tests")
return 0
def cmd_test_approve(args: argparse.Namespace) -> int:
"""Review and approve pending tests."""
storage_path = Path(args.storage) if args.storage else DEFAULT_STORAGE_PATH / args.goal_id
storage = TestStorage(storage_path)
pending = storage.get_pending_tests(args.goal_id)
if not pending:
print(f"No pending tests for goal {args.goal_id}")
return 0
print(f"Found {len(pending)} pending tests\n")
results = interactive_approval(pending, storage)
approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
print(f"\nApproved: {approved}/{len(pending)} tests")
return 0
def cmd_test_run(args: argparse.Namespace) -> int:
"""Run tests for an agent."""
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal)
# Get approved tests
tests = storage.get_approved_tests(args.goal)
# Filter by type
if args.type != "all":
type_map = {
"constraint": TestType.CONSTRAINT,
"success": TestType.SUCCESS_CRITERIA,
"edge_case": TestType.EDGE_CASE,
}
filter_type = type_map.get(args.type)
if filter_type:
tests = [t for t in tests if t.test_type == filter_type]
if not tests:
print(f"No approved tests found for goal {args.goal}")
return 1
print(f"Running {len(tests)} tests...\n")
# Configure runner
config = ParallelConfig(
num_workers=args.parallel if args.parallel > 0 else 1,
fail_fast=args.fail_fast,
)
# Run with progress - use AgentFactory for picklable parallel execution
runner = ParallelTestRunner(config, storage)
def on_result(result):
status = "" if result.passed else ""
print(f" {status} {result.test_id} ({result.duration_ms}ms)")
result = runner.run_all(
goal_id=args.goal,
agent_factory=AgentFactory(args.agent_path),
tests=tests,
on_result=on_result,
)
# Print summary
print(f"\n{'=' * 40}")
print(f"Results: {result.passed}/{result.total} passed ({result.pass_rate:.1%})")
print(f"Duration: {result.duration_ms}ms")
if not result.all_passed:
print(f"\nFailed tests:")
for r in result.get_failed_results():
print(f" - {r.test_id}: {r.error_message}")
if r.error_category:
print(f" Category: {r.error_category.value}")
return 0 if result.all_passed else 1
def cmd_test_debug(args: argparse.Namespace) -> int:
"""Debug a failed test."""
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
# Try to load runtime storage
runtime_storage = None
try:
from framework.storage.backend import FileStorage
runtime_storage = FileStorage(f"data/runtime/{args.goal_id}")
except Exception:
pass
debug_tool = DebugTool(storage, runtime_storage)
info = debug_tool.analyze(args.goal_id, args.test_id, args.run_id)
# Print debug info
print(f"Debug Info for: {info.test_name}")
print("=" * 50)
print(f"\nTest ID: {info.test_id}")
print(f"Passed: {info.passed}")
if info.error_category:
print(f"\nError Category: {info.error_category}")
print(f"Suggested Fix: {info.suggested_fix}")
if info.error_message:
print(f"\nError Message:\n{info.error_message}")
if info.stack_trace:
print(f"\nStack Trace:\n{info.stack_trace}")
if info.iteration_guidance:
print(f"\nIteration Guidance:")
print(f" Stage: {info.iteration_guidance.get('stage')}")
print(f" Action: {info.iteration_guidance.get('action')}")
print(f" Restart Required: {info.iteration_guidance.get('restart_required')}")
print(f"\nInput:\n{json.dumps(info.input, indent=2)}")
print(f"\nExpected:\n{json.dumps(info.expected, indent=2)}")
print(f"\nActual:\n{json.dumps(info.actual, indent=2, default=str)}")
return 0
def cmd_test_list(args: argparse.Namespace) -> int:
"""List tests for a goal."""
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
tests = storage.get_tests_by_goal(args.goal_id)
# Filter by status
if args.status != "all":
from framework.testing.test_case import ApprovalStatus
try:
filter_status = ApprovalStatus(args.status)
tests = [t for t in tests if t.approval_status == filter_status]
except ValueError:
pass
if not tests:
print(f"No tests found for goal {args.goal_id}")
return 0
print(f"Tests for goal {args.goal_id}:\n")
for t in tests:
status_icon = {
"pending": "",
"approved": "",
"modified": "✓*",
"rejected": "",
}.get(t.approval_status.value, "?")
result_icon = ""
if t.last_result:
result_icon = " [PASS]" if t.last_result == "passed" else " [FAIL]"
print(f" {status_icon} {t.test_name} ({t.test_type.value}){result_icon}")
print(f" ID: {t.id}")
print(f" Criteria: {t.parent_criteria_id}")
if t.llm_confidence:
print(f" Confidence: {t.llm_confidence:.0%}")
print()
return 0
def cmd_test_stats(args: argparse.Namespace) -> int:
"""Show test statistics."""
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
stats = storage.get_stats()
print(f"Statistics for goal {args.goal_id}:\n")
print(f" Total tests: {stats['total_tests']}")
print(f"\n By approval status:")
for status, count in stats["by_approval"].items():
print(f" {status}: {count}")
# Get pass/fail stats
tests = storage.get_approved_tests(args.goal_id)
passed = sum(1 for t in tests if t.last_result == "passed")
failed = sum(1 for t in tests if t.last_result == "failed")
not_run = sum(1 for t in tests if t.last_result is None)
print(f"\n Execution results:")
print(f" Passed: {passed}")
print(f" Failed: {failed}")
print(f" Not run: {not_run}")
return 0
+201
View File
@@ -0,0 +1,201 @@
"""
Constraint test generator.
Generates tests for Goal constraints using LLM.
Tests are returned with PENDING approval status.
"""
import uuid
from typing import TYPE_CHECKING
from framework.graph.goal import Goal, Constraint
from framework.testing.test_case import Test, TestType, ApprovalStatus
from framework.testing.prompts import CONSTRAINT_TEST_PROMPT
from framework.llm.provider import Tool, ToolUse, ToolResult
if TYPE_CHECKING:
from framework.llm.provider import LLMProvider
# Tool for collecting generated tests - Claude handles JSON escaping automatically
SUBMIT_TEST_TOOL = Tool(
name="submit_test",
description="Submit a generated constraint test. Call once per test.",
parameters={
"properties": {
"constraint_id": {
"type": "string",
"description": "ID of the constraint being tested",
},
"test_name": {
"type": "string",
"description": "pytest function name, e.g., test_constraint_api_limits_respected",
},
"test_code": {
"type": "string",
"description": "Complete Python test function code",
},
"description": {
"type": "string",
"description": "What the test validates",
},
"input": {
"type": "object",
"description": "Test input data",
},
"expected_output": {
"type": "object",
"description": "Expected output",
},
"confidence": {
"type": "number",
"description": "Confidence score 0-1",
},
},
"required": ["constraint_id", "test_name", "test_code", "description", "confidence"],
},
)
class ConstraintTestGenerator:
"""
Generate constraint tests from Goal constraints.
Generated tests require user approval before being added to the test suite.
"""
def __init__(self, llm: "LLMProvider"):
"""
Initialize generator with LLM provider.
Args:
llm: LLM provider for test generation (e.g., AnthropicProvider)
"""
self.llm = llm
def generate(self, goal: Goal) -> list[Test]:
"""
Generate tests for all constraints in a goal.
Args:
goal: Goal with constraints to test
Returns:
List of Test objects with approval_status=PENDING.
These MUST be approved before being added to the test suite.
"""
if not goal.constraints:
return []
# Format prompt
prompt = CONSTRAINT_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
constraints_formatted=self._format_constraints(goal.constraints),
)
# Collect tests via tool calls - Claude handles JSON escaping automatically
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(
tool_use_id=tool_use.id, content="Test recorded successfully"
)
return ToolResult(
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system="You are a test generation expert. For each constraint, call the submit_test tool with the test details.",
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=20,
)
return self._create_tests_from_collected(collected_tests, goal.id)
def generate_for_constraint(
self, goal: Goal, constraint: Constraint
) -> list[Test]:
"""
Generate tests for a single constraint.
Args:
goal: Goal containing the constraint
constraint: Specific constraint to test
Returns:
List of Test objects for the constraint
"""
# Format prompt with just this constraint
prompt = CONSTRAINT_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
constraints_formatted=self._format_constraint(constraint),
)
# Collect tests via tool calls
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(
tool_use_id=tool_use.id, content="Test recorded successfully"
)
return ToolResult(
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system="You are a test generation expert. Call the submit_test tool with the test details.",
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=10,
)
return self._create_tests_from_collected(collected_tests, goal.id)
def _format_constraints(self, constraints: list[Constraint]) -> str:
"""Format constraints for prompt."""
lines = []
for c in constraints:
lines.append(self._format_constraint(c))
lines.append("")
return "\n".join(lines)
def _format_constraint(self, constraint: Constraint) -> str:
"""Format a single constraint for prompt."""
severity = "HARD" if constraint.constraint_type == "hard" else "SOFT"
return f"""### Constraint: {constraint.id}
- Type: {severity} ({constraint.constraint_type})
- Category: {constraint.category}
- Description: {constraint.description}
- Check: {constraint.check}"""
def _create_tests_from_collected(
self, collected: list[dict], goal_id: str
) -> list[Test]:
"""Create Test objects from tool call data."""
tests = []
for td in collected:
test = Test(
id=f"test_{uuid.uuid4().hex[:8]}",
goal_id=goal_id,
parent_criteria_id=td.get("constraint_id", "unknown"),
test_type=TestType.CONSTRAINT,
test_name=td.get("test_name", "unnamed_test"),
test_code=td.get("test_code", ""),
description=td.get("description", ""),
input=td.get("input", {}),
expected_output=td.get("expected_output", {}),
generated_by="llm",
llm_confidence=float(td.get("confidence", 0.5)),
approval_status=ApprovalStatus.PENDING,
)
tests.append(test)
return tests
+286
View File
@@ -0,0 +1,286 @@
"""
Debug tool for analyzing failed tests.
Provides detailed information for debugging:
- Test input and expected output
- Actual output and error details
- Error categorization
- Runtime logs and execution path
- Fix suggestions
"""
from typing import Any
from pydantic import BaseModel, Field
from framework.testing.test_case import Test
from framework.testing.test_result import TestResult, ErrorCategory
from framework.testing.test_storage import TestStorage
from framework.testing.categorizer import ErrorCategorizer
class DebugInfo(BaseModel):
"""
Comprehensive debug information for a failed test.
"""
test_id: str
test_name: str
# Test definition
input: dict[str, Any] = Field(default_factory=dict)
expected: dict[str, Any] = Field(default_factory=dict)
# Actual result
actual: Any = None
passed: bool = False
# Error details
error_message: str | None = None
error_category: str | None = None
stack_trace: str | None = None
# Runtime data
logs: list[dict[str, Any]] = Field(default_factory=list)
runtime_data: dict[str, Any] = Field(default_factory=dict)
# Fix guidance
suggested_fix: str | None = None
iteration_guidance: dict[str, Any] = Field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert to dict for JSON serialization."""
return self.model_dump()
class DebugTool:
"""
Debug tool for analyzing failed tests.
Integrates with:
- TestStorage for test and result data
- Runtime storage (optional) for decision logs
- ErrorCategorizer for classification
"""
def __init__(
self,
test_storage: TestStorage,
runtime_storage: Any | None = None,
):
"""
Initialize debug tool.
Args:
test_storage: Storage for test and result data
runtime_storage: Optional FileStorage for Runtime data
"""
self.test_storage = test_storage
self.runtime_storage = runtime_storage
self.categorizer = ErrorCategorizer()
def analyze(
self,
goal_id: str,
test_id: str,
run_id: str | None = None,
) -> DebugInfo:
"""
Get detailed debug info for a failed test.
Args:
goal_id: Goal ID containing the test
test_id: ID of the test to analyze
run_id: Optional Runtime run ID for detailed logs
Returns:
DebugInfo with comprehensive debug data
"""
# Load test
test = self.test_storage.load_test(goal_id, test_id)
if not test:
return DebugInfo(
test_id=test_id,
test_name="unknown",
error_message=f"Test {test_id} not found in goal {goal_id}",
)
# Load latest result
result = self.test_storage.get_latest_result(test_id)
# Build debug info
debug_info = DebugInfo(
test_id=test_id,
test_name=test.test_name,
input=test.input,
expected=test.expected_output,
)
if result:
debug_info.actual = result.actual_output
debug_info.passed = result.passed
debug_info.error_message = result.error_message
debug_info.stack_trace = result.stack_trace
debug_info.logs = result.runtime_logs
# Set category
if result.error_category:
debug_info.error_category = result.error_category.value
elif not result.passed:
# Categorize if not already done
category = self.categorizer.categorize(result)
if category:
debug_info.error_category = category.value
# Get runtime data if available
if run_id and self.runtime_storage:
debug_info.runtime_data = self._get_runtime_data(run_id)
# Generate fix suggestions
if debug_info.error_category:
category = ErrorCategory(debug_info.error_category)
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
return debug_info
def analyze_result(
self,
test: Test,
result: TestResult,
run_id: str | None = None,
) -> DebugInfo:
"""
Analyze a test result directly (without loading from storage).
Args:
test: The Test that was run
result: The TestResult to analyze
run_id: Optional Runtime run ID
Returns:
DebugInfo with debug data
"""
debug_info = DebugInfo(
test_id=test.id,
test_name=test.test_name,
input=test.input,
expected=test.expected_output,
actual=result.actual_output,
passed=result.passed,
error_message=result.error_message,
stack_trace=result.stack_trace,
logs=result.runtime_logs,
)
# Categorize
if result.error_category:
debug_info.error_category = result.error_category.value
elif not result.passed:
category = self.categorizer.categorize(result)
if category:
debug_info.error_category = category.value
# Runtime data
if run_id and self.runtime_storage:
debug_info.runtime_data = self._get_runtime_data(run_id)
# Fix suggestions
if debug_info.error_category:
category = ErrorCategory(debug_info.error_category)
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
return debug_info
def get_failure_summary(
self,
goal_id: str,
) -> dict[str, Any]:
"""
Get summary of all failures for a goal.
Returns:
Dict with failure counts by category and test IDs
"""
tests = self.test_storage.get_tests_by_goal(goal_id)
failures_by_category: dict[str, list[str]] = {
"logic_error": [],
"implementation_error": [],
"edge_case": [],
"uncategorized": [],
}
for test in tests:
if test.last_result == "failed":
result = self.test_storage.get_latest_result(test.id)
if result and result.error_category:
failures_by_category[result.error_category.value].append(test.id)
else:
failures_by_category["uncategorized"].append(test.id)
return {
"goal_id": goal_id,
"total_failures": sum(len(ids) for ids in failures_by_category.values()),
"by_category": failures_by_category,
"iteration_suggestions": self._get_iteration_suggestions(failures_by_category),
}
def _get_runtime_data(self, run_id: str) -> dict[str, Any]:
"""Extract runtime data from Runtime storage."""
if not self.runtime_storage:
return {}
try:
run = self.runtime_storage.load_run(run_id)
if not run:
return {"error": f"Run {run_id} not found"}
return {
"execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [],
"decisions": [
d.model_dump() if hasattr(d, "model_dump") else str(d)
for d in getattr(run, "decisions", [])
],
"problems": [
p.model_dump() if hasattr(p, "model_dump") else str(p)
for p in getattr(run, "problems", [])
],
"status": run.status.value if hasattr(run, "status") else "unknown",
}
except Exception as e:
return {"error": f"Failed to load runtime data: {e}"}
def _get_iteration_suggestions(
self,
failures_by_category: dict[str, list[str]],
) -> list[str]:
"""Generate iteration suggestions based on failure categories."""
suggestions = []
if failures_by_category["logic_error"]:
suggestions.append(
f"Found {len(failures_by_category['logic_error'])} logic errors. "
"Review and update Goal success_criteria/constraints, then restart "
"the full Goal → Agent → Eval flow."
)
if failures_by_category["implementation_error"]:
suggestions.append(
f"Found {len(failures_by_category['implementation_error'])} implementation errors. "
"Fix agent node/edge code and re-run Eval."
)
if failures_by_category["edge_case"]:
suggestions.append(
f"Found {len(failures_by_category['edge_case'])} edge cases. "
"These are new scenarios - add tests for them."
)
if failures_by_category["uncategorized"]:
suggestions.append(
f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. "
"Manual review required."
)
return suggestions
+407
View File
@@ -0,0 +1,407 @@
"""
Single test executor.
Executes a single test against an agent and returns a TestResult.
"""
import asyncio
import inspect
import os
import time
import traceback
from typing import Any, Protocol, runtime_checkable
from framework.testing.test_case import Test
from framework.testing.test_result import TestResult, ErrorCategory
from framework.testing.categorizer import ErrorCategorizer
class LLMJudge:
"""
LLM-based judge for semantic evaluation of test results.
Used by tests that need to evaluate semantic properties like
"no hallucination" or "preserves meaning" that can't be checked
with simple assertions.
"""
def __init__(self):
"""Initialize the LLM judge."""
self._client = None
def _get_client(self):
"""Lazy-load the Anthropic client."""
if self._client is None:
try:
import anthropic
self._client = anthropic.Anthropic()
except ImportError:
raise RuntimeError("anthropic package required for LLM judge")
return self._client
def evaluate(
self,
constraint: str,
source_document: str,
summary: str,
criteria: str,
) -> dict[str, Any]:
"""
Evaluate whether a summary meets a constraint.
Args:
constraint: The constraint being tested (e.g., "no-hallucination")
source_document: The original document
summary: The generated summary to evaluate
criteria: Human-readable criteria for evaluation
Returns:
Dict with 'passes' (bool) and 'explanation' (str)
"""
client = self._get_client()
prompt = f"""You are evaluating whether a summary meets a specific constraint.
CONSTRAINT: {constraint}
CRITERIA: {criteria}
SOURCE DOCUMENT:
{source_document}
SUMMARY TO EVALUATE:
{summary}
Evaluate whether the summary meets the constraint. Be strict but fair.
Respond with JSON in this exact format:
{{"passes": true/false, "explanation": "brief explanation of your judgment"}}
Only output the JSON, nothing else."""
try:
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=500,
messages=[{"role": "user", "content": prompt}]
)
# Parse the response
import json
text = response.content[0].text.strip()
# Handle potential markdown code blocks
if text.startswith("```"):
text = text.split("```")[1]
if text.startswith("json"):
text = text[4:]
text = text.strip()
result = json.loads(text)
return {
"passes": bool(result.get("passes", False)),
"explanation": result.get("explanation", "No explanation provided")
}
except Exception as e:
# On error, fail the test with explanation
return {
"passes": False,
"explanation": f"LLM judge error: {e}"
}
@runtime_checkable
class AgentProtocol(Protocol):
"""Protocol for agent that can be tested."""
def run(self, input: dict[str, Any]) -> Any:
"""Run the agent with input and return result."""
...
class SyncAgentWrapper:
"""
Wrapper that makes async agent.run() callable synchronously.
This allows tests to call agent.run() without async/await syntax,
which simplifies test code generation and execution.
"""
def __init__(self, agent: Any):
self._agent = agent
self._loop: asyncio.AbstractEventLoop | None = None
def run(self, input_data: dict[str, Any]) -> Any:
"""
Run agent synchronously by wrapping async call.
Args:
input_data: Input data for the agent
Returns:
Output dict from the agent's ExecutionResult
"""
coro = self._agent.run(input_data)
# Check if we're already in an async context
try:
loop = asyncio.get_running_loop()
# We're in an async context, can't use run_until_complete
# This shouldn't happen in normal test execution
raise RuntimeError("Cannot run sync wrapper from async context")
except RuntimeError:
# No running loop, create one or reuse
pass
# Get or create event loop
try:
if self._loop is None or self._loop.is_closed():
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
return self._loop.run_until_complete(coro).output
finally:
# Don't close the loop here - we may need it for subsequent calls
pass
def __getattr__(self, name: str) -> Any:
"""Forward other attribute access to wrapped agent."""
return getattr(self._agent, name)
class TestExecutor:
"""
Execute a single test against an agent.
Handles:
- Test code compilation and execution
- Timing measurement
- Error capture and categorization
- Result creation
"""
def __init__(
self,
categorizer: ErrorCategorizer | None = None,
timeout: float = 60.0,
):
"""
Initialize executor.
Args:
categorizer: ErrorCategorizer for classifying failures
timeout: Maximum test execution time in seconds
"""
self.categorizer = categorizer or ErrorCategorizer()
self.timeout = timeout
def execute(
self,
test: Test,
agent: AgentProtocol,
capture_logs: bool = True,
) -> TestResult:
"""
Execute a test against an agent.
Args:
test: Test to execute
agent: Agent instance to test
capture_logs: Whether to capture runtime logs
Returns:
TestResult with execution details
"""
start_time = time.perf_counter()
try:
# Build test environment
test_globals = self._build_test_globals(agent, test)
# Compile test code
try:
compiled = compile(test.test_code, f"<test:{test.test_name}>", "exec")
except SyntaxError as e:
return self._create_error_result(
test=test,
start_time=start_time,
error_message=f"Test code syntax error: {e}",
stack_trace=traceback.format_exc(),
)
# Execute test
try:
exec(compiled, test_globals)
# Look for test function and call it
test_func = test_globals.get(test.test_name)
if test_func is None:
# Try to find any function starting with test_
for name, obj in test_globals.items():
if name.startswith("test_") and callable(obj):
test_func = obj
break
if test_func is None:
return self._create_error_result(
test=test,
start_time=start_time,
error_message=f"Test function '{test.test_name}' not found in test code",
)
# Call the test function with appropriate arguments
# Inspect the function signature to determine what to pass
sig = inspect.signature(test_func)
params = list(sig.parameters.keys())
# Build arguments based on what the function expects
call_args = []
for param in params:
if param == "agent":
call_args.append(test_globals["agent"])
elif param == "llm_judge":
call_args.append(test_globals["llm_judge"])
elif param in test_globals:
call_args.append(test_globals[param])
else:
# Unknown parameter - this will likely cause an error
# but we let it happen naturally
break
test_func(*call_args)
# Test passed
duration_ms = int((time.perf_counter() - start_time) * 1000)
return TestResult(
test_id=test.id,
passed=True,
duration_ms=duration_ms,
expected_output=test.expected_output,
actual_output={"status": "passed"},
)
except AssertionError as e:
return self._create_failure_result(
test=test,
start_time=start_time,
error_message=str(e) or "Assertion failed",
stack_trace=traceback.format_exc(),
)
except Exception as e:
return self._create_failure_result(
test=test,
start_time=start_time,
error_message=f"{type(e).__name__}: {e}",
stack_trace=traceback.format_exc(),
)
except Exception as e:
return self._create_error_result(
test=test,
start_time=start_time,
error_message=f"Test execution error: {e}",
stack_trace=traceback.format_exc(),
)
def _build_test_globals(
self,
agent: AgentProtocol,
test: Test,
) -> dict[str, Any]:
"""Build the globals dict for test execution."""
# Wrap async agents in a sync wrapper so test code can call agent.run()
# without async/await syntax
wrapped_agent = self._wrap_agent_if_async(agent)
return {
"__builtins__": __builtins__,
"agent": wrapped_agent,
"llm_judge": LLMJudge(), # For semantic evaluation tests
"test_input": test.input,
"expected_output": test.expected_output,
# Common test utilities
"assert": assert_, # Built-in
"isinstance": isinstance,
"len": len,
"str": str,
"int": int,
"float": float,
"list": list,
"dict": dict,
"set": set,
"tuple": tuple,
"any": any,
"all": all,
"print": print, # For debugging
}
def _wrap_agent_if_async(self, agent: AgentProtocol) -> Any:
"""
Wrap agent if its run() method is async.
Args:
agent: Agent to potentially wrap
Returns:
SyncAgentWrapper if agent.run() is async, otherwise the original agent
"""
run_method = getattr(agent, "run", None)
if run_method is None:
return agent
# Check if run() is a coroutine function
if inspect.iscoroutinefunction(run_method):
return SyncAgentWrapper(agent)
return agent
def _create_failure_result(
self,
test: Test,
start_time: float,
error_message: str,
stack_trace: str | None = None,
) -> TestResult:
"""Create a result for a test that failed assertions."""
duration_ms = int((time.perf_counter() - start_time) * 1000)
result = TestResult(
test_id=test.id,
passed=False,
duration_ms=duration_ms,
expected_output=test.expected_output,
error_message=error_message,
stack_trace=stack_trace,
)
# Categorize the error
result.error_category = self.categorizer.categorize(result)
return result
def _create_error_result(
self,
test: Test,
start_time: float,
error_message: str,
stack_trace: str | None = None,
) -> TestResult:
"""Create a result for a test that couldn't run."""
duration_ms = int((time.perf_counter() - start_time) * 1000)
result = TestResult(
test_id=test.id,
passed=False,
duration_ms=duration_ms,
error_message=error_message,
stack_trace=stack_trace,
)
# Implementation error for test setup failures
result.error_category = ErrorCategory.IMPLEMENTATION_ERROR
return result
def assert_(condition: bool, message: str = "") -> None:
"""Assert helper with message."""
if not condition:
raise AssertionError(message)
+344
View File
@@ -0,0 +1,344 @@
"""
Parallel test runner inspired by pytest-xdist.
Features:
- Per-test parallelism: Each test runs independently with load balancing
- Worker initialization: Agent created once per worker thread (not per test)
- Thread-based parallelism: Uses ThreadPoolExecutor for I/O-bound LLM calls
- Fail-fast option: Stop on first failure
"""
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from multiprocessing import cpu_count
from typing import Any, Callable, Protocol, runtime_checkable
from framework.testing.test_case import Test
from framework.testing.test_result import TestResult, TestSuiteResult
from framework.testing.test_storage import TestStorage
from framework.testing.executor import TestExecutor, AgentProtocol
from framework.testing.categorizer import ErrorCategorizer
# Thread-local storage for worker agents
# Each worker thread gets its own agent instance to avoid race conditions
_thread_local = threading.local()
def _init_worker(agent_factory: Any) -> None:
"""
Initialize worker thread with its own agent instance.
Called once per worker thread when the ThreadPoolExecutor starts.
The agent is stored in thread-local storage and reused for all tests
executed by this worker.
"""
if hasattr(agent_factory, "create"):
_thread_local.agent = agent_factory.create()
else:
_thread_local.agent = agent_factory()
def _run_single_test(test: Test, timeout: float) -> TestResult:
"""
Run a single test using the worker's pre-initialized agent.
Args:
test: Test to execute
timeout: Timeout per test in seconds
Returns:
TestResult with execution details
"""
executor = TestExecutor(
categorizer=ErrorCategorizer(),
timeout=timeout,
)
return executor.execute(test, _thread_local.agent)
@dataclass
class ParallelConfig:
"""Configuration for parallel test execution."""
num_workers: int = field(default_factory=cpu_count)
timeout_per_test: float = 60.0 # seconds
fail_fast: bool = False
mock_external_apis: bool = True
@runtime_checkable
class AgentFactoryProtocol(Protocol):
"""Protocol for creating agent instances."""
def create(self) -> AgentProtocol:
"""Create a new agent instance."""
...
class AgentFactory:
"""Picklable factory that creates AgentRunner instances from a path.
This class is used instead of a lambda for parallel test execution,
since lambdas capturing local variables cannot be pickled by ProcessPoolExecutor.
"""
def __init__(self, agent_path: str):
self.agent_path = agent_path
def create(self):
from framework.runner import AgentRunner
return AgentRunner.load(self.agent_path)
class ParallelTestRunner:
"""
Parallel test execution using ThreadPoolExecutor.
Key features:
- Per-test distribution: Tests distributed individually for load balancing
- Worker initialization: Each worker thread creates one agent at startup
- Thread-based parallelism: Uses threads (not processes) for I/O-bound LLM calls
- Thread-local storage: Each worker has isolated agent state via threading.local()
"""
def __init__(
self,
config: ParallelConfig | None = None,
storage: TestStorage | None = None,
):
"""
Initialize parallel runner.
Args:
config: Parallel execution configuration
storage: TestStorage for saving results
"""
self.config = config or ParallelConfig()
self.storage = storage
self.categorizer = ErrorCategorizer()
def run_all(
self,
goal_id: str,
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
tests: list[Test] | None = None,
on_result: Callable[[TestResult], None] | None = None,
) -> TestSuiteResult:
"""
Run all approved tests for a goal.
Args:
goal_id: Goal ID to run tests for
agent_factory: Factory for creating agent instances
tests: Optional list of tests (loads from storage if not provided)
on_result: Optional callback for each test result
Returns:
TestSuiteResult with summary and individual results
"""
# Load tests if not provided
if tests is None:
if self.storage is None:
raise ValueError("Either tests or storage must be provided")
tests = self.storage.get_approved_tests(goal_id)
if not tests:
return TestSuiteResult(
goal_id=goal_id,
total=0,
passed=0,
failed=0,
)
# Execute tests
results: list[TestResult] = []
if self.config.num_workers <= 1:
# Sequential execution - create single agent and run all tests
results = self._run_sequential(tests, agent_factory, on_result)
else:
# Parallel execution with per-test distribution
results = self._run_parallel(tests, agent_factory, on_result)
# Save results if storage available
if self.storage:
# Create test_id -> test mapping for lookup
test_map = {t.id: t for t in tests}
for result in results:
# Update the Test object with execution result
if result.test_id in test_map:
test = test_map[result.test_id]
test.record_result(result.passed)
self.storage.update_test(test)
# Save the TestResult
self.storage.save_result(result.test_id, result)
# Create suite result
return self._create_suite_result(goal_id, results)
def run_tests(
self,
tests: list[Test],
agent: AgentProtocol,
on_result: Callable[[TestResult], None] | None = None,
) -> list[TestResult]:
"""
Run a list of tests against an agent instance.
Args:
tests: Tests to run
agent: Agent instance to test
on_result: Optional callback for each result
Returns:
List of TestResult
"""
executor = TestExecutor(
categorizer=self.categorizer,
timeout=self.config.timeout_per_test,
)
results = []
for test in tests:
result = executor.execute(test, agent)
results.append(result)
if on_result:
on_result(result)
# Fail-fast check
if self.config.fail_fast and not result.passed:
break
return results
def _run_sequential(
self,
tests: list[Test],
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
on_result: Callable[[TestResult], None] | None = None,
) -> list[TestResult]:
"""Run tests sequentially with a single agent instance."""
results = []
executor = TestExecutor(
categorizer=self.categorizer,
timeout=self.config.timeout_per_test,
)
# Create single agent for all tests
if isinstance(agent_factory, AgentFactoryProtocol):
agent = agent_factory.create()
else:
agent = agent_factory()
# Run all tests
for test in tests:
result = executor.execute(test, agent)
results.append(result)
if on_result:
on_result(result)
# Fail-fast
if self.config.fail_fast and not result.passed:
return results
return results
def _run_parallel(
self,
tests: list[Test],
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
on_result: Callable[[TestResult], None] | None = None,
) -> list[TestResult]:
"""
Run tests in parallel using ThreadPoolExecutor with worker initialization.
Each worker thread creates ONE agent instance at startup and reuses it
for all tests assigned to that worker. Tests are distributed individually
for true load-balanced parallelism.
Uses threads instead of processes because LLM API calls are I/O-bound,
and threads have lower overhead (no pickling, shared memory).
"""
results = []
failed = False
with ThreadPoolExecutor(
max_workers=self.config.num_workers,
initializer=_init_worker,
initargs=(agent_factory,),
) as executor:
# Submit each test individually for true parallelism
futures = {
executor.submit(_run_single_test, test, self.config.timeout_per_test): test
for test in tests
}
for future in as_completed(futures):
test = futures[future]
try:
result = future.result(timeout=self.config.timeout_per_test + 30)
results.append(result)
if on_result:
on_result(result)
if not result.passed:
failed = True
except TimeoutError:
result = TestResult(
test_id=test.id,
passed=False,
duration_ms=int(self.config.timeout_per_test * 1000),
error_message="Test timed out",
)
results.append(result)
if on_result:
on_result(result)
failed = True
except Exception as e:
result = TestResult(
test_id=test.id,
passed=False,
duration_ms=0,
error_message=f"Execution error: {e}",
)
results.append(result)
if on_result:
on_result(result)
failed = True
# Fail-fast
if self.config.fail_fast and failed:
executor.shutdown(wait=False, cancel_futures=True)
break
return results
def _create_suite_result(
self,
goal_id: str,
results: list[TestResult],
) -> TestSuiteResult:
"""Create TestSuiteResult from individual results."""
passed = sum(1 for r in results if r.passed)
failed = len(results) - passed
total_duration = sum(r.duration_ms for r in results)
return TestSuiteResult(
goal_id=goal_id,
total=len(results),
passed=passed,
failed=failed,
results=results,
duration_ms=total_duration,
)
+112
View File
@@ -0,0 +1,112 @@
"""
LLM prompt templates for test generation.
These prompts instruct the LLM to generate pytest-compatible tests
from Goal success_criteria and constraints using tool calling.
"""
CONSTRAINT_TEST_PROMPT = """You are generating test cases for an AI agent's constraints.
## Goal
Name: {goal_name}
Description: {goal_description}
## Constraints to Test
{constraints_formatted}
## Instructions
For each constraint, generate pytest-compatible tests that verify the constraint is satisfied.
For EACH test, call the `submit_test` tool with:
- constraint_id: The ID of the constraint being tested
- test_name: A descriptive pytest function name (test_constraint_<constraint_id>_<scenario>)
- test_code: Complete Python test function code
- description: What the test validates
- input: Test input data as an object
- expected_output: Expected output as an object
- confidence: 0-1 score based on how testable/well-defined the constraint is
Consider for each constraint:
- Happy path: Normal execution that should satisfy the constraint
- Boundary conditions: Inputs at the edge of constraint boundaries
- Violation scenarios: Inputs that should trigger constraint violation
The test code should:
- Be valid Python using pytest conventions
- Use `agent.run(input)` to execute the agent
- Include descriptive assertion messages
- Handle potential exceptions appropriately
Generate tests now by calling submit_test for each test."""
SUCCESS_CRITERIA_TEST_PROMPT = """You are generating success criteria tests for an AI agent.
## Goal
Name: {goal_name}
Description: {goal_description}
## Success Criteria
{success_criteria_formatted}
## Agent Flow (for context)
Nodes: {node_names}
Tools: {tool_names}
## Instructions
For each success criterion, generate tests that verify the agent achieves its goals.
For EACH test, call the `submit_test` tool with:
- criteria_id: The ID of the success criterion being tested
- test_name: A descriptive pytest function name (test_<criteria_id>_<scenario>)
- test_code: Complete Python test function code
- description: What the test validates
- input: Test input data as an object
- expected_output: Expected output as an object
- confidence: 0-1 score based on how measurable/specific the criterion is
Consider for each criterion:
- Happy path: Normal successful execution
- Boundary conditions: Exactly at target thresholds (if applicable)
- Graceful handling: Near-misses and edge cases
The test code should:
- Be valid Python using pytest conventions
- Use `agent.run(input)` to execute the agent
- Validate the metric defined in the success criterion
- Include descriptive assertion messages
Generate tests now by calling submit_test for each test."""
EDGE_CASE_TEST_PROMPT = """You are generating edge case tests for an AI agent.
## Goal
Name: {goal_name}
Description: {goal_description}
## Existing Tests
{existing_tests_summary}
## Recent Failures (if any)
{failures_summary}
## Instructions
Generate additional edge case tests that cover scenarios not addressed by existing tests.
Focus on:
1. Unusual input formats or values
2. Empty or null inputs
3. Extremely large or small values
4. Unicode and special characters
5. Concurrent or timing-related scenarios
6. Network/API failure simulations (if applicable)
For EACH test, call the `submit_test` tool with:
- criteria_id: An identifier for the edge case category being tested
- test_name: A descriptive pytest function name (test_edge_case_<scenario>)
- test_code: Complete Python test function code
- description: What the test validates
- input: Test input data as an object
- expected_output: Expected output as an object
- confidence: 0-1 score
Generate edge case tests now by calling submit_test for each test."""
+219
View File
@@ -0,0 +1,219 @@
"""
Success criteria test generator.
Generates tests for Goal success_criteria using LLM.
Tests are returned with PENDING approval status.
"""
import uuid
from typing import TYPE_CHECKING
from framework.graph.goal import Goal, SuccessCriterion
from framework.testing.test_case import Test, TestType, ApprovalStatus
from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT
from framework.llm.provider import Tool, ToolUse, ToolResult
if TYPE_CHECKING:
from framework.llm.provider import LLMProvider
# Tool for collecting generated tests - Claude handles JSON escaping automatically
SUBMIT_TEST_TOOL = Tool(
name="submit_test",
description="Submit a generated success criteria test. Call once per test.",
parameters={
"properties": {
"criteria_id": {
"type": "string",
"description": "ID of the success criterion being tested",
},
"test_name": {
"type": "string",
"description": "pytest function name, e.g., test_find_videos_happy_path",
},
"test_code": {
"type": "string",
"description": "Complete Python test function code",
},
"description": {
"type": "string",
"description": "What the test validates",
},
"input": {
"type": "object",
"description": "Test input data",
},
"expected_output": {
"type": "object",
"description": "Expected output",
},
"confidence": {
"type": "number",
"description": "Confidence score 0-1",
},
},
"required": ["criteria_id", "test_name", "test_code", "description", "confidence"],
},
)
class SuccessCriteriaTestGenerator:
"""
Generate success criteria tests from Goal success_criteria.
Generated tests require user approval before being added to the test suite.
Unlike constraint tests, success criteria tests are generated during the
Eval stage (after the agent exists) and may reference agent nodes/tools.
"""
def __init__(self, llm: "LLMProvider"):
"""
Initialize generator with LLM provider.
Args:
llm: LLM provider for test generation (e.g., AnthropicProvider)
"""
self.llm = llm
def generate(
self,
goal: Goal,
node_names: list[str] | None = None,
tool_names: list[str] | None = None,
) -> list[Test]:
"""
Generate tests for all success criteria in a goal.
Args:
goal: Goal with success_criteria to test
node_names: Names of agent nodes (for context)
tool_names: Names of tools available to agent (for context)
Returns:
List of Test objects with approval_status=PENDING.
These MUST be approved before being added to the test suite.
"""
if not goal.success_criteria:
return []
# Format prompt
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
success_criteria_formatted=self._format_criteria(goal.success_criteria),
node_names=", ".join(node_names or ["(not specified)"]),
tool_names=", ".join(tool_names or ["(not specified)"]),
)
# Collect tests via tool calls - Claude handles JSON escaping automatically
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(
tool_use_id=tool_use.id, content="Test recorded successfully"
)
return ToolResult(
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system="You are a test generation expert. For each success criterion, call the submit_test tool with the test details.",
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=20,
)
return self._create_tests_from_collected(collected_tests, goal.id)
def generate_for_criterion(
self,
goal: Goal,
criterion: SuccessCriterion,
node_names: list[str] | None = None,
tool_names: list[str] | None = None,
) -> list[Test]:
"""
Generate tests for a single success criterion.
Args:
goal: Goal containing the criterion
criterion: Specific criterion to test
node_names: Names of agent nodes
tool_names: Names of tools available
Returns:
List of Test objects for the criterion
"""
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
success_criteria_formatted=self._format_criterion(criterion),
node_names=", ".join(node_names or ["(not specified)"]),
tool_names=", ".join(tool_names or ["(not specified)"]),
)
# Collect tests via tool calls
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(
tool_use_id=tool_use.id, content="Test recorded successfully"
)
return ToolResult(
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system="You are a test generation expert. Call the submit_test tool with the test details.",
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=10,
)
return self._create_tests_from_collected(collected_tests, goal.id)
def _format_criteria(self, criteria: list[SuccessCriterion]) -> str:
"""Format success criteria for prompt."""
lines = []
for c in criteria:
lines.append(self._format_criterion(c))
lines.append("")
return "\n".join(lines)
def _format_criterion(self, criterion: SuccessCriterion) -> str:
"""Format a single criterion for prompt."""
return f"""### Success Criterion: {criterion.id}
- Description: {criterion.description}
- Metric: {criterion.metric}
- Target: {criterion.target}
- Weight: {criterion.weight}
- Currently met: {criterion.met}"""
def _create_tests_from_collected(
self, collected: list[dict], goal_id: str
) -> list[Test]:
"""Create Test objects from tool call data."""
tests = []
for td in collected:
test = Test(
id=f"test_{uuid.uuid4().hex[:8]}",
goal_id=goal_id,
parent_criteria_id=td.get("criteria_id", "unknown"),
test_type=TestType.SUCCESS_CRITERIA,
test_name=td.get("test_name", "unnamed_test"),
test_code=td.get("test_code", ""),
description=td.get("description", ""),
input=td.get("input", {}),
expected_output=td.get("expected_output", {}),
generated_by="llm",
llm_confidence=float(td.get("confidence", 0.5)),
approval_status=ApprovalStatus.PENDING,
)
tests.append(test)
return tests
+150
View File
@@ -0,0 +1,150 @@
"""
Test case schema with approval tracking.
Tests are generated by LLM from Goal success_criteria and constraints,
but require mandatory user approval before being stored.
"""
from datetime import datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class ApprovalStatus(str, Enum):
"""Status of user approval for a generated test."""
PENDING = "pending" # Awaiting user review
APPROVED = "approved" # User accepted as-is
MODIFIED = "modified" # User edited before accepting
REJECTED = "rejected" # User declined (with reason)
class TestType(str, Enum):
"""Type of test based on what it validates."""
CONSTRAINT = "constraint" # Validates constraint boundaries
SUCCESS_CRITERIA = "outcome" # Validates success criteria achievement
EDGE_CASE = "edge_case" # Validates edge case handling
class Test(BaseModel):
"""
A test case generated from Goal success_criteria or constraints.
Tests are either:
- Generated by LLM during Goal stage (constraints) or Eval stage (success criteria)
- Created manually by human engineers
All tests require approval before being added to the test suite.
"""
id: str
goal_id: str
parent_criteria_id: str = Field(
description="Links to success_criteria.id or constraint.id"
)
test_type: TestType
# Test definition
test_name: str = Field(
description="Descriptive function name, e.g., test_constraint_api_limits_respected"
)
test_code: str = Field(
description="Python test function code (pytest compatible)"
)
description: str = Field(
description="Human-readable description of what the test validates"
)
input: dict[str, Any] = Field(
default_factory=dict,
description="Test input data"
)
expected_output: dict[str, Any] = Field(
default_factory=dict,
description="Expected output or assertions"
)
# LLM generation metadata
generated_by: str = Field(
default="llm",
description="Who created the test: 'llm' or 'human'"
)
llm_confidence: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="LLM's confidence in the test quality (0-1)"
)
# Approval tracking (CRITICAL - tests are never used without approval)
approval_status: ApprovalStatus = ApprovalStatus.PENDING
approved_by: str | None = None
approved_at: datetime | None = None
rejection_reason: str | None = Field(
default=None,
description="Reason for rejection if status is REJECTED"
)
original_code: str | None = Field(
default=None,
description="Original LLM-generated code if user modified it"
)
# Execution tracking
last_run: datetime | None = None
last_result: str | None = Field(
default=None,
description="Result of last run: 'passed', 'failed', 'error'"
)
run_count: int = 0
pass_count: int = 0
fail_count: int = 0
# Timestamps
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
model_config = {"extra": "allow"}
def approve(self, approved_by: str = "user") -> None:
"""Mark test as approved."""
self.approval_status = ApprovalStatus.APPROVED
self.approved_by = approved_by
self.approved_at = datetime.now()
self.updated_at = datetime.now()
def modify(self, new_code: str, approved_by: str = "user") -> None:
"""Approve test with modifications."""
self.original_code = self.test_code
self.test_code = new_code
self.approval_status = ApprovalStatus.MODIFIED
self.approved_by = approved_by
self.approved_at = datetime.now()
self.updated_at = datetime.now()
def reject(self, reason: str) -> None:
"""Reject the test with a reason."""
self.approval_status = ApprovalStatus.REJECTED
self.rejection_reason = reason
self.updated_at = datetime.now()
def record_result(self, passed: bool) -> None:
"""Record a test run result."""
self.last_run = datetime.now()
self.last_result = "passed" if passed else "failed"
self.run_count += 1
if passed:
self.pass_count += 1
else:
self.fail_count += 1
self.updated_at = datetime.now()
@property
def is_approved(self) -> bool:
"""Check if test has been approved (approved or modified)."""
return self.approval_status in (ApprovalStatus.APPROVED, ApprovalStatus.MODIFIED)
@property
def pass_rate(self) -> float | None:
"""Calculate pass rate if test has been run."""
if self.run_count == 0:
return None
return self.pass_count / self.run_count
+153
View File
@@ -0,0 +1,153 @@
"""
Test result schemas for tracking test execution outcomes.
Results include detailed error information for debugging and
categorization for guiding iteration strategy.
"""
from datetime import datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel, Field
class ErrorCategory(str, Enum):
"""
Category of test failure for guiding iteration.
Each category has different implications for how to fix:
- LOGIC_ERROR: Goal definition is wrong update success_criteria/constraints
- IMPLEMENTATION_ERROR: Code bug fix nodes/edges in Agent stage
- EDGE_CASE: New scenario discovered add new test only
"""
LOGIC_ERROR = "logic_error"
IMPLEMENTATION_ERROR = "implementation_error"
EDGE_CASE = "edge_case"
class TestResult(BaseModel):
"""
Result of a single test execution.
Captures:
- Pass/fail status with timing
- Actual vs expected output
- Error details for debugging
- Runtime logs and execution path
"""
test_id: str
passed: bool
duration_ms: int = Field(
ge=0,
description="Test execution time in milliseconds"
)
# Output comparison
actual_output: Any = None
expected_output: Any = None
# Error details (populated on failure)
error_message: str | None = None
error_category: ErrorCategory | None = None
stack_trace: str | None = None
# Runtime data for debugging
runtime_logs: list[dict[str, Any]] = Field(
default_factory=list,
description="Log entries from test execution"
)
node_outputs: dict[str, Any] = Field(
default_factory=dict,
description="Output from each node executed during test"
)
execution_path: list[str] = Field(
default_factory=list,
description="Sequence of nodes executed"
)
# Associated run ID (links to Runtime data)
run_id: str | None = Field(
default=None,
description="Runtime run ID for detailed analysis"
)
timestamp: datetime = Field(default_factory=datetime.now)
model_config = {"extra": "allow"}
def summary_dict(self) -> dict[str, Any]:
"""Return a summary dict for quick overview."""
return {
"test_id": self.test_id,
"passed": self.passed,
"duration_ms": self.duration_ms,
"error_category": self.error_category.value if self.error_category else None,
"error_message": self.error_message[:100] if self.error_message else None,
}
class TestSuiteResult(BaseModel):
"""
Aggregate result from running a test suite.
Provides summary statistics and individual results.
"""
goal_id: str
total: int
passed: int
failed: int
errors: int = 0 # Tests that couldn't run (e.g., exceptions in setup)
skipped: int = 0
results: list[TestResult] = Field(default_factory=list)
duration_ms: int = Field(
default=0,
description="Total execution time in milliseconds"
)
timestamp: datetime = Field(default_factory=datetime.now)
model_config = {"extra": "allow"}
@property
def all_passed(self) -> bool:
"""Check if all tests passed."""
return self.failed == 0 and self.errors == 0
@property
def pass_rate(self) -> float:
"""Calculate pass rate."""
if self.total == 0:
return 0.0
return self.passed / self.total
def summary_dict(self) -> dict[str, Any]:
"""Return summary for reporting."""
return {
"goal_id": self.goal_id,
"overall_passed": self.all_passed,
"summary": {
"total": self.total,
"passed": self.passed,
"failed": self.failed,
"errors": self.errors,
"skipped": self.skipped,
},
"pass_rate": f"{self.pass_rate:.1%}",
"duration_ms": self.duration_ms,
}
def get_failed_results(self) -> list[TestResult]:
"""Get all failed test results for debugging."""
return [r for r in self.results if not r.passed]
def get_results_by_category(
self, category: ErrorCategory
) -> list[TestResult]:
"""Get failed results by error category."""
return [
r for r in self.results
if not r.passed and r.error_category == category
]
+260
View File
@@ -0,0 +1,260 @@
"""
File-based storage backend for test data.
Follows the same pattern as framework/storage/backend.py (FileStorage),
storing tests as JSON files with indexes for efficient querying.
"""
import json
from pathlib import Path
from datetime import datetime
from framework.testing.test_case import Test, ApprovalStatus, TestType
from framework.testing.test_result import TestResult
class TestStorage:
"""
File-based storage for tests and results.
Directory structure:
{base_path}/
tests/
{goal_id}/
{test_id}.json # Full test data
indexes/
by_goal/{goal_id}.json # List of test IDs for this goal
by_approval/{status}.json # Tests by approval status
by_type/{test_type}.json # Tests by type
by_criteria/{criteria_id}.json # Tests by parent criteria
results/
{test_id}/
{timestamp}.json # Test run results
latest.json # Most recent result
suites/
{goal_id}_suite.json # Test suite metadata
"""
def __init__(self, base_path: str | Path):
self.base_path = Path(base_path)
self._ensure_dirs()
def _ensure_dirs(self) -> None:
"""Create directory structure if it doesn't exist."""
dirs = [
self.base_path / "tests",
self.base_path / "indexes" / "by_goal",
self.base_path / "indexes" / "by_approval",
self.base_path / "indexes" / "by_type",
self.base_path / "indexes" / "by_criteria",
self.base_path / "results",
self.base_path / "suites",
]
for d in dirs:
d.mkdir(parents=True, exist_ok=True)
# === TEST OPERATIONS ===
def save_test(self, test: Test) -> None:
"""Save a test to storage."""
# Ensure goal directory exists
goal_dir = self.base_path / "tests" / test.goal_id
goal_dir.mkdir(parents=True, exist_ok=True)
# Save full test
test_path = goal_dir / f"{test.id}.json"
with open(test_path, "w") as f:
f.write(test.model_dump_json(indent=2))
# Update indexes
self._add_to_index("by_goal", test.goal_id, test.id)
self._add_to_index("by_approval", test.approval_status.value, test.id)
self._add_to_index("by_type", test.test_type.value, test.id)
self._add_to_index("by_criteria", test.parent_criteria_id, test.id)
def load_test(self, goal_id: str, test_id: str) -> Test | None:
"""Load a test from storage."""
test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
if not test_path.exists():
return None
with open(test_path) as f:
return Test.model_validate_json(f.read())
def delete_test(self, goal_id: str, test_id: str) -> bool:
"""Delete a test from storage."""
test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
if not test_path.exists():
return False
# Load test to get index keys
test = self.load_test(goal_id, test_id)
if test:
self._remove_from_index("by_goal", test.goal_id, test_id)
self._remove_from_index("by_approval", test.approval_status.value, test_id)
self._remove_from_index("by_type", test.test_type.value, test_id)
self._remove_from_index("by_criteria", test.parent_criteria_id, test_id)
test_path.unlink()
# Also delete results
results_dir = self.base_path / "results" / test_id
if results_dir.exists():
for f in results_dir.iterdir():
f.unlink()
results_dir.rmdir()
return True
def update_test(self, test: Test) -> None:
"""
Update an existing test.
Handles index updates if approval_status changed.
"""
# Load old test to check for index changes
old_test = self.load_test(test.goal_id, test.id)
if old_test and old_test.approval_status != test.approval_status:
self._remove_from_index("by_approval", old_test.approval_status.value, test.id)
self._add_to_index("by_approval", test.approval_status.value, test.id)
# Update timestamp
test.updated_at = datetime.now()
# Save
self.save_test(test)
# === QUERY OPERATIONS ===
def get_tests_by_goal(self, goal_id: str) -> list[Test]:
"""Get all tests for a goal."""
test_ids = self._get_index("by_goal", goal_id)
tests = []
for test_id in test_ids:
test = self.load_test(goal_id, test_id)
if test:
tests.append(test)
return tests
def get_tests_by_approval_status(self, status: ApprovalStatus) -> list[str]:
"""Get test IDs by approval status."""
return self._get_index("by_approval", status.value)
def get_tests_by_type(self, test_type: TestType) -> list[str]:
"""Get test IDs by test type."""
return self._get_index("by_type", test_type.value)
def get_tests_by_criteria(self, criteria_id: str) -> list[str]:
"""Get test IDs for a specific criteria."""
return self._get_index("by_criteria", criteria_id)
def get_pending_tests(self, goal_id: str) -> list[Test]:
"""Get all pending tests for a goal."""
tests = self.get_tests_by_goal(goal_id)
return [t for t in tests if t.approval_status == ApprovalStatus.PENDING]
def get_approved_tests(self, goal_id: str) -> list[Test]:
"""Get all approved tests for a goal (approved or modified)."""
tests = self.get_tests_by_goal(goal_id)
return [t for t in tests if t.is_approved]
def list_all_goals(self) -> list[str]:
"""List all goal IDs that have tests."""
goals_dir = self.base_path / "indexes" / "by_goal"
return [f.stem for f in goals_dir.glob("*.json")]
# === RESULT OPERATIONS ===
def save_result(self, test_id: str, result: TestResult) -> None:
"""Save a test result."""
results_dir = self.base_path / "results" / test_id
results_dir.mkdir(parents=True, exist_ok=True)
# Save with timestamp
timestamp = result.timestamp.strftime("%Y%m%d_%H%M%S")
result_path = results_dir / f"{timestamp}.json"
with open(result_path, "w") as f:
f.write(result.model_dump_json(indent=2))
# Update latest
latest_path = results_dir / "latest.json"
with open(latest_path, "w") as f:
f.write(result.model_dump_json(indent=2))
def get_latest_result(self, test_id: str) -> TestResult | None:
"""Get the most recent result for a test."""
latest_path = self.base_path / "results" / test_id / "latest.json"
if not latest_path.exists():
return None
with open(latest_path) as f:
return TestResult.model_validate_json(f.read())
def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]:
"""Get result history for a test, most recent first."""
results_dir = self.base_path / "results" / test_id
if not results_dir.exists():
return []
# Get all result files except latest.json
result_files = sorted(
[f for f in results_dir.glob("*.json") if f.name != "latest.json"],
reverse=True
)[:limit]
results = []
for f in result_files:
with open(f) as file:
results.append(TestResult.model_validate_json(file.read()))
return results
# === INDEX OPERATIONS ===
def _get_index(self, index_type: str, key: str) -> list[str]:
"""Get values from an index."""
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
if not index_path.exists():
return []
with open(index_path) as f:
return json.load(f)
def _add_to_index(self, index_type: str, key: str, value: str) -> None:
"""Add a value to an index."""
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
values = self._get_index(index_type, key)
if value not in values:
values.append(value)
with open(index_path, "w") as f:
json.dump(values, f)
def _remove_from_index(self, index_type: str, key: str, value: str) -> None:
"""Remove a value from an index."""
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
values = self._get_index(index_type, key)
if value in values:
values.remove(value)
with open(index_path, "w") as f:
json.dump(values, f)
# === UTILITY ===
def get_stats(self) -> dict:
"""Get storage statistics."""
goals = self.list_all_goals()
total_tests = sum(len(self._get_index("by_goal", g)) for g in goals)
pending = len(self._get_index("by_approval", "pending"))
approved = len(self._get_index("by_approval", "approved"))
modified = len(self._get_index("by_approval", "modified"))
rejected = len(self._get_index("by_approval", "rejected"))
return {
"total_goals": len(goals),
"total_tests": total_tests,
"by_approval": {
"pending": pending,
"approved": approved,
"modified": modified,
"rejected": rejected,
},
"storage_path": str(self.base_path),
}
+612
View File
@@ -0,0 +1,612 @@
"""
Unit tests for the goal-based testing framework.
Tests cover:
- Schema validation
- Storage CRUD operations
- Error categorization heuristics
- Parallel runner grouping logic
"""
import pytest
import tempfile
from pathlib import Path
from datetime import datetime
from framework.testing.test_case import (
Test,
TestType,
ApprovalStatus,
)
from framework.testing.test_result import (
TestResult,
TestSuiteResult,
ErrorCategory,
)
from framework.testing.test_storage import TestStorage
from framework.testing.categorizer import ErrorCategorizer
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
from framework.testing.debug_tool import DebugTool
# ============================================================================
# Test Schema Tests
# ============================================================================
class TestTestCaseSchema:
"""Tests for Test schema."""
def test_create_test(self):
"""Test creating a basic test."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_api_limits",
test_type=TestType.CONSTRAINT,
test_name="test_constraint_api_limits",
test_code="def test_constraint_api_limits(agent): pass",
description="Tests API rate limits",
input={"topic": "test"},
expected_output={"count": 5},
)
assert test.id == "test_001"
assert test.goal_id == "goal_001"
assert test.test_type == TestType.CONSTRAINT
assert test.approval_status == ApprovalStatus.PENDING
assert not test.is_approved
def test_approve_test(self):
"""Test approving a test."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="pass",
description="test",
)
test.approve("test_user")
assert test.approval_status == ApprovalStatus.APPROVED
assert test.approved_by == "test_user"
assert test.approved_at is not None
assert test.is_approved
def test_modify_test(self):
"""Test modifying a test before approval."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="original code",
description="test",
)
test.modify("modified code", "test_user")
assert test.approval_status == ApprovalStatus.MODIFIED
assert test.original_code == "original code"
assert test.test_code == "modified code"
assert test.is_approved
def test_reject_test(self):
"""Test rejecting a test."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="pass",
description="test",
)
test.reject("Not a valid test case")
assert test.approval_status == ApprovalStatus.REJECTED
assert test.rejection_reason == "Not a valid test case"
assert not test.is_approved
def test_record_result(self):
"""Test recording test results."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="pass",
description="test",
)
test.record_result(passed=True)
assert test.last_result == "passed"
assert test.run_count == 1
assert test.pass_count == 1
assert test.pass_rate == 1.0
test.record_result(passed=False)
assert test.last_result == "failed"
assert test.run_count == 2
assert test.pass_count == 1
assert test.fail_count == 1
assert test.pass_rate == 0.5
class TestTestResultSchema:
"""Tests for TestResult schema."""
def test_create_passed_result(self):
"""Test creating a passed result."""
result = TestResult(
test_id="test_001",
passed=True,
duration_ms=100,
actual_output={"status": "ok"},
expected_output={"status": "ok"},
)
assert result.passed
assert result.duration_ms == 100
assert result.error_category is None
def test_create_failed_result(self):
"""Test creating a failed result."""
result = TestResult(
test_id="test_001",
passed=False,
duration_ms=50,
error_message="Assertion failed",
error_category=ErrorCategory.IMPLEMENTATION_ERROR,
stack_trace="Traceback...",
)
assert not result.passed
assert result.error_category == ErrorCategory.IMPLEMENTATION_ERROR
def test_summary_dict(self):
"""Test summary dict generation."""
result = TestResult(
test_id="test_001",
passed=False,
duration_ms=50,
error_message="Very long error " * 20,
error_category=ErrorCategory.LOGIC_ERROR,
)
summary = result.summary_dict()
assert summary["test_id"] == "test_001"
assert summary["passed"] is False
assert summary["error_category"] == "logic_error"
assert len(summary["error_message"]) == 100 # Truncated
class TestTestSuiteResult:
"""Tests for TestSuiteResult schema."""
def test_suite_result_properties(self):
"""Test suite result calculation properties."""
results = [
TestResult(test_id="t1", passed=True, duration_ms=100),
TestResult(test_id="t2", passed=True, duration_ms=50),
TestResult(test_id="t3", passed=False, duration_ms=75,
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
]
suite = TestSuiteResult(
goal_id="goal_001",
total=3,
passed=2,
failed=1,
results=results,
duration_ms=225,
)
assert not suite.all_passed
assert suite.pass_rate == pytest.approx(2/3)
assert len(suite.get_failed_results()) == 1
def test_get_results_by_category(self):
"""Test filtering results by error category."""
results = [
TestResult(test_id="t1", passed=False, duration_ms=100,
error_category=ErrorCategory.LOGIC_ERROR),
TestResult(test_id="t2", passed=False, duration_ms=50,
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
TestResult(test_id="t3", passed=False, duration_ms=75,
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
]
suite = TestSuiteResult(
goal_id="goal_001",
total=3,
passed=0,
failed=3,
results=results,
)
impl_errors = suite.get_results_by_category(ErrorCategory.IMPLEMENTATION_ERROR)
assert len(impl_errors) == 2
# ============================================================================
# Storage Tests
# ============================================================================
class TestTestStorage:
"""Tests for TestStorage."""
@pytest.fixture
def storage(self, tmp_path):
"""Create a temporary storage instance."""
return TestStorage(tmp_path)
def test_save_and_load_test(self, storage):
"""Test saving and loading a test."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="def test_something(agent): pass",
description="A test",
)
storage.save_test(test)
loaded = storage.load_test("goal_001", "test_001")
assert loaded is not None
assert loaded.id == "test_001"
assert loaded.test_name == "test_something"
def test_delete_test(self, storage):
"""Test deleting a test."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="pass",
description="test",
)
storage.save_test(test)
assert storage.load_test("goal_001", "test_001") is not None
storage.delete_test("goal_001", "test_001")
assert storage.load_test("goal_001", "test_001") is None
def test_get_tests_by_goal(self, storage):
"""Test querying tests by goal."""
for i in range(3):
test = Test(
id=f"test_{i}",
goal_id="goal_001",
parent_criteria_id=f"constraint_{i}",
test_type=TestType.CONSTRAINT,
test_name=f"test_{i}",
test_code="pass",
description="test",
)
storage.save_test(test)
tests = storage.get_tests_by_goal("goal_001")
assert len(tests) == 3
def test_get_approved_tests(self, storage):
"""Test querying approved tests."""
# Create tests with different approval statuses
test1 = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="c1",
test_type=TestType.CONSTRAINT,
test_name="test_1",
test_code="pass",
description="test",
)
test1.approve()
storage.save_test(test1)
test2 = Test(
id="test_002",
goal_id="goal_001",
parent_criteria_id="c2",
test_type=TestType.CONSTRAINT,
test_name="test_2",
test_code="pass",
description="test",
)
# Leave pending
storage.save_test(test2)
test3 = Test(
id="test_003",
goal_id="goal_001",
parent_criteria_id="c3",
test_type=TestType.CONSTRAINT,
test_name="test_3",
test_code="pass",
description="test",
)
test3.modify("modified", "user")
storage.save_test(test3)
approved = storage.get_approved_tests("goal_001")
assert len(approved) == 2 # approved and modified
def test_save_and_load_result(self, storage):
"""Test saving and loading test results."""
result = TestResult(
test_id="test_001",
passed=True,
duration_ms=100,
)
storage.save_result("test_001", result)
loaded = storage.get_latest_result("test_001")
assert loaded is not None
assert loaded.passed is True
assert loaded.duration_ms == 100
def test_result_history(self, storage):
"""Test getting result history."""
# Save multiple results
for i in range(5):
result = TestResult(
test_id="test_001",
passed=(i % 2 == 0),
duration_ms=100 + i,
)
storage.save_result("test_001", result)
history = storage.get_result_history("test_001", limit=3)
assert len(history) <= 3
def test_get_stats(self, storage):
"""Test getting storage statistics."""
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="c1",
test_type=TestType.CONSTRAINT,
test_name="test_1",
test_code="pass",
description="test",
)
test.approve()
storage.save_test(test)
stats = storage.get_stats()
assert stats["total_tests"] == 1
assert stats["by_approval"]["approved"] == 1
# ============================================================================
# Error Categorizer Tests
# ============================================================================
class TestErrorCategorizer:
"""Tests for ErrorCategorizer."""
@pytest.fixture
def categorizer(self):
return ErrorCategorizer()
def test_categorize_passed(self, categorizer):
"""Test that passed results return None."""
result = TestResult(test_id="t1", passed=True, duration_ms=100)
assert categorizer.categorize(result) is None
def test_categorize_logic_error(self, categorizer):
"""Test categorization of logic errors."""
result = TestResult(
test_id="t1",
passed=False,
duration_ms=100,
error_message="goal not achieved: expected success criteria was not met",
)
assert categorizer.categorize(result) == ErrorCategory.LOGIC_ERROR
def test_categorize_implementation_error(self, categorizer):
"""Test categorization of implementation errors."""
result = TestResult(
test_id="t1",
passed=False,
duration_ms=100,
error_message="TypeError: 'NoneType' object has no attribute 'get'",
)
assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
def test_categorize_edge_case(self, categorizer):
"""Test categorization of edge cases."""
result = TestResult(
test_id="t1",
passed=False,
duration_ms=100,
error_message="timeout: request took longer than expected",
)
assert categorizer.categorize(result) == ErrorCategory.EDGE_CASE
def test_categorize_from_stack_trace(self, categorizer):
"""Test categorization from stack trace."""
result = TestResult(
test_id="t1",
passed=False,
duration_ms=100,
error_message="Error occurred",
stack_trace="KeyError: 'missing_key'\n at line 42",
)
assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
def test_get_fix_suggestion(self, categorizer):
"""Test fix suggestions for each category."""
assert "Goal" in categorizer.get_fix_suggestion(ErrorCategory.LOGIC_ERROR)
assert "code" in categorizer.get_fix_suggestion(ErrorCategory.IMPLEMENTATION_ERROR).lower()
assert "test" in categorizer.get_fix_suggestion(ErrorCategory.EDGE_CASE).lower()
def test_get_iteration_guidance(self, categorizer):
"""Test iteration guidance."""
guidance = categorizer.get_iteration_guidance(ErrorCategory.LOGIC_ERROR)
assert guidance["stage"] == "Goal"
assert guidance["restart_required"] is True
guidance = categorizer.get_iteration_guidance(ErrorCategory.IMPLEMENTATION_ERROR)
assert guidance["stage"] == "Agent"
assert guidance["restart_required"] is False
# ============================================================================
# Parallel Runner Tests
# ============================================================================
class TestParallelRunner:
"""Tests for ParallelTestRunner."""
@pytest.fixture
def runner(self, tmp_path):
"""Create a test runner with temporary storage."""
storage = TestStorage(tmp_path)
config = ParallelConfig(num_workers=1) # Sequential for testing
return ParallelTestRunner(config, storage)
def test_create_suite_result(self, runner):
"""Test creating suite result from individual results."""
results = [
TestResult(test_id="t1", passed=True, duration_ms=100),
TestResult(test_id="t2", passed=False, duration_ms=50),
]
suite = runner._create_suite_result("goal_001", results)
assert suite.goal_id == "goal_001"
assert suite.total == 2
assert suite.passed == 1
assert suite.failed == 1
assert suite.duration_ms == 150
# ============================================================================
# Debug Tool Tests
# ============================================================================
class TestDebugTool:
"""Tests for DebugTool."""
@pytest.fixture
def debug_tool(self, tmp_path):
"""Create a debug tool with temporary storage."""
storage = TestStorage(tmp_path)
return DebugTool(storage)
def test_analyze_missing_test(self, debug_tool):
"""Test analyzing a non-existent test."""
info = debug_tool.analyze("goal_001", "nonexistent")
assert info.test_id == "nonexistent"
assert "not found" in info.error_message.lower()
def test_analyze_with_result(self, debug_tool, tmp_path):
"""Test analyzing a test with result."""
storage = TestStorage(tmp_path)
# Create and save test
test = Test(
id="test_001",
goal_id="goal_001",
parent_criteria_id="c1",
test_type=TestType.CONSTRAINT,
test_name="test_something",
test_code="pass",
description="A test",
input={"key": "value"},
expected_output={"result": "expected"},
)
storage.save_test(test)
# Create and save result
result = TestResult(
test_id="test_001",
passed=False,
duration_ms=100,
error_message="TypeError: something went wrong",
error_category=ErrorCategory.IMPLEMENTATION_ERROR,
)
storage.save_result("test_001", result)
# Create new debug tool with same storage
debug_tool = DebugTool(storage)
info = debug_tool.analyze("goal_001", "test_001")
assert info.test_id == "test_001"
assert info.test_name == "test_something"
assert not info.passed
assert info.error_category == "implementation_error"
assert info.suggested_fix is not None
# ============================================================================
# Integration Tests
# ============================================================================
class TestIntegration:
"""Integration tests for the testing framework."""
def test_full_workflow(self, tmp_path):
"""Test a simplified full workflow."""
storage = TestStorage(tmp_path)
# 1. Create tests (simulating generation)
tests = []
for i in range(3):
test = Test(
id=f"test_{i}",
goal_id="goal_001",
parent_criteria_id="constraint_001",
test_type=TestType.CONSTRAINT,
test_name=f"test_constraint_{i}",
test_code=f"def test_constraint_{i}(agent): assert True",
description=f"Test {i}",
)
tests.append(test)
# 2. Approve tests
for test in tests:
test.approve("user")
storage.save_test(test)
# 3. Verify storage
approved = storage.get_approved_tests("goal_001")
assert len(approved) == 3
# 4. Simulate running tests
config = ParallelConfig(num_workers=1)
runner = ParallelTestRunner(config, storage)
class MockAgent:
def run(self, input):
return {"success": True}
results = runner.run_tests(approved, MockAgent())
assert len(results) == 3
# 5. Save results
for result in results:
storage.save_result(result.test_id, result)
# 6. Check stats
stats = storage.get_stats()
assert stats["total_tests"] == 3
if __name__ == "__main__":
pytest.main([__file__, "-v"])