initial test phase
This commit is contained in:
@@ -56,6 +56,9 @@ __pycache__/
|
||||
.eggs/
|
||||
*.egg
|
||||
|
||||
# Generated runtime data
|
||||
core/data/
|
||||
|
||||
# Misc
|
||||
*.local
|
||||
.cache/
|
||||
|
||||
@@ -10,9 +10,11 @@ Build goal-driven agents that use LLM reasoning to accomplish tasks.
|
||||
## Quick Start
|
||||
|
||||
1. Define the goal (what success looks like)
|
||||
2. Add nodes (units of work)
|
||||
3. Connect with edges (flow between nodes)
|
||||
4. Validate and test
|
||||
2. Generate constraint tests from goal → Approve tests
|
||||
3. Add nodes (units of work) - validate against constraint tests
|
||||
4. Connect with edges (flow between nodes)
|
||||
5. Validate and test graph
|
||||
6. Handoff to testing-agent skill for final evaluation
|
||||
|
||||
## Core Concepts
|
||||
|
||||
@@ -117,10 +119,15 @@ For each component (goal, node, edge):
|
||||
|
||||
```
|
||||
Agent Build Progress:
|
||||
|
||||
GOAL STAGE:
|
||||
- [ ] Define goal with success criteria → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Define goal constraints → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add entry node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add each processing node → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Generate constraint tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓ (NEW)
|
||||
|
||||
AGENT STAGE:
|
||||
- [ ] Add entry node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add each processing node → TEST NODE → VALIDATE AGAINST CONSTRAINTS → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add pause nodes (if HITL needed) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add resume entry points (for pause nodes) → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
- [ ] Add terminal node(s) → TEST NODE → ASK APPROVAL (clickable: Approve/Reject/Pause) ✓
|
||||
@@ -129,6 +136,11 @@ Agent Build Progress:
|
||||
- [ ] Validate full graph → TEST GRAPH → SHOW RESULTS
|
||||
- [ ] Final approval → ASK APPROVAL (clickable: Approve & Export/Reject/Pause) ✓
|
||||
- [ ] Export to exports/{agent-name}/
|
||||
|
||||
EVAL STAGE (handoff to testing-agent skill):
|
||||
- [ ] Generate success criteria tests → ASK APPROVAL (clickable: Approve/Reject/Skip) for each test ✓
|
||||
- [ ] Run all tests (constraint + success criteria)
|
||||
- [ ] Debug failures and iterate
|
||||
```
|
||||
|
||||
### Testing During Approval
|
||||
@@ -147,6 +159,31 @@ Show the human:
|
||||
- What tools are available
|
||||
- What outputs will be written
|
||||
|
||||
**Validate against constraint tests** (if available):
|
||||
|
||||
After approving constraint tests, reference them during node development:
|
||||
|
||||
```python
|
||||
# When presenting a node for approval, show constraint alignment:
|
||||
"""
|
||||
**NODE: search_node**
|
||||
|
||||
Test Results: [test_node output]
|
||||
|
||||
Constraint Test Alignment:
|
||||
✓ test_constraint_api_limits_respected
|
||||
→ Node uses rate-limited tool wrapper ✓
|
||||
✓ test_constraint_content_safety_filter
|
||||
→ Output includes safety_score field ✓
|
||||
|
||||
Validation: ✅ PASS
|
||||
"""
|
||||
```
|
||||
|
||||
**IMPORTANT**: Constraint tests may not fully execute until the agent is complete,
|
||||
but their test definitions guide node design. Review the test code to ensure
|
||||
your nodes handle the constraint scenarios.
|
||||
|
||||
**Before final approval**, use `test_graph` to simulate full execution:
|
||||
```
|
||||
test_graph(
|
||||
@@ -425,6 +462,7 @@ Goal(
|
||||
description="What the agent must NOT do",
|
||||
constraint_type="hard", # hard = must not violate
|
||||
category="safety",
|
||||
check="llm_judge", # Optional: how to validate ("llm_judge", expression, or function)
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -433,6 +471,98 @@ Goal(
|
||||
**Good goals**: Specific, measurable, constrained
|
||||
**Bad goals**: Vague, unmeasurable, no boundaries
|
||||
|
||||
## Constraint Test Generation
|
||||
|
||||
**CRITICAL**: After approving the goal, generate constraint tests BEFORE building nodes.
|
||||
|
||||
Constraint tests verify that the agent will respect its defined constraints (safety, rate limits, etc.).
|
||||
These tests are **agent-agnostic** - they test boundaries, not implementation. This means they can be
|
||||
generated before any nodes exist.
|
||||
|
||||
### Why Generate Tests Before Building?
|
||||
|
||||
1. **Early Validation**: Catch constraint violations during node development, not after
|
||||
2. **Design Guidance**: Tests make constraints concrete and testable
|
||||
3. **Incremental Feedback**: Review constraint tests while designing each node
|
||||
|
||||
### Generation Workflow
|
||||
|
||||
```python
|
||||
# 1. After goal is approved, generate constraint tests
|
||||
result = generate_constraint_tests(
|
||||
goal_id=goal_data["id"],
|
||||
goal_json=json.dumps(goal_data)
|
||||
)
|
||||
|
||||
# 2. Tests are returned with PENDING status
|
||||
# The MCP tool returns approval_required=True
|
||||
|
||||
# 3. Display each test to the human for approval
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ [1/3] test_constraint_api_limits_respected │
|
||||
│ Constraint: api_limits │
|
||||
│ Confidence: 88% │
|
||||
│ │
|
||||
│ def test_constraint_api_limits_respected(agent): │
|
||||
│ ... │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
# 4. Use AskUserQuestion with approval options
|
||||
AskUserQuestion(
|
||||
questions=[{
|
||||
"question": "Do you approve this constraint test?",
|
||||
"header": "Test Approval",
|
||||
"options": [
|
||||
{"label": "✓ Approve (Recommended)", "description": "Test looks good"},
|
||||
{"label": "✗ Reject", "description": "Test is invalid"},
|
||||
{"label": "✎ Edit", "description": "Modify before accepting"},
|
||||
{"label": "⏭ Skip", "description": "Decide later"}
|
||||
],
|
||||
"multiSelect": false
|
||||
}]
|
||||
)
|
||||
|
||||
# 5. Call approve_tests with the decisions
|
||||
approve_tests(
|
||||
goal_id=goal_data["id"],
|
||||
approvals='[{"test_id": "...", "action": "approve"}, ...]'
|
||||
)
|
||||
|
||||
# 6. Verify no pending tests before proceeding to nodes
|
||||
pending = get_pending_tests(goal_id=goal_data["id"])
|
||||
if json.loads(pending)["pending_count"] > 0:
|
||||
# Prompt user to handle remaining tests
|
||||
print("⚠️ Pending tests must be resolved before building nodes")
|
||||
```
|
||||
|
||||
### Approval Rules
|
||||
|
||||
- **All tests must be reviewed** - no auto-approval
|
||||
- **Approved/Modified tests are stored** for use during node validation
|
||||
- **Rejected tests are not stored** (with reason tracked)
|
||||
- **Skipped tests remain pending** - must be resolved before export
|
||||
|
||||
### Using Constraint Tests During Node Building
|
||||
|
||||
Once constraint tests are approved, reference them when designing nodes:
|
||||
|
||||
```python
|
||||
# Before adding a node that makes API calls, review constraint tests:
|
||||
"""
|
||||
Creating node: search_node (llm_tool_use)
|
||||
Tools: youtube_search, video_details
|
||||
|
||||
Constraint Test Review:
|
||||
✓ test_constraint_api_limits_respected - checks rate limits
|
||||
→ Ensure search_node handles rate limit responses gracefully
|
||||
|
||||
✓ test_constraint_content_safety_filter - checks safe content
|
||||
→ Ensure output_keys include safety flags for filtering
|
||||
"""
|
||||
```
|
||||
|
||||
## Adding Nodes
|
||||
|
||||
Each node does one thing:
|
||||
@@ -617,11 +747,29 @@ analyze → needs_clarification? → YES → request-clarification (PAUSE)
|
||||
| `export_graph` | Export the completed agent |
|
||||
| `get_session_status` | View current build progress |
|
||||
|
||||
### Testing Tools (for HITL approval)
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| `test_node` | Run a single node with sample inputs to show behavior |
|
||||
| `test_graph` | Simulate full graph execution to show the complete flow |
|
||||
### Testing Tools by Stage
|
||||
|
||||
#### Goal Stage (this skill) - Generate constraint tests
|
||||
| Tool | Purpose | When to Use |
|
||||
|------|---------|-------------|
|
||||
| `generate_constraint_tests` | Generate tests from constraints | Immediately after goal approval |
|
||||
| `approve_tests` | Approve/reject/modify tests | After generation, before building nodes |
|
||||
| `get_pending_tests` | List tests awaiting approval | Before proceeding to node building |
|
||||
|
||||
#### Agent Stage (this skill) - Build and validate nodes
|
||||
| Tool | Purpose | When to Use |
|
||||
|------|---------|-------------|
|
||||
| `test_node` | Run a single node with sample inputs | Before each node approval |
|
||||
| `test_graph` | Simulate full graph execution | Before final approval |
|
||||
|
||||
#### Eval Stage (testing-agent skill) - Final evaluation
|
||||
| Tool | Purpose | When to Use |
|
||||
|------|---------|-------------|
|
||||
| `generate_success_tests` | Generate tests from success criteria | After agent export |
|
||||
| `run_tests` | Run all tests in parallel | After test approval |
|
||||
| `debug_test` | Debug failed tests | After test failures |
|
||||
|
||||
See the [testing-agent skill](../testing-agent/SKILL.md) for the full Eval stage workflow.
|
||||
|
||||
## Using the Exported Agent
|
||||
|
||||
@@ -762,3 +910,72 @@ result = await runner.run(context)
|
||||
```
|
||||
|
||||
For complete API details, see [reference/api.md](reference/api.md).
|
||||
|
||||
## Handoff to Testing-Agent Skill
|
||||
|
||||
After exporting the agent, switch to the **testing-agent** skill for final evaluation (Eval Stage).
|
||||
|
||||
### What Transfers
|
||||
|
||||
1. **Goal definition** (with constraints and success criteria)
|
||||
2. **Approved constraint tests** (generated in Goal Stage)
|
||||
3. **Exported agent** at `exports/{agent-name}/`
|
||||
|
||||
### What Happens in Testing-Agent
|
||||
|
||||
1. Generate **success criteria tests** (these need agent details, so generated after build)
|
||||
2. Run **all tests** (constraint + success criteria) in parallel
|
||||
3. Debug failures and categorize errors
|
||||
4. Iterate based on error type
|
||||
|
||||
### Triggering the Handoff
|
||||
|
||||
After `export_graph` completes successfully, display:
|
||||
|
||||
```
|
||||
✅ Agent exported to exports/{agent-name}/
|
||||
|
||||
Next Steps (Eval Stage):
|
||||
1. Switch to testing-agent skill
|
||||
2. Generate success criteria tests
|
||||
3. Run full evaluation
|
||||
4. Debug any failures
|
||||
|
||||
Command: "Run /testing-agent for exports/{agent-name}"
|
||||
```
|
||||
|
||||
### Error Category Routing
|
||||
|
||||
If tests fail in the Eval stage, the error category determines where to go:
|
||||
|
||||
| Error Category | Meaning | Action |
|
||||
|---------------|---------|--------|
|
||||
| `LOGIC_ERROR` | Goal definition is wrong | Return to Goal Stage - update goal, regenerate constraint tests |
|
||||
| `IMPLEMENTATION_ERROR` | Code bug in nodes/edges | Return to Agent Stage - fix nodes/edges, re-export |
|
||||
| `EDGE_CASE` | New scenario discovered | Stay in Eval Stage - add edge case test, continue |
|
||||
|
||||
### Flow Diagram
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ GOAL STAGE (building-agents skill) │
|
||||
│ 1. Define success_criteria and constraints → APPROVE │
|
||||
│ 2. Generate CONSTRAINT TESTS from constraints │
|
||||
│ 3. APPROVE each constraint test │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ AGENT STAGE (building-agents skill) │
|
||||
│ 1. Add nodes - review constraint tests for design guidance │
|
||||
│ 2. Test each node - validate against constraint expectations│
|
||||
│ 3. Connect edges → Validate graph → Export │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ EVAL STAGE (testing-agent skill) │
|
||||
│ 1. Generate SUCCESS_CRITERIA TESTS → APPROVE │
|
||||
│ 2. Run ALL tests (constraint + success criteria) │
|
||||
│ 3. Debug failures → Categorize errors │
|
||||
│ 4. Route back based on error category (if needed) │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
@@ -0,0 +1,625 @@
|
||||
---
|
||||
name: testing-agent
|
||||
description: Run goal-based evaluation tests for agents. Use when you need to verify an agent meets its goals, debug failing tests, or iterate on agent improvements based on test results.
|
||||
---
|
||||
|
||||
# Testing Agents
|
||||
|
||||
Run goal-based evaluation tests for agents built with the building-agents skill.
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. **Check existing state first** - See if tests already exist
|
||||
2. Generate tests from goal (only if needed)
|
||||
3. Approve tests (mandatory human approval)
|
||||
4. Run tests against agent
|
||||
5. Debug failures and iterate
|
||||
|
||||
## Check Existing State First
|
||||
|
||||
**CRITICAL**: Before generating any tests, ALWAYS check if tests already exist for the goal.
|
||||
|
||||
```python
|
||||
# Check what tests exist for this goal
|
||||
result = list_tests(goal_id="youtube-research")
|
||||
|
||||
# Returns:
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"total": 42,
|
||||
"by_status": {
|
||||
"pending": 10,
|
||||
"approved": 30,
|
||||
"modified": 2,
|
||||
"rejected": 0
|
||||
},
|
||||
"by_type": {
|
||||
"constraint": 15,
|
||||
"success_criteria": 25,
|
||||
"edge_case": 2
|
||||
},
|
||||
"tests": [...] # List of test summaries
|
||||
}
|
||||
```
|
||||
|
||||
### Decision Tree
|
||||
|
||||
Based on existing state, choose the right action:
|
||||
|
||||
```
|
||||
list_tests(goal_id) → Check existing tests
|
||||
↓
|
||||
┌───────┴────────────────────────────────────────┐
|
||||
│ │
|
||||
No tests exist Tests exist
|
||||
│ │
|
||||
↓ ┌─────────┴─────────┐
|
||||
Generate tests │ │
|
||||
(constraint first, Has pending All approved
|
||||
then success_criteria) tests │
|
||||
│ ↓
|
||||
↓ Run tests
|
||||
Approve pending directly
|
||||
tests first
|
||||
```
|
||||
|
||||
### Resuming a Testing Session
|
||||
|
||||
When the user asks to test an agent that may have been tested before:
|
||||
|
||||
1. **Always check first**: `list_tests(goal_id="...")`
|
||||
2. **Show the user what exists**:
|
||||
- "Found 42 existing tests: 30 approved, 10 pending, 2 modified"
|
||||
- "Last run: 28/30 passed (93.3%)"
|
||||
3. **Ask what they want to do**:
|
||||
|
||||
```python
|
||||
AskUserQuestion(
|
||||
questions=[{
|
||||
"question": "Tests already exist for this agent. What would you like to do?",
|
||||
"header": "Existing Tests",
|
||||
"options": [
|
||||
{
|
||||
"label": "Run existing tests (Recommended)",
|
||||
"description": "Run the 32 approved tests against the agent"
|
||||
},
|
||||
{
|
||||
"label": "Approve pending tests",
|
||||
"description": "Review and approve the 10 pending tests first"
|
||||
},
|
||||
{
|
||||
"label": "Regenerate all tests",
|
||||
"description": "Delete existing and generate fresh tests (loses approvals)"
|
||||
},
|
||||
{
|
||||
"label": "Show test details",
|
||||
"description": "List all tests with their status and last results"
|
||||
}
|
||||
],
|
||||
"multiSelect": false
|
||||
}]
|
||||
)
|
||||
```
|
||||
|
||||
### Why This Matters
|
||||
|
||||
- **Saves time**: Approved tests don't need re-approval
|
||||
- **Preserves work**: User's previous approvals/modifications are kept
|
||||
- **Clear state**: User knows exactly what exists before taking action
|
||||
- **Prevents duplicates**: Won't generate tests that already exist
|
||||
|
||||
## Core Concepts
|
||||
|
||||
**Test Types**: Three types of tests, generated at different stages:
|
||||
- `constraint` - Generated during Goal stage (agent-agnostic boundaries)
|
||||
- `success_criteria` - Generated during Eval stage (after agent exists)
|
||||
- `edge_case` - Generated when new scenarios discovered during debugging
|
||||
|
||||
**Approval**: All LLM-generated tests require explicit user approval before running.
|
||||
|
||||
**Error Categories**: Failed tests are categorized to guide iteration:
|
||||
- `LOGIC_ERROR` - Goal definition is wrong → Update goal, restart full flow
|
||||
- `IMPLEMENTATION_ERROR` - Code bug → Fix agent, re-run Eval
|
||||
- `EDGE_CASE` - New scenario discovered → Add test, continue Eval
|
||||
|
||||
**Iteration**: Each error category has a specific fix path (see Error Categorization section).
|
||||
|
||||
## Workflow (HITL Required)
|
||||
|
||||
**CRITICAL**: Each step requires human approval before proceeding.
|
||||
**CRITICAL**: Use structured questions (AskUserQuestion) with fallback to text mode.
|
||||
|
||||
### Approval Strategy
|
||||
|
||||
**Always try structured questions first**, with graceful fallback:
|
||||
|
||||
1. **Attempt**: Call AskUserQuestion with clickable options
|
||||
2. **Catch**: If tool fails/rejected, fall back to text prompt
|
||||
3. **Parse**: Accept text input like "approve", "reject", "skip"
|
||||
|
||||
This ensures the workflow works in all environments (VSCode extension, CLI, web).
|
||||
|
||||
### Test Loop
|
||||
|
||||
```
|
||||
For each test generated:
|
||||
1. DISPLAY → Show the test details to the human
|
||||
2. VALIDATE → Check test syntax and structure
|
||||
3. ASK APPROVAL → Use AskUserQuestion with clickable options
|
||||
4. Only run tests after approval
|
||||
```
|
||||
|
||||
### Checklist (ask approval at each check)
|
||||
|
||||
```
|
||||
Agent Testing Progress:
|
||||
- [ ] Load goal and agent → VERIFY PATHS
|
||||
- [ ] CHECK EXISTING TESTS → list_tests, show stats, ask what to do
|
||||
- [ ] If no tests OR user wants fresh: Generate tests → ASK APPROVAL
|
||||
- [ ] If pending tests exist: Approve pending tests first
|
||||
- [ ] Run all approved tests → SHOW RESULTS
|
||||
- [ ] Debug failed tests → SHOW CATEGORIZATION
|
||||
- [ ] Iterate based on category → ASK APPROVAL for changes
|
||||
```
|
||||
|
||||
## The Three-Stage Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ GOAL STAGE │
|
||||
│ 1. Define success_criteria and constraints (building-agents skill) │
|
||||
│ 2. Generate CONSTRAINT TESTS → USER APPROVAL → tests stored │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ AGENT STAGE │
|
||||
│ Build nodes + edges (building-agents skill) │
|
||||
│ Constraint tests can run during development for early feedback │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ EVAL STAGE (this skill) │
|
||||
│ 1. Generate SUCCESS_CRITERIA TESTS → USER APPROVAL → tests stored │
|
||||
│ 2. Run all tests in parallel → pass/fail summary │
|
||||
│ 3. On failure → Debug tool with categorization │
|
||||
│ 4. Iterate based on error category │
|
||||
└─────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Test Generation
|
||||
|
||||
### When to Generate Each Type
|
||||
|
||||
| Test Type | When Generated | Why |
|
||||
|-----------|----------------|-----|
|
||||
| **Constraint Tests** | During Goal stage (before agent exists) | Constraints are agent-agnostic boundaries |
|
||||
| **Success Criteria Tests** | During Eval stage (after agent exists) | May depend on agent flow/nodes |
|
||||
| **Edge Case Tests** | During debugging (when new scenario found) | Discovered through test failures |
|
||||
|
||||
### Generating Tests
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
# 1. Generate constraint tests (Goal stage)
|
||||
result = generate_constraint_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json=json.dumps({
|
||||
"id": "youtube-research",
|
||||
"name": "YouTube Research Agent",
|
||||
"description": "Find relevant YouTube videos on a topic",
|
||||
"success_criteria": [
|
||||
{
|
||||
"id": "find_videos",
|
||||
"description": "Find 3-5 relevant videos",
|
||||
"metric": "video_count",
|
||||
"target": "3-5",
|
||||
"weight": 1.0
|
||||
}
|
||||
],
|
||||
"constraints": [
|
||||
{
|
||||
"id": "api_limits",
|
||||
"description": "Must respect YouTube API rate limits",
|
||||
"constraint_type": "hard",
|
||||
"category": "reliability",
|
||||
"check": "llm_judge" # Optional: how to validate
|
||||
}
|
||||
]
|
||||
})
|
||||
)
|
||||
|
||||
# 2. Generate success criteria tests (Eval stage, after agent built)
|
||||
result = generate_success_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json='...', # Same structure as above
|
||||
node_names="search_node,filter_node,format_node",
|
||||
tool_names="youtube_search,video_details"
|
||||
)
|
||||
```
|
||||
|
||||
**After generation**, tests are stored as PENDING. They must be approved before running.
|
||||
|
||||
## Approval Patterns
|
||||
|
||||
### Interactive Approval Flow
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Generated Tests for: youtube-research (3 tests) │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [1/3] test_find_videos_happy_path │
|
||||
│ Type: SUCCESS_CRITERIA │
|
||||
│ Confidence: 92% │
|
||||
│ Input: {"topic": "machine learning tutorials"} │
|
||||
│ Expected: 3-5 videos with titles and IDs │
|
||||
│ │
|
||||
│ def test_find_videos_happy_path(agent): │
|
||||
│ result = agent.run({"topic": "machine learning"}) │
|
||||
│ assert 3 <= len(result.videos) <= 5 │
|
||||
│ assert all(v.title for v in result.videos) │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Approval Actions
|
||||
|
||||
| Action | Description | Result |
|
||||
|--------|-------------|--------|
|
||||
| **approve** | Accept test as-is | Status → APPROVED, test will run |
|
||||
| **reject** | Decline with reason | Status → REJECTED, test won't run |
|
||||
| **edit** | Modify code before accepting | Status → MODIFIED, original preserved |
|
||||
| **skip** | Leave for later | Status → PENDING, decide later |
|
||||
|
||||
### Approval Code Pattern
|
||||
|
||||
```python
|
||||
# After generating tests, approve them
|
||||
result = approve_tests(
|
||||
goal_id="youtube-research",
|
||||
approvals='[
|
||||
{"test_id": "test_001", "action": "approve"},
|
||||
{"test_id": "test_002", "action": "modify", "modified_code": "def test_..."},
|
||||
{"test_id": "test_003", "action": "reject", "reason": "Not a valid scenario"},
|
||||
{"test_id": "test_004", "action": "skip"}
|
||||
]'
|
||||
)
|
||||
```
|
||||
|
||||
### Structured Approval Questions
|
||||
|
||||
```python
|
||||
# Try structured approval first
|
||||
try:
|
||||
response = AskUserQuestion(
|
||||
questions=[{
|
||||
"question": "Do you approve this test?",
|
||||
"header": "Test Approval",
|
||||
"options": [
|
||||
{
|
||||
"label": "Approve (Recommended)",
|
||||
"description": "Test looks good, include in test suite"
|
||||
},
|
||||
{
|
||||
"label": "Reject",
|
||||
"description": "Test is invalid or unnecessary"
|
||||
},
|
||||
{
|
||||
"label": "Edit",
|
||||
"description": "Modify the test code before accepting"
|
||||
},
|
||||
{
|
||||
"label": "Skip",
|
||||
"description": "Decide later, leave as pending"
|
||||
}
|
||||
],
|
||||
"multiSelect": false
|
||||
}]
|
||||
)
|
||||
except:
|
||||
# Fallback to text mode
|
||||
print("Do you approve this test? Type: approve | reject | edit | skip")
|
||||
```
|
||||
|
||||
## Test Execution
|
||||
|
||||
### Parallel Configuration
|
||||
|
||||
```python
|
||||
# Tests run in parallel with these defaults
|
||||
ParallelConfig(
|
||||
num_workers=cpu_count(), # Use all CPU cores
|
||||
timeout_per_test=60.0, # 60 seconds per test
|
||||
fail_fast=False, # Run all tests, don't stop on first failure
|
||||
mode="loadfile", # Group tests by parent_criteria_id
|
||||
)
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
|
||||
```python
|
||||
# Run all approved tests
|
||||
result = run_tests(
|
||||
goal_id="youtube-research",
|
||||
agent_path="exports/youtube-agent",
|
||||
test_types='["all"]', # or ["constraint", "success_criteria", "edge_case"]
|
||||
parallel=4, # Number of workers
|
||||
fail_fast=False # Run all tests
|
||||
)
|
||||
|
||||
# Result structure
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"overall_passed": false,
|
||||
"summary": {
|
||||
"total": 15,
|
||||
"passed": 12,
|
||||
"failed": 3,
|
||||
"pass_rate": "80.0%"
|
||||
},
|
||||
"duration_ms": 5432,
|
||||
"results": [
|
||||
{"test_id": "test_001", "passed": true, "duration_ms": 234},
|
||||
{"test_id": "test_002", "passed": false, "duration_ms": 567, "error_category": "IMPLEMENTATION_ERROR"},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Execution Flow
|
||||
|
||||
1. Load only APPROVED and MODIFIED tests (skip PENDING and REJECTED)
|
||||
2. Group tests by `parent_criteria_id` for shared fixture setup
|
||||
3. Run groups in parallel with process isolation
|
||||
4. Aggregate results with timing information
|
||||
|
||||
## Error Categorization & Iteration
|
||||
|
||||
### Decision Tree
|
||||
|
||||
```
|
||||
Test Fails → Categorize Error
|
||||
↓
|
||||
┌───────────┴─────────────────┬────────────────────┐
|
||||
│ │ │
|
||||
LOGIC ERROR IMPLEMENTATION ERROR EDGE CASE
|
||||
(criteria wrong) (code bug) (new scenario)
|
||||
│ │ │
|
||||
↓ ↓ ↓
|
||||
Update goal Fix nodes/edges Generate new
|
||||
success_criteria in Agent stage edge case test
|
||||
↓ ↓ │
|
||||
FULL 3-STEP Re-run Eval Continue in
|
||||
FLOW RESTART (skip Goal stage) Eval stage
|
||||
```
|
||||
|
||||
### Pattern-Based Heuristics
|
||||
|
||||
The categorizer uses these patterns to classify errors:
|
||||
|
||||
**LOGIC_ERROR** (goal definition is wrong):
|
||||
- "goal not achieved"
|
||||
- "constraint violated: core"
|
||||
- "fundamental assumption"
|
||||
- "success criteria mismatch"
|
||||
- "expected behavior incorrect"
|
||||
|
||||
**IMPLEMENTATION_ERROR** (code bug in agent):
|
||||
- TypeError, AttributeError, KeyError, ValueError
|
||||
- "tool call failed"
|
||||
- "node execution error"
|
||||
- "assertion failed"
|
||||
- "null pointer", "undefined"
|
||||
|
||||
**EDGE_CASE** (new scenario discovered):
|
||||
- "boundary condition"
|
||||
- "timeout", "rate limit"
|
||||
- "empty result", "no results"
|
||||
- "unexpected format"
|
||||
- "rare input", "unusual"
|
||||
|
||||
### Iteration Guidance
|
||||
|
||||
```python
|
||||
# After categorization, you get guidance
|
||||
{
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"iteration_guidance": {
|
||||
"stage": "Agent",
|
||||
"action": "Fix the code in nodes/edges",
|
||||
"restart_required": false,
|
||||
"description": "The goal is correct, but the implementation has a bug. Fix the agent code and re-run Eval."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
| Category | Go To Stage | Restart Required | Action |
|
||||
|----------|-------------|------------------|--------|
|
||||
| LOGIC_ERROR | Goal | Yes | Update success_criteria/constraints, rebuild agent |
|
||||
| IMPLEMENTATION_ERROR | Agent | No | Fix nodes/edges, re-run Eval only |
|
||||
| EDGE_CASE | Eval | No | Generate edge case test, continue in Eval |
|
||||
|
||||
## Debugging Failed Tests
|
||||
|
||||
### Debug Tool
|
||||
|
||||
```python
|
||||
# Get detailed debug info for a failed test
|
||||
result = debug_test(
|
||||
goal_id="youtube-research",
|
||||
test_id="test_find_videos_no_results"
|
||||
)
|
||||
|
||||
# Returns comprehensive debug info
|
||||
{
|
||||
"test_id": "test_find_videos_no_results",
|
||||
"test_name": "test_find_videos_no_results",
|
||||
"input": {"topic": "xyzabc123nonsense"},
|
||||
"expected": {"videos": [], "message": "No results found"},
|
||||
"actual": {"error": "NullPointerException at node_3"},
|
||||
"passed": false,
|
||||
"error_message": "TypeError: 'NoneType' has no attribute 'get'",
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"stack_trace": "Traceback (most recent call last):\n ...",
|
||||
"logs": [
|
||||
{"timestamp": "...", "node": "search_node", "level": "INFO", "msg": "..."},
|
||||
{"timestamp": "...", "node": "filter_node", "level": "ERROR", "msg": "..."}
|
||||
],
|
||||
"runtime_data": {
|
||||
"execution_path": ["start", "search_node", "filter_node"],
|
||||
"node_outputs": {...}
|
||||
},
|
||||
"suggested_fix": "Check null handling in filter_node when no results returned",
|
||||
"iteration_guidance": {
|
||||
"stage": "Agent",
|
||||
"action": "Fix the code in nodes/edges",
|
||||
"restart_required": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Debug Workflow
|
||||
|
||||
1. **Run all tests** → Get pass/fail summary
|
||||
2. **Select failed test** → Get detailed DebugInfo
|
||||
3. **Review categorization** → Understand error type
|
||||
4. **Check suggested fix** → Get actionable guidance
|
||||
5. **Follow iteration guidance** → Go to correct stage
|
||||
|
||||
## Example: Testing YouTube Agent
|
||||
|
||||
See [examples/testing-youtube-agent.md](examples/testing-youtube-agent.md) for a complete walkthrough.
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Happy Path Tests
|
||||
Test normal successful execution with valid inputs:
|
||||
```python
|
||||
def test_find_videos_happy_path(agent):
|
||||
result = agent.run({"topic": "python tutorials"})
|
||||
assert result.success
|
||||
assert len(result.videos) >= 3
|
||||
assert all(v.title for v in result.videos)
|
||||
```
|
||||
|
||||
### Boundary Condition Tests
|
||||
Test exactly at target thresholds:
|
||||
```python
|
||||
def test_find_videos_minimum_count(agent):
|
||||
result = agent.run({"topic": "very specific niche topic"})
|
||||
assert len(result.videos) >= 1 # At least one result
|
||||
```
|
||||
|
||||
### Error Handling Tests
|
||||
Test graceful handling of failures:
|
||||
```python
|
||||
def test_find_videos_invalid_input(agent):
|
||||
result = agent.run({"topic": ""}) # Empty input
|
||||
assert not result.success or result.message == "Invalid input"
|
||||
```
|
||||
|
||||
### Constraint Violation Tests
|
||||
Test that constraints are respected:
|
||||
```python
|
||||
def test_api_rate_limit_respected(agent):
|
||||
# Run multiple times quickly
|
||||
for _ in range(5):
|
||||
result = agent.run({"topic": "test"})
|
||||
# Should not hit rate limit errors
|
||||
assert "rate limit" not in str(result).lower()
|
||||
```
|
||||
|
||||
## Anti-Patterns
|
||||
|
||||
| Don't | Do Instead |
|
||||
|-------|------------|
|
||||
| Auto-approve tests | Always require explicit user approval |
|
||||
| Run PENDING/REJECTED tests | Only run APPROVED/MODIFIED tests |
|
||||
| Generate success tests during Goal stage | Wait until agent exists |
|
||||
| Treat all failures the same | Categorize and iterate appropriately |
|
||||
| Restart full flow for IMPLEMENTATION_ERROR | Fix agent, re-run Eval only |
|
||||
| Add test for LOGIC_ERROR | Fix the goal definition instead |
|
||||
| Ignore confidence scores | Review low-confidence categorizations manually |
|
||||
| Skip the approval step | Tests must be reviewed before running |
|
||||
|
||||
## Tools Reference
|
||||
|
||||
### Testing Tools
|
||||
|
||||
| Tool | Purpose | When to Use |
|
||||
|------|---------|-------------|
|
||||
| `generate_constraint_tests` | Generate tests from goal constraints | Goal stage |
|
||||
| `generate_success_tests` | Generate tests from success criteria | Eval stage (after agent built) |
|
||||
| `approve_tests` | Approve/reject/modify generated tests | After generation |
|
||||
| `run_tests` | Execute tests in parallel | After approval |
|
||||
| `debug_test` | Analyze failed test with categorization | After test fails |
|
||||
| `list_tests` | List tests for a goal by status | Anytime |
|
||||
| `get_pending_tests` | Get tests awaiting approval | Before approval |
|
||||
|
||||
### Building Tools (for iteration)
|
||||
|
||||
When iteration requires modifying the agent, use these from the building-agents skill:
|
||||
|
||||
| Tool | Purpose | When to Use |
|
||||
|------|---------|-------------|
|
||||
| `set_goal` | Update goal definition | LOGIC_ERROR iteration |
|
||||
| `add_node` | Add or modify nodes | IMPLEMENTATION_ERROR iteration |
|
||||
| `add_edge` | Add or modify edges | IMPLEMENTATION_ERROR iteration |
|
||||
| `validate_graph` | Validate changes | After any modification |
|
||||
| `export_graph` | Re-export agent | After fixes complete |
|
||||
|
||||
## CLI Commands
|
||||
|
||||
```bash
|
||||
# Generate tests from goal
|
||||
python -m core test-generate goal.json --type all
|
||||
|
||||
# Interactive approval of pending tests
|
||||
python -m core test-approve <goal_id>
|
||||
|
||||
# Run tests for an agent
|
||||
python -m core test-run <agent_path> --goal <goal_id> --parallel 4
|
||||
|
||||
# Debug a failed test
|
||||
python -m core test-debug <goal_id> <test_id>
|
||||
|
||||
# List tests by status
|
||||
python -m core test-list <goal_id> --status approved
|
||||
|
||||
# Show test statistics
|
||||
python -m core test-stats <goal_id>
|
||||
```
|
||||
|
||||
## Integration with building-agents
|
||||
|
||||
### Handoff Points
|
||||
|
||||
| Scenario | From | To | Action |
|
||||
|----------|------|-----|--------|
|
||||
| Agent built, ready to test | building-agents | testing-agent | Generate success tests |
|
||||
| LOGIC_ERROR found | testing-agent | building-agents | Update goal, rebuild |
|
||||
| IMPLEMENTATION_ERROR found | testing-agent | building-agents | Fix nodes/edges |
|
||||
| EDGE_CASE found | testing-agent | testing-agent | Generate edge case test |
|
||||
| All tests pass | testing-agent | Done | Agent is validated |
|
||||
|
||||
### When to Switch Skills
|
||||
|
||||
**Use building-agents when:**
|
||||
- Defining goals and constraints
|
||||
- Building agent nodes and edges
|
||||
- Fixing LOGIC_ERROR or IMPLEMENTATION_ERROR
|
||||
|
||||
**Use testing-agent when:**
|
||||
- Generating tests from goals
|
||||
- Approving and running tests
|
||||
- Debugging failures
|
||||
- Categorizing errors
|
||||
|
||||
### Shared Patterns
|
||||
|
||||
Both skills use:
|
||||
- AskUserQuestion with structured options
|
||||
- HITL at every critical step
|
||||
- Fallback to text mode when widgets unavailable
|
||||
- Session state management for continuity
|
||||
@@ -0,0 +1,348 @@
|
||||
# Example: Testing a YouTube Research Agent
|
||||
|
||||
This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Agent built with building-agents skill at `exports/youtube-research/`
|
||||
- Goal defined with success criteria and constraints
|
||||
|
||||
## Step 1: Load the Goal
|
||||
|
||||
First, load the goal that was defined during the Goal stage:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "youtube-research",
|
||||
"name": "YouTube Research Agent",
|
||||
"description": "Find relevant YouTube videos on a given topic",
|
||||
"success_criteria": [
|
||||
{
|
||||
"id": "find_videos",
|
||||
"description": "Find 3-5 relevant videos",
|
||||
"metric": "video_count",
|
||||
"target": "3-5",
|
||||
"weight": 1.0
|
||||
},
|
||||
{
|
||||
"id": "relevance",
|
||||
"description": "Videos must be relevant to the topic",
|
||||
"metric": "relevance_score",
|
||||
"target": ">0.8",
|
||||
"weight": 0.8
|
||||
}
|
||||
],
|
||||
"constraints": [
|
||||
{
|
||||
"id": "api_limits",
|
||||
"description": "Must not exceed YouTube API rate limits",
|
||||
"constraint_type": "hard",
|
||||
"category": "technical"
|
||||
},
|
||||
{
|
||||
"id": "content_safety",
|
||||
"description": "Must filter out inappropriate content",
|
||||
"constraint_type": "hard",
|
||||
"category": "safety"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Step 2: Generate Constraint Tests
|
||||
|
||||
During the Goal stage (or early Eval), generate tests for constraints:
|
||||
|
||||
```python
|
||||
result = generate_constraint_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json='<goal JSON above>'
|
||||
)
|
||||
```
|
||||
|
||||
**Generated tests (awaiting approval):**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Generated Constraint Tests (2 tests) │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [1/2] test_constraint_api_limits_respected │
|
||||
│ Constraint: api_limits │
|
||||
│ Confidence: 88% │
|
||||
│ │
|
||||
│ def test_constraint_api_limits_respected(agent): │
|
||||
│ """Verify API rate limits are not exceeded.""" │
|
||||
│ import time │
|
||||
│ for i in range(10): │
|
||||
│ result = agent.run({"topic": f"test_{i}"}) │
|
||||
│ time.sleep(0.1) │
|
||||
│ # Should complete without rate limit errors │
|
||||
│ assert "rate limit" not in str(result).lower() │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [2/2] test_constraint_content_safety_filter │
|
||||
│ Constraint: content_safety │
|
||||
│ Confidence: 91% │
|
||||
│ │
|
||||
│ def test_constraint_content_safety_filter(agent): │
|
||||
│ """Verify inappropriate content is filtered.""" │
|
||||
│ result = agent.run({"topic": "general topic"}) │
|
||||
│ for video in result.videos: │
|
||||
│ assert video.safe_for_work is True │
|
||||
│ assert video.age_restricted is False │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Step 3: Approve Constraint Tests
|
||||
|
||||
Review and approve each test:
|
||||
|
||||
```python
|
||||
result = approve_tests(
|
||||
goal_id="youtube-research",
|
||||
approvals='[
|
||||
{"test_id": "test_constraint_api_001", "action": "approve"},
|
||||
{"test_id": "test_constraint_content_001", "action": "approve"}
|
||||
]'
|
||||
)
|
||||
```
|
||||
|
||||
## Step 4: Generate Success Criteria Tests
|
||||
|
||||
After the agent is built, generate success criteria tests:
|
||||
|
||||
```python
|
||||
result = generate_success_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json='<goal JSON>',
|
||||
node_names="search_node,filter_node,rank_node,format_node",
|
||||
tool_names="youtube_search,video_details,channel_info"
|
||||
)
|
||||
```
|
||||
|
||||
**Generated tests (awaiting approval):**
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Generated Success Criteria Tests (4 tests) │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [1/4] test_find_videos_happy_path │
|
||||
│ Criteria: find_videos │
|
||||
│ Confidence: 95% │
|
||||
│ │
|
||||
│ def test_find_videos_happy_path(agent): │
|
||||
│ """Test finding videos for a common topic.""" │
|
||||
│ result = agent.run({"topic": "machine learning"}) │
|
||||
│ assert result.success │
|
||||
│ assert 3 <= len(result.videos) <= 5 │
|
||||
│ assert all(v.title for v in result.videos) │
|
||||
│ assert all(v.video_id for v in result.videos) │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [2/4] test_find_videos_minimum_boundary │
|
||||
│ Criteria: find_videos │
|
||||
│ Confidence: 87% │
|
||||
│ │
|
||||
│ def test_find_videos_minimum_boundary(agent): │
|
||||
│ """Test at minimum threshold (3 videos).""" │
|
||||
│ result = agent.run({"topic": "niche topic xyz"}) │
|
||||
│ assert len(result.videos) >= 3 │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [3/4] test_relevance_score_threshold │
|
||||
│ Criteria: relevance │
|
||||
│ Confidence: 92% │
|
||||
│ │
|
||||
│ def test_relevance_score_threshold(agent): │
|
||||
│ """Test relevance scoring meets threshold.""" │
|
||||
│ result = agent.run({"topic": "python programming"}) │
|
||||
│ for video in result.videos: │
|
||||
│ assert video.relevance_score > 0.8 │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ [4/4] test_find_videos_no_results_graceful │
|
||||
│ Criteria: find_videos │
|
||||
│ Confidence: 84% │
|
||||
│ │
|
||||
│ def test_find_videos_no_results_graceful(agent): │
|
||||
│ """Test graceful handling of no results.""" │
|
||||
│ result = agent.run({"topic": "xyznonexistent123"}) │
|
||||
│ # Should not crash, return empty or message │
|
||||
│ assert result.videos == [] or result.message │
|
||||
│ │
|
||||
│ [a]pprove [r]eject [e]dit [s]kip │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Step 5: Approve Success Criteria Tests
|
||||
|
||||
```python
|
||||
result = approve_tests(
|
||||
goal_id="youtube-research",
|
||||
approvals='[
|
||||
{"test_id": "test_success_001", "action": "approve"},
|
||||
{"test_id": "test_success_002", "action": "approve"},
|
||||
{"test_id": "test_success_003", "action": "approve"},
|
||||
{"test_id": "test_success_004", "action": "approve"}
|
||||
]'
|
||||
)
|
||||
```
|
||||
|
||||
## Step 6: Run All Tests
|
||||
|
||||
Execute all approved tests:
|
||||
|
||||
```python
|
||||
result = run_tests(
|
||||
goal_id="youtube-research",
|
||||
agent_path="exports/youtube-research",
|
||||
test_types='["all"]',
|
||||
parallel=4
|
||||
)
|
||||
```
|
||||
|
||||
**Results:**
|
||||
|
||||
```json
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"overall_passed": false,
|
||||
"summary": {
|
||||
"total": 6,
|
||||
"passed": 5,
|
||||
"failed": 1,
|
||||
"pass_rate": "83.3%"
|
||||
},
|
||||
"duration_ms": 4521,
|
||||
"results": [
|
||||
{"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
|
||||
{"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
|
||||
{"test_id": "test_success_001", "passed": true, "duration_ms": 789},
|
||||
{"test_id": "test_success_002", "passed": true, "duration_ms": 654},
|
||||
{"test_id": "test_success_003", "passed": true, "duration_ms": 543},
|
||||
{"test_id": "test_success_004", "passed": false, "duration_ms": 845,
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Step 7: Debug the Failed Test
|
||||
|
||||
```python
|
||||
result = debug_test(
|
||||
goal_id="youtube-research",
|
||||
test_id="test_success_004"
|
||||
)
|
||||
```
|
||||
|
||||
**Debug Output:**
|
||||
|
||||
```json
|
||||
{
|
||||
"test_id": "test_success_004",
|
||||
"test_name": "test_find_videos_no_results_graceful",
|
||||
"input": {"topic": "xyznonexistent123"},
|
||||
"expected": "Empty list or message",
|
||||
"actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
|
||||
"passed": false,
|
||||
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"stack_trace": "Traceback (most recent call last):\n File \"filter_node.py\", line 42\n for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
|
||||
"logs": [
|
||||
{"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
|
||||
{"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
|
||||
{"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
|
||||
],
|
||||
"runtime_data": {
|
||||
"execution_path": ["start", "search_node", "filter_node"],
|
||||
"node_outputs": {
|
||||
"search_node": null
|
||||
}
|
||||
},
|
||||
"suggested_fix": "Add null check in filter_node before accessing .videos attribute",
|
||||
"iteration_guidance": {
|
||||
"stage": "Agent",
|
||||
"action": "Fix the code in nodes/edges",
|
||||
"restart_required": false,
|
||||
"description": "The goal is correct, but filter_node doesn't handle null results from search_node."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Step 8: Iterate Based on Category
|
||||
|
||||
Since this is an **IMPLEMENTATION_ERROR**, we:
|
||||
|
||||
1. **Don't restart** the Goal → Agent → Eval flow
|
||||
2. **Fix the agent** using building-agents skill:
|
||||
- Modify `filter_node` to handle null results
|
||||
3. **Re-run Eval** (tests only)
|
||||
|
||||
### Fix in building-agents:
|
||||
|
||||
```python
|
||||
# Update the filter_node to handle null
|
||||
add_node(
|
||||
node_id="filter_node",
|
||||
name="Filter Node",
|
||||
description="Filter and rank videos",
|
||||
node_type="function",
|
||||
input_keys=["search_results"],
|
||||
output_keys=["filtered_videos"],
|
||||
system_prompt="""
|
||||
Filter videos by relevance.
|
||||
IMPORTANT: Handle case where search_results is None or empty.
|
||||
Return empty list if no results.
|
||||
"""
|
||||
)
|
||||
```
|
||||
|
||||
### Re-export and re-test:
|
||||
|
||||
```python
|
||||
# Re-export the fixed agent
|
||||
export_graph(path="exports/youtube-research")
|
||||
|
||||
# Re-run tests
|
||||
result = run_tests(
|
||||
goal_id="youtube-research",
|
||||
agent_path="exports/youtube-research",
|
||||
test_types='["all"]'
|
||||
)
|
||||
```
|
||||
|
||||
**Updated Results:**
|
||||
|
||||
```json
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"overall_passed": true,
|
||||
"summary": {
|
||||
"total": 6,
|
||||
"passed": 6,
|
||||
"failed": 0,
|
||||
"pass_rate": "100.0%"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
1. **Generated** constraint tests during Goal stage
|
||||
2. **Generated** success criteria tests during Eval stage
|
||||
3. **Approved** all tests with user review
|
||||
4. **Ran** tests in parallel
|
||||
5. **Debugged** the one failure
|
||||
6. **Categorized** as IMPLEMENTATION_ERROR
|
||||
7. **Fixed** the agent (not the goal)
|
||||
8. **Re-ran** Eval only (didn't restart full flow)
|
||||
9. **Passed** all tests
|
||||
|
||||
The agent is now validated and ready for production use.
|
||||
@@ -128,6 +128,29 @@ runtime.record_outcome(
|
||||
runtime.end_run(success=True, narrative="Successfully processed all data")
|
||||
```
|
||||
|
||||
### Testing Agents
|
||||
|
||||
The framework includes a goal-based testing framework for validating agent behavior.
|
||||
|
||||
```bash
|
||||
# Generate tests from a goal definition
|
||||
python -m framework test-generate goal.json
|
||||
|
||||
# Interactively approve generated tests
|
||||
python -m framework test-approve <goal_id>
|
||||
|
||||
# Run tests against an agent
|
||||
python -m framework test-run <agent_path> --parallel 4
|
||||
|
||||
# Debug failed tests
|
||||
python -m framework test-debug <goal_id> <test_id>
|
||||
|
||||
# List tests by status
|
||||
python -m framework test-list <goal_id>
|
||||
```
|
||||
|
||||
For detailed testing workflows, see the [testing-agent skill](.claude/skills/testing-agent/SKILL.md).
|
||||
|
||||
### Analyzing with Builder
|
||||
|
||||
```python
|
||||
|
||||
@@ -10,6 +10,16 @@ choice the agent makes is captured with:
|
||||
- Whether that was good or bad (evaluated post-hoc)
|
||||
|
||||
This gives the Builder LLM the information it needs to improve agent behavior.
|
||||
|
||||
## Testing Framework
|
||||
|
||||
The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
|
||||
- Generate tests from Goal success_criteria and constraints
|
||||
- Mandatory user approval before tests are stored
|
||||
- Parallel test execution with error categorization
|
||||
- Debug tools with fix suggestions
|
||||
|
||||
See `framework.testing` for details.
|
||||
"""
|
||||
|
||||
from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
|
||||
@@ -19,6 +29,21 @@ from framework.builder.query import BuilderQuery
|
||||
from framework.llm import LLMProvider, AnthropicProvider
|
||||
from framework.runner import AgentRunner, AgentOrchestrator
|
||||
|
||||
# Testing framework
|
||||
from framework.testing import (
|
||||
Test,
|
||||
TestResult,
|
||||
TestSuiteResult,
|
||||
TestStorage,
|
||||
ApprovalStatus,
|
||||
ErrorCategory,
|
||||
ConstraintTestGenerator,
|
||||
SuccessCriteriaTestGenerator,
|
||||
ParallelTestRunner,
|
||||
ParallelConfig,
|
||||
DebugTool,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Schemas
|
||||
"Decision",
|
||||
@@ -38,4 +63,16 @@ __all__ = [
|
||||
# Runner
|
||||
"AgentRunner",
|
||||
"AgentOrchestrator",
|
||||
# Testing
|
||||
"Test",
|
||||
"TestResult",
|
||||
"TestSuiteResult",
|
||||
"TestStorage",
|
||||
"ApprovalStatus",
|
||||
"ErrorCategory",
|
||||
"ConstraintTestGenerator",
|
||||
"SuccessCriteriaTestGenerator",
|
||||
"ParallelTestRunner",
|
||||
"ParallelConfig",
|
||||
"DebugTool",
|
||||
]
|
||||
|
||||
+13
-1
@@ -8,6 +8,14 @@ Usage:
|
||||
python -m core list exports/
|
||||
python -m core dispatch exports/ --input '{"key": "value"}'
|
||||
python -m core shell exports/my-agent
|
||||
|
||||
Testing commands:
|
||||
python -m core test-generate goal.json
|
||||
python -m core test-approve <goal_id>
|
||||
python -m core test-run <agent_path> --goal <goal_id>
|
||||
python -m core test-debug <goal_id> <test_id>
|
||||
python -m core test-list <goal_id>
|
||||
python -m core test-stats <goal_id>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -20,7 +28,7 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="claude-sonnet-4-20250514",
|
||||
default="claude-haiku-4-5-20251001",
|
||||
help="Anthropic model to use",
|
||||
)
|
||||
|
||||
@@ -30,6 +38,10 @@ def main():
|
||||
from framework.runner.cli import register_commands
|
||||
register_commands(subparsers)
|
||||
|
||||
# Register testing commands (test-generate, test-approve, test-run, test-debug, etc.)
|
||||
from framework.testing.cli import register_testing_commands
|
||||
register_testing_commands(subparsers)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if hasattr(args, "func"):
|
||||
|
||||
@@ -340,7 +340,7 @@ class GraphSpec(BaseModel):
|
||||
)
|
||||
|
||||
# Default LLM settings
|
||||
default_model: str = "claude-sonnet-4-20250514"
|
||||
default_model: str = "claude-haiku-4-5-20251001"
|
||||
max_tokens: int = 1024
|
||||
|
||||
# Execution limits
|
||||
|
||||
@@ -76,6 +76,7 @@ class Constraint(BaseModel):
|
||||
description="Category: 'time', 'cost', 'safety', 'scope', 'quality'"
|
||||
)
|
||||
check: str = Field(
|
||||
default="",
|
||||
description="How to check: expression, function name, or 'llm_judge'"
|
||||
)
|
||||
|
||||
|
||||
@@ -18,14 +18,14 @@ class AnthropicProvider(LLMProvider):
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
model: str = "claude-haiku-4-5-20251001",
|
||||
):
|
||||
"""
|
||||
Initialize the Anthropic provider.
|
||||
|
||||
Args:
|
||||
api_key: Anthropic API key. If not provided, uses ANTHROPIC_API_KEY env var.
|
||||
model: Model to use (default: claude-sonnet-4-20250514)
|
||||
model: Model to use (default: claude-haiku-4-5-20251001)
|
||||
"""
|
||||
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not self.api_key:
|
||||
|
||||
@@ -9,6 +9,7 @@ Usage:
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
from mcp.server import FastMCP
|
||||
@@ -16,6 +17,15 @@ from mcp.server import FastMCP
|
||||
from framework.graph import Goal, SuccessCriterion, Constraint, NodeSpec, EdgeSpec, EdgeCondition
|
||||
from framework.graph.edge import GraphSpec
|
||||
|
||||
# Testing framework imports
|
||||
from framework.testing.test_case import Test, ApprovalStatus, TestType
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.constraint_gen import ConstraintTestGenerator
|
||||
from framework.testing.success_gen import SuccessCriteriaTestGenerator
|
||||
from framework.testing.approval_types import ApprovalRequest, ApprovalAction
|
||||
from framework.testing.debug_tool import DebugTool
|
||||
from framework.testing.parallel import AgentFactory
|
||||
|
||||
|
||||
# Initialize MCP server
|
||||
mcp = FastMCP("agent-builder")
|
||||
@@ -1408,6 +1418,387 @@ def simulate_plan_execution(
|
||||
}, indent=2)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TESTING TOOLS (Goal-Based Evaluation)
|
||||
# =============================================================================
|
||||
|
||||
# Session storage for pending tests (not yet persisted)
|
||||
_pending_tests: dict[str, list[Test]] = {}
|
||||
|
||||
# Default storage path for tests
|
||||
DEFAULT_TEST_STORAGE_PATH = Path("data/tests")
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def generate_constraint_tests(
|
||||
goal_id: Annotated[str, "ID of the goal to generate tests for"],
|
||||
goal_json: Annotated[str, """JSON string of the Goal object. Constraint fields:
|
||||
- id: string (required)
|
||||
- description: string (required)
|
||||
- constraint_type: "hard" or "soft" (required)
|
||||
- category: string (optional, default: "general")
|
||||
- check: string (optional, how to validate: "llm_judge", expression, or function name)"""],
|
||||
) -> str:
|
||||
"""
|
||||
Generate constraint tests for a goal.
|
||||
|
||||
Returns proposals for user approval. Tests are NOT persisted until approved.
|
||||
"""
|
||||
try:
|
||||
goal = Goal.model_validate_json(goal_json)
|
||||
except Exception as e:
|
||||
return json.dumps({"error": f"Invalid goal JSON: {e}"})
|
||||
|
||||
# Get LLM provider
|
||||
try:
|
||||
from framework.llm import AnthropicProvider
|
||||
llm = AnthropicProvider()
|
||||
except Exception as e:
|
||||
return json.dumps({"error": f"Failed to initialize LLM: {e}"})
|
||||
|
||||
# Generate tests
|
||||
generator = ConstraintTestGenerator(llm)
|
||||
tests = generator.generate(goal)
|
||||
|
||||
# Store as pending (not persisted yet)
|
||||
_pending_tests[goal_id] = tests
|
||||
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"generated_count": len(tests),
|
||||
"tests": [
|
||||
{
|
||||
"id": t.id,
|
||||
"test_name": t.test_name,
|
||||
"parent_criteria_id": t.parent_criteria_id,
|
||||
"description": t.description,
|
||||
"confidence": t.llm_confidence,
|
||||
"test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
|
||||
}
|
||||
for t in tests
|
||||
],
|
||||
"next_step": "Call approve_tests to approve, modify, or reject each test",
|
||||
})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def generate_success_tests(
|
||||
goal_id: Annotated[str, "ID of the goal to generate tests for"],
|
||||
goal_json: Annotated[str, "JSON string of the Goal object"],
|
||||
node_names: Annotated[str, "Comma-separated list of agent node names"] = "",
|
||||
tool_names: Annotated[str, "Comma-separated list of available tool names"] = "",
|
||||
) -> str:
|
||||
"""
|
||||
Generate success criteria tests for a goal.
|
||||
|
||||
Should be called during Eval stage after agent exists.
|
||||
Returns proposals for user approval.
|
||||
"""
|
||||
try:
|
||||
goal = Goal.model_validate_json(goal_json)
|
||||
except Exception as e:
|
||||
return json.dumps({"error": f"Invalid goal JSON: {e}"})
|
||||
|
||||
# Get LLM provider
|
||||
try:
|
||||
from framework.llm import AnthropicProvider
|
||||
llm = AnthropicProvider()
|
||||
except Exception as e:
|
||||
return json.dumps({"error": f"Failed to initialize LLM: {e}"})
|
||||
|
||||
# Parse node/tool names
|
||||
nodes = [n.strip() for n in node_names.split(",") if n.strip()]
|
||||
tools = [t.strip() for t in tool_names.split(",") if t.strip()]
|
||||
|
||||
# Generate tests
|
||||
generator = SuccessCriteriaTestGenerator(llm)
|
||||
tests = generator.generate(goal, node_names=nodes, tool_names=tools)
|
||||
|
||||
# Add to pending (may have constraint tests already)
|
||||
if goal_id in _pending_tests:
|
||||
_pending_tests[goal_id].extend(tests)
|
||||
else:
|
||||
_pending_tests[goal_id] = tests
|
||||
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"generated_count": len(tests),
|
||||
"tests": [
|
||||
{
|
||||
"id": t.id,
|
||||
"test_name": t.test_name,
|
||||
"parent_criteria_id": t.parent_criteria_id,
|
||||
"description": t.description,
|
||||
"confidence": t.llm_confidence,
|
||||
"test_code_preview": t.test_code[:500] + "..." if len(t.test_code) > 500 else t.test_code,
|
||||
}
|
||||
for t in tests
|
||||
],
|
||||
"next_step": "Call approve_tests to approve, modify, or reject each test",
|
||||
})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def approve_tests(
|
||||
goal_id: Annotated[str, "ID of the goal"],
|
||||
approvals: Annotated[str, "JSON array of approval decisions"],
|
||||
) -> str:
|
||||
"""
|
||||
Approve, reject, or modify generated tests.
|
||||
|
||||
Approvals format:
|
||||
[
|
||||
{"test_id": "...", "action": "approve"},
|
||||
{"test_id": "...", "action": "modify", "modified_code": "..."},
|
||||
{"test_id": "...", "action": "reject", "reason": "..."},
|
||||
{"test_id": "...", "action": "skip"}
|
||||
]
|
||||
|
||||
Actions: approve, modify (requires modified_code), reject (requires reason), skip
|
||||
"""
|
||||
if goal_id not in _pending_tests:
|
||||
return json.dumps({"error": f"No pending tests for goal {goal_id}"})
|
||||
|
||||
try:
|
||||
approvals_list = json.loads(approvals)
|
||||
except json.JSONDecodeError as e:
|
||||
return json.dumps({"error": f"Invalid approvals JSON: {e}"})
|
||||
|
||||
# Create storage
|
||||
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
|
||||
|
||||
# Build approval requests
|
||||
requests = []
|
||||
for a in approvals_list:
|
||||
try:
|
||||
action = ApprovalAction(a.get("action", "skip"))
|
||||
requests.append(ApprovalRequest(
|
||||
test_id=a["test_id"],
|
||||
action=action,
|
||||
modified_code=a.get("modified_code"),
|
||||
reason=a.get("reason"),
|
||||
approved_by="mcp_user",
|
||||
))
|
||||
except (KeyError, ValueError) as e:
|
||||
return json.dumps({"error": f"Invalid approval entry: {e}"})
|
||||
|
||||
# Find and save approved tests
|
||||
pending = {t.id: t for t in _pending_tests[goal_id]}
|
||||
|
||||
results = []
|
||||
for req in requests:
|
||||
test = pending.get(req.test_id)
|
||||
if not test:
|
||||
results.append({"test_id": req.test_id, "error": "Not found in pending"})
|
||||
continue
|
||||
|
||||
if req.action == ApprovalAction.APPROVE:
|
||||
test.approve(req.approved_by)
|
||||
storage.save_test(test)
|
||||
results.append({"test_id": req.test_id, "status": "approved"})
|
||||
|
||||
elif req.action == ApprovalAction.MODIFY:
|
||||
if req.modified_code:
|
||||
test.modify(req.modified_code, req.approved_by)
|
||||
storage.save_test(test)
|
||||
results.append({"test_id": req.test_id, "status": "modified"})
|
||||
else:
|
||||
results.append({"test_id": req.test_id, "error": "modified_code required"})
|
||||
|
||||
elif req.action == ApprovalAction.REJECT:
|
||||
test.reject(req.reason or "No reason provided")
|
||||
storage.save_test(test)
|
||||
results.append({"test_id": req.test_id, "status": "rejected"})
|
||||
|
||||
elif req.action == ApprovalAction.SKIP:
|
||||
results.append({"test_id": req.test_id, "status": "skipped"})
|
||||
|
||||
# Clear pending for processed tests
|
||||
processed_ids = {r["test_id"] for r in results if "error" not in r}
|
||||
_pending_tests[goal_id] = [t for t in _pending_tests[goal_id] if t.id not in processed_ids]
|
||||
|
||||
# Clean up if empty
|
||||
if not _pending_tests[goal_id]:
|
||||
del _pending_tests[goal_id]
|
||||
|
||||
return json.dumps({"goal_id": goal_id, "results": results})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def run_tests(
|
||||
goal_id: Annotated[str, "ID of the goal to test"],
|
||||
agent_path: Annotated[str, "Path to the agent export folder"],
|
||||
test_types: Annotated[str, 'JSON array of test types: ["constraint", "outcome", "edge_case", "all"]'] = '["all"]',
|
||||
parallel: Annotated[int, "Number of parallel workers (0 for sequential)"] = 0,
|
||||
fail_fast: Annotated[bool, "Stop on first failure"] = False,
|
||||
) -> str:
|
||||
"""
|
||||
Run evaluation tests for a goal.
|
||||
|
||||
Returns pass/fail summary with detailed results for each test.
|
||||
"""
|
||||
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
|
||||
|
||||
# Parse test types
|
||||
try:
|
||||
types_list = json.loads(test_types)
|
||||
except json.JSONDecodeError:
|
||||
types_list = ["all"]
|
||||
|
||||
# Load storage
|
||||
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
|
||||
|
||||
# Get approved tests
|
||||
tests = storage.get_approved_tests(goal_id)
|
||||
|
||||
# Filter by type if not "all"
|
||||
if "all" not in types_list:
|
||||
type_map = {
|
||||
"constraint": TestType.CONSTRAINT,
|
||||
"outcome": TestType.SUCCESS_CRITERIA,
|
||||
"edge_case": TestType.EDGE_CASE,
|
||||
}
|
||||
filter_types = {type_map.get(t) for t in types_list if t in type_map}
|
||||
tests = [t for t in tests if t.test_type in filter_types]
|
||||
|
||||
if not tests:
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"error": "No approved tests found",
|
||||
"hint": "Generate and approve tests first using generate_constraint_tests and approve_tests",
|
||||
})
|
||||
|
||||
# Configure runner
|
||||
config = ParallelConfig(
|
||||
num_workers=parallel if parallel > 0 else 1,
|
||||
fail_fast=fail_fast,
|
||||
)
|
||||
|
||||
# Run tests - use AgentFactory for picklable parallel execution
|
||||
runner = ParallelTestRunner(config, storage)
|
||||
result = runner.run_all(
|
||||
goal_id=goal_id,
|
||||
agent_factory=AgentFactory(agent_path),
|
||||
tests=tests,
|
||||
)
|
||||
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"overall_passed": result.all_passed,
|
||||
"summary": {
|
||||
"total": result.total,
|
||||
"passed": result.passed,
|
||||
"failed": result.failed,
|
||||
"pass_rate": f"{result.pass_rate:.1%}",
|
||||
},
|
||||
"duration_ms": result.duration_ms,
|
||||
"results": [r.summary_dict() for r in result.results],
|
||||
})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def debug_test(
|
||||
goal_id: Annotated[str, "ID of the goal"],
|
||||
test_id: Annotated[str, "ID of the failed test"],
|
||||
run_id: Annotated[str, "Optional Runtime run ID for detailed logs"] = "",
|
||||
) -> str:
|
||||
"""
|
||||
Get detailed debug info for a failed test.
|
||||
|
||||
Includes error categorization, logs, and fix suggestions.
|
||||
"""
|
||||
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
|
||||
|
||||
# Optionally load runtime storage
|
||||
runtime_storage = None
|
||||
try:
|
||||
from framework.storage.backend import FileStorage
|
||||
runtime_storage = FileStorage(f"data/runtime/{goal_id}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
debug_tool = DebugTool(storage, runtime_storage)
|
||||
info = debug_tool.analyze(goal_id, test_id, run_id or None)
|
||||
|
||||
return json.dumps(info.to_dict(), indent=2, default=str)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def list_tests(
|
||||
goal_id: Annotated[str, "ID of the goal"],
|
||||
status: Annotated[str, "Filter by approval status: pending, approved, modified, rejected, all"] = "all",
|
||||
) -> str:
|
||||
"""
|
||||
List tests for a goal.
|
||||
|
||||
Returns test metadata without full code (use debug_test for details).
|
||||
"""
|
||||
storage = TestStorage(DEFAULT_TEST_STORAGE_PATH / goal_id)
|
||||
tests = storage.get_tests_by_goal(goal_id)
|
||||
|
||||
# Filter by status
|
||||
if status != "all":
|
||||
try:
|
||||
filter_status = ApprovalStatus(status)
|
||||
tests = [t for t in tests if t.approval_status == filter_status]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"total": len(tests),
|
||||
"tests": [
|
||||
{
|
||||
"id": t.id,
|
||||
"test_name": t.test_name,
|
||||
"test_type": t.test_type.value,
|
||||
"parent_criteria_id": t.parent_criteria_id,
|
||||
"approval_status": t.approval_status.value,
|
||||
"last_result": t.last_result,
|
||||
"confidence": t.llm_confidence,
|
||||
}
|
||||
for t in tests
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_pending_tests(
|
||||
goal_id: Annotated[str, "ID of the goal"],
|
||||
) -> str:
|
||||
"""
|
||||
Get pending tests awaiting approval.
|
||||
|
||||
Returns tests that have been generated but not yet approved.
|
||||
"""
|
||||
if goal_id not in _pending_tests:
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"pending_count": 0,
|
||||
"tests": [],
|
||||
})
|
||||
|
||||
tests = _pending_tests[goal_id]
|
||||
return json.dumps({
|
||||
"goal_id": goal_id,
|
||||
"pending_count": len(tests),
|
||||
"tests": [
|
||||
{
|
||||
"id": t.id,
|
||||
"test_name": t.test_name,
|
||||
"test_type": t.test_type.value,
|
||||
"parent_criteria_id": t.parent_criteria_id,
|
||||
"description": t.description,
|
||||
"confidence": t.llm_confidence,
|
||||
"test_code": t.test_code,
|
||||
"input": t.input,
|
||||
"expected_output": t.expected_output,
|
||||
}
|
||||
for t in tests
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PLAN LOADING AND EXECUTION
|
||||
# =============================================================================
|
||||
|
||||
@@ -189,7 +189,7 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
runner = AgentRunner.load(
|
||||
args.agent_path,
|
||||
mock_mode=args.mock,
|
||||
model=getattr(args, "model", "claude-sonnet-4-20250514"),
|
||||
model=getattr(args, "model", "claude-haiku-4-5-20251001"),
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
|
||||
@@ -57,7 +57,7 @@ class AgentOrchestrator:
|
||||
def __init__(
|
||||
self,
|
||||
llm: LLMProvider | None = None,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
model: str = "claude-haiku-4-5-20251001",
|
||||
):
|
||||
"""
|
||||
Initialize the orchestrator.
|
||||
|
||||
@@ -171,7 +171,7 @@ class AgentRunner:
|
||||
goal: Goal,
|
||||
mock_mode: bool = False,
|
||||
storage_path: Path | None = None,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
model: str = "claude-haiku-4-5-20251001",
|
||||
):
|
||||
"""
|
||||
Initialize the runner (use AgentRunner.load() instead).
|
||||
@@ -216,7 +216,7 @@ class AgentRunner:
|
||||
agent_path: str | Path,
|
||||
mock_mode: bool = False,
|
||||
storage_path: Path | None = None,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
model: str = "claude-haiku-4-5-20251001",
|
||||
) -> "AgentRunner":
|
||||
"""
|
||||
Load an agent from an export folder.
|
||||
|
||||
@@ -9,12 +9,15 @@ handles all the structured logging.
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from framework.schemas.decision import Decision, Option, Outcome, DecisionType
|
||||
from framework.schemas.run import Run, RunStatus
|
||||
from framework.storage.backend import FileStorage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Runtime:
|
||||
"""
|
||||
@@ -100,7 +103,10 @@ class Runtime:
|
||||
output_data: Final output of the run
|
||||
"""
|
||||
if self._current_run is None:
|
||||
raise RuntimeError("No run in progress")
|
||||
# Gracefully handle case where run was already ended or never started
|
||||
# This can happen during exception handling cascades
|
||||
logger.warning("end_run called but no run in progress (already ended or never started)")
|
||||
return
|
||||
|
||||
status = RunStatus.COMPLETED if success else RunStatus.FAILED
|
||||
self._current_run.output_data = output_data or {}
|
||||
@@ -158,10 +164,12 @@ class Runtime:
|
||||
context: Additional context available when deciding
|
||||
|
||||
Returns:
|
||||
The decision ID (use this to record outcome later)
|
||||
The decision ID (use this to record outcome later), or empty string if no run in progress
|
||||
"""
|
||||
if self._current_run is None:
|
||||
raise RuntimeError("No run in progress. Call start_run() first.")
|
||||
# Gracefully handle case where run ended during exception handling
|
||||
logger.warning(f"decide called but no run in progress: {intent}")
|
||||
return ""
|
||||
|
||||
# Build Option objects
|
||||
option_objects = []
|
||||
@@ -220,7 +228,10 @@ class Runtime:
|
||||
latency_ms: Time taken in milliseconds
|
||||
"""
|
||||
if self._current_run is None:
|
||||
raise RuntimeError("No run in progress")
|
||||
# Gracefully handle case where run ended during exception handling
|
||||
# This can happen in cascading error scenarios
|
||||
logger.warning(f"record_outcome called but no run in progress (decision_id={decision_id})")
|
||||
return
|
||||
|
||||
outcome = Outcome(
|
||||
success=success,
|
||||
@@ -258,10 +269,13 @@ class Runtime:
|
||||
suggested_fix: What might fix it (if known)
|
||||
|
||||
Returns:
|
||||
The problem ID
|
||||
The problem ID, or empty string if no run in progress
|
||||
"""
|
||||
if self._current_run is None:
|
||||
raise RuntimeError("No run in progress")
|
||||
# Gracefully handle case where run ended during exception handling
|
||||
# Log the problem since we can't store it, then return empty ID
|
||||
logger.warning(f"report_problem called but no run in progress: [{severity}] {description}")
|
||||
return ""
|
||||
|
||||
return self._current_run.add_problem(
|
||||
severity=severity,
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
Goal-Based Testing Framework
|
||||
|
||||
A three-stage framework (Goal → Agent → Eval) where tests are LLM-generated
|
||||
from success_criteria and constraints, with mandatory user approval.
|
||||
|
||||
## Core Flow
|
||||
|
||||
1. **Goal Stage**: Define success_criteria and constraints, generate constraint tests
|
||||
2. **Agent Stage**: Build nodes + edges, run constraint tests during development
|
||||
3. **Eval Stage**: Generate success_criteria tests, run all tests, debug failures
|
||||
|
||||
## Key Components
|
||||
|
||||
- **Schemas**: Test, TestResult, TestSuiteResult, ApprovalStatus, ErrorCategory
|
||||
- **Storage**: TestStorage for persisting tests and results
|
||||
- **Generation**: LLM-based test generation from Goal criteria
|
||||
- **Approval**: Mandatory user approval workflow (CLI and programmatic)
|
||||
- **Runner**: Parallel test execution with pytest-xdist inspired design
|
||||
- **Debug**: Error categorization and fix suggestions
|
||||
|
||||
## MCP Tools
|
||||
|
||||
Testing tools are integrated into the main agent_builder_server.py (not a separate server).
|
||||
This ensures the building_agent skill has access to all testing functionality:
|
||||
- generate_constraint_tests, generate_success_tests
|
||||
- approve_tests, run_tests, debug_test
|
||||
- list_tests, get_pending_tests
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from framework.testing import (
|
||||
Test, TestResult, TestStorage,
|
||||
ConstraintTestGenerator, SuccessCriteriaTestGenerator,
|
||||
ParallelTestRunner, DebugTool,
|
||||
)
|
||||
|
||||
# Generate tests
|
||||
generator = ConstraintTestGenerator(llm)
|
||||
tests = generator.generate(goal)
|
||||
|
||||
# Approve tests (required)
|
||||
for test in tests:
|
||||
test.approve("user")
|
||||
storage.save_test(test)
|
||||
|
||||
# Run tests
|
||||
runner = ParallelTestRunner()
|
||||
result = runner.run_all(goal_id, agent_factory, tests)
|
||||
|
||||
# Debug failures
|
||||
debug = DebugTool(storage)
|
||||
info = debug.analyze(goal_id, test_id)
|
||||
```
|
||||
|
||||
## CLI Commands
|
||||
|
||||
```bash
|
||||
python -m framework test-generate goal.json
|
||||
python -m framework test-approve <goal_id>
|
||||
python -m framework test-run <agent_path> --goal <goal_id>
|
||||
python -m framework test-debug <goal_id> <test_id>
|
||||
```
|
||||
"""
|
||||
|
||||
# Schemas
|
||||
from framework.testing.test_case import (
|
||||
ApprovalStatus,
|
||||
TestType,
|
||||
Test,
|
||||
)
|
||||
from framework.testing.test_result import (
|
||||
ErrorCategory,
|
||||
TestResult,
|
||||
TestSuiteResult,
|
||||
)
|
||||
|
||||
# Storage
|
||||
from framework.testing.test_storage import TestStorage
|
||||
|
||||
# Generation
|
||||
from framework.testing.constraint_gen import ConstraintTestGenerator
|
||||
from framework.testing.success_gen import SuccessCriteriaTestGenerator
|
||||
from framework.testing.prompts import (
|
||||
CONSTRAINT_TEST_PROMPT,
|
||||
SUCCESS_CRITERIA_TEST_PROMPT,
|
||||
)
|
||||
|
||||
# Approval
|
||||
from framework.testing.approval_types import (
|
||||
ApprovalAction,
|
||||
ApprovalRequest,
|
||||
ApprovalResult,
|
||||
BatchApprovalRequest,
|
||||
BatchApprovalResult,
|
||||
)
|
||||
from framework.testing.approval_cli import interactive_approval, batch_approval
|
||||
|
||||
# Runner
|
||||
from framework.testing.executor import TestExecutor
|
||||
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
|
||||
from framework.testing.categorizer import ErrorCategorizer
|
||||
|
||||
# Debug
|
||||
from framework.testing.debug_tool import DebugTool, DebugInfo
|
||||
|
||||
# CLI
|
||||
from framework.testing.cli import register_testing_commands
|
||||
|
||||
__all__ = [
|
||||
# Schemas
|
||||
"ApprovalStatus",
|
||||
"TestType",
|
||||
"Test",
|
||||
"ErrorCategory",
|
||||
"TestResult",
|
||||
"TestSuiteResult",
|
||||
# Storage
|
||||
"TestStorage",
|
||||
# Generation
|
||||
"ConstraintTestGenerator",
|
||||
"SuccessCriteriaTestGenerator",
|
||||
"CONSTRAINT_TEST_PROMPT",
|
||||
"SUCCESS_CRITERIA_TEST_PROMPT",
|
||||
# Approval
|
||||
"ApprovalAction",
|
||||
"ApprovalRequest",
|
||||
"ApprovalResult",
|
||||
"BatchApprovalRequest",
|
||||
"BatchApprovalResult",
|
||||
"interactive_approval",
|
||||
"batch_approval",
|
||||
# Runner
|
||||
"TestExecutor",
|
||||
"ParallelTestRunner",
|
||||
"ParallelConfig",
|
||||
"ErrorCategorizer",
|
||||
# Debug
|
||||
"DebugTool",
|
||||
"DebugInfo",
|
||||
# CLI
|
||||
"register_testing_commands",
|
||||
]
|
||||
@@ -0,0 +1,295 @@
|
||||
"""
|
||||
Interactive CLI for reviewing and approving generated tests.
|
||||
|
||||
LLM-generated tests are NEVER created without user approval.
|
||||
This CLI provides the interactive approval workflow.
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import subprocess
|
||||
import os
|
||||
from typing import Callable
|
||||
|
||||
from framework.testing.test_case import Test, ApprovalStatus
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.approval_types import (
|
||||
ApprovalAction,
|
||||
ApprovalRequest,
|
||||
ApprovalResult,
|
||||
BatchApprovalResult,
|
||||
)
|
||||
|
||||
|
||||
def interactive_approval(
|
||||
tests: list[Test],
|
||||
storage: TestStorage,
|
||||
on_progress: Callable[[int, int], None] | None = None,
|
||||
) -> list[ApprovalResult]:
|
||||
"""
|
||||
Interactive CLI flow for reviewing generated tests.
|
||||
|
||||
Displays each test and allows user to:
|
||||
- [a]pprove: Accept as-is
|
||||
- [r]eject: Decline with reason
|
||||
- [e]dit: Modify before accepting
|
||||
- [s]kip: Leave pending (decide later)
|
||||
|
||||
Args:
|
||||
tests: List of pending tests to review
|
||||
storage: TestStorage for saving decisions
|
||||
on_progress: Optional callback(current, total) for progress tracking
|
||||
|
||||
Returns:
|
||||
List of ApprovalResult for each processed test
|
||||
"""
|
||||
results = []
|
||||
total = len(tests)
|
||||
|
||||
for i, test in enumerate(tests, 1):
|
||||
if on_progress:
|
||||
on_progress(i, total)
|
||||
|
||||
# Display test
|
||||
_display_test(test, i, total)
|
||||
|
||||
# Get user action
|
||||
action = _get_user_action()
|
||||
|
||||
# Process action
|
||||
result = _process_action(test, action, storage)
|
||||
results.append(result)
|
||||
|
||||
print() # Blank line between tests
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def batch_approval(
|
||||
goal_id: str,
|
||||
requests: list[ApprovalRequest],
|
||||
storage: TestStorage,
|
||||
) -> BatchApprovalResult:
|
||||
"""
|
||||
Process multiple approval requests at once.
|
||||
|
||||
Used by MCP interface for programmatic approval.
|
||||
|
||||
Args:
|
||||
goal_id: Goal ID for the tests
|
||||
requests: List of approval requests
|
||||
storage: TestStorage for saving decisions
|
||||
|
||||
Returns:
|
||||
BatchApprovalResult with counts and individual results
|
||||
"""
|
||||
results = []
|
||||
counts = {
|
||||
"approved": 0,
|
||||
"modified": 0,
|
||||
"rejected": 0,
|
||||
"skipped": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
for req in requests:
|
||||
# Validate request
|
||||
valid, error = req.validate_action()
|
||||
if not valid:
|
||||
results.append(ApprovalResult.error_result(
|
||||
req.test_id, req.action, error or "Invalid request"
|
||||
))
|
||||
counts["errors"] += 1
|
||||
continue
|
||||
|
||||
# Load test
|
||||
test = storage.load_test(goal_id, req.test_id)
|
||||
if not test:
|
||||
results.append(ApprovalResult.error_result(
|
||||
req.test_id, req.action, f"Test {req.test_id} not found"
|
||||
))
|
||||
counts["errors"] += 1
|
||||
continue
|
||||
|
||||
# Apply action
|
||||
try:
|
||||
if req.action == ApprovalAction.APPROVE:
|
||||
test.approve(req.approved_by)
|
||||
counts["approved"] += 1
|
||||
elif req.action == ApprovalAction.MODIFY:
|
||||
test.modify(req.modified_code or test.test_code, req.approved_by)
|
||||
counts["modified"] += 1
|
||||
elif req.action == ApprovalAction.REJECT:
|
||||
test.reject(req.reason or "No reason provided")
|
||||
counts["rejected"] += 1
|
||||
elif req.action == ApprovalAction.SKIP:
|
||||
counts["skipped"] += 1
|
||||
|
||||
# Save if not skipped
|
||||
if req.action != ApprovalAction.SKIP:
|
||||
storage.update_test(test)
|
||||
|
||||
results.append(ApprovalResult.success_result(
|
||||
req.test_id, req.action, f"Test {req.action.value}d successfully"
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
results.append(ApprovalResult.error_result(
|
||||
req.test_id, req.action, str(e)
|
||||
))
|
||||
counts["errors"] += 1
|
||||
|
||||
return BatchApprovalResult(
|
||||
goal_id=goal_id,
|
||||
total=len(requests),
|
||||
approved=counts["approved"],
|
||||
modified=counts["modified"],
|
||||
rejected=counts["rejected"],
|
||||
skipped=counts["skipped"],
|
||||
errors=counts["errors"],
|
||||
results=results,
|
||||
)
|
||||
|
||||
|
||||
def _display_test(test: Test, index: int, total: int) -> None:
|
||||
"""Display a test for review."""
|
||||
separator = "=" * 60
|
||||
|
||||
print(f"\n{separator}")
|
||||
print(f"[{index}/{total}] {test.test_name}")
|
||||
print(f"Type: {test.test_type.value}")
|
||||
print(f"Criteria: {test.parent_criteria_id}")
|
||||
print(f"Confidence: {test.llm_confidence * 100:.0f}%")
|
||||
print(separator)
|
||||
|
||||
print(f"\nDescription: {test.description}")
|
||||
|
||||
if test.input:
|
||||
print(f"\nInput:")
|
||||
print(json.dumps(test.input, indent=2))
|
||||
|
||||
if test.expected_output:
|
||||
print(f"\nExpected Output:")
|
||||
print(json.dumps(test.expected_output, indent=2))
|
||||
|
||||
print(f"\nTest Code:")
|
||||
print("-" * 40)
|
||||
print(test.test_code)
|
||||
print("-" * 40)
|
||||
|
||||
print("\n[a]pprove [r]eject [e]dit [s]kip")
|
||||
|
||||
|
||||
def _get_user_action() -> ApprovalAction:
|
||||
"""Get user's choice for action."""
|
||||
while True:
|
||||
choice = input("Your choice: ").strip().lower()
|
||||
|
||||
if choice == "a":
|
||||
return ApprovalAction.APPROVE
|
||||
elif choice == "r":
|
||||
return ApprovalAction.REJECT
|
||||
elif choice == "e":
|
||||
return ApprovalAction.MODIFY
|
||||
elif choice == "s":
|
||||
return ApprovalAction.SKIP
|
||||
else:
|
||||
print("Invalid choice. Please enter a, r, e, or s.")
|
||||
|
||||
|
||||
def _process_action(
|
||||
test: Test,
|
||||
action: ApprovalAction,
|
||||
storage: TestStorage,
|
||||
) -> ApprovalResult:
|
||||
"""Process user's action on a test."""
|
||||
try:
|
||||
if action == ApprovalAction.APPROVE:
|
||||
test.approve()
|
||||
storage.update_test(test)
|
||||
print("✓ Approved")
|
||||
return ApprovalResult.success_result(test.id, action, "Approved")
|
||||
|
||||
elif action == ApprovalAction.REJECT:
|
||||
reason = input("Rejection reason: ").strip()
|
||||
if not reason:
|
||||
reason = "No reason provided"
|
||||
test.reject(reason)
|
||||
storage.update_test(test)
|
||||
print(f"✗ Rejected: {reason}")
|
||||
return ApprovalResult.success_result(test.id, action, f"Rejected: {reason}")
|
||||
|
||||
elif action == ApprovalAction.MODIFY:
|
||||
edited_code = _edit_test_code(test.test_code)
|
||||
if edited_code != test.test_code:
|
||||
test.modify(edited_code)
|
||||
storage.update_test(test)
|
||||
print("✓ Modified and approved")
|
||||
return ApprovalResult.success_result(test.id, action, "Modified and approved")
|
||||
else:
|
||||
# No changes made, treat as approve
|
||||
test.approve()
|
||||
storage.update_test(test)
|
||||
print("✓ Approved (no modifications)")
|
||||
return ApprovalResult.success_result(test.id, ApprovalAction.APPROVE, "No modifications made")
|
||||
|
||||
elif action == ApprovalAction.SKIP:
|
||||
print("⏭ Skipped (remains pending)")
|
||||
return ApprovalResult.success_result(test.id, action, "Skipped")
|
||||
|
||||
else:
|
||||
return ApprovalResult.error_result(test.id, action, f"Unknown action: {action}")
|
||||
|
||||
except Exception as e:
|
||||
return ApprovalResult.error_result(test.id, action, str(e))
|
||||
|
||||
|
||||
def _edit_test_code(code: str) -> str:
|
||||
"""
|
||||
Open test code in user's editor for modification.
|
||||
|
||||
Uses $EDITOR environment variable, falls back to vim/nano.
|
||||
"""
|
||||
editor = os.environ.get("EDITOR", "vim")
|
||||
|
||||
# Try to find an available editor
|
||||
if not _command_exists(editor):
|
||||
for fallback in ["nano", "vi", "notepad"]:
|
||||
if _command_exists(fallback):
|
||||
editor = fallback
|
||||
break
|
||||
|
||||
# Create temp file with code
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w",
|
||||
suffix=".py",
|
||||
delete=False
|
||||
) as f:
|
||||
f.write(code)
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
# Open editor
|
||||
subprocess.run([editor, temp_path], check=True)
|
||||
|
||||
# Read edited code
|
||||
with open(temp_path) as f:
|
||||
return f.read()
|
||||
except subprocess.CalledProcessError:
|
||||
print("Editor failed, keeping original code")
|
||||
return code
|
||||
except FileNotFoundError:
|
||||
print(f"Editor '{editor}' not found, keeping original code")
|
||||
return code
|
||||
finally:
|
||||
# Clean up temp file
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _command_exists(cmd: str) -> bool:
|
||||
"""Check if a command exists in PATH."""
|
||||
from shutil import which
|
||||
return which(cmd) is not None
|
||||
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
Types for the approval workflow.
|
||||
|
||||
These types are used for both interactive CLI approval and
|
||||
programmatic/MCP-based approval.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ApprovalAction(str, Enum):
|
||||
"""Actions a user can take on a generated test."""
|
||||
APPROVE = "approve" # Accept as-is
|
||||
MODIFY = "modify" # Accept with modifications
|
||||
REJECT = "reject" # Decline
|
||||
SKIP = "skip" # Leave pending (decide later)
|
||||
|
||||
|
||||
class ApprovalRequest(BaseModel):
|
||||
"""
|
||||
Request to approve/modify/reject a generated test.
|
||||
|
||||
Used by both CLI and MCP interfaces.
|
||||
"""
|
||||
test_id: str
|
||||
action: ApprovalAction
|
||||
modified_code: str | None = Field(
|
||||
default=None,
|
||||
description="New code if action is MODIFY"
|
||||
)
|
||||
reason: str | None = Field(
|
||||
default=None,
|
||||
description="Rejection reason if action is REJECT"
|
||||
)
|
||||
approved_by: str = "user"
|
||||
|
||||
def validate_action(self) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Validate that the request has required fields for its action.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
if self.action == ApprovalAction.MODIFY and not self.modified_code:
|
||||
return False, "modified_code is required for MODIFY action"
|
||||
if self.action == ApprovalAction.REJECT and not self.reason:
|
||||
return False, "reason is required for REJECT action"
|
||||
return True, None
|
||||
|
||||
|
||||
class ApprovalResult(BaseModel):
|
||||
"""
|
||||
Result of processing an approval request.
|
||||
"""
|
||||
test_id: str
|
||||
action: ApprovalAction
|
||||
success: bool
|
||||
message: str | None = None
|
||||
error: str | None = None
|
||||
timestamp: datetime = Field(default_factory=datetime.now)
|
||||
|
||||
@classmethod
|
||||
def success_result(
|
||||
cls, test_id: str, action: ApprovalAction, message: str | None = None
|
||||
) -> "ApprovalResult":
|
||||
"""Create a successful result."""
|
||||
return cls(
|
||||
test_id=test_id,
|
||||
action=action,
|
||||
success=True,
|
||||
message=message,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def error_result(
|
||||
cls, test_id: str, action: ApprovalAction, error: str
|
||||
) -> "ApprovalResult":
|
||||
"""Create an error result."""
|
||||
return cls(
|
||||
test_id=test_id,
|
||||
action=action,
|
||||
success=False,
|
||||
error=error,
|
||||
)
|
||||
|
||||
|
||||
class BatchApprovalRequest(BaseModel):
|
||||
"""
|
||||
Request to approve multiple tests at once.
|
||||
|
||||
Useful for MCP interface where user reviews all tests and submits decisions.
|
||||
"""
|
||||
goal_id: str
|
||||
approvals: list[ApprovalRequest]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return {
|
||||
"goal_id": self.goal_id,
|
||||
"approvals": [a.model_dump() for a in self.approvals],
|
||||
}
|
||||
|
||||
|
||||
class BatchApprovalResult(BaseModel):
|
||||
"""
|
||||
Result of processing a batch approval request.
|
||||
"""
|
||||
goal_id: str
|
||||
total: int
|
||||
approved: int
|
||||
modified: int
|
||||
rejected: int
|
||||
skipped: int
|
||||
errors: int
|
||||
results: list[ApprovalResult]
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Return a summary string."""
|
||||
return (
|
||||
f"Processed {self.total} tests: "
|
||||
f"{self.approved} approved, "
|
||||
f"{self.modified} modified, "
|
||||
f"{self.rejected} rejected, "
|
||||
f"{self.skipped} skipped, "
|
||||
f"{self.errors} errors"
|
||||
)
|
||||
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
Error categorization for test failures.
|
||||
|
||||
Categorizes errors to guide iteration strategy:
|
||||
- LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints
|
||||
- IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
|
||||
- EDGE_CASE: New scenario discovered → add new test only
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from framework.testing.test_result import ErrorCategory, TestResult
|
||||
|
||||
|
||||
class ErrorCategorizer:
|
||||
"""
|
||||
Categorize test failures for guiding iteration.
|
||||
|
||||
Uses pattern matching heuristics to classify errors.
|
||||
Each category has different implications for how to fix.
|
||||
"""
|
||||
|
||||
# Patterns indicating goal/criteria definition is wrong
|
||||
LOGIC_ERROR_PATTERNS = [
|
||||
r"goal not achieved",
|
||||
r"constraint violated:?\s*core",
|
||||
r"fundamental assumption",
|
||||
r"success criteria mismatch",
|
||||
r"criteria not met",
|
||||
r"expected behavior incorrect",
|
||||
r"specification error",
|
||||
r"requirement mismatch",
|
||||
]
|
||||
|
||||
# Patterns indicating code/implementation bug
|
||||
IMPLEMENTATION_ERROR_PATTERNS = [
|
||||
r"TypeError",
|
||||
r"AttributeError",
|
||||
r"KeyError",
|
||||
r"IndexError",
|
||||
r"ValueError",
|
||||
r"NameError",
|
||||
r"ImportError",
|
||||
r"ModuleNotFoundError",
|
||||
r"RuntimeError",
|
||||
r"NullPointerException",
|
||||
r"NoneType.*has no attribute",
|
||||
r"tool call failed",
|
||||
r"node execution error",
|
||||
r"agent execution failed",
|
||||
r"assertion.*failed",
|
||||
r"AssertionError",
|
||||
r"expected.*but got",
|
||||
r"unexpected.*type",
|
||||
r"missing required",
|
||||
r"invalid.*argument",
|
||||
]
|
||||
|
||||
# Patterns indicating edge case / new scenario
|
||||
EDGE_CASE_PATTERNS = [
|
||||
r"boundary condition",
|
||||
r"timeout",
|
||||
r"connection.*timeout",
|
||||
r"request.*timeout",
|
||||
r"unexpected format",
|
||||
r"unexpected response",
|
||||
r"rare input",
|
||||
r"empty.*result",
|
||||
r"null.*value",
|
||||
r"empty.*response",
|
||||
r"no.*results",
|
||||
r"rate.*limit",
|
||||
r"quota.*exceeded",
|
||||
r"retry.*exhausted",
|
||||
r"unicode.*error",
|
||||
r"encoding.*error",
|
||||
r"special.*character",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize categorizer with compiled patterns."""
|
||||
self._logic_patterns = [
|
||||
re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS
|
||||
]
|
||||
self._impl_patterns = [
|
||||
re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS
|
||||
]
|
||||
self._edge_patterns = [
|
||||
re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS
|
||||
]
|
||||
|
||||
def categorize(self, result: TestResult) -> ErrorCategory | None:
|
||||
"""
|
||||
Categorize a test failure.
|
||||
|
||||
Args:
|
||||
result: TestResult to categorize
|
||||
|
||||
Returns:
|
||||
ErrorCategory if test failed, None if passed
|
||||
"""
|
||||
if result.passed:
|
||||
return None
|
||||
|
||||
# Combine error sources for analysis
|
||||
error_text = self._get_error_text(result)
|
||||
|
||||
# Check patterns in priority order
|
||||
# Logic errors take precedence (wrong goal definition)
|
||||
for pattern in self._logic_patterns:
|
||||
if pattern.search(error_text):
|
||||
return ErrorCategory.LOGIC_ERROR
|
||||
|
||||
# Then implementation errors (code bugs)
|
||||
for pattern in self._impl_patterns:
|
||||
if pattern.search(error_text):
|
||||
return ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
# Then edge cases (new scenarios)
|
||||
for pattern in self._edge_patterns:
|
||||
if pattern.search(error_text):
|
||||
return ErrorCategory.EDGE_CASE
|
||||
|
||||
# Default to implementation error (most common)
|
||||
return ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
def categorize_with_confidence(
|
||||
self, result: TestResult
|
||||
) -> tuple[ErrorCategory | None, float]:
|
||||
"""
|
||||
Categorize with a confidence score.
|
||||
|
||||
Args:
|
||||
result: TestResult to categorize
|
||||
|
||||
Returns:
|
||||
Tuple of (category, confidence 0-1)
|
||||
"""
|
||||
if result.passed:
|
||||
return None, 1.0
|
||||
|
||||
error_text = self._get_error_text(result)
|
||||
|
||||
# Count pattern matches for each category
|
||||
logic_matches = sum(
|
||||
1 for p in self._logic_patterns if p.search(error_text)
|
||||
)
|
||||
impl_matches = sum(
|
||||
1 for p in self._impl_patterns if p.search(error_text)
|
||||
)
|
||||
edge_matches = sum(
|
||||
1 for p in self._edge_patterns if p.search(error_text)
|
||||
)
|
||||
|
||||
total_matches = logic_matches + impl_matches + edge_matches
|
||||
|
||||
if total_matches == 0:
|
||||
# No pattern matches, default to implementation with low confidence
|
||||
return ErrorCategory.IMPLEMENTATION_ERROR, 0.3
|
||||
|
||||
# Calculate confidence based on match dominance
|
||||
if logic_matches >= impl_matches and logic_matches >= edge_matches:
|
||||
confidence = logic_matches / total_matches if total_matches > 0 else 0.5
|
||||
return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4)
|
||||
|
||||
if impl_matches >= logic_matches and impl_matches >= edge_matches:
|
||||
confidence = impl_matches / total_matches if total_matches > 0 else 0.5
|
||||
return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4)
|
||||
|
||||
confidence = edge_matches / total_matches if total_matches > 0 else 0.5
|
||||
return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4)
|
||||
|
||||
def _get_error_text(self, result: TestResult) -> str:
|
||||
"""Extract all error text from a result for analysis."""
|
||||
parts = []
|
||||
|
||||
if result.error_message:
|
||||
parts.append(result.error_message)
|
||||
|
||||
if result.stack_trace:
|
||||
parts.append(result.stack_trace)
|
||||
|
||||
# Include log messages
|
||||
for log in result.runtime_logs:
|
||||
if log.get("level") in ("ERROR", "CRITICAL", "WARNING"):
|
||||
parts.append(str(log.get("msg", "")))
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
def get_fix_suggestion(self, category: ErrorCategory) -> str:
|
||||
"""
|
||||
Get a fix suggestion based on error category.
|
||||
|
||||
Args:
|
||||
category: ErrorCategory from categorization
|
||||
|
||||
Returns:
|
||||
Human-readable fix suggestion
|
||||
"""
|
||||
suggestions = {
|
||||
ErrorCategory.LOGIC_ERROR: (
|
||||
"Review and update success_criteria or constraints in the Goal definition. "
|
||||
"The goal specification may not accurately describe the desired behavior."
|
||||
),
|
||||
ErrorCategory.IMPLEMENTATION_ERROR: (
|
||||
"Fix the code in agent nodes/edges. "
|
||||
"There's a bug in the implementation that needs to be corrected."
|
||||
),
|
||||
ErrorCategory.EDGE_CASE: (
|
||||
"Add a new test for this edge case scenario. "
|
||||
"This is a valid scenario that wasn't covered by existing tests."
|
||||
),
|
||||
}
|
||||
return suggestions.get(category, "Review the test and agent implementation.")
|
||||
|
||||
def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]:
|
||||
"""
|
||||
Get detailed iteration guidance based on error category.
|
||||
|
||||
Returns a dict with:
|
||||
- stage: Which stage to return to (Goal, Agent, Eval)
|
||||
- action: What action to take
|
||||
- restart_required: Whether full 3-step flow restart is needed
|
||||
"""
|
||||
guidance = {
|
||||
ErrorCategory.LOGIC_ERROR: {
|
||||
"stage": "Goal",
|
||||
"action": "Update success_criteria or constraints",
|
||||
"restart_required": True,
|
||||
"description": (
|
||||
"The goal definition is incorrect. Update the success criteria "
|
||||
"or constraints, then restart the full Goal → Agent → Eval flow."
|
||||
),
|
||||
},
|
||||
ErrorCategory.IMPLEMENTATION_ERROR: {
|
||||
"stage": "Agent",
|
||||
"action": "Fix nodes/edges implementation",
|
||||
"restart_required": False,
|
||||
"description": (
|
||||
"There's a code bug. Fix the agent implementation, "
|
||||
"then re-run Eval (skip Goal stage)."
|
||||
),
|
||||
},
|
||||
ErrorCategory.EDGE_CASE: {
|
||||
"stage": "Eval",
|
||||
"action": "Add new test only",
|
||||
"restart_required": False,
|
||||
"description": (
|
||||
"This is a new scenario. Add a test for it and continue "
|
||||
"in the Eval stage."
|
||||
),
|
||||
},
|
||||
}
|
||||
return guidance.get(category, {
|
||||
"stage": "Unknown",
|
||||
"action": "Review manually",
|
||||
"restart_required": False,
|
||||
"description": "Unable to determine category. Manual review required.",
|
||||
})
|
||||
@@ -0,0 +1,413 @@
|
||||
"""
|
||||
CLI commands for goal-based testing.
|
||||
|
||||
Provides commands:
|
||||
- test-generate: Generate tests from a goal
|
||||
- test-approve: Review and approve pending tests
|
||||
- test-run: Run tests for an agent
|
||||
- test-debug: Debug a failed test
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from framework.graph.goal import Goal
|
||||
from framework.testing.test_case import TestType
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.constraint_gen import ConstraintTestGenerator
|
||||
from framework.testing.success_gen import SuccessCriteriaTestGenerator
|
||||
from framework.testing.approval_cli import interactive_approval
|
||||
from framework.testing.parallel import ParallelTestRunner, ParallelConfig, AgentFactory
|
||||
from framework.testing.debug_tool import DebugTool
|
||||
|
||||
|
||||
DEFAULT_STORAGE_PATH = Path("data/tests")
|
||||
|
||||
|
||||
def register_testing_commands(subparsers: argparse._SubParsersAction) -> None:
|
||||
"""Register testing CLI commands."""
|
||||
|
||||
# test-generate
|
||||
gen_parser = subparsers.add_parser(
|
||||
"test-generate",
|
||||
help="Generate tests from goal criteria",
|
||||
)
|
||||
gen_parser.add_argument(
|
||||
"goal_file",
|
||||
help="Path to goal JSON file",
|
||||
)
|
||||
gen_parser.add_argument(
|
||||
"--type",
|
||||
choices=["constraint", "success", "all"],
|
||||
default="all",
|
||||
help="Type of tests to generate",
|
||||
)
|
||||
gen_parser.add_argument(
|
||||
"--auto-approve",
|
||||
action="store_true",
|
||||
help="Skip interactive approval (use with caution)",
|
||||
)
|
||||
gen_parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
help="Output directory for tests (default: data/tests/<goal_id>)",
|
||||
)
|
||||
gen_parser.set_defaults(func=cmd_test_generate)
|
||||
|
||||
# test-approve
|
||||
approve_parser = subparsers.add_parser(
|
||||
"test-approve",
|
||||
help="Review and approve pending tests",
|
||||
)
|
||||
approve_parser.add_argument(
|
||||
"goal_id",
|
||||
help="Goal ID to review tests for",
|
||||
)
|
||||
approve_parser.add_argument(
|
||||
"--storage",
|
||||
help="Storage directory (default: data/tests/<goal_id>)",
|
||||
)
|
||||
approve_parser.set_defaults(func=cmd_test_approve)
|
||||
|
||||
# test-run
|
||||
run_parser = subparsers.add_parser(
|
||||
"test-run",
|
||||
help="Run tests for an agent",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"agent_path",
|
||||
help="Path to agent export folder",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--goal",
|
||||
"-g",
|
||||
required=True,
|
||||
help="Goal ID to run tests for",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--parallel",
|
||||
"-p",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of parallel workers (0 for sequential)",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--fail-fast",
|
||||
action="store_true",
|
||||
help="Stop on first failure",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--type",
|
||||
choices=["constraint", "success", "edge_case", "all"],
|
||||
default="all",
|
||||
help="Type of tests to run",
|
||||
)
|
||||
run_parser.set_defaults(func=cmd_test_run)
|
||||
|
||||
# test-debug
|
||||
debug_parser = subparsers.add_parser(
|
||||
"test-debug",
|
||||
help="Debug a failed test",
|
||||
)
|
||||
debug_parser.add_argument(
|
||||
"goal_id",
|
||||
help="Goal ID",
|
||||
)
|
||||
debug_parser.add_argument(
|
||||
"test_id",
|
||||
help="Test ID to debug",
|
||||
)
|
||||
debug_parser.add_argument(
|
||||
"--run-id",
|
||||
help="Runtime run ID for detailed logs",
|
||||
)
|
||||
debug_parser.set_defaults(func=cmd_test_debug)
|
||||
|
||||
# test-list
|
||||
list_parser = subparsers.add_parser(
|
||||
"test-list",
|
||||
help="List tests for a goal",
|
||||
)
|
||||
list_parser.add_argument(
|
||||
"goal_id",
|
||||
help="Goal ID",
|
||||
)
|
||||
list_parser.add_argument(
|
||||
"--status",
|
||||
choices=["pending", "approved", "modified", "rejected", "all"],
|
||||
default="all",
|
||||
help="Filter by approval status",
|
||||
)
|
||||
list_parser.set_defaults(func=cmd_test_list)
|
||||
|
||||
# test-stats
|
||||
stats_parser = subparsers.add_parser(
|
||||
"test-stats",
|
||||
help="Show test statistics for a goal",
|
||||
)
|
||||
stats_parser.add_argument(
|
||||
"goal_id",
|
||||
help="Goal ID",
|
||||
)
|
||||
stats_parser.set_defaults(func=cmd_test_stats)
|
||||
|
||||
|
||||
def cmd_test_generate(args: argparse.Namespace) -> int:
|
||||
"""Generate tests from a goal file."""
|
||||
# Load goal
|
||||
goal_path = Path(args.goal_file)
|
||||
if not goal_path.exists():
|
||||
print(f"Error: Goal file not found: {goal_path}")
|
||||
return 1
|
||||
|
||||
with open(goal_path) as f:
|
||||
goal = Goal.model_validate_json(f.read())
|
||||
|
||||
print(f"Loaded goal: {goal.name} ({goal.id})")
|
||||
|
||||
# Determine output directory
|
||||
output_dir = Path(args.output) if args.output else DEFAULT_STORAGE_PATH / goal.id
|
||||
storage = TestStorage(output_dir)
|
||||
|
||||
# Get LLM provider
|
||||
try:
|
||||
from framework.llm import AnthropicProvider
|
||||
llm = AnthropicProvider()
|
||||
except Exception as e:
|
||||
print(f"Error: Failed to initialize LLM provider: {e}")
|
||||
return 1
|
||||
|
||||
all_tests = []
|
||||
|
||||
# Generate constraint tests
|
||||
if args.type in ("constraint", "all"):
|
||||
print(f"\nGenerating constraint tests for {len(goal.constraints)} constraints...")
|
||||
generator = ConstraintTestGenerator(llm)
|
||||
constraint_tests = generator.generate(goal)
|
||||
all_tests.extend(constraint_tests)
|
||||
print(f"Generated {len(constraint_tests)} constraint tests")
|
||||
|
||||
# Generate success criteria tests
|
||||
if args.type in ("success", "all"):
|
||||
print(f"\nGenerating success criteria tests for {len(goal.success_criteria)} criteria...")
|
||||
generator = SuccessCriteriaTestGenerator(llm)
|
||||
success_tests = generator.generate(goal)
|
||||
all_tests.extend(success_tests)
|
||||
print(f"Generated {len(success_tests)} success criteria tests")
|
||||
|
||||
if not all_tests:
|
||||
print("\nNo tests generated.")
|
||||
return 0
|
||||
|
||||
print(f"\nTotal tests generated: {len(all_tests)}")
|
||||
|
||||
# Approval
|
||||
if args.auto_approve:
|
||||
print("\nAuto-approving all tests...")
|
||||
for test in all_tests:
|
||||
test.approve("cli-auto")
|
||||
storage.save_test(test)
|
||||
print(f"Saved {len(all_tests)} tests to {output_dir}")
|
||||
else:
|
||||
print("\nStarting interactive approval...")
|
||||
# Save pending tests first
|
||||
for test in all_tests:
|
||||
storage.save_test(test)
|
||||
|
||||
results = interactive_approval(all_tests, storage)
|
||||
approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
|
||||
print(f"\nApproved: {approved}/{len(all_tests)} tests")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_test_approve(args: argparse.Namespace) -> int:
|
||||
"""Review and approve pending tests."""
|
||||
storage_path = Path(args.storage) if args.storage else DEFAULT_STORAGE_PATH / args.goal_id
|
||||
storage = TestStorage(storage_path)
|
||||
|
||||
pending = storage.get_pending_tests(args.goal_id)
|
||||
|
||||
if not pending:
|
||||
print(f"No pending tests for goal {args.goal_id}")
|
||||
return 0
|
||||
|
||||
print(f"Found {len(pending)} pending tests\n")
|
||||
|
||||
results = interactive_approval(pending, storage)
|
||||
approved = sum(1 for r in results if r.action.value in ("approve", "modify"))
|
||||
print(f"\nApproved: {approved}/{len(pending)} tests")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_test_run(args: argparse.Namespace) -> int:
|
||||
"""Run tests for an agent."""
|
||||
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal)
|
||||
|
||||
# Get approved tests
|
||||
tests = storage.get_approved_tests(args.goal)
|
||||
|
||||
# Filter by type
|
||||
if args.type != "all":
|
||||
type_map = {
|
||||
"constraint": TestType.CONSTRAINT,
|
||||
"success": TestType.SUCCESS_CRITERIA,
|
||||
"edge_case": TestType.EDGE_CASE,
|
||||
}
|
||||
filter_type = type_map.get(args.type)
|
||||
if filter_type:
|
||||
tests = [t for t in tests if t.test_type == filter_type]
|
||||
|
||||
if not tests:
|
||||
print(f"No approved tests found for goal {args.goal}")
|
||||
return 1
|
||||
|
||||
print(f"Running {len(tests)} tests...\n")
|
||||
|
||||
# Configure runner
|
||||
config = ParallelConfig(
|
||||
num_workers=args.parallel if args.parallel > 0 else 1,
|
||||
fail_fast=args.fail_fast,
|
||||
)
|
||||
|
||||
# Run with progress - use AgentFactory for picklable parallel execution
|
||||
runner = ParallelTestRunner(config, storage)
|
||||
|
||||
def on_result(result):
|
||||
status = "✓" if result.passed else "✗"
|
||||
print(f" {status} {result.test_id} ({result.duration_ms}ms)")
|
||||
|
||||
result = runner.run_all(
|
||||
goal_id=args.goal,
|
||||
agent_factory=AgentFactory(args.agent_path),
|
||||
tests=tests,
|
||||
on_result=on_result,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'=' * 40}")
|
||||
print(f"Results: {result.passed}/{result.total} passed ({result.pass_rate:.1%})")
|
||||
print(f"Duration: {result.duration_ms}ms")
|
||||
|
||||
if not result.all_passed:
|
||||
print(f"\nFailed tests:")
|
||||
for r in result.get_failed_results():
|
||||
print(f" - {r.test_id}: {r.error_message}")
|
||||
if r.error_category:
|
||||
print(f" Category: {r.error_category.value}")
|
||||
|
||||
return 0 if result.all_passed else 1
|
||||
|
||||
|
||||
def cmd_test_debug(args: argparse.Namespace) -> int:
|
||||
"""Debug a failed test."""
|
||||
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
|
||||
|
||||
# Try to load runtime storage
|
||||
runtime_storage = None
|
||||
try:
|
||||
from framework.storage.backend import FileStorage
|
||||
runtime_storage = FileStorage(f"data/runtime/{args.goal_id}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
debug_tool = DebugTool(storage, runtime_storage)
|
||||
info = debug_tool.analyze(args.goal_id, args.test_id, args.run_id)
|
||||
|
||||
# Print debug info
|
||||
print(f"Debug Info for: {info.test_name}")
|
||||
print("=" * 50)
|
||||
|
||||
print(f"\nTest ID: {info.test_id}")
|
||||
print(f"Passed: {info.passed}")
|
||||
|
||||
if info.error_category:
|
||||
print(f"\nError Category: {info.error_category}")
|
||||
print(f"Suggested Fix: {info.suggested_fix}")
|
||||
|
||||
if info.error_message:
|
||||
print(f"\nError Message:\n{info.error_message}")
|
||||
|
||||
if info.stack_trace:
|
||||
print(f"\nStack Trace:\n{info.stack_trace}")
|
||||
|
||||
if info.iteration_guidance:
|
||||
print(f"\nIteration Guidance:")
|
||||
print(f" Stage: {info.iteration_guidance.get('stage')}")
|
||||
print(f" Action: {info.iteration_guidance.get('action')}")
|
||||
print(f" Restart Required: {info.iteration_guidance.get('restart_required')}")
|
||||
|
||||
print(f"\nInput:\n{json.dumps(info.input, indent=2)}")
|
||||
print(f"\nExpected:\n{json.dumps(info.expected, indent=2)}")
|
||||
print(f"\nActual:\n{json.dumps(info.actual, indent=2, default=str)}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_test_list(args: argparse.Namespace) -> int:
|
||||
"""List tests for a goal."""
|
||||
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
|
||||
tests = storage.get_tests_by_goal(args.goal_id)
|
||||
|
||||
# Filter by status
|
||||
if args.status != "all":
|
||||
from framework.testing.test_case import ApprovalStatus
|
||||
try:
|
||||
filter_status = ApprovalStatus(args.status)
|
||||
tests = [t for t in tests if t.approval_status == filter_status]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if not tests:
|
||||
print(f"No tests found for goal {args.goal_id}")
|
||||
return 0
|
||||
|
||||
print(f"Tests for goal {args.goal_id}:\n")
|
||||
for t in tests:
|
||||
status_icon = {
|
||||
"pending": "⏳",
|
||||
"approved": "✓",
|
||||
"modified": "✓*",
|
||||
"rejected": "✗",
|
||||
}.get(t.approval_status.value, "?")
|
||||
|
||||
result_icon = ""
|
||||
if t.last_result:
|
||||
result_icon = " [PASS]" if t.last_result == "passed" else " [FAIL]"
|
||||
|
||||
print(f" {status_icon} {t.test_name} ({t.test_type.value}){result_icon}")
|
||||
print(f" ID: {t.id}")
|
||||
print(f" Criteria: {t.parent_criteria_id}")
|
||||
if t.llm_confidence:
|
||||
print(f" Confidence: {t.llm_confidence:.0%}")
|
||||
print()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_test_stats(args: argparse.Namespace) -> int:
|
||||
"""Show test statistics."""
|
||||
storage = TestStorage(DEFAULT_STORAGE_PATH / args.goal_id)
|
||||
stats = storage.get_stats()
|
||||
|
||||
print(f"Statistics for goal {args.goal_id}:\n")
|
||||
print(f" Total tests: {stats['total_tests']}")
|
||||
print(f"\n By approval status:")
|
||||
for status, count in stats["by_approval"].items():
|
||||
print(f" {status}: {count}")
|
||||
|
||||
# Get pass/fail stats
|
||||
tests = storage.get_approved_tests(args.goal_id)
|
||||
passed = sum(1 for t in tests if t.last_result == "passed")
|
||||
failed = sum(1 for t in tests if t.last_result == "failed")
|
||||
not_run = sum(1 for t in tests if t.last_result is None)
|
||||
|
||||
print(f"\n Execution results:")
|
||||
print(f" Passed: {passed}")
|
||||
print(f" Failed: {failed}")
|
||||
print(f" Not run: {not_run}")
|
||||
|
||||
return 0
|
||||
@@ -0,0 +1,201 @@
|
||||
"""
|
||||
Constraint test generator.
|
||||
|
||||
Generates tests for Goal constraints using LLM.
|
||||
Tests are returned with PENDING approval status.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from framework.graph.goal import Goal, Constraint
|
||||
from framework.testing.test_case import Test, TestType, ApprovalStatus
|
||||
from framework.testing.prompts import CONSTRAINT_TEST_PROMPT
|
||||
from framework.llm.provider import Tool, ToolUse, ToolResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from framework.llm.provider import LLMProvider
|
||||
|
||||
|
||||
# Tool for collecting generated tests - Claude handles JSON escaping automatically
|
||||
SUBMIT_TEST_TOOL = Tool(
|
||||
name="submit_test",
|
||||
description="Submit a generated constraint test. Call once per test.",
|
||||
parameters={
|
||||
"properties": {
|
||||
"constraint_id": {
|
||||
"type": "string",
|
||||
"description": "ID of the constraint being tested",
|
||||
},
|
||||
"test_name": {
|
||||
"type": "string",
|
||||
"description": "pytest function name, e.g., test_constraint_api_limits_respected",
|
||||
},
|
||||
"test_code": {
|
||||
"type": "string",
|
||||
"description": "Complete Python test function code",
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "What the test validates",
|
||||
},
|
||||
"input": {
|
||||
"type": "object",
|
||||
"description": "Test input data",
|
||||
},
|
||||
"expected_output": {
|
||||
"type": "object",
|
||||
"description": "Expected output",
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"description": "Confidence score 0-1",
|
||||
},
|
||||
},
|
||||
"required": ["constraint_id", "test_name", "test_code", "description", "confidence"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class ConstraintTestGenerator:
|
||||
"""
|
||||
Generate constraint tests from Goal constraints.
|
||||
|
||||
Generated tests require user approval before being added to the test suite.
|
||||
"""
|
||||
|
||||
def __init__(self, llm: "LLMProvider"):
|
||||
"""
|
||||
Initialize generator with LLM provider.
|
||||
|
||||
Args:
|
||||
llm: LLM provider for test generation (e.g., AnthropicProvider)
|
||||
"""
|
||||
self.llm = llm
|
||||
|
||||
def generate(self, goal: Goal) -> list[Test]:
|
||||
"""
|
||||
Generate tests for all constraints in a goal.
|
||||
|
||||
Args:
|
||||
goal: Goal with constraints to test
|
||||
|
||||
Returns:
|
||||
List of Test objects with approval_status=PENDING.
|
||||
These MUST be approved before being added to the test suite.
|
||||
"""
|
||||
if not goal.constraints:
|
||||
return []
|
||||
|
||||
# Format prompt
|
||||
prompt = CONSTRAINT_TEST_PROMPT.format(
|
||||
goal_name=goal.name,
|
||||
goal_description=goal.description,
|
||||
constraints_formatted=self._format_constraints(goal.constraints),
|
||||
)
|
||||
|
||||
# Collect tests via tool calls - Claude handles JSON escaping automatically
|
||||
collected_tests: list[dict] = []
|
||||
|
||||
def tool_executor(tool_use: ToolUse) -> ToolResult:
|
||||
if tool_use.name == "submit_test":
|
||||
collected_tests.append(tool_use.input)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Test recorded successfully"
|
||||
)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
|
||||
)
|
||||
|
||||
self.llm.complete_with_tools(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system="You are a test generation expert. For each constraint, call the submit_test tool with the test details.",
|
||||
tools=[SUBMIT_TEST_TOOL],
|
||||
tool_executor=tool_executor,
|
||||
max_iterations=20,
|
||||
)
|
||||
|
||||
return self._create_tests_from_collected(collected_tests, goal.id)
|
||||
|
||||
def generate_for_constraint(
|
||||
self, goal: Goal, constraint: Constraint
|
||||
) -> list[Test]:
|
||||
"""
|
||||
Generate tests for a single constraint.
|
||||
|
||||
Args:
|
||||
goal: Goal containing the constraint
|
||||
constraint: Specific constraint to test
|
||||
|
||||
Returns:
|
||||
List of Test objects for the constraint
|
||||
"""
|
||||
# Format prompt with just this constraint
|
||||
prompt = CONSTRAINT_TEST_PROMPT.format(
|
||||
goal_name=goal.name,
|
||||
goal_description=goal.description,
|
||||
constraints_formatted=self._format_constraint(constraint),
|
||||
)
|
||||
|
||||
# Collect tests via tool calls
|
||||
collected_tests: list[dict] = []
|
||||
|
||||
def tool_executor(tool_use: ToolUse) -> ToolResult:
|
||||
if tool_use.name == "submit_test":
|
||||
collected_tests.append(tool_use.input)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Test recorded successfully"
|
||||
)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
|
||||
)
|
||||
|
||||
self.llm.complete_with_tools(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system="You are a test generation expert. Call the submit_test tool with the test details.",
|
||||
tools=[SUBMIT_TEST_TOOL],
|
||||
tool_executor=tool_executor,
|
||||
max_iterations=10,
|
||||
)
|
||||
|
||||
return self._create_tests_from_collected(collected_tests, goal.id)
|
||||
|
||||
def _format_constraints(self, constraints: list[Constraint]) -> str:
|
||||
"""Format constraints for prompt."""
|
||||
lines = []
|
||||
for c in constraints:
|
||||
lines.append(self._format_constraint(c))
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_constraint(self, constraint: Constraint) -> str:
|
||||
"""Format a single constraint for prompt."""
|
||||
severity = "HARD" if constraint.constraint_type == "hard" else "SOFT"
|
||||
return f"""### Constraint: {constraint.id}
|
||||
- Type: {severity} ({constraint.constraint_type})
|
||||
- Category: {constraint.category}
|
||||
- Description: {constraint.description}
|
||||
- Check: {constraint.check}"""
|
||||
|
||||
def _create_tests_from_collected(
|
||||
self, collected: list[dict], goal_id: str
|
||||
) -> list[Test]:
|
||||
"""Create Test objects from tool call data."""
|
||||
tests = []
|
||||
for td in collected:
|
||||
test = Test(
|
||||
id=f"test_{uuid.uuid4().hex[:8]}",
|
||||
goal_id=goal_id,
|
||||
parent_criteria_id=td.get("constraint_id", "unknown"),
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name=td.get("test_name", "unnamed_test"),
|
||||
test_code=td.get("test_code", ""),
|
||||
description=td.get("description", ""),
|
||||
input=td.get("input", {}),
|
||||
expected_output=td.get("expected_output", {}),
|
||||
generated_by="llm",
|
||||
llm_confidence=float(td.get("confidence", 0.5)),
|
||||
approval_status=ApprovalStatus.PENDING,
|
||||
)
|
||||
tests.append(test)
|
||||
return tests
|
||||
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Debug tool for analyzing failed tests.
|
||||
|
||||
Provides detailed information for debugging:
|
||||
- Test input and expected output
|
||||
- Actual output and error details
|
||||
- Error categorization
|
||||
- Runtime logs and execution path
|
||||
- Fix suggestions
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from framework.testing.test_case import Test
|
||||
from framework.testing.test_result import TestResult, ErrorCategory
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.categorizer import ErrorCategorizer
|
||||
|
||||
|
||||
class DebugInfo(BaseModel):
|
||||
"""
|
||||
Comprehensive debug information for a failed test.
|
||||
"""
|
||||
test_id: str
|
||||
test_name: str
|
||||
|
||||
# Test definition
|
||||
input: dict[str, Any] = Field(default_factory=dict)
|
||||
expected: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Actual result
|
||||
actual: Any = None
|
||||
passed: bool = False
|
||||
|
||||
# Error details
|
||||
error_message: str | None = None
|
||||
error_category: str | None = None
|
||||
stack_trace: str | None = None
|
||||
|
||||
# Runtime data
|
||||
logs: list[dict[str, Any]] = Field(default_factory=list)
|
||||
runtime_data: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Fix guidance
|
||||
suggested_fix: str | None = None
|
||||
iteration_guidance: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dict for JSON serialization."""
|
||||
return self.model_dump()
|
||||
|
||||
|
||||
class DebugTool:
|
||||
"""
|
||||
Debug tool for analyzing failed tests.
|
||||
|
||||
Integrates with:
|
||||
- TestStorage for test and result data
|
||||
- Runtime storage (optional) for decision logs
|
||||
- ErrorCategorizer for classification
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
test_storage: TestStorage,
|
||||
runtime_storage: Any | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize debug tool.
|
||||
|
||||
Args:
|
||||
test_storage: Storage for test and result data
|
||||
runtime_storage: Optional FileStorage for Runtime data
|
||||
"""
|
||||
self.test_storage = test_storage
|
||||
self.runtime_storage = runtime_storage
|
||||
self.categorizer = ErrorCategorizer()
|
||||
|
||||
def analyze(
|
||||
self,
|
||||
goal_id: str,
|
||||
test_id: str,
|
||||
run_id: str | None = None,
|
||||
) -> DebugInfo:
|
||||
"""
|
||||
Get detailed debug info for a failed test.
|
||||
|
||||
Args:
|
||||
goal_id: Goal ID containing the test
|
||||
test_id: ID of the test to analyze
|
||||
run_id: Optional Runtime run ID for detailed logs
|
||||
|
||||
Returns:
|
||||
DebugInfo with comprehensive debug data
|
||||
"""
|
||||
# Load test
|
||||
test = self.test_storage.load_test(goal_id, test_id)
|
||||
if not test:
|
||||
return DebugInfo(
|
||||
test_id=test_id,
|
||||
test_name="unknown",
|
||||
error_message=f"Test {test_id} not found in goal {goal_id}",
|
||||
)
|
||||
|
||||
# Load latest result
|
||||
result = self.test_storage.get_latest_result(test_id)
|
||||
|
||||
# Build debug info
|
||||
debug_info = DebugInfo(
|
||||
test_id=test_id,
|
||||
test_name=test.test_name,
|
||||
input=test.input,
|
||||
expected=test.expected_output,
|
||||
)
|
||||
|
||||
if result:
|
||||
debug_info.actual = result.actual_output
|
||||
debug_info.passed = result.passed
|
||||
debug_info.error_message = result.error_message
|
||||
debug_info.stack_trace = result.stack_trace
|
||||
debug_info.logs = result.runtime_logs
|
||||
|
||||
# Set category
|
||||
if result.error_category:
|
||||
debug_info.error_category = result.error_category.value
|
||||
elif not result.passed:
|
||||
# Categorize if not already done
|
||||
category = self.categorizer.categorize(result)
|
||||
if category:
|
||||
debug_info.error_category = category.value
|
||||
|
||||
# Get runtime data if available
|
||||
if run_id and self.runtime_storage:
|
||||
debug_info.runtime_data = self._get_runtime_data(run_id)
|
||||
|
||||
# Generate fix suggestions
|
||||
if debug_info.error_category:
|
||||
category = ErrorCategory(debug_info.error_category)
|
||||
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
|
||||
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
|
||||
|
||||
return debug_info
|
||||
|
||||
def analyze_result(
|
||||
self,
|
||||
test: Test,
|
||||
result: TestResult,
|
||||
run_id: str | None = None,
|
||||
) -> DebugInfo:
|
||||
"""
|
||||
Analyze a test result directly (without loading from storage).
|
||||
|
||||
Args:
|
||||
test: The Test that was run
|
||||
result: The TestResult to analyze
|
||||
run_id: Optional Runtime run ID
|
||||
|
||||
Returns:
|
||||
DebugInfo with debug data
|
||||
"""
|
||||
debug_info = DebugInfo(
|
||||
test_id=test.id,
|
||||
test_name=test.test_name,
|
||||
input=test.input,
|
||||
expected=test.expected_output,
|
||||
actual=result.actual_output,
|
||||
passed=result.passed,
|
||||
error_message=result.error_message,
|
||||
stack_trace=result.stack_trace,
|
||||
logs=result.runtime_logs,
|
||||
)
|
||||
|
||||
# Categorize
|
||||
if result.error_category:
|
||||
debug_info.error_category = result.error_category.value
|
||||
elif not result.passed:
|
||||
category = self.categorizer.categorize(result)
|
||||
if category:
|
||||
debug_info.error_category = category.value
|
||||
|
||||
# Runtime data
|
||||
if run_id and self.runtime_storage:
|
||||
debug_info.runtime_data = self._get_runtime_data(run_id)
|
||||
|
||||
# Fix suggestions
|
||||
if debug_info.error_category:
|
||||
category = ErrorCategory(debug_info.error_category)
|
||||
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
|
||||
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
|
||||
|
||||
return debug_info
|
||||
|
||||
def get_failure_summary(
|
||||
self,
|
||||
goal_id: str,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Get summary of all failures for a goal.
|
||||
|
||||
Returns:
|
||||
Dict with failure counts by category and test IDs
|
||||
"""
|
||||
tests = self.test_storage.get_tests_by_goal(goal_id)
|
||||
|
||||
failures_by_category: dict[str, list[str]] = {
|
||||
"logic_error": [],
|
||||
"implementation_error": [],
|
||||
"edge_case": [],
|
||||
"uncategorized": [],
|
||||
}
|
||||
|
||||
for test in tests:
|
||||
if test.last_result == "failed":
|
||||
result = self.test_storage.get_latest_result(test.id)
|
||||
if result and result.error_category:
|
||||
failures_by_category[result.error_category.value].append(test.id)
|
||||
else:
|
||||
failures_by_category["uncategorized"].append(test.id)
|
||||
|
||||
return {
|
||||
"goal_id": goal_id,
|
||||
"total_failures": sum(len(ids) for ids in failures_by_category.values()),
|
||||
"by_category": failures_by_category,
|
||||
"iteration_suggestions": self._get_iteration_suggestions(failures_by_category),
|
||||
}
|
||||
|
||||
def _get_runtime_data(self, run_id: str) -> dict[str, Any]:
|
||||
"""Extract runtime data from Runtime storage."""
|
||||
if not self.runtime_storage:
|
||||
return {}
|
||||
|
||||
try:
|
||||
run = self.runtime_storage.load_run(run_id)
|
||||
if not run:
|
||||
return {"error": f"Run {run_id} not found"}
|
||||
|
||||
return {
|
||||
"execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [],
|
||||
"decisions": [
|
||||
d.model_dump() if hasattr(d, "model_dump") else str(d)
|
||||
for d in getattr(run, "decisions", [])
|
||||
],
|
||||
"problems": [
|
||||
p.model_dump() if hasattr(p, "model_dump") else str(p)
|
||||
for p in getattr(run, "problems", [])
|
||||
],
|
||||
"status": run.status.value if hasattr(run, "status") else "unknown",
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to load runtime data: {e}"}
|
||||
|
||||
def _get_iteration_suggestions(
|
||||
self,
|
||||
failures_by_category: dict[str, list[str]],
|
||||
) -> list[str]:
|
||||
"""Generate iteration suggestions based on failure categories."""
|
||||
suggestions = []
|
||||
|
||||
if failures_by_category["logic_error"]:
|
||||
suggestions.append(
|
||||
f"Found {len(failures_by_category['logic_error'])} logic errors. "
|
||||
"Review and update Goal success_criteria/constraints, then restart "
|
||||
"the full Goal → Agent → Eval flow."
|
||||
)
|
||||
|
||||
if failures_by_category["implementation_error"]:
|
||||
suggestions.append(
|
||||
f"Found {len(failures_by_category['implementation_error'])} implementation errors. "
|
||||
"Fix agent node/edge code and re-run Eval."
|
||||
)
|
||||
|
||||
if failures_by_category["edge_case"]:
|
||||
suggestions.append(
|
||||
f"Found {len(failures_by_category['edge_case'])} edge cases. "
|
||||
"These are new scenarios - add tests for them."
|
||||
)
|
||||
|
||||
if failures_by_category["uncategorized"]:
|
||||
suggestions.append(
|
||||
f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. "
|
||||
"Manual review required."
|
||||
)
|
||||
|
||||
return suggestions
|
||||
@@ -0,0 +1,407 @@
|
||||
"""
|
||||
Single test executor.
|
||||
|
||||
Executes a single test against an agent and returns a TestResult.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from typing import Any, Protocol, runtime_checkable
|
||||
|
||||
from framework.testing.test_case import Test
|
||||
from framework.testing.test_result import TestResult, ErrorCategory
|
||||
from framework.testing.categorizer import ErrorCategorizer
|
||||
|
||||
|
||||
class LLMJudge:
|
||||
"""
|
||||
LLM-based judge for semantic evaluation of test results.
|
||||
|
||||
Used by tests that need to evaluate semantic properties like
|
||||
"no hallucination" or "preserves meaning" that can't be checked
|
||||
with simple assertions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the LLM judge."""
|
||||
self._client = None
|
||||
|
||||
def _get_client(self):
|
||||
"""Lazy-load the Anthropic client."""
|
||||
if self._client is None:
|
||||
try:
|
||||
import anthropic
|
||||
self._client = anthropic.Anthropic()
|
||||
except ImportError:
|
||||
raise RuntimeError("anthropic package required for LLM judge")
|
||||
return self._client
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
constraint: str,
|
||||
source_document: str,
|
||||
summary: str,
|
||||
criteria: str,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Evaluate whether a summary meets a constraint.
|
||||
|
||||
Args:
|
||||
constraint: The constraint being tested (e.g., "no-hallucination")
|
||||
source_document: The original document
|
||||
summary: The generated summary to evaluate
|
||||
criteria: Human-readable criteria for evaluation
|
||||
|
||||
Returns:
|
||||
Dict with 'passes' (bool) and 'explanation' (str)
|
||||
"""
|
||||
client = self._get_client()
|
||||
|
||||
prompt = f"""You are evaluating whether a summary meets a specific constraint.
|
||||
|
||||
CONSTRAINT: {constraint}
|
||||
CRITERIA: {criteria}
|
||||
|
||||
SOURCE DOCUMENT:
|
||||
{source_document}
|
||||
|
||||
SUMMARY TO EVALUATE:
|
||||
{summary}
|
||||
|
||||
Evaluate whether the summary meets the constraint. Be strict but fair.
|
||||
|
||||
Respond with JSON in this exact format:
|
||||
{{"passes": true/false, "explanation": "brief explanation of your judgment"}}
|
||||
|
||||
Only output the JSON, nothing else."""
|
||||
|
||||
try:
|
||||
response = client.messages.create(
|
||||
model="claude-haiku-4-5-20251001",
|
||||
max_tokens=500,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
# Parse the response
|
||||
import json
|
||||
text = response.content[0].text.strip()
|
||||
# Handle potential markdown code blocks
|
||||
if text.startswith("```"):
|
||||
text = text.split("```")[1]
|
||||
if text.startswith("json"):
|
||||
text = text[4:]
|
||||
text = text.strip()
|
||||
|
||||
result = json.loads(text)
|
||||
return {
|
||||
"passes": bool(result.get("passes", False)),
|
||||
"explanation": result.get("explanation", "No explanation provided")
|
||||
}
|
||||
except Exception as e:
|
||||
# On error, fail the test with explanation
|
||||
return {
|
||||
"passes": False,
|
||||
"explanation": f"LLM judge error: {e}"
|
||||
}
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class AgentProtocol(Protocol):
|
||||
"""Protocol for agent that can be tested."""
|
||||
|
||||
def run(self, input: dict[str, Any]) -> Any:
|
||||
"""Run the agent with input and return result."""
|
||||
...
|
||||
|
||||
|
||||
class SyncAgentWrapper:
|
||||
"""
|
||||
Wrapper that makes async agent.run() callable synchronously.
|
||||
|
||||
This allows tests to call agent.run() without async/await syntax,
|
||||
which simplifies test code generation and execution.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: Any):
|
||||
self._agent = agent
|
||||
self._loop: asyncio.AbstractEventLoop | None = None
|
||||
|
||||
def run(self, input_data: dict[str, Any]) -> Any:
|
||||
"""
|
||||
Run agent synchronously by wrapping async call.
|
||||
|
||||
Args:
|
||||
input_data: Input data for the agent
|
||||
|
||||
Returns:
|
||||
Output dict from the agent's ExecutionResult
|
||||
"""
|
||||
coro = self._agent.run(input_data)
|
||||
|
||||
# Check if we're already in an async context
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
# We're in an async context, can't use run_until_complete
|
||||
# This shouldn't happen in normal test execution
|
||||
raise RuntimeError("Cannot run sync wrapper from async context")
|
||||
except RuntimeError:
|
||||
# No running loop, create one or reuse
|
||||
pass
|
||||
|
||||
# Get or create event loop
|
||||
try:
|
||||
if self._loop is None or self._loop.is_closed():
|
||||
self._loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(self._loop)
|
||||
return self._loop.run_until_complete(coro).output
|
||||
finally:
|
||||
# Don't close the loop here - we may need it for subsequent calls
|
||||
pass
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""Forward other attribute access to wrapped agent."""
|
||||
return getattr(self._agent, name)
|
||||
|
||||
|
||||
class TestExecutor:
|
||||
"""
|
||||
Execute a single test against an agent.
|
||||
|
||||
Handles:
|
||||
- Test code compilation and execution
|
||||
- Timing measurement
|
||||
- Error capture and categorization
|
||||
- Result creation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
categorizer: ErrorCategorizer | None = None,
|
||||
timeout: float = 60.0,
|
||||
):
|
||||
"""
|
||||
Initialize executor.
|
||||
|
||||
Args:
|
||||
categorizer: ErrorCategorizer for classifying failures
|
||||
timeout: Maximum test execution time in seconds
|
||||
"""
|
||||
self.categorizer = categorizer or ErrorCategorizer()
|
||||
self.timeout = timeout
|
||||
|
||||
def execute(
|
||||
self,
|
||||
test: Test,
|
||||
agent: AgentProtocol,
|
||||
capture_logs: bool = True,
|
||||
) -> TestResult:
|
||||
"""
|
||||
Execute a test against an agent.
|
||||
|
||||
Args:
|
||||
test: Test to execute
|
||||
agent: Agent instance to test
|
||||
capture_logs: Whether to capture runtime logs
|
||||
|
||||
Returns:
|
||||
TestResult with execution details
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
# Build test environment
|
||||
test_globals = self._build_test_globals(agent, test)
|
||||
|
||||
# Compile test code
|
||||
try:
|
||||
compiled = compile(test.test_code, f"<test:{test.test_name}>", "exec")
|
||||
except SyntaxError as e:
|
||||
return self._create_error_result(
|
||||
test=test,
|
||||
start_time=start_time,
|
||||
error_message=f"Test code syntax error: {e}",
|
||||
stack_trace=traceback.format_exc(),
|
||||
)
|
||||
|
||||
# Execute test
|
||||
try:
|
||||
exec(compiled, test_globals)
|
||||
|
||||
# Look for test function and call it
|
||||
test_func = test_globals.get(test.test_name)
|
||||
if test_func is None:
|
||||
# Try to find any function starting with test_
|
||||
for name, obj in test_globals.items():
|
||||
if name.startswith("test_") and callable(obj):
|
||||
test_func = obj
|
||||
break
|
||||
|
||||
if test_func is None:
|
||||
return self._create_error_result(
|
||||
test=test,
|
||||
start_time=start_time,
|
||||
error_message=f"Test function '{test.test_name}' not found in test code",
|
||||
)
|
||||
|
||||
# Call the test function with appropriate arguments
|
||||
# Inspect the function signature to determine what to pass
|
||||
sig = inspect.signature(test_func)
|
||||
params = list(sig.parameters.keys())
|
||||
|
||||
# Build arguments based on what the function expects
|
||||
call_args = []
|
||||
for param in params:
|
||||
if param == "agent":
|
||||
call_args.append(test_globals["agent"])
|
||||
elif param == "llm_judge":
|
||||
call_args.append(test_globals["llm_judge"])
|
||||
elif param in test_globals:
|
||||
call_args.append(test_globals[param])
|
||||
else:
|
||||
# Unknown parameter - this will likely cause an error
|
||||
# but we let it happen naturally
|
||||
break
|
||||
|
||||
test_func(*call_args)
|
||||
|
||||
# Test passed
|
||||
duration_ms = int((time.perf_counter() - start_time) * 1000)
|
||||
return TestResult(
|
||||
test_id=test.id,
|
||||
passed=True,
|
||||
duration_ms=duration_ms,
|
||||
expected_output=test.expected_output,
|
||||
actual_output={"status": "passed"},
|
||||
)
|
||||
|
||||
except AssertionError as e:
|
||||
return self._create_failure_result(
|
||||
test=test,
|
||||
start_time=start_time,
|
||||
error_message=str(e) or "Assertion failed",
|
||||
stack_trace=traceback.format_exc(),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return self._create_failure_result(
|
||||
test=test,
|
||||
start_time=start_time,
|
||||
error_message=f"{type(e).__name__}: {e}",
|
||||
stack_trace=traceback.format_exc(),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return self._create_error_result(
|
||||
test=test,
|
||||
start_time=start_time,
|
||||
error_message=f"Test execution error: {e}",
|
||||
stack_trace=traceback.format_exc(),
|
||||
)
|
||||
|
||||
def _build_test_globals(
|
||||
self,
|
||||
agent: AgentProtocol,
|
||||
test: Test,
|
||||
) -> dict[str, Any]:
|
||||
"""Build the globals dict for test execution."""
|
||||
# Wrap async agents in a sync wrapper so test code can call agent.run()
|
||||
# without async/await syntax
|
||||
wrapped_agent = self._wrap_agent_if_async(agent)
|
||||
|
||||
return {
|
||||
"__builtins__": __builtins__,
|
||||
"agent": wrapped_agent,
|
||||
"llm_judge": LLMJudge(), # For semantic evaluation tests
|
||||
"test_input": test.input,
|
||||
"expected_output": test.expected_output,
|
||||
# Common test utilities
|
||||
"assert": assert_, # Built-in
|
||||
"isinstance": isinstance,
|
||||
"len": len,
|
||||
"str": str,
|
||||
"int": int,
|
||||
"float": float,
|
||||
"list": list,
|
||||
"dict": dict,
|
||||
"set": set,
|
||||
"tuple": tuple,
|
||||
"any": any,
|
||||
"all": all,
|
||||
"print": print, # For debugging
|
||||
}
|
||||
|
||||
def _wrap_agent_if_async(self, agent: AgentProtocol) -> Any:
|
||||
"""
|
||||
Wrap agent if its run() method is async.
|
||||
|
||||
Args:
|
||||
agent: Agent to potentially wrap
|
||||
|
||||
Returns:
|
||||
SyncAgentWrapper if agent.run() is async, otherwise the original agent
|
||||
"""
|
||||
run_method = getattr(agent, "run", None)
|
||||
if run_method is None:
|
||||
return agent
|
||||
|
||||
# Check if run() is a coroutine function
|
||||
if inspect.iscoroutinefunction(run_method):
|
||||
return SyncAgentWrapper(agent)
|
||||
|
||||
return agent
|
||||
|
||||
def _create_failure_result(
|
||||
self,
|
||||
test: Test,
|
||||
start_time: float,
|
||||
error_message: str,
|
||||
stack_trace: str | None = None,
|
||||
) -> TestResult:
|
||||
"""Create a result for a test that failed assertions."""
|
||||
duration_ms = int((time.perf_counter() - start_time) * 1000)
|
||||
|
||||
result = TestResult(
|
||||
test_id=test.id,
|
||||
passed=False,
|
||||
duration_ms=duration_ms,
|
||||
expected_output=test.expected_output,
|
||||
error_message=error_message,
|
||||
stack_trace=stack_trace,
|
||||
)
|
||||
|
||||
# Categorize the error
|
||||
result.error_category = self.categorizer.categorize(result)
|
||||
|
||||
return result
|
||||
|
||||
def _create_error_result(
|
||||
self,
|
||||
test: Test,
|
||||
start_time: float,
|
||||
error_message: str,
|
||||
stack_trace: str | None = None,
|
||||
) -> TestResult:
|
||||
"""Create a result for a test that couldn't run."""
|
||||
duration_ms = int((time.perf_counter() - start_time) * 1000)
|
||||
|
||||
result = TestResult(
|
||||
test_id=test.id,
|
||||
passed=False,
|
||||
duration_ms=duration_ms,
|
||||
error_message=error_message,
|
||||
stack_trace=stack_trace,
|
||||
)
|
||||
|
||||
# Implementation error for test setup failures
|
||||
result.error_category = ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def assert_(condition: bool, message: str = "") -> None:
|
||||
"""Assert helper with message."""
|
||||
if not condition:
|
||||
raise AssertionError(message)
|
||||
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
Parallel test runner inspired by pytest-xdist.
|
||||
|
||||
Features:
|
||||
- Per-test parallelism: Each test runs independently with load balancing
|
||||
- Worker initialization: Agent created once per worker thread (not per test)
|
||||
- Thread-based parallelism: Uses ThreadPoolExecutor for I/O-bound LLM calls
|
||||
- Fail-fast option: Stop on first failure
|
||||
"""
|
||||
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import cpu_count
|
||||
from typing import Any, Callable, Protocol, runtime_checkable
|
||||
|
||||
from framework.testing.test_case import Test
|
||||
from framework.testing.test_result import TestResult, TestSuiteResult
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.executor import TestExecutor, AgentProtocol
|
||||
from framework.testing.categorizer import ErrorCategorizer
|
||||
|
||||
|
||||
# Thread-local storage for worker agents
|
||||
# Each worker thread gets its own agent instance to avoid race conditions
|
||||
_thread_local = threading.local()
|
||||
|
||||
|
||||
def _init_worker(agent_factory: Any) -> None:
|
||||
"""
|
||||
Initialize worker thread with its own agent instance.
|
||||
|
||||
Called once per worker thread when the ThreadPoolExecutor starts.
|
||||
The agent is stored in thread-local storage and reused for all tests
|
||||
executed by this worker.
|
||||
"""
|
||||
if hasattr(agent_factory, "create"):
|
||||
_thread_local.agent = agent_factory.create()
|
||||
else:
|
||||
_thread_local.agent = agent_factory()
|
||||
|
||||
|
||||
def _run_single_test(test: Test, timeout: float) -> TestResult:
|
||||
"""
|
||||
Run a single test using the worker's pre-initialized agent.
|
||||
|
||||
Args:
|
||||
test: Test to execute
|
||||
timeout: Timeout per test in seconds
|
||||
|
||||
Returns:
|
||||
TestResult with execution details
|
||||
"""
|
||||
executor = TestExecutor(
|
||||
categorizer=ErrorCategorizer(),
|
||||
timeout=timeout,
|
||||
)
|
||||
return executor.execute(test, _thread_local.agent)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParallelConfig:
|
||||
"""Configuration for parallel test execution."""
|
||||
|
||||
num_workers: int = field(default_factory=cpu_count)
|
||||
timeout_per_test: float = 60.0 # seconds
|
||||
fail_fast: bool = False
|
||||
mock_external_apis: bool = True
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class AgentFactoryProtocol(Protocol):
|
||||
"""Protocol for creating agent instances."""
|
||||
|
||||
def create(self) -> AgentProtocol:
|
||||
"""Create a new agent instance."""
|
||||
...
|
||||
|
||||
|
||||
class AgentFactory:
|
||||
"""Picklable factory that creates AgentRunner instances from a path.
|
||||
|
||||
This class is used instead of a lambda for parallel test execution,
|
||||
since lambdas capturing local variables cannot be pickled by ProcessPoolExecutor.
|
||||
"""
|
||||
|
||||
def __init__(self, agent_path: str):
|
||||
self.agent_path = agent_path
|
||||
|
||||
def create(self):
|
||||
from framework.runner import AgentRunner
|
||||
return AgentRunner.load(self.agent_path)
|
||||
|
||||
|
||||
class ParallelTestRunner:
|
||||
"""
|
||||
Parallel test execution using ThreadPoolExecutor.
|
||||
|
||||
Key features:
|
||||
- Per-test distribution: Tests distributed individually for load balancing
|
||||
- Worker initialization: Each worker thread creates one agent at startup
|
||||
- Thread-based parallelism: Uses threads (not processes) for I/O-bound LLM calls
|
||||
- Thread-local storage: Each worker has isolated agent state via threading.local()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: ParallelConfig | None = None,
|
||||
storage: TestStorage | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize parallel runner.
|
||||
|
||||
Args:
|
||||
config: Parallel execution configuration
|
||||
storage: TestStorage for saving results
|
||||
"""
|
||||
self.config = config or ParallelConfig()
|
||||
self.storage = storage
|
||||
self.categorizer = ErrorCategorizer()
|
||||
|
||||
def run_all(
|
||||
self,
|
||||
goal_id: str,
|
||||
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
|
||||
tests: list[Test] | None = None,
|
||||
on_result: Callable[[TestResult], None] | None = None,
|
||||
) -> TestSuiteResult:
|
||||
"""
|
||||
Run all approved tests for a goal.
|
||||
|
||||
Args:
|
||||
goal_id: Goal ID to run tests for
|
||||
agent_factory: Factory for creating agent instances
|
||||
tests: Optional list of tests (loads from storage if not provided)
|
||||
on_result: Optional callback for each test result
|
||||
|
||||
Returns:
|
||||
TestSuiteResult with summary and individual results
|
||||
"""
|
||||
# Load tests if not provided
|
||||
if tests is None:
|
||||
if self.storage is None:
|
||||
raise ValueError("Either tests or storage must be provided")
|
||||
tests = self.storage.get_approved_tests(goal_id)
|
||||
|
||||
if not tests:
|
||||
return TestSuiteResult(
|
||||
goal_id=goal_id,
|
||||
total=0,
|
||||
passed=0,
|
||||
failed=0,
|
||||
)
|
||||
|
||||
# Execute tests
|
||||
results: list[TestResult] = []
|
||||
|
||||
if self.config.num_workers <= 1:
|
||||
# Sequential execution - create single agent and run all tests
|
||||
results = self._run_sequential(tests, agent_factory, on_result)
|
||||
else:
|
||||
# Parallel execution with per-test distribution
|
||||
results = self._run_parallel(tests, agent_factory, on_result)
|
||||
|
||||
# Save results if storage available
|
||||
if self.storage:
|
||||
# Create test_id -> test mapping for lookup
|
||||
test_map = {t.id: t for t in tests}
|
||||
|
||||
for result in results:
|
||||
# Update the Test object with execution result
|
||||
if result.test_id in test_map:
|
||||
test = test_map[result.test_id]
|
||||
test.record_result(result.passed)
|
||||
self.storage.update_test(test)
|
||||
|
||||
# Save the TestResult
|
||||
self.storage.save_result(result.test_id, result)
|
||||
|
||||
# Create suite result
|
||||
return self._create_suite_result(goal_id, results)
|
||||
|
||||
def run_tests(
|
||||
self,
|
||||
tests: list[Test],
|
||||
agent: AgentProtocol,
|
||||
on_result: Callable[[TestResult], None] | None = None,
|
||||
) -> list[TestResult]:
|
||||
"""
|
||||
Run a list of tests against an agent instance.
|
||||
|
||||
Args:
|
||||
tests: Tests to run
|
||||
agent: Agent instance to test
|
||||
on_result: Optional callback for each result
|
||||
|
||||
Returns:
|
||||
List of TestResult
|
||||
"""
|
||||
executor = TestExecutor(
|
||||
categorizer=self.categorizer,
|
||||
timeout=self.config.timeout_per_test,
|
||||
)
|
||||
|
||||
results = []
|
||||
for test in tests:
|
||||
result = executor.execute(test, agent)
|
||||
results.append(result)
|
||||
|
||||
if on_result:
|
||||
on_result(result)
|
||||
|
||||
# Fail-fast check
|
||||
if self.config.fail_fast and not result.passed:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def _run_sequential(
|
||||
self,
|
||||
tests: list[Test],
|
||||
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
|
||||
on_result: Callable[[TestResult], None] | None = None,
|
||||
) -> list[TestResult]:
|
||||
"""Run tests sequentially with a single agent instance."""
|
||||
results = []
|
||||
executor = TestExecutor(
|
||||
categorizer=self.categorizer,
|
||||
timeout=self.config.timeout_per_test,
|
||||
)
|
||||
|
||||
# Create single agent for all tests
|
||||
if isinstance(agent_factory, AgentFactoryProtocol):
|
||||
agent = agent_factory.create()
|
||||
else:
|
||||
agent = agent_factory()
|
||||
|
||||
# Run all tests
|
||||
for test in tests:
|
||||
result = executor.execute(test, agent)
|
||||
results.append(result)
|
||||
|
||||
if on_result:
|
||||
on_result(result)
|
||||
|
||||
# Fail-fast
|
||||
if self.config.fail_fast and not result.passed:
|
||||
return results
|
||||
|
||||
return results
|
||||
|
||||
def _run_parallel(
|
||||
self,
|
||||
tests: list[Test],
|
||||
agent_factory: AgentFactoryProtocol | Callable[[], AgentProtocol],
|
||||
on_result: Callable[[TestResult], None] | None = None,
|
||||
) -> list[TestResult]:
|
||||
"""
|
||||
Run tests in parallel using ThreadPoolExecutor with worker initialization.
|
||||
|
||||
Each worker thread creates ONE agent instance at startup and reuses it
|
||||
for all tests assigned to that worker. Tests are distributed individually
|
||||
for true load-balanced parallelism.
|
||||
|
||||
Uses threads instead of processes because LLM API calls are I/O-bound,
|
||||
and threads have lower overhead (no pickling, shared memory).
|
||||
"""
|
||||
results = []
|
||||
failed = False
|
||||
|
||||
with ThreadPoolExecutor(
|
||||
max_workers=self.config.num_workers,
|
||||
initializer=_init_worker,
|
||||
initargs=(agent_factory,),
|
||||
) as executor:
|
||||
# Submit each test individually for true parallelism
|
||||
futures = {
|
||||
executor.submit(_run_single_test, test, self.config.timeout_per_test): test
|
||||
for test in tests
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
test = futures[future]
|
||||
try:
|
||||
result = future.result(timeout=self.config.timeout_per_test + 30)
|
||||
results.append(result)
|
||||
|
||||
if on_result:
|
||||
on_result(result)
|
||||
|
||||
if not result.passed:
|
||||
failed = True
|
||||
|
||||
except TimeoutError:
|
||||
result = TestResult(
|
||||
test_id=test.id,
|
||||
passed=False,
|
||||
duration_ms=int(self.config.timeout_per_test * 1000),
|
||||
error_message="Test timed out",
|
||||
)
|
||||
results.append(result)
|
||||
if on_result:
|
||||
on_result(result)
|
||||
failed = True
|
||||
|
||||
except Exception as e:
|
||||
result = TestResult(
|
||||
test_id=test.id,
|
||||
passed=False,
|
||||
duration_ms=0,
|
||||
error_message=f"Execution error: {e}",
|
||||
)
|
||||
results.append(result)
|
||||
if on_result:
|
||||
on_result(result)
|
||||
failed = True
|
||||
|
||||
# Fail-fast
|
||||
if self.config.fail_fast and failed:
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def _create_suite_result(
|
||||
self,
|
||||
goal_id: str,
|
||||
results: list[TestResult],
|
||||
) -> TestSuiteResult:
|
||||
"""Create TestSuiteResult from individual results."""
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
failed = len(results) - passed
|
||||
total_duration = sum(r.duration_ms for r in results)
|
||||
|
||||
return TestSuiteResult(
|
||||
goal_id=goal_id,
|
||||
total=len(results),
|
||||
passed=passed,
|
||||
failed=failed,
|
||||
results=results,
|
||||
duration_ms=total_duration,
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
LLM prompt templates for test generation.
|
||||
|
||||
These prompts instruct the LLM to generate pytest-compatible tests
|
||||
from Goal success_criteria and constraints using tool calling.
|
||||
"""
|
||||
|
||||
CONSTRAINT_TEST_PROMPT = """You are generating test cases for an AI agent's constraints.
|
||||
|
||||
## Goal
|
||||
Name: {goal_name}
|
||||
Description: {goal_description}
|
||||
|
||||
## Constraints to Test
|
||||
{constraints_formatted}
|
||||
|
||||
## Instructions
|
||||
For each constraint, generate pytest-compatible tests that verify the constraint is satisfied.
|
||||
|
||||
For EACH test, call the `submit_test` tool with:
|
||||
- constraint_id: The ID of the constraint being tested
|
||||
- test_name: A descriptive pytest function name (test_constraint_<constraint_id>_<scenario>)
|
||||
- test_code: Complete Python test function code
|
||||
- description: What the test validates
|
||||
- input: Test input data as an object
|
||||
- expected_output: Expected output as an object
|
||||
- confidence: 0-1 score based on how testable/well-defined the constraint is
|
||||
|
||||
Consider for each constraint:
|
||||
- Happy path: Normal execution that should satisfy the constraint
|
||||
- Boundary conditions: Inputs at the edge of constraint boundaries
|
||||
- Violation scenarios: Inputs that should trigger constraint violation
|
||||
|
||||
The test code should:
|
||||
- Be valid Python using pytest conventions
|
||||
- Use `agent.run(input)` to execute the agent
|
||||
- Include descriptive assertion messages
|
||||
- Handle potential exceptions appropriately
|
||||
|
||||
Generate tests now by calling submit_test for each test."""
|
||||
|
||||
SUCCESS_CRITERIA_TEST_PROMPT = """You are generating success criteria tests for an AI agent.
|
||||
|
||||
## Goal
|
||||
Name: {goal_name}
|
||||
Description: {goal_description}
|
||||
|
||||
## Success Criteria
|
||||
{success_criteria_formatted}
|
||||
|
||||
## Agent Flow (for context)
|
||||
Nodes: {node_names}
|
||||
Tools: {tool_names}
|
||||
|
||||
## Instructions
|
||||
For each success criterion, generate tests that verify the agent achieves its goals.
|
||||
|
||||
For EACH test, call the `submit_test` tool with:
|
||||
- criteria_id: The ID of the success criterion being tested
|
||||
- test_name: A descriptive pytest function name (test_<criteria_id>_<scenario>)
|
||||
- test_code: Complete Python test function code
|
||||
- description: What the test validates
|
||||
- input: Test input data as an object
|
||||
- expected_output: Expected output as an object
|
||||
- confidence: 0-1 score based on how measurable/specific the criterion is
|
||||
|
||||
Consider for each criterion:
|
||||
- Happy path: Normal successful execution
|
||||
- Boundary conditions: Exactly at target thresholds (if applicable)
|
||||
- Graceful handling: Near-misses and edge cases
|
||||
|
||||
The test code should:
|
||||
- Be valid Python using pytest conventions
|
||||
- Use `agent.run(input)` to execute the agent
|
||||
- Validate the metric defined in the success criterion
|
||||
- Include descriptive assertion messages
|
||||
|
||||
Generate tests now by calling submit_test for each test."""
|
||||
|
||||
EDGE_CASE_TEST_PROMPT = """You are generating edge case tests for an AI agent.
|
||||
|
||||
## Goal
|
||||
Name: {goal_name}
|
||||
Description: {goal_description}
|
||||
|
||||
## Existing Tests
|
||||
{existing_tests_summary}
|
||||
|
||||
## Recent Failures (if any)
|
||||
{failures_summary}
|
||||
|
||||
## Instructions
|
||||
Generate additional edge case tests that cover scenarios not addressed by existing tests.
|
||||
|
||||
Focus on:
|
||||
1. Unusual input formats or values
|
||||
2. Empty or null inputs
|
||||
3. Extremely large or small values
|
||||
4. Unicode and special characters
|
||||
5. Concurrent or timing-related scenarios
|
||||
6. Network/API failure simulations (if applicable)
|
||||
|
||||
For EACH test, call the `submit_test` tool with:
|
||||
- criteria_id: An identifier for the edge case category being tested
|
||||
- test_name: A descriptive pytest function name (test_edge_case_<scenario>)
|
||||
- test_code: Complete Python test function code
|
||||
- description: What the test validates
|
||||
- input: Test input data as an object
|
||||
- expected_output: Expected output as an object
|
||||
- confidence: 0-1 score
|
||||
|
||||
Generate edge case tests now by calling submit_test for each test."""
|
||||
@@ -0,0 +1,219 @@
|
||||
"""
|
||||
Success criteria test generator.
|
||||
|
||||
Generates tests for Goal success_criteria using LLM.
|
||||
Tests are returned with PENDING approval status.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from framework.graph.goal import Goal, SuccessCriterion
|
||||
from framework.testing.test_case import Test, TestType, ApprovalStatus
|
||||
from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT
|
||||
from framework.llm.provider import Tool, ToolUse, ToolResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from framework.llm.provider import LLMProvider
|
||||
|
||||
|
||||
# Tool for collecting generated tests - Claude handles JSON escaping automatically
|
||||
SUBMIT_TEST_TOOL = Tool(
|
||||
name="submit_test",
|
||||
description="Submit a generated success criteria test. Call once per test.",
|
||||
parameters={
|
||||
"properties": {
|
||||
"criteria_id": {
|
||||
"type": "string",
|
||||
"description": "ID of the success criterion being tested",
|
||||
},
|
||||
"test_name": {
|
||||
"type": "string",
|
||||
"description": "pytest function name, e.g., test_find_videos_happy_path",
|
||||
},
|
||||
"test_code": {
|
||||
"type": "string",
|
||||
"description": "Complete Python test function code",
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "What the test validates",
|
||||
},
|
||||
"input": {
|
||||
"type": "object",
|
||||
"description": "Test input data",
|
||||
},
|
||||
"expected_output": {
|
||||
"type": "object",
|
||||
"description": "Expected output",
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"description": "Confidence score 0-1",
|
||||
},
|
||||
},
|
||||
"required": ["criteria_id", "test_name", "test_code", "description", "confidence"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class SuccessCriteriaTestGenerator:
|
||||
"""
|
||||
Generate success criteria tests from Goal success_criteria.
|
||||
|
||||
Generated tests require user approval before being added to the test suite.
|
||||
Unlike constraint tests, success criteria tests are generated during the
|
||||
Eval stage (after the agent exists) and may reference agent nodes/tools.
|
||||
"""
|
||||
|
||||
def __init__(self, llm: "LLMProvider"):
|
||||
"""
|
||||
Initialize generator with LLM provider.
|
||||
|
||||
Args:
|
||||
llm: LLM provider for test generation (e.g., AnthropicProvider)
|
||||
"""
|
||||
self.llm = llm
|
||||
|
||||
def generate(
|
||||
self,
|
||||
goal: Goal,
|
||||
node_names: list[str] | None = None,
|
||||
tool_names: list[str] | None = None,
|
||||
) -> list[Test]:
|
||||
"""
|
||||
Generate tests for all success criteria in a goal.
|
||||
|
||||
Args:
|
||||
goal: Goal with success_criteria to test
|
||||
node_names: Names of agent nodes (for context)
|
||||
tool_names: Names of tools available to agent (for context)
|
||||
|
||||
Returns:
|
||||
List of Test objects with approval_status=PENDING.
|
||||
These MUST be approved before being added to the test suite.
|
||||
"""
|
||||
if not goal.success_criteria:
|
||||
return []
|
||||
|
||||
# Format prompt
|
||||
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
|
||||
goal_name=goal.name,
|
||||
goal_description=goal.description,
|
||||
success_criteria_formatted=self._format_criteria(goal.success_criteria),
|
||||
node_names=", ".join(node_names or ["(not specified)"]),
|
||||
tool_names=", ".join(tool_names or ["(not specified)"]),
|
||||
)
|
||||
|
||||
# Collect tests via tool calls - Claude handles JSON escaping automatically
|
||||
collected_tests: list[dict] = []
|
||||
|
||||
def tool_executor(tool_use: ToolUse) -> ToolResult:
|
||||
if tool_use.name == "submit_test":
|
||||
collected_tests.append(tool_use.input)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Test recorded successfully"
|
||||
)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
|
||||
)
|
||||
|
||||
self.llm.complete_with_tools(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system="You are a test generation expert. For each success criterion, call the submit_test tool with the test details.",
|
||||
tools=[SUBMIT_TEST_TOOL],
|
||||
tool_executor=tool_executor,
|
||||
max_iterations=20,
|
||||
)
|
||||
|
||||
return self._create_tests_from_collected(collected_tests, goal.id)
|
||||
|
||||
def generate_for_criterion(
|
||||
self,
|
||||
goal: Goal,
|
||||
criterion: SuccessCriterion,
|
||||
node_names: list[str] | None = None,
|
||||
tool_names: list[str] | None = None,
|
||||
) -> list[Test]:
|
||||
"""
|
||||
Generate tests for a single success criterion.
|
||||
|
||||
Args:
|
||||
goal: Goal containing the criterion
|
||||
criterion: Specific criterion to test
|
||||
node_names: Names of agent nodes
|
||||
tool_names: Names of tools available
|
||||
|
||||
Returns:
|
||||
List of Test objects for the criterion
|
||||
"""
|
||||
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
|
||||
goal_name=goal.name,
|
||||
goal_description=goal.description,
|
||||
success_criteria_formatted=self._format_criterion(criterion),
|
||||
node_names=", ".join(node_names or ["(not specified)"]),
|
||||
tool_names=", ".join(tool_names or ["(not specified)"]),
|
||||
)
|
||||
|
||||
# Collect tests via tool calls
|
||||
collected_tests: list[dict] = []
|
||||
|
||||
def tool_executor(tool_use: ToolUse) -> ToolResult:
|
||||
if tool_use.name == "submit_test":
|
||||
collected_tests.append(tool_use.input)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Test recorded successfully"
|
||||
)
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id, content="Unknown tool", is_error=True
|
||||
)
|
||||
|
||||
self.llm.complete_with_tools(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
system="You are a test generation expert. Call the submit_test tool with the test details.",
|
||||
tools=[SUBMIT_TEST_TOOL],
|
||||
tool_executor=tool_executor,
|
||||
max_iterations=10,
|
||||
)
|
||||
|
||||
return self._create_tests_from_collected(collected_tests, goal.id)
|
||||
|
||||
def _format_criteria(self, criteria: list[SuccessCriterion]) -> str:
|
||||
"""Format success criteria for prompt."""
|
||||
lines = []
|
||||
for c in criteria:
|
||||
lines.append(self._format_criterion(c))
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_criterion(self, criterion: SuccessCriterion) -> str:
|
||||
"""Format a single criterion for prompt."""
|
||||
return f"""### Success Criterion: {criterion.id}
|
||||
- Description: {criterion.description}
|
||||
- Metric: {criterion.metric}
|
||||
- Target: {criterion.target}
|
||||
- Weight: {criterion.weight}
|
||||
- Currently met: {criterion.met}"""
|
||||
|
||||
def _create_tests_from_collected(
|
||||
self, collected: list[dict], goal_id: str
|
||||
) -> list[Test]:
|
||||
"""Create Test objects from tool call data."""
|
||||
tests = []
|
||||
for td in collected:
|
||||
test = Test(
|
||||
id=f"test_{uuid.uuid4().hex[:8]}",
|
||||
goal_id=goal_id,
|
||||
parent_criteria_id=td.get("criteria_id", "unknown"),
|
||||
test_type=TestType.SUCCESS_CRITERIA,
|
||||
test_name=td.get("test_name", "unnamed_test"),
|
||||
test_code=td.get("test_code", ""),
|
||||
description=td.get("description", ""),
|
||||
input=td.get("input", {}),
|
||||
expected_output=td.get("expected_output", {}),
|
||||
generated_by="llm",
|
||||
llm_confidence=float(td.get("confidence", 0.5)),
|
||||
approval_status=ApprovalStatus.PENDING,
|
||||
)
|
||||
tests.append(test)
|
||||
return tests
|
||||
@@ -0,0 +1,150 @@
|
||||
"""
|
||||
Test case schema with approval tracking.
|
||||
|
||||
Tests are generated by LLM from Goal success_criteria and constraints,
|
||||
but require mandatory user approval before being stored.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ApprovalStatus(str, Enum):
|
||||
"""Status of user approval for a generated test."""
|
||||
PENDING = "pending" # Awaiting user review
|
||||
APPROVED = "approved" # User accepted as-is
|
||||
MODIFIED = "modified" # User edited before accepting
|
||||
REJECTED = "rejected" # User declined (with reason)
|
||||
|
||||
|
||||
class TestType(str, Enum):
|
||||
"""Type of test based on what it validates."""
|
||||
CONSTRAINT = "constraint" # Validates constraint boundaries
|
||||
SUCCESS_CRITERIA = "outcome" # Validates success criteria achievement
|
||||
EDGE_CASE = "edge_case" # Validates edge case handling
|
||||
|
||||
|
||||
class Test(BaseModel):
|
||||
"""
|
||||
A test case generated from Goal success_criteria or constraints.
|
||||
|
||||
Tests are either:
|
||||
- Generated by LLM during Goal stage (constraints) or Eval stage (success criteria)
|
||||
- Created manually by human engineers
|
||||
|
||||
All tests require approval before being added to the test suite.
|
||||
"""
|
||||
id: str
|
||||
goal_id: str
|
||||
parent_criteria_id: str = Field(
|
||||
description="Links to success_criteria.id or constraint.id"
|
||||
)
|
||||
test_type: TestType
|
||||
|
||||
# Test definition
|
||||
test_name: str = Field(
|
||||
description="Descriptive function name, e.g., test_constraint_api_limits_respected"
|
||||
)
|
||||
test_code: str = Field(
|
||||
description="Python test function code (pytest compatible)"
|
||||
)
|
||||
description: str = Field(
|
||||
description="Human-readable description of what the test validates"
|
||||
)
|
||||
input: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Test input data"
|
||||
)
|
||||
expected_output: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Expected output or assertions"
|
||||
)
|
||||
|
||||
# LLM generation metadata
|
||||
generated_by: str = Field(
|
||||
default="llm",
|
||||
description="Who created the test: 'llm' or 'human'"
|
||||
)
|
||||
llm_confidence: float = Field(
|
||||
default=0.0,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="LLM's confidence in the test quality (0-1)"
|
||||
)
|
||||
|
||||
# Approval tracking (CRITICAL - tests are never used without approval)
|
||||
approval_status: ApprovalStatus = ApprovalStatus.PENDING
|
||||
approved_by: str | None = None
|
||||
approved_at: datetime | None = None
|
||||
rejection_reason: str | None = Field(
|
||||
default=None,
|
||||
description="Reason for rejection if status is REJECTED"
|
||||
)
|
||||
original_code: str | None = Field(
|
||||
default=None,
|
||||
description="Original LLM-generated code if user modified it"
|
||||
)
|
||||
|
||||
# Execution tracking
|
||||
last_run: datetime | None = None
|
||||
last_result: str | None = Field(
|
||||
default=None,
|
||||
description="Result of last run: 'passed', 'failed', 'error'"
|
||||
)
|
||||
run_count: int = 0
|
||||
pass_count: int = 0
|
||||
fail_count: int = 0
|
||||
|
||||
# Timestamps
|
||||
created_at: datetime = Field(default_factory=datetime.now)
|
||||
updated_at: datetime = Field(default_factory=datetime.now)
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
def approve(self, approved_by: str = "user") -> None:
|
||||
"""Mark test as approved."""
|
||||
self.approval_status = ApprovalStatus.APPROVED
|
||||
self.approved_by = approved_by
|
||||
self.approved_at = datetime.now()
|
||||
self.updated_at = datetime.now()
|
||||
|
||||
def modify(self, new_code: str, approved_by: str = "user") -> None:
|
||||
"""Approve test with modifications."""
|
||||
self.original_code = self.test_code
|
||||
self.test_code = new_code
|
||||
self.approval_status = ApprovalStatus.MODIFIED
|
||||
self.approved_by = approved_by
|
||||
self.approved_at = datetime.now()
|
||||
self.updated_at = datetime.now()
|
||||
|
||||
def reject(self, reason: str) -> None:
|
||||
"""Reject the test with a reason."""
|
||||
self.approval_status = ApprovalStatus.REJECTED
|
||||
self.rejection_reason = reason
|
||||
self.updated_at = datetime.now()
|
||||
|
||||
def record_result(self, passed: bool) -> None:
|
||||
"""Record a test run result."""
|
||||
self.last_run = datetime.now()
|
||||
self.last_result = "passed" if passed else "failed"
|
||||
self.run_count += 1
|
||||
if passed:
|
||||
self.pass_count += 1
|
||||
else:
|
||||
self.fail_count += 1
|
||||
self.updated_at = datetime.now()
|
||||
|
||||
@property
|
||||
def is_approved(self) -> bool:
|
||||
"""Check if test has been approved (approved or modified)."""
|
||||
return self.approval_status in (ApprovalStatus.APPROVED, ApprovalStatus.MODIFIED)
|
||||
|
||||
@property
|
||||
def pass_rate(self) -> float | None:
|
||||
"""Calculate pass rate if test has been run."""
|
||||
if self.run_count == 0:
|
||||
return None
|
||||
return self.pass_count / self.run_count
|
||||
@@ -0,0 +1,153 @@
|
||||
"""
|
||||
Test result schemas for tracking test execution outcomes.
|
||||
|
||||
Results include detailed error information for debugging and
|
||||
categorization for guiding iteration strategy.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ErrorCategory(str, Enum):
|
||||
"""
|
||||
Category of test failure for guiding iteration.
|
||||
|
||||
Each category has different implications for how to fix:
|
||||
- LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints
|
||||
- IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
|
||||
- EDGE_CASE: New scenario discovered → add new test only
|
||||
"""
|
||||
LOGIC_ERROR = "logic_error"
|
||||
IMPLEMENTATION_ERROR = "implementation_error"
|
||||
EDGE_CASE = "edge_case"
|
||||
|
||||
|
||||
class TestResult(BaseModel):
|
||||
"""
|
||||
Result of a single test execution.
|
||||
|
||||
Captures:
|
||||
- Pass/fail status with timing
|
||||
- Actual vs expected output
|
||||
- Error details for debugging
|
||||
- Runtime logs and execution path
|
||||
"""
|
||||
test_id: str
|
||||
passed: bool
|
||||
duration_ms: int = Field(
|
||||
ge=0,
|
||||
description="Test execution time in milliseconds"
|
||||
)
|
||||
|
||||
# Output comparison
|
||||
actual_output: Any = None
|
||||
expected_output: Any = None
|
||||
|
||||
# Error details (populated on failure)
|
||||
error_message: str | None = None
|
||||
error_category: ErrorCategory | None = None
|
||||
stack_trace: str | None = None
|
||||
|
||||
# Runtime data for debugging
|
||||
runtime_logs: list[dict[str, Any]] = Field(
|
||||
default_factory=list,
|
||||
description="Log entries from test execution"
|
||||
)
|
||||
node_outputs: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Output from each node executed during test"
|
||||
)
|
||||
execution_path: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Sequence of nodes executed"
|
||||
)
|
||||
|
||||
# Associated run ID (links to Runtime data)
|
||||
run_id: str | None = Field(
|
||||
default=None,
|
||||
description="Runtime run ID for detailed analysis"
|
||||
)
|
||||
|
||||
timestamp: datetime = Field(default_factory=datetime.now)
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
def summary_dict(self) -> dict[str, Any]:
|
||||
"""Return a summary dict for quick overview."""
|
||||
return {
|
||||
"test_id": self.test_id,
|
||||
"passed": self.passed,
|
||||
"duration_ms": self.duration_ms,
|
||||
"error_category": self.error_category.value if self.error_category else None,
|
||||
"error_message": self.error_message[:100] if self.error_message else None,
|
||||
}
|
||||
|
||||
|
||||
class TestSuiteResult(BaseModel):
|
||||
"""
|
||||
Aggregate result from running a test suite.
|
||||
|
||||
Provides summary statistics and individual results.
|
||||
"""
|
||||
goal_id: str
|
||||
total: int
|
||||
passed: int
|
||||
failed: int
|
||||
errors: int = 0 # Tests that couldn't run (e.g., exceptions in setup)
|
||||
skipped: int = 0
|
||||
|
||||
results: list[TestResult] = Field(default_factory=list)
|
||||
|
||||
duration_ms: int = Field(
|
||||
default=0,
|
||||
description="Total execution time in milliseconds"
|
||||
)
|
||||
|
||||
timestamp: datetime = Field(default_factory=datetime.now)
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@property
|
||||
def all_passed(self) -> bool:
|
||||
"""Check if all tests passed."""
|
||||
return self.failed == 0 and self.errors == 0
|
||||
|
||||
@property
|
||||
def pass_rate(self) -> float:
|
||||
"""Calculate pass rate."""
|
||||
if self.total == 0:
|
||||
return 0.0
|
||||
return self.passed / self.total
|
||||
|
||||
def summary_dict(self) -> dict[str, Any]:
|
||||
"""Return summary for reporting."""
|
||||
return {
|
||||
"goal_id": self.goal_id,
|
||||
"overall_passed": self.all_passed,
|
||||
"summary": {
|
||||
"total": self.total,
|
||||
"passed": self.passed,
|
||||
"failed": self.failed,
|
||||
"errors": self.errors,
|
||||
"skipped": self.skipped,
|
||||
},
|
||||
"pass_rate": f"{self.pass_rate:.1%}",
|
||||
"duration_ms": self.duration_ms,
|
||||
}
|
||||
|
||||
def get_failed_results(self) -> list[TestResult]:
|
||||
"""Get all failed test results for debugging."""
|
||||
return [r for r in self.results if not r.passed]
|
||||
|
||||
def get_results_by_category(
|
||||
self, category: ErrorCategory
|
||||
) -> list[TestResult]:
|
||||
"""Get failed results by error category."""
|
||||
return [
|
||||
r for r in self.results
|
||||
if not r.passed and r.error_category == category
|
||||
]
|
||||
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
File-based storage backend for test data.
|
||||
|
||||
Follows the same pattern as framework/storage/backend.py (FileStorage),
|
||||
storing tests as JSON files with indexes for efficient querying.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from framework.testing.test_case import Test, ApprovalStatus, TestType
|
||||
from framework.testing.test_result import TestResult
|
||||
|
||||
|
||||
class TestStorage:
|
||||
"""
|
||||
File-based storage for tests and results.
|
||||
|
||||
Directory structure:
|
||||
{base_path}/
|
||||
tests/
|
||||
{goal_id}/
|
||||
{test_id}.json # Full test data
|
||||
indexes/
|
||||
by_goal/{goal_id}.json # List of test IDs for this goal
|
||||
by_approval/{status}.json # Tests by approval status
|
||||
by_type/{test_type}.json # Tests by type
|
||||
by_criteria/{criteria_id}.json # Tests by parent criteria
|
||||
results/
|
||||
{test_id}/
|
||||
{timestamp}.json # Test run results
|
||||
latest.json # Most recent result
|
||||
suites/
|
||||
{goal_id}_suite.json # Test suite metadata
|
||||
"""
|
||||
|
||||
def __init__(self, base_path: str | Path):
|
||||
self.base_path = Path(base_path)
|
||||
self._ensure_dirs()
|
||||
|
||||
def _ensure_dirs(self) -> None:
|
||||
"""Create directory structure if it doesn't exist."""
|
||||
dirs = [
|
||||
self.base_path / "tests",
|
||||
self.base_path / "indexes" / "by_goal",
|
||||
self.base_path / "indexes" / "by_approval",
|
||||
self.base_path / "indexes" / "by_type",
|
||||
self.base_path / "indexes" / "by_criteria",
|
||||
self.base_path / "results",
|
||||
self.base_path / "suites",
|
||||
]
|
||||
for d in dirs:
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === TEST OPERATIONS ===
|
||||
|
||||
def save_test(self, test: Test) -> None:
|
||||
"""Save a test to storage."""
|
||||
# Ensure goal directory exists
|
||||
goal_dir = self.base_path / "tests" / test.goal_id
|
||||
goal_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save full test
|
||||
test_path = goal_dir / f"{test.id}.json"
|
||||
with open(test_path, "w") as f:
|
||||
f.write(test.model_dump_json(indent=2))
|
||||
|
||||
# Update indexes
|
||||
self._add_to_index("by_goal", test.goal_id, test.id)
|
||||
self._add_to_index("by_approval", test.approval_status.value, test.id)
|
||||
self._add_to_index("by_type", test.test_type.value, test.id)
|
||||
self._add_to_index("by_criteria", test.parent_criteria_id, test.id)
|
||||
|
||||
def load_test(self, goal_id: str, test_id: str) -> Test | None:
|
||||
"""Load a test from storage."""
|
||||
test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
|
||||
if not test_path.exists():
|
||||
return None
|
||||
with open(test_path) as f:
|
||||
return Test.model_validate_json(f.read())
|
||||
|
||||
def delete_test(self, goal_id: str, test_id: str) -> bool:
|
||||
"""Delete a test from storage."""
|
||||
test_path = self.base_path / "tests" / goal_id / f"{test_id}.json"
|
||||
|
||||
if not test_path.exists():
|
||||
return False
|
||||
|
||||
# Load test to get index keys
|
||||
test = self.load_test(goal_id, test_id)
|
||||
if test:
|
||||
self._remove_from_index("by_goal", test.goal_id, test_id)
|
||||
self._remove_from_index("by_approval", test.approval_status.value, test_id)
|
||||
self._remove_from_index("by_type", test.test_type.value, test_id)
|
||||
self._remove_from_index("by_criteria", test.parent_criteria_id, test_id)
|
||||
|
||||
test_path.unlink()
|
||||
|
||||
# Also delete results
|
||||
results_dir = self.base_path / "results" / test_id
|
||||
if results_dir.exists():
|
||||
for f in results_dir.iterdir():
|
||||
f.unlink()
|
||||
results_dir.rmdir()
|
||||
|
||||
return True
|
||||
|
||||
def update_test(self, test: Test) -> None:
|
||||
"""
|
||||
Update an existing test.
|
||||
|
||||
Handles index updates if approval_status changed.
|
||||
"""
|
||||
# Load old test to check for index changes
|
||||
old_test = self.load_test(test.goal_id, test.id)
|
||||
if old_test and old_test.approval_status != test.approval_status:
|
||||
self._remove_from_index("by_approval", old_test.approval_status.value, test.id)
|
||||
self._add_to_index("by_approval", test.approval_status.value, test.id)
|
||||
|
||||
# Update timestamp
|
||||
test.updated_at = datetime.now()
|
||||
|
||||
# Save
|
||||
self.save_test(test)
|
||||
|
||||
# === QUERY OPERATIONS ===
|
||||
|
||||
def get_tests_by_goal(self, goal_id: str) -> list[Test]:
|
||||
"""Get all tests for a goal."""
|
||||
test_ids = self._get_index("by_goal", goal_id)
|
||||
tests = []
|
||||
for test_id in test_ids:
|
||||
test = self.load_test(goal_id, test_id)
|
||||
if test:
|
||||
tests.append(test)
|
||||
return tests
|
||||
|
||||
def get_tests_by_approval_status(self, status: ApprovalStatus) -> list[str]:
|
||||
"""Get test IDs by approval status."""
|
||||
return self._get_index("by_approval", status.value)
|
||||
|
||||
def get_tests_by_type(self, test_type: TestType) -> list[str]:
|
||||
"""Get test IDs by test type."""
|
||||
return self._get_index("by_type", test_type.value)
|
||||
|
||||
def get_tests_by_criteria(self, criteria_id: str) -> list[str]:
|
||||
"""Get test IDs for a specific criteria."""
|
||||
return self._get_index("by_criteria", criteria_id)
|
||||
|
||||
def get_pending_tests(self, goal_id: str) -> list[Test]:
|
||||
"""Get all pending tests for a goal."""
|
||||
tests = self.get_tests_by_goal(goal_id)
|
||||
return [t for t in tests if t.approval_status == ApprovalStatus.PENDING]
|
||||
|
||||
def get_approved_tests(self, goal_id: str) -> list[Test]:
|
||||
"""Get all approved tests for a goal (approved or modified)."""
|
||||
tests = self.get_tests_by_goal(goal_id)
|
||||
return [t for t in tests if t.is_approved]
|
||||
|
||||
def list_all_goals(self) -> list[str]:
|
||||
"""List all goal IDs that have tests."""
|
||||
goals_dir = self.base_path / "indexes" / "by_goal"
|
||||
return [f.stem for f in goals_dir.glob("*.json")]
|
||||
|
||||
# === RESULT OPERATIONS ===
|
||||
|
||||
def save_result(self, test_id: str, result: TestResult) -> None:
|
||||
"""Save a test result."""
|
||||
results_dir = self.base_path / "results" / test_id
|
||||
results_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save with timestamp
|
||||
timestamp = result.timestamp.strftime("%Y%m%d_%H%M%S")
|
||||
result_path = results_dir / f"{timestamp}.json"
|
||||
with open(result_path, "w") as f:
|
||||
f.write(result.model_dump_json(indent=2))
|
||||
|
||||
# Update latest
|
||||
latest_path = results_dir / "latest.json"
|
||||
with open(latest_path, "w") as f:
|
||||
f.write(result.model_dump_json(indent=2))
|
||||
|
||||
def get_latest_result(self, test_id: str) -> TestResult | None:
|
||||
"""Get the most recent result for a test."""
|
||||
latest_path = self.base_path / "results" / test_id / "latest.json"
|
||||
if not latest_path.exists():
|
||||
return None
|
||||
with open(latest_path) as f:
|
||||
return TestResult.model_validate_json(f.read())
|
||||
|
||||
def get_result_history(self, test_id: str, limit: int = 10) -> list[TestResult]:
|
||||
"""Get result history for a test, most recent first."""
|
||||
results_dir = self.base_path / "results" / test_id
|
||||
if not results_dir.exists():
|
||||
return []
|
||||
|
||||
# Get all result files except latest.json
|
||||
result_files = sorted(
|
||||
[f for f in results_dir.glob("*.json") if f.name != "latest.json"],
|
||||
reverse=True
|
||||
)[:limit]
|
||||
|
||||
results = []
|
||||
for f in result_files:
|
||||
with open(f) as file:
|
||||
results.append(TestResult.model_validate_json(file.read()))
|
||||
|
||||
return results
|
||||
|
||||
# === INDEX OPERATIONS ===
|
||||
|
||||
def _get_index(self, index_type: str, key: str) -> list[str]:
|
||||
"""Get values from an index."""
|
||||
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
|
||||
if not index_path.exists():
|
||||
return []
|
||||
with open(index_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def _add_to_index(self, index_type: str, key: str, value: str) -> None:
|
||||
"""Add a value to an index."""
|
||||
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
|
||||
values = self._get_index(index_type, key)
|
||||
if value not in values:
|
||||
values.append(value)
|
||||
with open(index_path, "w") as f:
|
||||
json.dump(values, f)
|
||||
|
||||
def _remove_from_index(self, index_type: str, key: str, value: str) -> None:
|
||||
"""Remove a value from an index."""
|
||||
index_path = self.base_path / "indexes" / index_type / f"{key}.json"
|
||||
values = self._get_index(index_type, key)
|
||||
if value in values:
|
||||
values.remove(value)
|
||||
with open(index_path, "w") as f:
|
||||
json.dump(values, f)
|
||||
|
||||
# === UTILITY ===
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
"""Get storage statistics."""
|
||||
goals = self.list_all_goals()
|
||||
total_tests = sum(len(self._get_index("by_goal", g)) for g in goals)
|
||||
pending = len(self._get_index("by_approval", "pending"))
|
||||
approved = len(self._get_index("by_approval", "approved"))
|
||||
modified = len(self._get_index("by_approval", "modified"))
|
||||
rejected = len(self._get_index("by_approval", "rejected"))
|
||||
|
||||
return {
|
||||
"total_goals": len(goals),
|
||||
"total_tests": total_tests,
|
||||
"by_approval": {
|
||||
"pending": pending,
|
||||
"approved": approved,
|
||||
"modified": modified,
|
||||
"rejected": rejected,
|
||||
},
|
||||
"storage_path": str(self.base_path),
|
||||
}
|
||||
@@ -0,0 +1,612 @@
|
||||
"""
|
||||
Unit tests for the goal-based testing framework.
|
||||
|
||||
Tests cover:
|
||||
- Schema validation
|
||||
- Storage CRUD operations
|
||||
- Error categorization heuristics
|
||||
- Parallel runner grouping logic
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from framework.testing.test_case import (
|
||||
Test,
|
||||
TestType,
|
||||
ApprovalStatus,
|
||||
)
|
||||
from framework.testing.test_result import (
|
||||
TestResult,
|
||||
TestSuiteResult,
|
||||
ErrorCategory,
|
||||
)
|
||||
from framework.testing.test_storage import TestStorage
|
||||
from framework.testing.categorizer import ErrorCategorizer
|
||||
from framework.testing.parallel import ParallelTestRunner, ParallelConfig
|
||||
from framework.testing.debug_tool import DebugTool
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Test Schema Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestTestCaseSchema:
|
||||
"""Tests for Test schema."""
|
||||
|
||||
def test_create_test(self):
|
||||
"""Test creating a basic test."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_api_limits",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_constraint_api_limits",
|
||||
test_code="def test_constraint_api_limits(agent): pass",
|
||||
description="Tests API rate limits",
|
||||
input={"topic": "test"},
|
||||
expected_output={"count": 5},
|
||||
)
|
||||
|
||||
assert test.id == "test_001"
|
||||
assert test.goal_id == "goal_001"
|
||||
assert test.test_type == TestType.CONSTRAINT
|
||||
assert test.approval_status == ApprovalStatus.PENDING
|
||||
assert not test.is_approved
|
||||
|
||||
def test_approve_test(self):
|
||||
"""Test approving a test."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
|
||||
test.approve("test_user")
|
||||
|
||||
assert test.approval_status == ApprovalStatus.APPROVED
|
||||
assert test.approved_by == "test_user"
|
||||
assert test.approved_at is not None
|
||||
assert test.is_approved
|
||||
|
||||
def test_modify_test(self):
|
||||
"""Test modifying a test before approval."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="original code",
|
||||
description="test",
|
||||
)
|
||||
|
||||
test.modify("modified code", "test_user")
|
||||
|
||||
assert test.approval_status == ApprovalStatus.MODIFIED
|
||||
assert test.original_code == "original code"
|
||||
assert test.test_code == "modified code"
|
||||
assert test.is_approved
|
||||
|
||||
def test_reject_test(self):
|
||||
"""Test rejecting a test."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
|
||||
test.reject("Not a valid test case")
|
||||
|
||||
assert test.approval_status == ApprovalStatus.REJECTED
|
||||
assert test.rejection_reason == "Not a valid test case"
|
||||
assert not test.is_approved
|
||||
|
||||
def test_record_result(self):
|
||||
"""Test recording test results."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
|
||||
test.record_result(passed=True)
|
||||
assert test.last_result == "passed"
|
||||
assert test.run_count == 1
|
||||
assert test.pass_count == 1
|
||||
assert test.pass_rate == 1.0
|
||||
|
||||
test.record_result(passed=False)
|
||||
assert test.last_result == "failed"
|
||||
assert test.run_count == 2
|
||||
assert test.pass_count == 1
|
||||
assert test.fail_count == 1
|
||||
assert test.pass_rate == 0.5
|
||||
|
||||
|
||||
class TestTestResultSchema:
|
||||
"""Tests for TestResult schema."""
|
||||
|
||||
def test_create_passed_result(self):
|
||||
"""Test creating a passed result."""
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=True,
|
||||
duration_ms=100,
|
||||
actual_output={"status": "ok"},
|
||||
expected_output={"status": "ok"},
|
||||
)
|
||||
|
||||
assert result.passed
|
||||
assert result.duration_ms == 100
|
||||
assert result.error_category is None
|
||||
|
||||
def test_create_failed_result(self):
|
||||
"""Test creating a failed result."""
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=False,
|
||||
duration_ms=50,
|
||||
error_message="Assertion failed",
|
||||
error_category=ErrorCategory.IMPLEMENTATION_ERROR,
|
||||
stack_trace="Traceback...",
|
||||
)
|
||||
|
||||
assert not result.passed
|
||||
assert result.error_category == ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
def test_summary_dict(self):
|
||||
"""Test summary dict generation."""
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=False,
|
||||
duration_ms=50,
|
||||
error_message="Very long error " * 20,
|
||||
error_category=ErrorCategory.LOGIC_ERROR,
|
||||
)
|
||||
|
||||
summary = result.summary_dict()
|
||||
assert summary["test_id"] == "test_001"
|
||||
assert summary["passed"] is False
|
||||
assert summary["error_category"] == "logic_error"
|
||||
assert len(summary["error_message"]) == 100 # Truncated
|
||||
|
||||
|
||||
class TestTestSuiteResult:
|
||||
"""Tests for TestSuiteResult schema."""
|
||||
|
||||
def test_suite_result_properties(self):
|
||||
"""Test suite result calculation properties."""
|
||||
results = [
|
||||
TestResult(test_id="t1", passed=True, duration_ms=100),
|
||||
TestResult(test_id="t2", passed=True, duration_ms=50),
|
||||
TestResult(test_id="t3", passed=False, duration_ms=75,
|
||||
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
|
||||
]
|
||||
|
||||
suite = TestSuiteResult(
|
||||
goal_id="goal_001",
|
||||
total=3,
|
||||
passed=2,
|
||||
failed=1,
|
||||
results=results,
|
||||
duration_ms=225,
|
||||
)
|
||||
|
||||
assert not suite.all_passed
|
||||
assert suite.pass_rate == pytest.approx(2/3)
|
||||
assert len(suite.get_failed_results()) == 1
|
||||
|
||||
def test_get_results_by_category(self):
|
||||
"""Test filtering results by error category."""
|
||||
results = [
|
||||
TestResult(test_id="t1", passed=False, duration_ms=100,
|
||||
error_category=ErrorCategory.LOGIC_ERROR),
|
||||
TestResult(test_id="t2", passed=False, duration_ms=50,
|
||||
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
|
||||
TestResult(test_id="t3", passed=False, duration_ms=75,
|
||||
error_category=ErrorCategory.IMPLEMENTATION_ERROR),
|
||||
]
|
||||
|
||||
suite = TestSuiteResult(
|
||||
goal_id="goal_001",
|
||||
total=3,
|
||||
passed=0,
|
||||
failed=3,
|
||||
results=results,
|
||||
)
|
||||
|
||||
impl_errors = suite.get_results_by_category(ErrorCategory.IMPLEMENTATION_ERROR)
|
||||
assert len(impl_errors) == 2
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Storage Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestTestStorage:
|
||||
"""Tests for TestStorage."""
|
||||
|
||||
@pytest.fixture
|
||||
def storage(self, tmp_path):
|
||||
"""Create a temporary storage instance."""
|
||||
return TestStorage(tmp_path)
|
||||
|
||||
def test_save_and_load_test(self, storage):
|
||||
"""Test saving and loading a test."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="def test_something(agent): pass",
|
||||
description="A test",
|
||||
)
|
||||
|
||||
storage.save_test(test)
|
||||
|
||||
loaded = storage.load_test("goal_001", "test_001")
|
||||
assert loaded is not None
|
||||
assert loaded.id == "test_001"
|
||||
assert loaded.test_name == "test_something"
|
||||
|
||||
def test_delete_test(self, storage):
|
||||
"""Test deleting a test."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
|
||||
storage.save_test(test)
|
||||
assert storage.load_test("goal_001", "test_001") is not None
|
||||
|
||||
storage.delete_test("goal_001", "test_001")
|
||||
assert storage.load_test("goal_001", "test_001") is None
|
||||
|
||||
def test_get_tests_by_goal(self, storage):
|
||||
"""Test querying tests by goal."""
|
||||
for i in range(3):
|
||||
test = Test(
|
||||
id=f"test_{i}",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id=f"constraint_{i}",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name=f"test_{i}",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
storage.save_test(test)
|
||||
|
||||
tests = storage.get_tests_by_goal("goal_001")
|
||||
assert len(tests) == 3
|
||||
|
||||
def test_get_approved_tests(self, storage):
|
||||
"""Test querying approved tests."""
|
||||
# Create tests with different approval statuses
|
||||
test1 = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="c1",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_1",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
test1.approve()
|
||||
storage.save_test(test1)
|
||||
|
||||
test2 = Test(
|
||||
id="test_002",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="c2",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_2",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
# Leave pending
|
||||
storage.save_test(test2)
|
||||
|
||||
test3 = Test(
|
||||
id="test_003",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="c3",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_3",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
test3.modify("modified", "user")
|
||||
storage.save_test(test3)
|
||||
|
||||
approved = storage.get_approved_tests("goal_001")
|
||||
assert len(approved) == 2 # approved and modified
|
||||
|
||||
def test_save_and_load_result(self, storage):
|
||||
"""Test saving and loading test results."""
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=True,
|
||||
duration_ms=100,
|
||||
)
|
||||
|
||||
storage.save_result("test_001", result)
|
||||
|
||||
loaded = storage.get_latest_result("test_001")
|
||||
assert loaded is not None
|
||||
assert loaded.passed is True
|
||||
assert loaded.duration_ms == 100
|
||||
|
||||
def test_result_history(self, storage):
|
||||
"""Test getting result history."""
|
||||
# Save multiple results
|
||||
for i in range(5):
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=(i % 2 == 0),
|
||||
duration_ms=100 + i,
|
||||
)
|
||||
storage.save_result("test_001", result)
|
||||
|
||||
history = storage.get_result_history("test_001", limit=3)
|
||||
assert len(history) <= 3
|
||||
|
||||
def test_get_stats(self, storage):
|
||||
"""Test getting storage statistics."""
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="c1",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_1",
|
||||
test_code="pass",
|
||||
description="test",
|
||||
)
|
||||
test.approve()
|
||||
storage.save_test(test)
|
||||
|
||||
stats = storage.get_stats()
|
||||
assert stats["total_tests"] == 1
|
||||
assert stats["by_approval"]["approved"] == 1
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Error Categorizer Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestErrorCategorizer:
|
||||
"""Tests for ErrorCategorizer."""
|
||||
|
||||
@pytest.fixture
|
||||
def categorizer(self):
|
||||
return ErrorCategorizer()
|
||||
|
||||
def test_categorize_passed(self, categorizer):
|
||||
"""Test that passed results return None."""
|
||||
result = TestResult(test_id="t1", passed=True, duration_ms=100)
|
||||
assert categorizer.categorize(result) is None
|
||||
|
||||
def test_categorize_logic_error(self, categorizer):
|
||||
"""Test categorization of logic errors."""
|
||||
result = TestResult(
|
||||
test_id="t1",
|
||||
passed=False,
|
||||
duration_ms=100,
|
||||
error_message="goal not achieved: expected success criteria was not met",
|
||||
)
|
||||
assert categorizer.categorize(result) == ErrorCategory.LOGIC_ERROR
|
||||
|
||||
def test_categorize_implementation_error(self, categorizer):
|
||||
"""Test categorization of implementation errors."""
|
||||
result = TestResult(
|
||||
test_id="t1",
|
||||
passed=False,
|
||||
duration_ms=100,
|
||||
error_message="TypeError: 'NoneType' object has no attribute 'get'",
|
||||
)
|
||||
assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
def test_categorize_edge_case(self, categorizer):
|
||||
"""Test categorization of edge cases."""
|
||||
result = TestResult(
|
||||
test_id="t1",
|
||||
passed=False,
|
||||
duration_ms=100,
|
||||
error_message="timeout: request took longer than expected",
|
||||
)
|
||||
assert categorizer.categorize(result) == ErrorCategory.EDGE_CASE
|
||||
|
||||
def test_categorize_from_stack_trace(self, categorizer):
|
||||
"""Test categorization from stack trace."""
|
||||
result = TestResult(
|
||||
test_id="t1",
|
||||
passed=False,
|
||||
duration_ms=100,
|
||||
error_message="Error occurred",
|
||||
stack_trace="KeyError: 'missing_key'\n at line 42",
|
||||
)
|
||||
assert categorizer.categorize(result) == ErrorCategory.IMPLEMENTATION_ERROR
|
||||
|
||||
def test_get_fix_suggestion(self, categorizer):
|
||||
"""Test fix suggestions for each category."""
|
||||
assert "Goal" in categorizer.get_fix_suggestion(ErrorCategory.LOGIC_ERROR)
|
||||
assert "code" in categorizer.get_fix_suggestion(ErrorCategory.IMPLEMENTATION_ERROR).lower()
|
||||
assert "test" in categorizer.get_fix_suggestion(ErrorCategory.EDGE_CASE).lower()
|
||||
|
||||
def test_get_iteration_guidance(self, categorizer):
|
||||
"""Test iteration guidance."""
|
||||
guidance = categorizer.get_iteration_guidance(ErrorCategory.LOGIC_ERROR)
|
||||
assert guidance["stage"] == "Goal"
|
||||
assert guidance["restart_required"] is True
|
||||
|
||||
guidance = categorizer.get_iteration_guidance(ErrorCategory.IMPLEMENTATION_ERROR)
|
||||
assert guidance["stage"] == "Agent"
|
||||
assert guidance["restart_required"] is False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Parallel Runner Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestParallelRunner:
|
||||
"""Tests for ParallelTestRunner."""
|
||||
|
||||
@pytest.fixture
|
||||
def runner(self, tmp_path):
|
||||
"""Create a test runner with temporary storage."""
|
||||
storage = TestStorage(tmp_path)
|
||||
config = ParallelConfig(num_workers=1) # Sequential for testing
|
||||
return ParallelTestRunner(config, storage)
|
||||
|
||||
def test_create_suite_result(self, runner):
|
||||
"""Test creating suite result from individual results."""
|
||||
results = [
|
||||
TestResult(test_id="t1", passed=True, duration_ms=100),
|
||||
TestResult(test_id="t2", passed=False, duration_ms=50),
|
||||
]
|
||||
|
||||
suite = runner._create_suite_result("goal_001", results)
|
||||
|
||||
assert suite.goal_id == "goal_001"
|
||||
assert suite.total == 2
|
||||
assert suite.passed == 1
|
||||
assert suite.failed == 1
|
||||
assert suite.duration_ms == 150
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Debug Tool Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestDebugTool:
|
||||
"""Tests for DebugTool."""
|
||||
|
||||
@pytest.fixture
|
||||
def debug_tool(self, tmp_path):
|
||||
"""Create a debug tool with temporary storage."""
|
||||
storage = TestStorage(tmp_path)
|
||||
return DebugTool(storage)
|
||||
|
||||
def test_analyze_missing_test(self, debug_tool):
|
||||
"""Test analyzing a non-existent test."""
|
||||
info = debug_tool.analyze("goal_001", "nonexistent")
|
||||
|
||||
assert info.test_id == "nonexistent"
|
||||
assert "not found" in info.error_message.lower()
|
||||
|
||||
def test_analyze_with_result(self, debug_tool, tmp_path):
|
||||
"""Test analyzing a test with result."""
|
||||
storage = TestStorage(tmp_path)
|
||||
|
||||
# Create and save test
|
||||
test = Test(
|
||||
id="test_001",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="c1",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name="test_something",
|
||||
test_code="pass",
|
||||
description="A test",
|
||||
input={"key": "value"},
|
||||
expected_output={"result": "expected"},
|
||||
)
|
||||
storage.save_test(test)
|
||||
|
||||
# Create and save result
|
||||
result = TestResult(
|
||||
test_id="test_001",
|
||||
passed=False,
|
||||
duration_ms=100,
|
||||
error_message="TypeError: something went wrong",
|
||||
error_category=ErrorCategory.IMPLEMENTATION_ERROR,
|
||||
)
|
||||
storage.save_result("test_001", result)
|
||||
|
||||
# Create new debug tool with same storage
|
||||
debug_tool = DebugTool(storage)
|
||||
|
||||
info = debug_tool.analyze("goal_001", "test_001")
|
||||
|
||||
assert info.test_id == "test_001"
|
||||
assert info.test_name == "test_something"
|
||||
assert not info.passed
|
||||
assert info.error_category == "implementation_error"
|
||||
assert info.suggested_fix is not None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Integration Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestIntegration:
|
||||
"""Integration tests for the testing framework."""
|
||||
|
||||
def test_full_workflow(self, tmp_path):
|
||||
"""Test a simplified full workflow."""
|
||||
storage = TestStorage(tmp_path)
|
||||
|
||||
# 1. Create tests (simulating generation)
|
||||
tests = []
|
||||
for i in range(3):
|
||||
test = Test(
|
||||
id=f"test_{i}",
|
||||
goal_id="goal_001",
|
||||
parent_criteria_id="constraint_001",
|
||||
test_type=TestType.CONSTRAINT,
|
||||
test_name=f"test_constraint_{i}",
|
||||
test_code=f"def test_constraint_{i}(agent): assert True",
|
||||
description=f"Test {i}",
|
||||
)
|
||||
tests.append(test)
|
||||
|
||||
# 2. Approve tests
|
||||
for test in tests:
|
||||
test.approve("user")
|
||||
storage.save_test(test)
|
||||
|
||||
# 3. Verify storage
|
||||
approved = storage.get_approved_tests("goal_001")
|
||||
assert len(approved) == 3
|
||||
|
||||
# 4. Simulate running tests
|
||||
config = ParallelConfig(num_workers=1)
|
||||
runner = ParallelTestRunner(config, storage)
|
||||
|
||||
class MockAgent:
|
||||
def run(self, input):
|
||||
return {"success": True}
|
||||
|
||||
results = runner.run_tests(approved, MockAgent())
|
||||
assert len(results) == 3
|
||||
|
||||
# 5. Save results
|
||||
for result in results:
|
||||
storage.save_result(result.test_id, result)
|
||||
|
||||
# 6. Check stats
|
||||
stats = storage.get_stats()
|
||||
assert stats["total_tests"] == 3
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user