feat: verified testing
This commit is contained in:
@@ -0,0 +1,225 @@
|
||||
# Integration Test Reporting Skill
|
||||
|
||||
Run the Level 2 dummy agent integration test suite and produce a detailed HTML report with per-test input → outcome analysis.
|
||||
|
||||
## Trigger
|
||||
|
||||
User wants to run integration tests and see results:
|
||||
- `/test-reporting`
|
||||
- `/test-reporting test_component_queen_live.py`
|
||||
- `/test-reporting --all`
|
||||
|
||||
## SOP: Running Tests
|
||||
|
||||
### Step 1: Select Scope
|
||||
|
||||
If the user provides a specific test file or pattern, use it. Otherwise run the full suite.
|
||||
|
||||
```bash
|
||||
# Full suite
|
||||
cd core && echo "1" | uv run python tests/dummy_agents/run_all.py --interactive 2>&1
|
||||
|
||||
# Specific file (requires manual provider setup)
|
||||
cd core && uv run python -c "
|
||||
import sys
|
||||
sys.path.insert(0, '.')
|
||||
from tests.dummy_agents.run_all import detect_available
|
||||
from tests.dummy_agents.conftest import set_llm_selection
|
||||
|
||||
avail = detect_available()
|
||||
claude = [p for p in avail if 'Claude Code' in p['name']]
|
||||
if not claude:
|
||||
avail_names = [p['name'] for p in avail]
|
||||
raise RuntimeError(f'No Claude Code subscription. Available: {avail_names}')
|
||||
provider = claude[0]
|
||||
set_llm_selection(
|
||||
model=provider['model'],
|
||||
api_key=provider['api_key'],
|
||||
extra_headers=provider.get('extra_headers'),
|
||||
api_base=provider.get('api_base'),
|
||||
)
|
||||
|
||||
import pytest
|
||||
sys.exit(pytest.main([
|
||||
'tests/dummy_agents/TEST_FILE_HERE',
|
||||
'-v', '--override-ini=asyncio_mode=auto', '--no-header', '--tb=long',
|
||||
'--log-cli-level=WARNING', '--junitxml=/tmp/hive_test_results.xml',
|
||||
]))
|
||||
"
|
||||
```
|
||||
|
||||
### Step 2: Collect Results
|
||||
|
||||
After the test run completes, collect:
|
||||
1. **JUnit XML** from `--junitxml` output (if available)
|
||||
2. **stdout/stderr** from the run
|
||||
3. **Summary table** from `run_all.py` output (the Unicode table)
|
||||
|
||||
### Step 3: Generate HTML Report
|
||||
|
||||
Write the report to `/tmp/hive_integration_test_report.html`.
|
||||
|
||||
The report MUST include these sections:
|
||||
|
||||
#### Header
|
||||
- Run timestamp (ISO 8601)
|
||||
- Provider used (model name, source)
|
||||
- Total tests / passed / failed / skipped
|
||||
- Total wall-clock time
|
||||
- Overall verdict: PASS (all green) or FAIL (with count)
|
||||
|
||||
#### Per-Test Table
|
||||
|
||||
For EVERY test (not just failures), include a row with:
|
||||
|
||||
| Column | Description |
|
||||
|--------|-------------|
|
||||
| Component | Test file grouping (e.g., `component_queen_live`) |
|
||||
| Test Name | Function name (e.g., `test_queen_starts_in_planning_without_worker`) |
|
||||
| Status | PASS / FAIL / SKIP / ERROR with color badge |
|
||||
| Duration | Wall-clock seconds |
|
||||
| What | One-line description of what the test verifies |
|
||||
| How | How it works (setup → action → assertion) |
|
||||
| Why | Why this test matters (what bug/behavior it catches) |
|
||||
| Input | The input data or configuration (graph spec, initial prompt, phase, etc.) |
|
||||
| Expected Outcome | What the test asserts |
|
||||
| Actual Outcome | What actually happened (PASS: matches expected / FAIL: actual vs expected) |
|
||||
| Failure Detail | For failures only: full traceback + diagnosis |
|
||||
|
||||
#### What / How / Why Descriptions
|
||||
|
||||
These MUST be derived from the test function's docstring and code. Read each test file to extract:
|
||||
- **What**: From the docstring first line
|
||||
- **How**: From the test body (what fixtures, what graph, what assertions)
|
||||
- **Why**: From the docstring body or "Why this matters" section in the test module
|
||||
|
||||
Use these mappings for the component test files:
|
||||
|
||||
```
|
||||
test_component_llm.py → "LLM Provider" — streaming, tool calling, tokens
|
||||
test_component_tools.py → "Tool Registry + MCP" — connection, execution
|
||||
test_component_event_loop.py → "EventLoopNode" — iteration, output, stall
|
||||
test_component_edges.py → "Edge Evaluation" — conditional, priority
|
||||
test_component_conversation.py → "Conversation Persistence" — storage, cursor
|
||||
test_component_escalation.py → "Escalation Flow" — worker→queen signaling
|
||||
test_component_continuous.py → "Continuous Mode" — conversation threading
|
||||
test_component_queen.py → "Queen Phase (Unit)" — phase state, tools, events
|
||||
test_component_queen_live.py → "Queen Phase (Live)" — real queen, real LLM
|
||||
test_component_queen_state_machine.py → "Queen State Machine" — edge cases, races
|
||||
test_component_worker_comms.py → "Worker Communication" — events, data flow
|
||||
test_component_strict_outcomes.py → "Strict Outcomes" — exact path, output, quality
|
||||
```
|
||||
|
||||
#### HTML Template
|
||||
|
||||
Use this structure:
|
||||
|
||||
```html
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Hive Integration Test Report — {timestamp}</title>
|
||||
<style>
|
||||
:root { --pass: #22c55e; --fail: #ef4444; --skip: #f59e0b; --bg: #0f172a; --surface: #1e293b; --text: #e2e8f0; --muted: #94a3b8; --border: #334155; }
|
||||
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body { font-family: 'SF Mono', 'Fira Code', monospace; background: var(--bg); color: var(--text); padding: 2rem; line-height: 1.6; }
|
||||
h1, h2, h3 { font-weight: 600; }
|
||||
h1 { font-size: 1.5rem; margin-bottom: 1rem; }
|
||||
h2 { font-size: 1.2rem; margin: 2rem 0 1rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
|
||||
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
|
||||
.card { background: var(--surface); padding: 1rem; border-radius: 8px; border: 1px solid var(--border); }
|
||||
.card .label { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; }
|
||||
.card .value { font-size: 1.5rem; font-weight: 700; margin-top: 0.25rem; }
|
||||
.card .value.pass { color: var(--pass); }
|
||||
.card .value.fail { color: var(--fail); }
|
||||
table { width: 100%; border-collapse: collapse; font-size: 0.8rem; }
|
||||
th { background: var(--surface); position: sticky; top: 0; text-align: left; padding: 0.5rem; border-bottom: 2px solid var(--border); color: var(--muted); text-transform: uppercase; font-size: 0.7rem; }
|
||||
td { padding: 0.5rem; border-bottom: 1px solid var(--border); vertical-align: top; }
|
||||
tr:hover { background: rgba(255,255,255,0.03); }
|
||||
.badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; font-weight: 700; }
|
||||
.badge.pass { background: rgba(34,197,94,0.2); color: var(--pass); }
|
||||
.badge.fail { background: rgba(239,68,68,0.2); color: var(--fail); }
|
||||
.badge.skip { background: rgba(245,158,11,0.2); color: var(--skip); }
|
||||
.detail { background: #1a1a2e; padding: 0.75rem; border-radius: 4px; margin-top: 0.5rem; font-size: 0.75rem; white-space: pre-wrap; overflow-x: auto; max-height: 200px; overflow-y: auto; }
|
||||
.component-header { background: var(--surface); padding: 0.75rem 0.5rem; font-weight: 600; font-size: 0.85rem; }
|
||||
.meta { color: var(--muted); font-size: 0.75rem; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Hive Integration Test Report</h1>
|
||||
<p class="meta">Generated: {timestamp} | Provider: {provider} | Duration: {duration}s</p>
|
||||
|
||||
<div class="summary">
|
||||
<div class="card"><div class="label">Total</div><div class="value">{total}</div></div>
|
||||
<div class="card"><div class="label">Passed</div><div class="value pass">{passed}</div></div>
|
||||
<div class="card"><div class="label">Failed</div><div class="value fail">{failed}</div></div>
|
||||
<div class="card"><div class="label">Verdict</div><div class="value {verdict_class}">{verdict}</div></div>
|
||||
</div>
|
||||
|
||||
<h2>Test Results</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>Test</th>
|
||||
<th>Status</th>
|
||||
<th>Time</th>
|
||||
<th>What</th>
|
||||
<th>Input → Expected → Actual</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<!-- For each test: -->
|
||||
<tr>
|
||||
<td>{component}</td>
|
||||
<td>{test_name}</td>
|
||||
<td><span class="badge {status_class}">{status}</span></td>
|
||||
<td>{duration}s</td>
|
||||
<td>{what_description}</td>
|
||||
<td>
|
||||
<strong>Input:</strong> {input_description}<br>
|
||||
<strong>Expected:</strong> {expected_outcome}<br>
|
||||
<strong>Actual:</strong> {actual_outcome}
|
||||
<!-- If failed: -->
|
||||
<div class="detail">{failure_traceback}</div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h2>Failure Analysis</h2>
|
||||
<!-- Only if there are failures -->
|
||||
<p>For each failure, provide:</p>
|
||||
<ul>
|
||||
<li><strong>Root cause:</strong> Why it failed</li>
|
||||
<li><strong>Impact:</strong> What this means for the system</li>
|
||||
<li><strong>Suggested fix:</strong> How to address it</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
### Step 4: Output
|
||||
|
||||
1. Write the HTML file to `/tmp/hive_integration_test_report.html`
|
||||
2. Print the file path so the user can open it
|
||||
3. Print a concise summary to the terminal:
|
||||
```
|
||||
Test Report: /tmp/hive_integration_test_report.html
|
||||
Result: 74/76 PASSED (2 failures)
|
||||
Failures:
|
||||
- parallel_merge::test_parallel_disjoint_output_keys
|
||||
- worker::test_worker_timestamped_note_artifact
|
||||
```
|
||||
|
||||
## Key Rules
|
||||
|
||||
1. ALWAYS use `--junitxml` when running pytest to get structured results
|
||||
2. ALWAYS read the test source files to populate What/How/Why columns — do not guess
|
||||
3. For Input/Expected/Actual, extract from the test's graph spec, assertions, and result
|
||||
4. Color-code everything: green for pass, red for fail, amber for skip
|
||||
5. Include the full traceback for failures in a scrollable `<div class="detail">`
|
||||
6. Group tests by component (file name) with a visual separator
|
||||
7. The report must be self-contained HTML (no external CSS/JS dependencies)
|
||||
@@ -7,6 +7,7 @@ Run via: cd core && uv run python tests/dummy_agents/run_all.py
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@@ -202,3 +203,130 @@ def make_executor(
|
||||
|
||||
executor.execute = execute_with_timeout # type: ignore[method-assign]
|
||||
return executor
|
||||
|
||||
|
||||
# ── Artifact capture: raw output written to disk for every test ──────
|
||||
|
||||
ARTIFACTS_DIR = Path("/tmp/hive_test_artifacts")
|
||||
|
||||
|
||||
class TestArtifact:
|
||||
"""Collects raw output + expected behavior for a single test.
|
||||
|
||||
Usage in tests:
|
||||
def test_foo(artifact, ...):
|
||||
result = await executor.execute(...)
|
||||
artifact.record(result, expected="path == ['a','b'], output['x'] == 'hello'")
|
||||
"""
|
||||
|
||||
def __init__(self, test_id: str):
|
||||
self.test_id = test_id
|
||||
self._data: dict = {"test_id": test_id, "raw_output": None, "expected": "", "checks": []}
|
||||
|
||||
def record(self, result, *, expected: str = ""):
|
||||
"""Record an ExecutionResult with expected behavior description."""
|
||||
self._data["expected"] = expected
|
||||
if result is None:
|
||||
self._data["raw_output"] = None
|
||||
return
|
||||
self._data["raw_output"] = {
|
||||
"success": getattr(result, "success", None),
|
||||
"output": _safe_serialize(getattr(result, "output", {})),
|
||||
"error": getattr(result, "error", None),
|
||||
"path": getattr(result, "path", []),
|
||||
"steps_executed": getattr(result, "steps_executed", 0),
|
||||
"total_tokens": getattr(result, "total_tokens", 0),
|
||||
"total_latency_ms": getattr(result, "total_latency_ms", 0),
|
||||
"execution_quality": getattr(result, "execution_quality", ""),
|
||||
"total_retries": getattr(result, "total_retries", 0),
|
||||
"node_visit_counts": getattr(result, "node_visit_counts", {}),
|
||||
"nodes_with_failures": getattr(result, "nodes_with_failures", []),
|
||||
"session_state_buffer": _safe_serialize(
|
||||
(getattr(result, "session_state", {}) or {}).get("data_buffer", {})
|
||||
),
|
||||
}
|
||||
|
||||
def record_value(self, key: str, value, *, expected: str = ""):
|
||||
"""Record an arbitrary key-value (for non-ExecutionResult tests)."""
|
||||
self._data.setdefault("values", {})[key] = _safe_serialize(value)
|
||||
if expected:
|
||||
self._data["expected"] = expected
|
||||
|
||||
def check(self, description: str, passed: bool, actual: str = "", expected_val: str = ""):
|
||||
"""Record an individual assertion check."""
|
||||
self._data["checks"].append({
|
||||
"description": description,
|
||||
"passed": passed,
|
||||
"actual": actual,
|
||||
"expected": expected_val,
|
||||
})
|
||||
|
||||
def save(self):
|
||||
"""Write artifact to disk."""
|
||||
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe_name = self.test_id.replace("::", "__").replace("/", "_")
|
||||
path = ARTIFACTS_DIR / f"{safe_name}.json"
|
||||
with open(path, "w") as f:
|
||||
json.dump(self._data, f, indent=2, default=str)
|
||||
|
||||
|
||||
def _safe_serialize(obj):
|
||||
"""Convert to JSON-safe types."""
|
||||
if obj is None:
|
||||
return None
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
if isinstance(obj, dict):
|
||||
return {str(k): _safe_serialize(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [_safe_serialize(v) for v in obj]
|
||||
return str(obj)[:500]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def artifact(request):
|
||||
"""Fixture that captures raw test output to disk.
|
||||
|
||||
Every test gets an artifact recorder. Call artifact.record(result)
|
||||
and artifact.check("description", passed, actual, expected) to
|
||||
capture data. Saved automatically on teardown.
|
||||
"""
|
||||
test_id = request.node.nodeid
|
||||
art = TestArtifact(test_id)
|
||||
yield art
|
||||
art.save()
|
||||
|
||||
|
||||
# Autouse hook: for tests that DON'T use the artifact fixture,
|
||||
# create a minimal artifact from pass/fail status.
|
||||
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
|
||||
def pytest_runtest_makereport(item, call):
|
||||
outcome = yield
|
||||
rep = outcome.get_result()
|
||||
if rep.when == "call":
|
||||
item._test_report = rep
|
||||
|
||||
|
||||
def pytest_runtest_teardown(item, nextitem):
|
||||
"""Auto-save a minimal artifact for tests that didn't use the fixture."""
|
||||
report = getattr(item, "_test_report", None)
|
||||
if report is None:
|
||||
return
|
||||
# Check if the test already used the artifact fixture
|
||||
if "artifact" in item.fixturenames:
|
||||
return # Already handled by fixture teardown
|
||||
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe_name = item.nodeid.replace("::", "__").replace("/", "_")
|
||||
path = ARTIFACTS_DIR / f"{safe_name}.json"
|
||||
data = {
|
||||
"test_id": item.nodeid,
|
||||
"raw_output": None,
|
||||
"expected": "",
|
||||
"checks": [],
|
||||
"auto_captured": True,
|
||||
"status": "PASS" if report.passed else ("FAIL" if report.failed else "SKIP"),
|
||||
}
|
||||
if report.failed and report.longreprtext:
|
||||
data["failure_text"] = report.longreprtext[:5000]
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Component tests: Continuous Conversation Mode — threading, buffer passing.
|
||||
"""Component tests: Continuous Conversation Mode — threading, buffer.
|
||||
|
||||
Exercises conversation threading across nodes to verify that downstream
|
||||
nodes receive context from upstream nodes in continuous mode.
|
||||
@@ -15,12 +15,15 @@ from .conftest import make_executor
|
||||
|
||||
SET_OUTPUT_INSTRUCTION = (
|
||||
"You MUST call the set_output tool to provide your answer. "
|
||||
"Do not just write text — call set_output with the correct key and value."
|
||||
"Do not just write text — call set_output with the correct "
|
||||
"key and value."
|
||||
)
|
||||
|
||||
|
||||
def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
|
||||
"""Two-node pipeline: intake captures input, transform uppercases it."""
|
||||
def _build_pipeline_graph(
|
||||
conversation_mode: str = "continuous",
|
||||
) -> GraphSpec:
|
||||
"""Two-node pipeline: intake captures, transform uppercases."""
|
||||
return GraphSpec(
|
||||
id="continuous-pipeline",
|
||||
goal_id="dummy",
|
||||
@@ -37,8 +40,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
|
||||
input_keys=["raw"],
|
||||
output_keys=["captured"],
|
||||
system_prompt=(
|
||||
"Read the 'raw' input value and call set_output with "
|
||||
"key='captured' and the same value. " + SET_OUTPUT_INSTRUCTION
|
||||
"Read the 'raw' input value and call "
|
||||
"set_output with key='captured' and the "
|
||||
"same value. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -49,9 +53,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
|
||||
input_keys=["value"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Read the 'value' input, convert it to UPPERCASE, "
|
||||
"then call set_output with key='result' and the uppercased value. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Read the 'value' input, convert it to "
|
||||
"UPPERCASE, then call set_output with "
|
||||
"key='result' and the uppercased value. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -69,53 +73,141 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_continuous_pipeline_traverses(runtime, goal, llm_provider):
|
||||
async def test_continuous_pipeline_traverses(runtime, goal, llm_provider, artifact):
|
||||
"""Continuous mode pipeline should traverse both nodes."""
|
||||
graph = _build_pipeline_graph(conversation_mode="continuous")
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
|
||||
result = await executor.execute(
|
||||
graph, goal, {"raw": "hello"}, validate_graph=False
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{"raw": "hello"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, path=['intake','transform'], output['result'] is set"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["intake", "transform"],
|
||||
actual=str(result.path),
|
||||
expected_val="['intake', 'transform']",
|
||||
)
|
||||
assert result.path == ["intake", "transform"]
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_continuous_data_flows_through(runtime, goal, llm_provider):
|
||||
"""Data from node 1's output should be available to node 2 via input_mapping."""
|
||||
async def test_continuous_data_flows_through(runtime, goal, llm_provider, artifact):
|
||||
"""Data from node 1's output should be available to node 2."""
|
||||
graph = _build_pipeline_graph(conversation_mode="continuous")
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
|
||||
result = await executor.execute(
|
||||
graph, goal, {"raw": "test_data"}, validate_graph=False
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{"raw": "test_data"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, output['result'] is non-empty",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
# The transform node should have produced something based on the input
|
||||
|
||||
output_len = len(str(result.output["result"]))
|
||||
artifact.check(
|
||||
"output is non-empty",
|
||||
output_len > 0,
|
||||
actual=str(output_len),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(str(result.output["result"])) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_isolated_pipeline_traverses(runtime, goal, llm_provider):
|
||||
async def test_isolated_pipeline_traverses(runtime, goal, llm_provider, artifact):
|
||||
"""Isolated mode pipeline should also traverse both nodes."""
|
||||
graph = _build_pipeline_graph(conversation_mode="isolated")
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
|
||||
result = await executor.execute(
|
||||
graph, goal, {"raw": "data"}, validate_graph=False
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{"raw": "data"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['intake','transform']",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["intake", "transform"],
|
||||
actual=str(result.path),
|
||||
expected_val="['intake', 'transform']",
|
||||
)
|
||||
assert result.path == ["intake", "transform"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_continuous_three_node_chain(runtime, goal, llm_provider):
|
||||
"""Three-node continuous pipeline should thread conversation end-to-end."""
|
||||
async def test_continuous_three_node_chain(runtime, goal, llm_provider, artifact):
|
||||
"""Three-node continuous pipeline should thread end-to-end."""
|
||||
graph = GraphSpec(
|
||||
id="three-node-chain",
|
||||
goal_id="dummy",
|
||||
@@ -132,8 +224,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
|
||||
input_keys=["input"],
|
||||
output_keys=["a_out"],
|
||||
system_prompt=(
|
||||
"Read the 'input' value and call set_output with "
|
||||
"key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
|
||||
"Read the 'input' value and call set_output "
|
||||
"with key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -144,9 +236,9 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
|
||||
input_keys=["b_in"],
|
||||
output_keys=["b_out"],
|
||||
system_prompt=(
|
||||
"Read the 'b_in' value and call set_output with "
|
||||
"key='b_out' and value='processed_' followed by the input. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Read the 'b_in' value and call set_output "
|
||||
"with key='b_out' and value='processed_' "
|
||||
"followed by the input. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -157,8 +249,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
|
||||
input_keys=["c_in"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Read the 'c_in' value and call set_output with "
|
||||
"key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
|
||||
"Read the 'c_in' value and call set_output "
|
||||
"with key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -178,14 +270,60 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
|
||||
input_mapping={"c_in": "b_out"},
|
||||
),
|
||||
],
|
||||
memory_keys=["input", "a_out", "b_in", "b_out", "c_in", "result"],
|
||||
memory_keys=[
|
||||
"input",
|
||||
"a_out",
|
||||
"b_in",
|
||||
"b_out",
|
||||
"c_in",
|
||||
"result",
|
||||
],
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
result = await executor.execute(
|
||||
graph, goal, {"input": "payload"}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"input": "payload"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, path=['a','b','c'], steps=3, output['result'] is set"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["a", "b", "c"],
|
||||
actual=str(result.path),
|
||||
expected_val="['a', 'b', 'c']",
|
||||
)
|
||||
assert result.path == ["a", "b", "c"]
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 3",
|
||||
result.steps_executed == 3,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="3",
|
||||
)
|
||||
assert result.steps_executed == 3
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Component tests: Conversation Persistence — write-through, cursor, storage.
|
||||
"""Component tests: Conversation Persistence — write-through, storage.
|
||||
|
||||
Exercises conversation persistence by running real LLM turns and verifying
|
||||
that messages and state are written to disk correctly.
|
||||
Exercises conversation persistence by running real LLM turns and
|
||||
verifying that messages and state are written to disk correctly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -31,8 +31,9 @@ def _build_echo_graph() -> GraphSpec:
|
||||
input_keys=["input"],
|
||||
output_keys=["output"],
|
||||
system_prompt=(
|
||||
"Read the 'input' value and immediately call set_output "
|
||||
"with key='output' and the same value. Do not add any text."
|
||||
"Read the 'input' value and immediately call "
|
||||
"set_output with key='output' and the same "
|
||||
"value. Do not add any text."
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -43,48 +44,113 @@ def _build_echo_graph() -> GraphSpec:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path):
|
||||
async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""After execution, conversation data should exist on disk."""
|
||||
storage = tmp_path / "session"
|
||||
graph = _build_echo_graph()
|
||||
executor = make_executor(runtime, llm_provider, storage_path=storage)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
storage_path=storage,
|
||||
)
|
||||
|
||||
result = await executor.execute(
|
||||
graph, goal, {"input": "hello"}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"input": "hello"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, conversations/ dir exists with data files"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
# Verify conversation directory was created with content
|
||||
conv_dir = storage / "conversations"
|
||||
|
||||
artifact.check(
|
||||
"conversations/ dir exists",
|
||||
conv_dir.exists(),
|
||||
actual=str(conv_dir.exists()),
|
||||
expected_val="True",
|
||||
)
|
||||
assert conv_dir.exists(), "conversations/ directory should exist"
|
||||
|
||||
# Should have at least one file (messages or cursor)
|
||||
all_files = list(conv_dir.rglob("*"))
|
||||
data_files = [f for f in all_files if f.is_file()]
|
||||
|
||||
artifact.check(
|
||||
"at least one data file",
|
||||
len(data_files) > 0,
|
||||
actual=str(len(data_files)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(data_files) > 0, "Should have persisted at least one conversation file"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_conversation_output_matches_execution(
|
||||
runtime, goal, llm_provider, tmp_path
|
||||
runtime, goal, llm_provider, tmp_path, artifact
|
||||
):
|
||||
"""ExecutionResult output should be consistent with what the node produced."""
|
||||
"""ExecutionResult output should be consistent with the node."""
|
||||
storage = tmp_path / "session"
|
||||
graph = _build_echo_graph()
|
||||
executor = make_executor(runtime, llm_provider, storage_path=storage)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
storage_path=storage,
|
||||
)
|
||||
|
||||
result = await executor.execute(
|
||||
graph, goal, {"input": "test_value"}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"input": "test_value"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, output['output'] is non-empty",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("output")
|
||||
artifact.check(
|
||||
"output['output'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("output") is not None
|
||||
|
||||
# The echo node should produce some non-empty output
|
||||
output_len = len(str(result.output["output"]))
|
||||
artifact.check(
|
||||
"output is non-empty",
|
||||
output_len > 0,
|
||||
actual=str(output_len),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(str(result.output["output"])) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_conversation_multi_node_persistence(
|
||||
runtime, goal, llm_provider, tmp_path
|
||||
):
|
||||
async def test_conversation_multi_node_persistence(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""Multi-node graph should persist conversation data for each node."""
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec
|
||||
|
||||
@@ -104,8 +170,8 @@ async def test_conversation_multi_node_persistence(
|
||||
node_type="event_loop",
|
||||
output_keys=["intermediate"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='intermediate' and value='step1_done'. "
|
||||
"Do not write text."
|
||||
"Call set_output with key='intermediate' "
|
||||
"and value='step1_done'. Do not write text."
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -116,8 +182,7 @@ async def test_conversation_multi_node_persistence(
|
||||
input_keys=["intermediate"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='result' and value='step2_done'. "
|
||||
"Do not write text."
|
||||
"Call set_output with key='result' and value='step2_done'. Do not write text."
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -132,12 +197,45 @@ async def test_conversation_multi_node_persistence(
|
||||
],
|
||||
memory_keys=["intermediate", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, storage_path=storage)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
storage_path=storage,
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, path=['step1','step2'], conversations/ dir exists"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["step1", "step2"],
|
||||
actual=str(result.path),
|
||||
expected_val="['step1', 'step2']",
|
||||
)
|
||||
assert result.path == ["step1", "step2"]
|
||||
|
||||
# Both nodes should have written conversation data
|
||||
conv_dir = storage / "conversations"
|
||||
|
||||
artifact.check(
|
||||
"conversations/ dir exists",
|
||||
conv_dir.exists(),
|
||||
actual=str(conv_dir.exists()),
|
||||
expected_val="True",
|
||||
)
|
||||
assert conv_dir.exists()
|
||||
|
||||
@@ -15,12 +15,13 @@ from .conftest import make_executor
|
||||
|
||||
SET_OUTPUT_INSTRUCTION = (
|
||||
"You MUST call the set_output tool to provide your answer. "
|
||||
"Do not just write text — call set_output with the correct key and value."
|
||||
"Do not just write text — call set_output with the correct "
|
||||
"key and value."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_edge_conditional_true_path(runtime, goal, llm_provider):
|
||||
async def test_edge_conditional_true_path(runtime, goal, llm_provider, artifact):
|
||||
"""Conditional edge with True expression should be traversed."""
|
||||
graph = GraphSpec(
|
||||
id="cond-true",
|
||||
@@ -37,8 +38,7 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["label"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='label' and value='yes'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='label' and value='yes'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -48,8 +48,8 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='result' and value='reached'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='result' and "
|
||||
"value='reached'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -64,15 +64,41 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
|
||||
],
|
||||
memory_keys=["label", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['source','target']",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["source", "target"],
|
||||
actual=str(result.path),
|
||||
expected_val="['source', 'target']",
|
||||
)
|
||||
assert result.path == ["source", "target"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_edge_conditional_false_path(runtime, goal, llm_provider):
|
||||
async def test_edge_conditional_false_path(runtime, goal, llm_provider, artifact):
|
||||
"""Conditional edge with False expression should NOT be traversed."""
|
||||
graph = GraphSpec(
|
||||
id="cond-false",
|
||||
@@ -89,8 +115,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["label"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='label' and value='no'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='label' and value='no'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -99,7 +124,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
|
||||
description="Should not be reached",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='bad'.",
|
||||
system_prompt=("Call set_output with key='result' and value='bad'."),
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
@@ -113,15 +138,41 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
|
||||
],
|
||||
memory_keys=["label", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, 'target' not in path",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"target not in path",
|
||||
"target" not in result.path,
|
||||
actual=str(result.path),
|
||||
expected_val="path without 'target'",
|
||||
)
|
||||
assert "target" not in result.path
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
|
||||
async def test_edge_priority_selects_higher(runtime, goal, llm_provider, artifact):
|
||||
"""When multiple conditional edges match, higher priority wins."""
|
||||
graph = GraphSpec(
|
||||
id="priority-test",
|
||||
@@ -138,8 +189,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["value"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='value' and value='match'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='value' and value='match'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -149,8 +199,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='result' and value='HIGH'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='result' and value='HIGH'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
@@ -160,8 +209,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='result' and value='LOW'. "
|
||||
+ SET_OUTPUT_INSTRUCTION
|
||||
"Call set_output with key='result' and value='LOW'. " + SET_OUTPUT_INSTRUCTION
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -185,8 +233,34 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
|
||||
],
|
||||
memory_keys=["value", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['source','high']",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["source", "high"],
|
||||
actual=str(result.path),
|
||||
expected_val="['source', 'high']",
|
||||
)
|
||||
assert result.path == ["source", "high"]
|
||||
|
||||
@@ -16,7 +16,7 @@ from .conftest import make_executor
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path):
|
||||
async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""Worker LLM should call the escalate tool when instructed.
|
||||
|
||||
After calling escalate, the worker blocks waiting for queen input.
|
||||
@@ -40,8 +40,9 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"You MUST immediately call the escalate tool with "
|
||||
"reason='need human approval for deployment'. "
|
||||
"You MUST immediately call the escalate tool "
|
||||
"with reason='need human approval for "
|
||||
"deployment'. "
|
||||
"Do not call set_output. Do not write text."
|
||||
),
|
||||
),
|
||||
@@ -74,17 +75,34 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
|
||||
# Worker will block after escalate. Short timeout is fine.
|
||||
try:
|
||||
await _asyncio.wait_for(
|
||||
executor.execute(graph, goal, {}, validate_graph=False),
|
||||
executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
),
|
||||
timeout=30,
|
||||
)
|
||||
except (TimeoutError, _asyncio.TimeoutError):
|
||||
pass # Expected: worker hangs waiting for queen
|
||||
|
||||
artifact.record_value(
|
||||
"escalation_count",
|
||||
len(escalations),
|
||||
expected=">=1 ESCALATION_REQUESTED event emitted",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"escalation event emitted",
|
||||
len(escalations) >= 1,
|
||||
actual=str(len(escalations)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(escalations) >= 1, "No ESCALATION_REQUESTED event emitted"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path):
|
||||
async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""Worker that escalates should still terminate (not hang forever)."""
|
||||
graph = GraphSpec(
|
||||
id="escalate-terminate",
|
||||
@@ -100,8 +118,10 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call the escalate tool with reason='blocked on credentials'. "
|
||||
"Then call set_output with key='result' and value='escalated'."
|
||||
"Call the escalate tool with "
|
||||
"reason='blocked on credentials'. "
|
||||
"Then call set_output with key='result' "
|
||||
"and value='escalated'."
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -115,6 +135,21 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
)
|
||||
# Should terminate within timeout (make_executor wraps with asyncio.wait_for)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="steps_executed=1 (terminates, does not hang)",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 1",
|
||||
result.steps_executed == 1,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="1",
|
||||
)
|
||||
assert result.steps_executed == 1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Component tests: EventLoopNode — iteration limits, output accumulation, stall safety.
|
||||
"""Component tests: EventLoopNode — iteration limits, output, stall safety.
|
||||
|
||||
Exercises the core multi-turn LLM loop through single-node graphs with
|
||||
real LLM calls to verify iteration control and termination behavior.
|
||||
@@ -15,8 +15,8 @@ from .conftest import make_executor
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
|
||||
"""LLM calls set_output on first turn — node should terminate with output."""
|
||||
async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider, artifact):
|
||||
"""LLM calls set_output on first turn — node terminates with output."""
|
||||
graph = GraphSpec(
|
||||
id="single-turn",
|
||||
goal_id="dummy",
|
||||
@@ -31,7 +31,8 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='result' and value='done'. "
|
||||
"Call set_output with key='result' and "
|
||||
"value='done'. "
|
||||
"Do not write any text. Just call the tool."
|
||||
),
|
||||
),
|
||||
@@ -40,19 +41,51 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
|
||||
memory_keys=["result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, output['result'] set, steps=1",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 1",
|
||||
result.steps_executed == 1,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="1",
|
||||
)
|
||||
assert result.steps_executed == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_event_loop_multi_turn_tool_use(
|
||||
runtime, goal, llm_provider, tool_registry
|
||||
):
|
||||
"""LLM calls a tool, gets result, then calls set_output — multi-turn flow."""
|
||||
async def test_event_loop_multi_turn_tool_use(runtime, goal, llm_provider, tool_registry, artifact):
|
||||
"""LLM calls a tool, gets result, then calls set_output."""
|
||||
graph = GraphSpec(
|
||||
id="multi-turn",
|
||||
goal_id="dummy",
|
||||
@@ -68,9 +101,10 @@ async def test_event_loop_multi_turn_tool_use(
|
||||
output_keys=["result"],
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"First call get_current_time with timezone='UTC'. "
|
||||
"Then call set_output with key='result' and the day_of_week "
|
||||
"from the tool response."
|
||||
"First call get_current_time with "
|
||||
"timezone='UTC'. "
|
||||
"Then call set_output with key='result' and "
|
||||
"the day_of_week from the tool response."
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -79,18 +113,42 @@ async def test_event_loop_multi_turn_tool_use(
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, output['result'] is day_of_week",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
|
||||
async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider, artifact):
|
||||
"""Node must terminate after max_iterations even without set_output."""
|
||||
graph = GraphSpec(
|
||||
id="stuck-node",
|
||||
@@ -106,8 +164,7 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"You are thinking deeply. Respond with a short thought. "
|
||||
"Never call set_output."
|
||||
"You are thinking deeply. Respond with a short thought. Never call set_output."
|
||||
),
|
||||
max_tokens=32,
|
||||
),
|
||||
@@ -116,15 +173,34 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
|
||||
memory_keys=["result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="terminates (not hang), steps_executed=1",
|
||||
)
|
||||
|
||||
# Should terminate (not hang) — the node was visited
|
||||
artifact.check(
|
||||
"steps_executed is 1",
|
||||
result.steps_executed == 1,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="1",
|
||||
)
|
||||
assert result.steps_executed == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
|
||||
async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider, artifact):
|
||||
"""LLM should be able to set multiple output keys in a single node."""
|
||||
graph = GraphSpec(
|
||||
id="multi-output",
|
||||
@@ -142,7 +218,8 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
|
||||
system_prompt=(
|
||||
"Call set_output twice: "
|
||||
"first with key='name' and value='Alice', "
|
||||
"then with key='greeting' and value='Hello Alice'. "
|
||||
"then with key='greeting' and "
|
||||
"value='Hello Alice'. "
|
||||
"Do not write any text."
|
||||
),
|
||||
),
|
||||
@@ -151,9 +228,44 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
|
||||
memory_keys=["name", "greeting"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, output['name'] and output['greeting'] are set"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_name = result.output.get("name")
|
||||
artifact.check(
|
||||
"output['name'] is set",
|
||||
actual_name is not None,
|
||||
actual=repr(actual_name),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("name") is not None
|
||||
|
||||
actual_greeting = result.output.get("greeting")
|
||||
artifact.check(
|
||||
"output['greeting'] is set",
|
||||
actual_greeting is not None,
|
||||
actual=repr(actual_greeting),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("greeting") is not None
|
||||
|
||||
@@ -15,18 +15,39 @@ from framework.llm.stream_events import FinishEvent, TextDeltaEvent, ToolCallEve
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_acomplete_returns_content(llm_provider):
|
||||
async def test_llm_acomplete_returns_content(llm_provider, artifact):
|
||||
"""acomplete() should return a non-empty LLMResponse."""
|
||||
result = await llm_provider.acomplete(
|
||||
messages=[{"role": "user", "content": "Reply with exactly: OK"}],
|
||||
max_tokens=16,
|
||||
)
|
||||
artifact.record_value(
|
||||
"result_type",
|
||||
type(result).__name__,
|
||||
expected="LLMResponse with non-empty content",
|
||||
)
|
||||
artifact.record_value("content", result.content)
|
||||
|
||||
artifact.check(
|
||||
"result is LLMResponse",
|
||||
isinstance(result, LLMResponse),
|
||||
actual=type(result).__name__,
|
||||
expected_val="LLMResponse",
|
||||
)
|
||||
assert isinstance(result, LLMResponse)
|
||||
|
||||
content_ok = result.content and result.content.strip()
|
||||
artifact.check(
|
||||
"content is non-empty",
|
||||
bool(content_ok),
|
||||
actual=repr(result.content),
|
||||
expected_val="non-empty string",
|
||||
)
|
||||
assert result.content and result.content.strip()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_stream_yields_text_delta(llm_provider):
|
||||
async def test_llm_stream_yields_text_delta(llm_provider, artifact):
|
||||
"""stream() should yield at least one TextDeltaEvent and a FinishEvent."""
|
||||
text_deltas = []
|
||||
finish_events = []
|
||||
@@ -39,12 +60,32 @@ async def test_llm_stream_yields_text_delta(llm_provider):
|
||||
elif isinstance(event, FinishEvent):
|
||||
finish_events.append(event)
|
||||
|
||||
artifact.record_value(
|
||||
"text_delta_count",
|
||||
len(text_deltas),
|
||||
expected=">=1 TextDeltaEvent and exactly 1 FinishEvent",
|
||||
)
|
||||
artifact.record_value("finish_event_count", len(finish_events))
|
||||
|
||||
artifact.check(
|
||||
"at least one TextDeltaEvent",
|
||||
len(text_deltas) >= 1,
|
||||
actual=str(len(text_deltas)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(text_deltas) >= 1, "Expected at least one TextDeltaEvent"
|
||||
|
||||
artifact.check(
|
||||
"exactly one FinishEvent",
|
||||
len(finish_events) == 1,
|
||||
actual=str(len(finish_events)),
|
||||
expected_val="1",
|
||||
)
|
||||
assert len(finish_events) == 1, "Expected exactly one FinishEvent"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_stream_tool_call(llm_provider):
|
||||
async def test_llm_stream_tool_call(llm_provider, artifact):
|
||||
"""stream() with a tool definition should produce a ToolCallEvent."""
|
||||
tool = Tool(
|
||||
name="record_result",
|
||||
@@ -52,7 +93,10 @@ async def test_llm_stream_tool_call(llm_provider):
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {"type": "string", "description": "The result to record."},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "The result to record.",
|
||||
},
|
||||
},
|
||||
"required": ["value"],
|
||||
},
|
||||
@@ -63,7 +107,8 @@ async def test_llm_stream_tool_call(llm_provider):
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Call the record_result tool exactly once with value='OK'. "
|
||||
"Call the record_result tool exactly once "
|
||||
"with value='OK'. "
|
||||
"Do not answer with plain text."
|
||||
),
|
||||
}
|
||||
@@ -74,30 +119,79 @@ async def test_llm_stream_tool_call(llm_provider):
|
||||
events.append(event)
|
||||
|
||||
tool_calls = [e for e in events if isinstance(e, ToolCallEvent)]
|
||||
|
||||
artifact.record_value(
|
||||
"tool_call_count",
|
||||
len(tool_calls),
|
||||
expected=">=1 ToolCallEvent, tool_name='record_result'",
|
||||
)
|
||||
artifact.record_value(
|
||||
"tool_names",
|
||||
[tc.tool_name for tc in tool_calls],
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"LLM called record_result",
|
||||
len(tool_calls) >= 1,
|
||||
actual=str(len(tool_calls)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(tool_calls) >= 1, "LLM should have called record_result"
|
||||
|
||||
artifact.check(
|
||||
"tool_name is record_result",
|
||||
tool_calls[0].tool_name == "record_result",
|
||||
actual=tool_calls[0].tool_name,
|
||||
expected_val="record_result",
|
||||
)
|
||||
assert tool_calls[0].tool_name == "record_result"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_token_counts_populated(llm_provider):
|
||||
async def test_llm_token_counts_populated(llm_provider, artifact):
|
||||
"""LLMResponse should have positive input_tokens and output_tokens."""
|
||||
result = await llm_provider.acomplete(
|
||||
messages=[{"role": "user", "content": "Reply OK."}],
|
||||
max_tokens=16,
|
||||
)
|
||||
|
||||
artifact.record_value(
|
||||
"input_tokens",
|
||||
result.input_tokens,
|
||||
expected="positive input_tokens and output_tokens",
|
||||
)
|
||||
artifact.record_value("output_tokens", result.output_tokens)
|
||||
|
||||
artifact.check(
|
||||
"input_tokens positive",
|
||||
result.input_tokens > 0,
|
||||
actual=str(result.input_tokens),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert result.input_tokens > 0, "input_tokens should be positive"
|
||||
|
||||
artifact.check(
|
||||
"output_tokens positive",
|
||||
result.output_tokens > 0,
|
||||
actual=str(result.output_tokens),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert result.output_tokens > 0, "output_tokens should be positive"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_json_mode(llm_provider):
|
||||
"""acomplete(json_mode=True) should return parseable JSON when supported."""
|
||||
async def test_llm_json_mode(llm_provider, artifact):
|
||||
"""acomplete(json_mode=True) should return parseable JSON."""
|
||||
try:
|
||||
result = await llm_provider.acomplete(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": 'Return a JSON object with key "status" and value "ok". Output only valid JSON, no other text.',
|
||||
"content": (
|
||||
'Return a JSON object with key "status" '
|
||||
'and value "ok". Output only valid JSON, '
|
||||
"no other text."
|
||||
),
|
||||
}
|
||||
],
|
||||
max_tokens=64,
|
||||
@@ -110,6 +204,26 @@ async def test_llm_json_mode(llm_provider):
|
||||
if not content:
|
||||
pytest.skip("Provider returned empty content for json_mode request")
|
||||
|
||||
artifact.record_value(
|
||||
"content",
|
||||
content,
|
||||
expected="parseable JSON dict with 'status' key",
|
||||
)
|
||||
|
||||
parsed = json.loads(content)
|
||||
|
||||
artifact.check(
|
||||
"parsed is dict",
|
||||
isinstance(parsed, dict),
|
||||
actual=type(parsed).__name__,
|
||||
expected_val="dict",
|
||||
)
|
||||
assert isinstance(parsed, dict)
|
||||
|
||||
artifact.check(
|
||||
"'status' key present",
|
||||
"status" in parsed,
|
||||
actual=str(list(parsed.keys())),
|
||||
expected_val="contains 'status'",
|
||||
)
|
||||
assert "status" in parsed
|
||||
|
||||
@@ -16,15 +16,24 @@ def _make_tools(*names: str) -> list[Tool]:
|
||||
return [Tool(name=n, description=f"Tool {n}", parameters={}) for n in names]
|
||||
|
||||
|
||||
def test_queen_phase_state_initial_phase():
|
||||
def test_queen_phase_state_initial_phase(artifact):
|
||||
"""QueenPhaseState should default to 'building' phase."""
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
state = QueenPhaseState()
|
||||
|
||||
artifact.record_value("phase", state.phase, expected="default phase == 'building'")
|
||||
|
||||
artifact.check(
|
||||
"default phase is building",
|
||||
state.phase == "building",
|
||||
actual=repr(state.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert state.phase == "building"
|
||||
|
||||
|
||||
def test_queen_phase_state_planning_tools():
|
||||
def test_queen_phase_state_planning_tools(artifact):
|
||||
"""Planning phase should return planning_tools."""
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
@@ -34,11 +43,31 @@ def test_queen_phase_state_planning_tools():
|
||||
|
||||
tools = state.get_current_tools()
|
||||
tool_names = {t.name for t in tools}
|
||||
|
||||
artifact.record_value(
|
||||
"tool_names",
|
||||
sorted(tool_names),
|
||||
expected="planning tools include list_agent_tools, exclude edit_file",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"list_agent_tools in tools",
|
||||
"list_agent_tools" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'list_agent_tools'",
|
||||
)
|
||||
assert "list_agent_tools" in tool_names
|
||||
|
||||
artifact.check(
|
||||
"edit_file not in tools",
|
||||
"edit_file" not in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="does not contain 'edit_file'",
|
||||
)
|
||||
assert "edit_file" not in tool_names
|
||||
|
||||
|
||||
def test_queen_phase_state_building_tools():
|
||||
def test_queen_phase_state_building_tools(artifact):
|
||||
"""Building phase should return building_tools."""
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
@@ -48,11 +77,31 @@ def test_queen_phase_state_building_tools():
|
||||
|
||||
tools = state.get_current_tools()
|
||||
tool_names = {t.name for t in tools}
|
||||
|
||||
artifact.record_value(
|
||||
"tool_names",
|
||||
sorted(tool_names),
|
||||
expected="building tools include edit_file, exclude list_agent_tools",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"edit_file in tools",
|
||||
"edit_file" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'edit_file'",
|
||||
)
|
||||
assert "edit_file" in tool_names
|
||||
|
||||
artifact.check(
|
||||
"list_agent_tools not in tools",
|
||||
"list_agent_tools" not in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="does not contain 'list_agent_tools'",
|
||||
)
|
||||
assert "list_agent_tools" not in tool_names
|
||||
|
||||
|
||||
def test_queen_phase_state_tool_switching():
|
||||
def test_queen_phase_state_tool_switching(artifact):
|
||||
"""Switching phase should change which tools are returned."""
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
@@ -62,33 +111,96 @@ def test_queen_phase_state_tool_switching():
|
||||
state.staging_tools = _make_tools("c")
|
||||
state.running_tools = _make_tools("d")
|
||||
|
||||
planning_tool = state.get_current_tools()[0].name
|
||||
artifact.check(
|
||||
"planning returns tool 'a'",
|
||||
planning_tool == "a",
|
||||
actual=repr(planning_tool),
|
||||
expected_val="'a'",
|
||||
)
|
||||
assert state.get_current_tools()[0].name == "a"
|
||||
|
||||
state.phase = "building"
|
||||
building_tool = state.get_current_tools()[0].name
|
||||
artifact.check(
|
||||
"building returns tool 'b'",
|
||||
building_tool == "b",
|
||||
actual=repr(building_tool),
|
||||
expected_val="'b'",
|
||||
)
|
||||
assert state.get_current_tools()[0].name == "b"
|
||||
|
||||
state.phase = "staging"
|
||||
staging_tool = state.get_current_tools()[0].name
|
||||
artifact.check(
|
||||
"staging returns tool 'c'",
|
||||
staging_tool == "c",
|
||||
actual=repr(staging_tool),
|
||||
expected_val="'c'",
|
||||
)
|
||||
assert state.get_current_tools()[0].name == "c"
|
||||
|
||||
state.phase = "running"
|
||||
running_tool = state.get_current_tools()[0].name
|
||||
artifact.check(
|
||||
"running returns tool 'd'",
|
||||
running_tool == "d",
|
||||
actual=repr(running_tool),
|
||||
expected_val="'d'",
|
||||
)
|
||||
assert state.get_current_tools()[0].name == "d"
|
||||
|
||||
artifact.record_value(
|
||||
"tool_per_phase",
|
||||
{"planning": "a", "building": "b", "staging": "c", "running": "d"},
|
||||
expected="each phase returns its own tool",
|
||||
)
|
||||
|
||||
def test_queen_initial_phase_no_worker():
|
||||
|
||||
def test_queen_initial_phase_no_worker(artifact):
|
||||
"""Without a worker identity, queen should start in 'planning'."""
|
||||
# This tests the logic in queen_orchestrator.py line 106:
|
||||
# initial_phase = "staging" if worker_identity else "planning"
|
||||
worker_identity = None
|
||||
initial_phase = "staging" if worker_identity else "planning"
|
||||
|
||||
artifact.record_value(
|
||||
"initial_phase",
|
||||
initial_phase,
|
||||
expected="'planning' when worker_identity is None",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"initial phase is planning",
|
||||
initial_phase == "planning",
|
||||
actual=repr(initial_phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert initial_phase == "planning"
|
||||
|
||||
|
||||
def test_queen_initial_phase_with_worker():
|
||||
def test_queen_initial_phase_with_worker(artifact):
|
||||
"""With a worker identity, queen should start in 'staging'."""
|
||||
worker_identity = "my_agent"
|
||||
initial_phase = "staging" if worker_identity else "planning"
|
||||
|
||||
artifact.record_value(
|
||||
"initial_phase",
|
||||
initial_phase,
|
||||
expected="'staging' when worker_identity is set",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"initial phase is staging",
|
||||
initial_phase == "staging",
|
||||
actual=repr(initial_phase),
|
||||
expected_val="'staging'",
|
||||
)
|
||||
assert initial_phase == "staging"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_phase_switch_emits_event():
|
||||
async def test_queen_phase_switch_emits_event(artifact):
|
||||
"""Phase transition should emit QUEEN_PHASE_CHANGED event."""
|
||||
from framework.runtime.event_bus import EventBus, EventType
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
@@ -110,12 +222,36 @@ async def test_queen_phase_switch_emits_event():
|
||||
|
||||
await state.switch_to_building(source="tool")
|
||||
|
||||
artifact.record_value("phase", state.phase, expected="'building'")
|
||||
artifact.record_value("event_count", len(phase_events))
|
||||
|
||||
artifact.check(
|
||||
"phase is building",
|
||||
state.phase == "building",
|
||||
actual=repr(state.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert state.phase == "building"
|
||||
|
||||
artifact.check(
|
||||
"at least 1 phase event",
|
||||
len(phase_events) >= 1,
|
||||
actual=str(len(phase_events)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(phase_events) >= 1
|
||||
|
||||
event_phase = phase_events[0].data.get("phase")
|
||||
artifact.check(
|
||||
"event reports building",
|
||||
event_phase == "building",
|
||||
actual=repr(event_phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert phase_events[0].data.get("phase") == "building"
|
||||
|
||||
|
||||
def test_queen_draft_graph_persists_across_turns():
|
||||
def test_queen_draft_graph_persists_across_turns(artifact):
|
||||
"""Draft graph stored on phase_state should survive phase changes."""
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
@@ -126,5 +262,24 @@ def test_queen_draft_graph_persists_across_turns():
|
||||
state.phase = "building"
|
||||
|
||||
# Draft should still be available
|
||||
artifact.record_value(
|
||||
"draft_graph",
|
||||
state.draft_graph,
|
||||
expected="draft_graph survives phase change, nodes=['a','b']",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"draft_graph is not None",
|
||||
state.draft_graph is not None,
|
||||
actual=repr(state.draft_graph),
|
||||
expected_val="non-None",
|
||||
)
|
||||
assert state.draft_graph is not None
|
||||
|
||||
artifact.check(
|
||||
"draft has 2 nodes",
|
||||
len(state.draft_graph["nodes"]) == 2,
|
||||
actual=str(len(state.draft_graph["nodes"])),
|
||||
expected_val="2",
|
||||
)
|
||||
assert len(state.draft_graph["nodes"]) == 2
|
||||
|
||||
@@ -0,0 +1,772 @@
|
||||
"""Component tests: Queen Live Phase Switching — real LLM, real event bus.
|
||||
|
||||
Starts the actual queen via create_queen() with a real LLM provider and
|
||||
verifies phase transitions, dynamic tool switching, prompt switching, and
|
||||
event emission through the full queen lifecycle.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.runtime.event_bus import AgentEvent, EventBus, EventType
|
||||
from framework.server.session_manager import Session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
QUEEN_STARTUP_TIMEOUT = 30 # seconds to wait for queen to initialize
|
||||
QUEEN_RESPONSE_TIMEOUT = 60 # seconds to wait for queen to respond to a message
|
||||
|
||||
|
||||
@dataclass
|
||||
class PhaseCapture:
|
||||
"""Captures QUEEN_PHASE_CHANGED events."""
|
||||
|
||||
phases: list[str] = field(default_factory=list)
|
||||
events: list[AgentEvent] = field(default_factory=list)
|
||||
_waiters: list[tuple[str, asyncio.Event]] = field(default_factory=list)
|
||||
|
||||
async def on_event(self, event: AgentEvent) -> None:
|
||||
phase = event.data.get("phase", "")
|
||||
self.phases.append(phase)
|
||||
self.events.append(event)
|
||||
# Wake any waiters for this phase
|
||||
for target_phase, evt in self._waiters:
|
||||
if phase == target_phase:
|
||||
evt.set()
|
||||
|
||||
async def wait_for_phase(self, phase: str, timeout: float = 30) -> bool:
|
||||
"""Wait until a specific phase change is observed."""
|
||||
if phase in self.phases:
|
||||
return True
|
||||
evt = asyncio.Event()
|
||||
self._waiters.append((phase, evt))
|
||||
try:
|
||||
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
||||
return True
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextCapture:
|
||||
"""Captures LLM text deltas to verify queen is responding."""
|
||||
|
||||
chunks: list[str] = field(default_factory=list)
|
||||
_has_text: asyncio.Event = field(default_factory=asyncio.Event)
|
||||
|
||||
async def on_event(self, event: AgentEvent) -> None:
|
||||
text = event.data.get("content", "")
|
||||
if text:
|
||||
self.chunks.append(text)
|
||||
self._has_text.set()
|
||||
|
||||
async def wait_for_text(self, timeout: float = 30) -> bool:
|
||||
try:
|
||||
await asyncio.wait_for(self._has_text.wait(), timeout=timeout)
|
||||
return True
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
return False
|
||||
|
||||
@property
|
||||
def full_text(self) -> str:
|
||||
return "".join(self.chunks)
|
||||
|
||||
|
||||
def _make_mock_session_manager() -> MagicMock:
|
||||
"""Create a minimal mock SessionManager that satisfies create_queen()."""
|
||||
mgr = MagicMock()
|
||||
# _subscribe_worker_handoffs needs to exist but can be a no-op for tests
|
||||
mgr._subscribe_worker_handoffs = MagicMock()
|
||||
return mgr
|
||||
|
||||
|
||||
async def _start_queen(
|
||||
llm_provider,
|
||||
tmp_path: Path,
|
||||
*,
|
||||
worker_identity: str | None = None,
|
||||
initial_prompt: str | None = None,
|
||||
) -> tuple[Session, asyncio.Task]:
|
||||
"""Start a real queen and return (session, task)."""
|
||||
from framework.server.queen_orchestrator import create_queen
|
||||
|
||||
event_bus = EventBus()
|
||||
session = Session(
|
||||
id=f"test_{int(time.time())}",
|
||||
event_bus=event_bus,
|
||||
llm=llm_provider,
|
||||
loaded_at=time.time(),
|
||||
)
|
||||
|
||||
queen_dir = tmp_path / "queen"
|
||||
queen_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mgr = _make_mock_session_manager()
|
||||
|
||||
task = await create_queen(
|
||||
session=session,
|
||||
session_manager=mgr,
|
||||
worker_identity=worker_identity,
|
||||
queen_dir=queen_dir,
|
||||
initial_prompt=initial_prompt,
|
||||
)
|
||||
|
||||
# Wait for queen to initialize (queen_executor is set inside the task)
|
||||
for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
|
||||
if session.queen_executor is not None:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
assert session.queen_executor is not None, "Queen executor did not initialize"
|
||||
assert session.phase_state is not None, "Phase state not set"
|
||||
|
||||
return session, task
|
||||
|
||||
|
||||
async def _shutdown_queen(session: Session, task: asyncio.Task) -> None:
|
||||
"""Cleanly shut down the queen."""
|
||||
# Signal the event loop node to stop
|
||||
node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
|
||||
if node and hasattr(node, "signal_shutdown"):
|
||||
node.signal_shutdown()
|
||||
|
||||
# Cancel the task as backup
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(task, timeout=5)
|
||||
except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Initial Phase
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_starts_in_planning_without_worker(llm_provider, tmp_path, artifact):
|
||||
"""Queen with no worker_identity must start in 'planning' phase."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
actual_phase = session.phase_state.phase
|
||||
artifact.record_value(
|
||||
"phase", actual_phase, expected="phase == 'planning' when no worker_identity"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is planning",
|
||||
actual_phase == "planning",
|
||||
actual=repr(actual_phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert session.phase_state.phase == "planning", (
|
||||
f"Expected planning, got {session.phase_state.phase}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_starts_in_staging_with_worker(llm_provider, tmp_path, artifact):
|
||||
"""Queen with worker_identity must start in 'staging' phase."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity="test_agent",
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
actual_phase = session.phase_state.phase
|
||||
artifact.record_value(
|
||||
"phase", actual_phase, expected="phase == 'staging' when worker_identity is set"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is staging",
|
||||
actual_phase == "staging",
|
||||
actual=repr(actual_phase),
|
||||
expected_val="'staging'",
|
||||
)
|
||||
assert session.phase_state.phase == "staging", (
|
||||
f"Expected staging, got {session.phase_state.phase}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Tool Availability Per Phase
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_planning_tools_available(llm_provider, tmp_path, artifact):
|
||||
"""In planning phase, planning tools must be returned by get_current_tools()."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
artifact.record_value(
|
||||
"phase",
|
||||
ps.phase,
|
||||
expected="phase='planning', tools include list_agent_tools, exclude edit_file",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
|
||||
tool_names = {t.name for t in ps.get_current_tools()}
|
||||
artifact.record_value("tool_names", sorted(tool_names))
|
||||
|
||||
# Planning phase must have agent discovery tools
|
||||
artifact.check(
|
||||
"list_agent_tools in tools",
|
||||
"list_agent_tools" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'list_agent_tools'",
|
||||
)
|
||||
assert "list_agent_tools" in tool_names, (
|
||||
f"list_agent_tools missing from planning tools: {tool_names}"
|
||||
)
|
||||
# Planning phase must NOT have building-only tools
|
||||
artifact.check(
|
||||
"edit_file not in tools",
|
||||
"edit_file" not in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="does not contain 'edit_file'",
|
||||
)
|
||||
assert "edit_file" not in tool_names, (
|
||||
f"edit_file should not be in planning tools: {tool_names}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_tools_change_on_phase_switch(llm_provider, tmp_path, artifact):
|
||||
"""Switching phase must change the tools returned by get_current_tools()."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
planning_tools = {t.name for t in ps.get_current_tools()}
|
||||
artifact.record_value(
|
||||
"planning_tools",
|
||||
sorted(planning_tools),
|
||||
expected="planning, building, and staging tool sets all differ",
|
||||
)
|
||||
|
||||
# Switch to building
|
||||
await ps.switch_to_building(source="test")
|
||||
building_tools = {t.name for t in ps.get_current_tools()}
|
||||
artifact.record_value("building_tools", sorted(building_tools))
|
||||
|
||||
artifact.check(
|
||||
"planning != building tools",
|
||||
planning_tools != building_tools,
|
||||
actual=f"planning={sorted(planning_tools)}, building={sorted(building_tools)}",
|
||||
expected_val="different sets",
|
||||
)
|
||||
assert planning_tools != building_tools, "Planning and building tools must differ"
|
||||
|
||||
# Switch to staging
|
||||
await ps.switch_to_staging(source="test")
|
||||
staging_tools = {t.name for t in ps.get_current_tools()}
|
||||
artifact.record_value("staging_tools", sorted(staging_tools))
|
||||
|
||||
artifact.check(
|
||||
"staging != building tools",
|
||||
staging_tools != building_tools,
|
||||
actual=f"staging={sorted(staging_tools)}, building={sorted(building_tools)}",
|
||||
expected_val="different sets",
|
||||
)
|
||||
assert staging_tools != building_tools, "Building and staging tools must differ"
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Prompt Switching
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_prompt_changes_on_phase_switch(llm_provider, tmp_path, artifact):
|
||||
"""Switching phase must change the system prompt returned by get_current_prompt()."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
planning_prompt = ps.get_current_prompt()
|
||||
artifact.record_value(
|
||||
"planning_prompt_len",
|
||||
len(planning_prompt),
|
||||
expected="non-empty planning and building prompts that differ",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"planning prompt non-empty",
|
||||
len(planning_prompt) > 0,
|
||||
actual=str(len(planning_prompt)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(planning_prompt) > 0, "Planning prompt should not be empty"
|
||||
|
||||
await ps.switch_to_building(source="test")
|
||||
building_prompt = ps.get_current_prompt()
|
||||
artifact.record_value("building_prompt_len", len(building_prompt))
|
||||
|
||||
artifact.check(
|
||||
"building prompt non-empty",
|
||||
len(building_prompt) > 0,
|
||||
actual=str(len(building_prompt)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(building_prompt) > 0, "Building prompt should not be empty"
|
||||
|
||||
artifact.check(
|
||||
"prompts differ",
|
||||
planning_prompt != building_prompt,
|
||||
actual=f"planning_len={len(planning_prompt)}, building_len={len(building_prompt)}",
|
||||
expected_val="different prompts",
|
||||
)
|
||||
assert planning_prompt != building_prompt, "Planning and building prompts must differ"
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Phase Change Events
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_emits_phase_change_events(llm_provider, tmp_path, artifact):
|
||||
"""Each phase switch must emit a QUEEN_PHASE_CHANGED event."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
capture = PhaseCapture()
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=capture.on_event,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
# planning -> building
|
||||
await ps.switch_to_building(source="test")
|
||||
assert await capture.wait_for_phase("building", timeout=5)
|
||||
|
||||
# building -> staging
|
||||
await ps.switch_to_staging(source="test")
|
||||
assert await capture.wait_for_phase("staging", timeout=5)
|
||||
|
||||
# staging -> running
|
||||
await ps.switch_to_running(source="test")
|
||||
assert await capture.wait_for_phase("running", timeout=5)
|
||||
|
||||
# running -> planning
|
||||
await ps.switch_to_planning(source="test")
|
||||
assert await capture.wait_for_phase("planning", timeout=5)
|
||||
|
||||
artifact.record_value(
|
||||
"phases", capture.phases, expected="['building', 'staging', 'running', 'planning']"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase sequence matches",
|
||||
capture.phases == ["building", "staging", "running", "planning"],
|
||||
actual=str(capture.phases),
|
||||
expected_val="['building', 'staging', 'running', 'planning']",
|
||||
)
|
||||
assert capture.phases == ["building", "staging", "running", "planning"], (
|
||||
f"Phase sequence was: {capture.phases}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_no_duplicate_phase_event_on_same_phase(llm_provider, tmp_path, artifact):
|
||||
"""Switching to the same phase should NOT emit a duplicate event."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
capture = PhaseCapture()
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=capture.on_event,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
artifact.check(
|
||||
"initial phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
|
||||
# Switch to building twice
|
||||
await ps.switch_to_building(source="test")
|
||||
await asyncio.sleep(0.2)
|
||||
await ps.switch_to_building(source="test") # no-op
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
# Should only have one "building" event
|
||||
building_events = [p for p in capture.phases if p == "building"]
|
||||
|
||||
artifact.record_value(
|
||||
"building_event_count",
|
||||
len(building_events),
|
||||
expected="exactly 1 building event (no duplicate)",
|
||||
)
|
||||
artifact.record_value("all_phases", capture.phases)
|
||||
|
||||
artifact.check(
|
||||
"only 1 building event",
|
||||
len(building_events) == 1,
|
||||
actual=str(len(building_events)),
|
||||
expected_val="1",
|
||||
)
|
||||
assert len(building_events) == 1, (
|
||||
f"Expected 1 building event, got {len(building_events)}: {capture.phases}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Queen Responds in Correct Phase
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_responds_to_message(llm_provider, tmp_path, artifact):
|
||||
"""Queen must produce an LLM turn when started with an initial prompt."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello, I want to build an agent.",
|
||||
)
|
||||
turn_complete = asyncio.Event()
|
||||
|
||||
async def _on_turn(event: AgentEvent) -> None:
|
||||
turn_complete.set()
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.LLM_TURN_COMPLETE],
|
||||
handler=_on_turn,
|
||||
filter_stream="queen",
|
||||
)
|
||||
try:
|
||||
# Queen should complete at least one LLM turn (text or tool call)
|
||||
got_turn = False
|
||||
try:
|
||||
await asyncio.wait_for(turn_complete.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
|
||||
got_turn = True
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
pass
|
||||
|
||||
artifact.record_value(
|
||||
"got_turn", got_turn, expected="queen completes at least one LLM turn"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"queen completed LLM turn", got_turn, actual=str(got_turn), expected_val="True"
|
||||
)
|
||||
assert got_turn, "Queen did not complete any LLM turn"
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_responds_after_injected_message(llm_provider, tmp_path, artifact):
|
||||
"""Injecting a user message must trigger a new queen LLM turn."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
# Wait for initial response to settle
|
||||
first_turn = asyncio.Event()
|
||||
|
||||
async def _on_first_turn(event: AgentEvent) -> None:
|
||||
first_turn.set()
|
||||
|
||||
sub_id = session.event_bus.subscribe(
|
||||
event_types=[EventType.LLM_TURN_COMPLETE],
|
||||
handler=_on_first_turn,
|
||||
filter_stream="queen",
|
||||
)
|
||||
try:
|
||||
await asyncio.wait_for(first_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
pass
|
||||
session.event_bus.unsubscribe(sub_id)
|
||||
|
||||
# Now inject a follow-up and listen for a new turn
|
||||
second_turn = asyncio.Event()
|
||||
|
||||
async def _on_second_turn(event: AgentEvent) -> None:
|
||||
second_turn.set()
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.LLM_TURN_COMPLETE],
|
||||
handler=_on_second_turn,
|
||||
filter_stream="queen",
|
||||
)
|
||||
|
||||
node = session.queen_executor.node_registry.get("queen")
|
||||
assert node is not None
|
||||
await node.inject_event(
|
||||
"What tools do you have available?",
|
||||
is_client_input=True,
|
||||
)
|
||||
|
||||
got_turn = False
|
||||
try:
|
||||
await asyncio.wait_for(second_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
|
||||
got_turn = True
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
pass
|
||||
|
||||
artifact.record_value(
|
||||
"got_second_turn", got_turn, expected="queen responds to injected message"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"queen responded to injected message",
|
||||
got_turn,
|
||||
actual=str(got_turn),
|
||||
expected_val="True",
|
||||
)
|
||||
assert got_turn, "Queen did not respond to injected message"
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: Phase Transition Cycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_full_phase_cycle_with_events(llm_provider, tmp_path, artifact):
|
||||
"""Walk through all 4 phases and verify state + events at each step."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
capture = PhaseCapture()
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=capture.on_event,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
# Start: planning
|
||||
artifact.check(
|
||||
"initial phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
planning_tools = {t.name for t in ps.get_current_tools()}
|
||||
|
||||
# -> building
|
||||
await ps.switch_to_building(source="test")
|
||||
artifact.check(
|
||||
"phase is building",
|
||||
ps.phase == "building",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert ps.phase == "building"
|
||||
building_tools = {t.name for t in ps.get_current_tools()}
|
||||
|
||||
artifact.check(
|
||||
"building tools differ from planning",
|
||||
building_tools != planning_tools,
|
||||
actual=f"building={sorted(building_tools)}",
|
||||
expected_val="different from planning",
|
||||
)
|
||||
assert building_tools != planning_tools
|
||||
|
||||
# -> staging
|
||||
await ps.switch_to_staging(source="test")
|
||||
artifact.check(
|
||||
"phase is staging",
|
||||
ps.phase == "staging",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'staging'",
|
||||
)
|
||||
assert ps.phase == "staging"
|
||||
staging_tools = {t.name for t in ps.get_current_tools()}
|
||||
|
||||
# -> running
|
||||
await ps.switch_to_running(source="test")
|
||||
artifact.check(
|
||||
"phase is running",
|
||||
ps.phase == "running",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'running'",
|
||||
)
|
||||
assert ps.phase == "running"
|
||||
running_tools = {t.name for t in ps.get_current_tools()}
|
||||
|
||||
# -> back to planning
|
||||
await ps.switch_to_planning(source="test")
|
||||
artifact.check(
|
||||
"phase is planning again",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
final_tools = {t.name for t in ps.get_current_tools()}
|
||||
|
||||
artifact.check(
|
||||
"final tools match original planning set",
|
||||
final_tools == planning_tools,
|
||||
actual=f"final={sorted(final_tools)}",
|
||||
expected_val=f"planning={sorted(planning_tools)}",
|
||||
)
|
||||
assert final_tools == planning_tools, "Tools should match original planning set"
|
||||
|
||||
# Verify events
|
||||
await asyncio.sleep(0.3)
|
||||
artifact.record_value(
|
||||
"phase_events",
|
||||
capture.phases,
|
||||
expected="['building', 'staging', 'running', 'planning']",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase event sequence",
|
||||
capture.phases == ["building", "staging", "running", "planning"],
|
||||
actual=str(capture.phases),
|
||||
expected_val="['building', 'staging', 'running', 'planning']",
|
||||
)
|
||||
assert capture.phases == ["building", "staging", "running", "planning"]
|
||||
|
||||
# Verify all 4 phase tool sets are distinct
|
||||
all_sets = [planning_tools, building_tools, staging_tools, running_tools]
|
||||
for i, a in enumerate(all_sets):
|
||||
for j, b in enumerate(all_sets):
|
||||
if i != j:
|
||||
phase_names = ["planning", "building", "staging", "running"]
|
||||
artifact.check(
|
||||
f"{phase_names[i]} != {phase_names[j]} tools",
|
||||
a != b,
|
||||
actual=f"{phase_names[i]}={sorted(a)}, {phase_names[j]}={sorted(b)}",
|
||||
expected_val="different",
|
||||
)
|
||||
assert a != b, f"Phase tool sets {i} and {j} should differ but are identical"
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_queen_phase_state_persists_draft(llm_provider, tmp_path, artifact):
|
||||
"""Draft graph on phase_state must survive phase transitions."""
|
||||
session, task = await _start_queen(
|
||||
llm_provider,
|
||||
tmp_path,
|
||||
worker_identity=None,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
ps.draft_graph = {"nodes": ["a", "b"], "edges": ["a->b"]}
|
||||
|
||||
await ps.switch_to_building(source="test")
|
||||
artifact.check(
|
||||
"draft survives building switch",
|
||||
ps.draft_graph is not None,
|
||||
actual=repr(ps.draft_graph),
|
||||
expected_val="non-None",
|
||||
)
|
||||
assert ps.draft_graph is not None
|
||||
|
||||
artifact.check(
|
||||
"draft nodes intact after building",
|
||||
ps.draft_graph["nodes"] == ["a", "b"],
|
||||
actual=str(ps.draft_graph["nodes"]),
|
||||
expected_val="['a', 'b']",
|
||||
)
|
||||
assert ps.draft_graph["nodes"] == ["a", "b"]
|
||||
|
||||
await ps.switch_to_staging(source="test")
|
||||
artifact.check(
|
||||
"draft survives staging switch",
|
||||
ps.draft_graph is not None,
|
||||
actual=repr(ps.draft_graph),
|
||||
expected_val="non-None",
|
||||
)
|
||||
assert ps.draft_graph is not None
|
||||
|
||||
await ps.switch_to_running(source="test")
|
||||
artifact.check(
|
||||
"draft survives running switch",
|
||||
ps.draft_graph is not None,
|
||||
actual=repr(ps.draft_graph),
|
||||
expected_val="non-None",
|
||||
)
|
||||
assert ps.draft_graph is not None
|
||||
|
||||
artifact.record_value(
|
||||
"final_draft_graph",
|
||||
ps.draft_graph,
|
||||
expected="draft_graph survives all phase transitions",
|
||||
)
|
||||
finally:
|
||||
await _shutdown_queen(session, task)
|
||||
@@ -0,0 +1,678 @@
|
||||
"""Component tests: Queen State Machine Edge Cases.
|
||||
|
||||
Race conditions, invalid transitions, stale events.
|
||||
|
||||
These tests confirm real bugs and edge cases in the queen's phase
|
||||
state machine:
|
||||
- Non-atomic phase switch + event emission
|
||||
- Stale worker completion events ignored during wrong phase
|
||||
- No guards against invalid phase transitions
|
||||
- Double phase switch deduplication
|
||||
- inject_notification after executor teardown
|
||||
- Empty tool lists per phase
|
||||
- Phase persistence across rapid cycling
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.runtime.event_bus import AgentEvent, EventBus, EventType
|
||||
from framework.server.session_manager import Session
|
||||
from framework.tools.queen_lifecycle_tools import QueenPhaseState
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
QUEEN_STARTUP_TIMEOUT = 30
|
||||
|
||||
|
||||
async def _start_queen_session(llm_provider, tmp_path, *, worker_identity=None):
|
||||
"""Start a real queen and return (session, task)."""
|
||||
from framework.server.queen_orchestrator import create_queen
|
||||
|
||||
event_bus = EventBus()
|
||||
session = Session(
|
||||
id=f"test_{int(time.time())}",
|
||||
event_bus=event_bus,
|
||||
llm=llm_provider,
|
||||
loaded_at=time.time(),
|
||||
)
|
||||
queen_dir = tmp_path / "queen"
|
||||
queen_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mgr = MagicMock()
|
||||
mgr._subscribe_worker_handoffs = MagicMock()
|
||||
|
||||
task = await create_queen(
|
||||
session=session,
|
||||
session_manager=mgr,
|
||||
worker_identity=worker_identity,
|
||||
queen_dir=queen_dir,
|
||||
initial_prompt="Hello",
|
||||
)
|
||||
|
||||
for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
|
||||
if session.queen_executor is not None:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
assert session.queen_executor is not None
|
||||
return session, task
|
||||
|
||||
|
||||
async def _shutdown(session, task):
|
||||
node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
|
||||
if node and hasattr(node, "signal_shutdown"):
|
||||
node.signal_shutdown()
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(task, timeout=5)
|
||||
except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
|
||||
pass
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #1: Concurrent phase switches — no crash or lost events
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_phase_switches_no_crash(llm_provider, tmp_path, artifact):
|
||||
"""Firing multiple phase switches concurrently must not crash."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
phases_seen = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
phases_seen.append(event.data.get("phase"))
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
# Fire 4 phase switches concurrently
|
||||
await asyncio.gather(
|
||||
ps.switch_to_building(source="test"),
|
||||
ps.switch_to_staging(source="test"),
|
||||
ps.switch_to_running(source="test"),
|
||||
ps.switch_to_planning(source="test"),
|
||||
)
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
valid_phases = ("planning", "building", "staging", "running")
|
||||
|
||||
artifact.record_value(
|
||||
"final_phase",
|
||||
ps.phase,
|
||||
expected="valid phase (not corrupted)",
|
||||
)
|
||||
artifact.record_value("phases_seen", phases_seen)
|
||||
|
||||
artifact.check(
|
||||
"phase is valid",
|
||||
ps.phase in valid_phases,
|
||||
actual=repr(ps.phase),
|
||||
expected_val="one of planning/building/staging/running",
|
||||
)
|
||||
assert ps.phase in valid_phases, f"Phase corrupted: {ps.phase}"
|
||||
|
||||
artifact.check(
|
||||
"at least 1 phase event",
|
||||
len(phases_seen) >= 1,
|
||||
actual=str(len(phases_seen)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(phases_seen) >= 1, "No phase change events"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #3: Non-atomic phase change + event
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_phase_changes_without_event_bus(artifact):
|
||||
"""Phase must still change when event_bus is None (no crash)."""
|
||||
ps = QueenPhaseState(phase="planning", event_bus=None)
|
||||
|
||||
await ps.switch_to_building(source="test")
|
||||
|
||||
artifact.record_value(
|
||||
"phase",
|
||||
ps.phase,
|
||||
expected="'building' even without event bus",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase changed to building",
|
||||
ps.phase == "building",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert ps.phase == "building", "Phase should change even without event bus"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_phase_change_committed_before_event(artifact):
|
||||
"""Phase assignment before event emission — verify both occur."""
|
||||
bus = EventBus()
|
||||
phases_at_event_time = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
phases_at_event_time.append(event.data.get("phase"))
|
||||
|
||||
bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
|
||||
ps = QueenPhaseState(phase="planning", event_bus=bus)
|
||||
await ps.switch_to_building(source="test")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
artifact.record_value(
|
||||
"phase",
|
||||
ps.phase,
|
||||
expected="'building', event reports 'building'",
|
||||
)
|
||||
artifact.record_value(
|
||||
"phases_at_event_time",
|
||||
phases_at_event_time,
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is building",
|
||||
ps.phase == "building",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert ps.phase == "building"
|
||||
|
||||
artifact.check(
|
||||
"event reports building",
|
||||
phases_at_event_time == ["building"],
|
||||
actual=str(phases_at_event_time),
|
||||
expected_val="['building']",
|
||||
)
|
||||
assert phases_at_event_time == ["building"], (
|
||||
f"Event should report 'building', got: {phases_at_event_time}"
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #4: Stale worker done events during non-running phase
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_done_ignored_in_non_running_phase(llm_provider, tmp_path, artifact):
|
||||
"""Worker completion in planning phase must be silently dropped.
|
||||
|
||||
This confirms BUG #4: the _on_worker_done handler only processes
|
||||
events when phase == 'running'. Events in other phases are lost.
|
||||
"""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
phase_changes = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
phase_changes.append(event.data.get("phase"))
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
artifact.check(
|
||||
"initial phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
|
||||
# Simulate a stale worker completion event
|
||||
await session.event_bus.publish(
|
||||
AgentEvent(
|
||||
type=EventType.EXECUTION_COMPLETED,
|
||||
stream_id="worker",
|
||||
data={"output": {"result": "stale output"}},
|
||||
)
|
||||
)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
artifact.record_value(
|
||||
"phase_after_stale_event",
|
||||
ps.phase,
|
||||
expected="still 'planning' (stale event ignored)",
|
||||
)
|
||||
artifact.record_value("phase_changes", phase_changes)
|
||||
|
||||
artifact.check(
|
||||
"phase still planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning", f"Phase should still be planning, got: {ps.phase}"
|
||||
|
||||
artifact.check(
|
||||
"no auto-switch to staging",
|
||||
"staging" not in phase_changes,
|
||||
actual=str(phase_changes),
|
||||
expected_val="does not contain 'staging'",
|
||||
)
|
||||
assert "staging" not in phase_changes, (
|
||||
"Should not auto-switch to staging from planning phase"
|
||||
)
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #10: No guards against invalid phase transitions
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_transition_planning_to_running(llm_provider, tmp_path, artifact):
|
||||
"""planning -> running should succeed (no guard).
|
||||
|
||||
This confirms BUG #10: the state machine allows any transition.
|
||||
"""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
artifact.check(
|
||||
"initial phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning"
|
||||
|
||||
await ps.switch_to_running(source="test")
|
||||
|
||||
artifact.record_value(
|
||||
"phase_after_invalid_transition",
|
||||
ps.phase,
|
||||
expected="'running' (no guard, transition allowed)",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is running",
|
||||
ps.phase == "running",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'running'",
|
||||
)
|
||||
assert ps.phase == "running", "switch_to_running should succeed from planning"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_transition_running_to_building(llm_provider, tmp_path, artifact):
|
||||
"""running -> building should succeed (no guard).
|
||||
|
||||
In production this could leave a running worker orphaned.
|
||||
"""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
await ps.switch_to_running(source="test")
|
||||
|
||||
artifact.check(
|
||||
"phase is running",
|
||||
ps.phase == "running",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'running'",
|
||||
)
|
||||
assert ps.phase == "running"
|
||||
|
||||
await ps.switch_to_building(source="test")
|
||||
|
||||
artifact.record_value(
|
||||
"phase_after_invalid_transition",
|
||||
ps.phase,
|
||||
expected="'building' (no guard)",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"phase is building",
|
||||
ps.phase == "building",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'building'",
|
||||
)
|
||||
assert ps.phase == "building"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #1 supplement: Double phase switch deduplication
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_double_switch_to_same_phase_is_noop(llm_provider, tmp_path, artifact):
|
||||
"""switch_to_X when already in X must be a no-op (no event)."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
events = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
events.append(event.data.get("phase"))
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
await ps.switch_to_building(source="test")
|
||||
await asyncio.sleep(0.1)
|
||||
count_after_first = len(events)
|
||||
|
||||
# Second call to same phase
|
||||
await ps.switch_to_building(source="test")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
artifact.record_value(
|
||||
"events_after_first",
|
||||
count_after_first,
|
||||
expected="no extra event after double switch",
|
||||
)
|
||||
artifact.record_value(
|
||||
"events_after_second",
|
||||
len(events),
|
||||
)
|
||||
artifact.record_value("all_events", events)
|
||||
|
||||
artifact.check(
|
||||
"no extra event on double switch",
|
||||
len(events) == count_after_first,
|
||||
actual=f"first={count_after_first}, second={len(events)}",
|
||||
expected_val="same count",
|
||||
)
|
||||
assert len(events) == count_after_first, (
|
||||
f"Double switch should not emit extra event. Events: {events}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# BUG #6: Phase with empty tool lists
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_phase_with_empty_tools_returns_empty(llm_provider, tmp_path, artifact):
|
||||
"""get_current_tools() with empty tool list returns [] not crash."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
# Clear all running tools
|
||||
ps.running_tools = []
|
||||
await ps.switch_to_running(source="test")
|
||||
|
||||
tools = ps.get_current_tools()
|
||||
|
||||
artifact.record_value(
|
||||
"tool_count",
|
||||
len(tools),
|
||||
expected="0 (empty list, no crash)",
|
||||
)
|
||||
artifact.record_value(
|
||||
"tool_names",
|
||||
[t.name for t in tools],
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"empty tools returns []",
|
||||
tools == [],
|
||||
actual=str([t.name for t in tools]),
|
||||
expected_val="[]",
|
||||
)
|
||||
assert tools == [], f"Expected empty list, got: {[t.name for t in tools]}"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Rapid phase cycling — verify final state is consistent
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_rapid_phase_cycling_final_state(llm_provider, tmp_path, artifact):
|
||||
"""Rapidly cycling through phases must leave state consistent."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
all_events = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
all_events.append(event.data.get("phase"))
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
# Cycle 3 times
|
||||
for _ in range(3):
|
||||
await ps.switch_to_building(source="test")
|
||||
await ps.switch_to_staging(source="test")
|
||||
await ps.switch_to_running(source="test")
|
||||
await ps.switch_to_planning(source="test")
|
||||
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
artifact.record_value(
|
||||
"final_phase",
|
||||
ps.phase,
|
||||
expected="'planning' after 3 full cycles",
|
||||
)
|
||||
artifact.record_value("event_count", len(all_events))
|
||||
artifact.record_value("all_events", all_events)
|
||||
|
||||
artifact.check(
|
||||
"final phase is planning",
|
||||
ps.phase == "planning",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'planning'",
|
||||
)
|
||||
assert ps.phase == "planning", f"Expected planning, got: {ps.phase}"
|
||||
|
||||
# Should have 12 phase change events (4 per cycle x 3)
|
||||
artifact.check(
|
||||
"12 phase events",
|
||||
len(all_events) == 12,
|
||||
actual=str(len(all_events)),
|
||||
expected_val="12",
|
||||
)
|
||||
assert len(all_events) == 12, f"Expected 12 events, got {len(all_events)}: {all_events}"
|
||||
|
||||
# Tools and prompt should match planning phase
|
||||
prompt = ps.get_current_prompt()
|
||||
|
||||
artifact.check(
|
||||
"prompt non-empty after cycling",
|
||||
len(prompt) > 0,
|
||||
actual=str(len(prompt)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(prompt) > 0, "Prompt should not be empty after cycling"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Tool availability is correct per phase (strict verification)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_sets_are_disjoint_across_phases(llm_provider, tmp_path, artifact):
|
||||
"""Each phase must have a distinct non-empty tool set."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
|
||||
phase_tools = {}
|
||||
for phase in ("planning", "building", "staging", "running"):
|
||||
getattr(
|
||||
ps,
|
||||
f"switch_to_{phase}",
|
||||
ps.switch_to_planning,
|
||||
)
|
||||
# Use direct assignment for simplicity
|
||||
ps.phase = phase
|
||||
tools = {t.name for t in ps.get_current_tools()}
|
||||
phase_tools[phase] = tools
|
||||
|
||||
# All phases should have at least 1 tool
|
||||
for phase, tools in phase_tools.items():
|
||||
artifact.check(
|
||||
f"{phase} has tools",
|
||||
len(tools) > 0,
|
||||
actual=str(len(tools)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(tools) > 0, f"{phase} has no tools"
|
||||
|
||||
artifact.record_value(
|
||||
"phase_tools",
|
||||
{k: sorted(v) for k, v in phase_tools.items()},
|
||||
expected="all 4 phases have distinct tool sets",
|
||||
)
|
||||
|
||||
# Pairwise comparison: all sets should differ
|
||||
phases = list(phase_tools.keys())
|
||||
for i in range(len(phases)):
|
||||
for j in range(i + 1, len(phases)):
|
||||
a, b = phases[i], phases[j]
|
||||
artifact.check(
|
||||
f"{a} != {b} tools",
|
||||
phase_tools[a] != phase_tools[b],
|
||||
actual=(f"{a}={sorted(phase_tools[a])}, {b}={sorted(phase_tools[b])}"),
|
||||
expected_val="different",
|
||||
)
|
||||
assert phase_tools[a] != phase_tools[b], (
|
||||
f"{a} and {b} have identical tools: {phase_tools[a]}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Worker completion -> auto-staging transition
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_completion_triggers_auto_staging(llm_provider, tmp_path, artifact):
|
||||
"""EXECUTION_COMPLETED in running phase must auto-switch to staging."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
phase_changes = []
|
||||
|
||||
async def _capture(event: AgentEvent):
|
||||
phase_changes.append(event.data.get("phase"))
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[EventType.QUEEN_PHASE_CHANGED],
|
||||
handler=_capture,
|
||||
)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
# Move to running phase
|
||||
await ps.switch_to_running(source="test")
|
||||
await asyncio.sleep(0.3)
|
||||
phase_changes.clear() # Reset after manual switch
|
||||
|
||||
# Simulate worker completion event
|
||||
await session.event_bus.publish(
|
||||
AgentEvent(
|
||||
type=EventType.EXECUTION_COMPLETED,
|
||||
stream_id="worker",
|
||||
data={"output": {"result": "done"}},
|
||||
)
|
||||
)
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
artifact.record_value(
|
||||
"phase_after_completion",
|
||||
ps.phase,
|
||||
expected="'staging' (auto-switch on completion)",
|
||||
)
|
||||
artifact.record_value("phase_changes", phase_changes)
|
||||
|
||||
artifact.check(
|
||||
"auto-switched to staging",
|
||||
ps.phase == "staging",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'staging'",
|
||||
)
|
||||
assert ps.phase == "staging", f"Expected auto-switch to staging, got: {ps.phase}"
|
||||
|
||||
artifact.check(
|
||||
"staging event emitted",
|
||||
"staging" in phase_changes,
|
||||
actual=str(phase_changes),
|
||||
expected_val="contains 'staging'",
|
||||
)
|
||||
assert "staging" in phase_changes, (
|
||||
f"QUEEN_PHASE_CHANGED(staging) not emitted. Events: {phase_changes}"
|
||||
)
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_failure_triggers_auto_staging(llm_provider, tmp_path, artifact):
|
||||
"""EXECUTION_FAILED in running phase must auto-switch to staging."""
|
||||
session, task = await _start_queen_session(llm_provider, tmp_path)
|
||||
try:
|
||||
ps = session.phase_state
|
||||
await ps.switch_to_running(source="test")
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
# Simulate worker failure event
|
||||
await session.event_bus.publish(
|
||||
AgentEvent(
|
||||
type=EventType.EXECUTION_FAILED,
|
||||
stream_id="worker",
|
||||
data={"error": "worker crashed"},
|
||||
)
|
||||
)
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
artifact.record_value(
|
||||
"phase_after_failure",
|
||||
ps.phase,
|
||||
expected="'staging' (auto-switch on failure)",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"auto-switched to staging on failure",
|
||||
ps.phase == "staging",
|
||||
actual=repr(ps.phase),
|
||||
expected_val="'staging'",
|
||||
)
|
||||
assert ps.phase == "staging", f"Expected auto-switch to staging on failure, got: {ps.phase}"
|
||||
finally:
|
||||
await _shutdown(session, task)
|
||||
@@ -27,7 +27,7 @@ SET_OUTPUT = (
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
|
||||
async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider, artifact):
|
||||
"""Echo node: path must be exactly ['echo'], steps must be 1."""
|
||||
graph = GraphSpec(
|
||||
id="strict-echo",
|
||||
@@ -54,21 +54,70 @@ async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
result = await executor.execute(
|
||||
graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False
|
||||
result = await executor.execute(graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, path=['echo'], steps=1, "
|
||||
"output['output'] set, quality='clean', "
|
||||
"retries=0, tokens>0"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches", result.path == ["echo"], actual=str(result.path), expected_val="['echo']"
|
||||
)
|
||||
assert result.path == ["echo"]
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 1",
|
||||
result.steps_executed == 1,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="1",
|
||||
)
|
||||
assert result.steps_executed == 1
|
||||
|
||||
actual_output = result.output.get("output")
|
||||
artifact.check(
|
||||
"output['output'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("output") is not None
|
||||
|
||||
artifact.check(
|
||||
"execution_quality is clean",
|
||||
result.execution_quality == "clean",
|
||||
actual=repr(result.execution_quality),
|
||||
expected_val="'clean'",
|
||||
)
|
||||
assert result.execution_quality == "clean"
|
||||
|
||||
artifact.check(
|
||||
"total_retries is 0",
|
||||
result.total_retries == 0,
|
||||
actual=str(result.total_retries),
|
||||
expected_val="0",
|
||||
)
|
||||
assert result.total_retries == 0
|
||||
|
||||
artifact.check(
|
||||
"total_tokens > 0",
|
||||
result.total_tokens > 0,
|
||||
actual=str(result.total_tokens),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert result.total_tokens > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
|
||||
async def test_strict_clean_execution_quality(runtime, goal, llm_provider, artifact):
|
||||
"""A simple set_output call should produce 'clean' execution quality."""
|
||||
graph = GraphSpec(
|
||||
id="strict-clean",
|
||||
@@ -92,12 +141,37 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(result, expected="clean success, no partial failures, no nodes_with_failures")
|
||||
|
||||
artifact.check(
|
||||
"is_clean_success",
|
||||
result.is_clean_success,
|
||||
actual=(
|
||||
f"quality={result.execution_quality}, "
|
||||
f"retries={result.total_retries}, "
|
||||
f"failures={result.nodes_with_failures}"
|
||||
),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.is_clean_success, (
|
||||
f"Expected clean success, got quality={result.execution_quality}, "
|
||||
f"retries={result.total_retries}, failures={result.nodes_with_failures}"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"no partial failures",
|
||||
not result.had_partial_failures,
|
||||
actual=str(result.had_partial_failures),
|
||||
expected_val="False",
|
||||
)
|
||||
assert not result.had_partial_failures
|
||||
|
||||
artifact.check(
|
||||
"no nodes_with_failures",
|
||||
len(result.nodes_with_failures) == 0,
|
||||
actual=str(result.nodes_with_failures),
|
||||
expected_val="[]",
|
||||
)
|
||||
assert len(result.nodes_with_failures) == 0
|
||||
|
||||
|
||||
@@ -107,8 +181,8 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
|
||||
"""Three-node pipeline must traverse in exact order: a → b → c."""
|
||||
async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider, artifact):
|
||||
"""Three-node pipeline must traverse in exact order: a -> b -> c."""
|
||||
graph = GraphSpec(
|
||||
id="strict-pipeline",
|
||||
goal_id="dummy",
|
||||
@@ -118,45 +192,106 @@ async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
|
||||
conversation_mode="continuous",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="a", name="A", description="First",
|
||||
node_type="event_loop", output_keys=["a_out"],
|
||||
id="a",
|
||||
name="A",
|
||||
description="First",
|
||||
node_type="event_loop",
|
||||
output_keys=["a_out"],
|
||||
system_prompt="Call set_output with key='a_out' and value='from_a'. " + SET_OUTPUT,
|
||||
),
|
||||
NodeSpec(
|
||||
id="b", name="B", description="Second",
|
||||
node_type="event_loop", input_keys=["b_in"], output_keys=["b_out"],
|
||||
id="b",
|
||||
name="B",
|
||||
description="Second",
|
||||
node_type="event_loop",
|
||||
input_keys=["b_in"],
|
||||
output_keys=["b_out"],
|
||||
system_prompt="Call set_output with key='b_out' and value='from_b'. " + SET_OUTPUT,
|
||||
),
|
||||
NodeSpec(
|
||||
id="c", name="C", description="Third",
|
||||
node_type="event_loop", input_keys=["c_in"], output_keys=["result"],
|
||||
id="c",
|
||||
name="C",
|
||||
description="Third",
|
||||
node_type="event_loop",
|
||||
input_keys=["c_in"],
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='from_c'. " + SET_OUTPUT,
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(id="a-b", source="a", target="b",
|
||||
condition=EdgeCondition.ON_SUCCESS, input_mapping={"b_in": "a_out"}),
|
||||
EdgeSpec(id="b-c", source="b", target="c",
|
||||
condition=EdgeCondition.ON_SUCCESS, input_mapping={"c_in": "b_out"}),
|
||||
EdgeSpec(
|
||||
id="a-b",
|
||||
source="a",
|
||||
target="b",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
input_mapping={"b_in": "a_out"},
|
||||
),
|
||||
EdgeSpec(
|
||||
id="b-c",
|
||||
source="b",
|
||||
target="c",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
input_mapping={"c_in": "b_out"},
|
||||
),
|
||||
],
|
||||
memory_keys=["a_out", "b_in", "b_out", "c_in", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, path=['a','b','c'], steps=3, "
|
||||
"output['result'] set, each node visited once"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["a", "b", "c"],
|
||||
actual=str(result.path),
|
||||
expected_val="['a', 'b', 'c']",
|
||||
)
|
||||
assert result.path == ["a", "b", "c"], f"Path was {result.path}"
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 3",
|
||||
result.steps_executed == 3,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="3",
|
||||
)
|
||||
assert result.steps_executed == 3
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
# Visit counts: each node visited exactly once
|
||||
a_visits = result.node_visit_counts.get("a", 0)
|
||||
artifact.check("node 'a' visited once", a_visits == 1, actual=str(a_visits), expected_val="1")
|
||||
assert result.node_visit_counts.get("a", 0) == 1
|
||||
|
||||
b_visits = result.node_visit_counts.get("b", 0)
|
||||
artifact.check("node 'b' visited once", b_visits == 1, actual=str(b_visits), expected_val="1")
|
||||
assert result.node_visit_counts.get("b", 0) == 1
|
||||
|
||||
c_visits = result.node_visit_counts.get("c", 0)
|
||||
artifact.check("node 'c' visited once", c_visits == 1, actual=str(c_visits), expected_val="1")
|
||||
assert result.node_visit_counts.get("c", 0) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
|
||||
async def test_strict_branch_correct_terminal(runtime, goal, llm_provider, artifact):
|
||||
"""Classifier node must route 'I love it' to the positive terminal."""
|
||||
graph = GraphSpec(
|
||||
id="strict-branch",
|
||||
@@ -167,8 +302,11 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
|
||||
conversation_mode="continuous",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="classify", name="Classify", description="Sentiment classifier",
|
||||
node_type="event_loop", input_keys=["text"],
|
||||
id="classify",
|
||||
name="Classify",
|
||||
description="Sentiment classifier",
|
||||
node_type="event_loop",
|
||||
input_keys=["text"],
|
||||
output_keys=["label"],
|
||||
system_prompt=(
|
||||
"Read the 'text' input. Determine if sentiment is positive or negative. "
|
||||
@@ -177,39 +315,87 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="positive", name="Positive", description="Positive handler",
|
||||
node_type="event_loop", output_keys=["result"],
|
||||
id="positive",
|
||||
name="Positive",
|
||||
description="Positive handler",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
|
||||
),
|
||||
NodeSpec(
|
||||
id="negative", name="Negative", description="Negative handler",
|
||||
node_type="event_loop", output_keys=["result"],
|
||||
id="negative",
|
||||
name="Negative",
|
||||
description="Negative handler",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(id="to-pos", source="classify", target="positive",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'positive'", priority=1),
|
||||
EdgeSpec(id="to-neg", source="classify", target="negative",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'negative'", priority=0),
|
||||
EdgeSpec(
|
||||
id="to-pos",
|
||||
source="classify",
|
||||
target="positive",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'positive'",
|
||||
priority=1,
|
||||
),
|
||||
EdgeSpec(
|
||||
id="to-neg",
|
||||
source="classify",
|
||||
target="negative",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'negative'",
|
||||
priority=0,
|
||||
),
|
||||
],
|
||||
memory_keys=["text", "label", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(
|
||||
graph, goal, {"text": "I absolutely love this product, it's fantastic!"}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"text": "I absolutely love this product, it's fantastic!"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['classify','positive'], steps=2, output['result']='POS'",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["classify", "positive"],
|
||||
actual=str(result.path),
|
||||
expected_val="['classify', 'positive']",
|
||||
)
|
||||
assert result.path == ["classify", "positive"], f"Path was {result.path}"
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 2",
|
||||
result.steps_executed == 2,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="2",
|
||||
)
|
||||
assert result.steps_executed == 2
|
||||
|
||||
actual_result = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is 'POS'",
|
||||
actual_result == "POS",
|
||||
actual=repr(actual_result),
|
||||
expected_val="'POS'",
|
||||
)
|
||||
assert result.output.get("result") == "POS"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
|
||||
async def test_strict_branch_negative_terminal(runtime, goal, llm_provider, artifact):
|
||||
"""Classifier node must route hateful text to the negative terminal."""
|
||||
graph = GraphSpec(
|
||||
id="strict-branch-neg",
|
||||
@@ -220,8 +406,11 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
|
||||
conversation_mode="continuous",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="classify", name="Classify", description="Sentiment classifier",
|
||||
node_type="event_loop", input_keys=["text"],
|
||||
id="classify",
|
||||
name="Classify",
|
||||
description="Sentiment classifier",
|
||||
node_type="event_loop",
|
||||
input_keys=["text"],
|
||||
output_keys=["label"],
|
||||
system_prompt=(
|
||||
"Read the 'text' input. Determine if sentiment is positive or negative. "
|
||||
@@ -230,34 +419,82 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="positive", name="Positive", description="Positive handler",
|
||||
node_type="event_loop", output_keys=["result"],
|
||||
id="positive",
|
||||
name="Positive",
|
||||
description="Positive handler",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
|
||||
),
|
||||
NodeSpec(
|
||||
id="negative", name="Negative", description="Negative handler",
|
||||
node_type="event_loop", output_keys=["result"],
|
||||
id="negative",
|
||||
name="Negative",
|
||||
description="Negative handler",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(id="to-pos", source="classify", target="positive",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'positive'", priority=1),
|
||||
EdgeSpec(id="to-neg", source="classify", target="negative",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'negative'", priority=0),
|
||||
EdgeSpec(
|
||||
id="to-pos",
|
||||
source="classify",
|
||||
target="positive",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'positive'",
|
||||
priority=1,
|
||||
),
|
||||
EdgeSpec(
|
||||
id="to-neg",
|
||||
source="classify",
|
||||
target="negative",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('label') == 'negative'",
|
||||
priority=0,
|
||||
),
|
||||
],
|
||||
memory_keys=["text", "label", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
|
||||
result = await executor.execute(
|
||||
graph, goal, {"text": "This is absolutely terrible and broken. Worst ever."}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"text": "This is absolutely terrible and broken. Worst ever."},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['classify','negative'], steps=2, output['result']='NEG'",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["classify", "negative"],
|
||||
actual=str(result.path),
|
||||
expected_val="['classify', 'negative']",
|
||||
)
|
||||
assert result.path == ["classify", "negative"], f"Path was {result.path}"
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 2",
|
||||
result.steps_executed == 2,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="2",
|
||||
)
|
||||
assert result.steps_executed == 2
|
||||
|
||||
actual_result = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is 'NEG'",
|
||||
actual_result == "NEG",
|
||||
actual=repr(actual_result),
|
||||
expected_val="'NEG'",
|
||||
)
|
||||
assert result.output.get("result") == "NEG"
|
||||
|
||||
|
||||
@@ -268,7 +505,7 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_tool_output_format(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
|
||||
):
|
||||
"""Worker must call get_current_time and produce output in STATUS|date|day format."""
|
||||
graph = GraphSpec(
|
||||
@@ -290,8 +527,7 @@ async def test_strict_tool_output_format(
|
||||
"Extract the 'date' and 'day_of_week' fields from the result. "
|
||||
"Build this exact format: STATUS|<date>|<day_of_week> "
|
||||
"(example: STATUS|2026-04-03|Thursday). "
|
||||
"Call set_output with key='result' and this formatted string. "
|
||||
+ SET_OUTPUT
|
||||
"Call set_output with key='result' and this formatted string. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -300,32 +536,77 @@ async def test_strict_tool_output_format(
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result, expected="success=True, output['result'] in STATUS|YYYY-MM-DD|DayName format"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
output is not None,
|
||||
actual=repr(output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert output is not None, "No result output"
|
||||
|
||||
# Strict format verification: STATUS|date|day_of_week
|
||||
parts = output.split("|")
|
||||
artifact.check(
|
||||
"3 pipe-separated parts",
|
||||
len(parts) == 3,
|
||||
actual=f"{len(parts)} parts: {output}",
|
||||
expected_val="3 parts",
|
||||
)
|
||||
assert len(parts) == 3, f"Expected 3 pipe-separated parts, got {len(parts)}: {output}"
|
||||
|
||||
artifact.check(
|
||||
"first part is STATUS", parts[0] == "STATUS", actual=repr(parts[0]), expected_val="'STATUS'"
|
||||
)
|
||||
assert parts[0] == "STATUS", f"First part should be STATUS, got: {parts[0]}"
|
||||
|
||||
# Date part should look like YYYY-MM-DD
|
||||
artifact.check(
|
||||
"date part length >= 8",
|
||||
len(parts[1]) >= 8,
|
||||
actual=f"len={len(parts[1])}, value={parts[1]}",
|
||||
expected_val=">=8",
|
||||
)
|
||||
assert len(parts[1]) >= 8, f"Date part too short: {parts[1]}"
|
||||
|
||||
artifact.check(
|
||||
"date part contains dashes",
|
||||
"-" in parts[1],
|
||||
actual=repr(parts[1]),
|
||||
expected_val="contains '-'",
|
||||
)
|
||||
assert "-" in parts[1], f"Date part should contain dashes: {parts[1]}"
|
||||
|
||||
# Day of week should be a recognizable day name
|
||||
valid_days = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"}
|
||||
artifact.check(
|
||||
"valid day_of_week",
|
||||
parts[2] in valid_days,
|
||||
actual=repr(parts[2]),
|
||||
expected_val=f"one of {sorted(valid_days)}",
|
||||
)
|
||||
assert parts[2] in valid_days, f"Invalid day_of_week: {parts[2]}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_artifact_creation_and_verification(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
|
||||
):
|
||||
"""Single-node: saves a file via save_data, then verifies the artifact on disk."""
|
||||
storage_path = tmp_path / "session"
|
||||
@@ -359,7 +640,8 @@ async def test_strict_artifact_creation_and_verification(
|
||||
memory_keys=["task", "result"],
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=storage_path,
|
||||
@@ -367,23 +649,72 @@ async def test_strict_artifact_creation_and_verification(
|
||||
result = await executor.execute(
|
||||
graph, goal, {"task": "Create and verify artifact"}, validate_graph=False
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, path=['worker'], steps=1, "
|
||||
"output contains INTEGRATION_TEST_PAYLOAD_XYZ, "
|
||||
"file on disk matches"
|
||||
),
|
||||
)
|
||||
|
||||
# Strict outcome verification
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["worker"],
|
||||
actual=str(result.path),
|
||||
expected_val="['worker']",
|
||||
)
|
||||
assert result.path == ["worker"], f"Path was {result.path}"
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 1",
|
||||
result.steps_executed == 1,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="1",
|
||||
)
|
||||
assert result.steps_executed == 1
|
||||
|
||||
# Output must be the loaded content
|
||||
output = result.output.get("result")
|
||||
assert output is not None, "Worker did not set 'result'"
|
||||
assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, (
|
||||
f"Expected payload in output, got: {output}"
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
output is not None,
|
||||
actual=repr(output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert output is not None, "Worker did not set 'result'"
|
||||
|
||||
artifact.check(
|
||||
"output contains payload",
|
||||
"INTEGRATION_TEST_PAYLOAD_XYZ" in output,
|
||||
actual=repr(output),
|
||||
expected_val="contains 'INTEGRATION_TEST_PAYLOAD_XYZ'",
|
||||
)
|
||||
assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, f"Expected payload in output, got: {output}"
|
||||
|
||||
# Verify the actual file exists on disk (save_data uses storage_path/data/)
|
||||
artifact_path = storage_path / "data" / "test_artifact.txt"
|
||||
artifact.check(
|
||||
"artifact file exists",
|
||||
artifact_path.exists(),
|
||||
actual=str(artifact_path.exists()),
|
||||
expected_val="True",
|
||||
)
|
||||
assert artifact_path.exists(), f"Artifact not found at {artifact_path}"
|
||||
|
||||
file_content = artifact_path.read_text(encoding="utf-8").strip()
|
||||
artifact.check(
|
||||
"file content matches payload",
|
||||
file_content == "INTEGRATION_TEST_PAYLOAD_XYZ",
|
||||
actual=repr(file_content),
|
||||
expected_val="'INTEGRATION_TEST_PAYLOAD_XYZ'",
|
||||
)
|
||||
assert file_content == "INTEGRATION_TEST_PAYLOAD_XYZ", (
|
||||
f"File content mismatch: {file_content!r}"
|
||||
)
|
||||
@@ -395,7 +726,7 @@ async def test_strict_artifact_creation_and_verification(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
|
||||
async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider, artifact):
|
||||
"""Feedback loop must respect max_node_visits and record visit counts."""
|
||||
from .nodes import StatefulNode, SuccessNode
|
||||
from framework.graph.node import NodeResult
|
||||
@@ -406,23 +737,48 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
|
||||
entry_node="draft",
|
||||
terminal_nodes=["done"],
|
||||
nodes=[
|
||||
NodeSpec(id="draft", name="Draft", description="Produces draft",
|
||||
node_type="event_loop", output_keys=["draft_output"], max_node_visits=3),
|
||||
NodeSpec(id="review", name="Review", description="Reviews draft",
|
||||
node_type="event_loop", input_keys=["draft_output"],
|
||||
output_keys=["approved"]),
|
||||
NodeSpec(id="done", name="Done", description="Terminal",
|
||||
node_type="event_loop", output_keys=["final"]),
|
||||
NodeSpec(
|
||||
id="draft",
|
||||
name="Draft",
|
||||
description="Produces draft",
|
||||
node_type="event_loop",
|
||||
output_keys=["draft_output"],
|
||||
max_node_visits=3,
|
||||
),
|
||||
NodeSpec(
|
||||
id="review",
|
||||
name="Review",
|
||||
description="Reviews draft",
|
||||
node_type="event_loop",
|
||||
input_keys=["draft_output"],
|
||||
output_keys=["approved"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="done",
|
||||
name="Done",
|
||||
description="Terminal",
|
||||
node_type="event_loop",
|
||||
output_keys=["final"],
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(id="d-r", source="draft", target="review",
|
||||
condition=EdgeCondition.ON_SUCCESS),
|
||||
EdgeSpec(id="r-d", source="review", target="draft",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved') == False", priority=1),
|
||||
EdgeSpec(id="r-done", source="review", target="done",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved') == True", priority=0),
|
||||
EdgeSpec(id="d-r", source="draft", target="review", condition=EdgeCondition.ON_SUCCESS),
|
||||
EdgeSpec(
|
||||
id="r-d",
|
||||
source="review",
|
||||
target="draft",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved') == False",
|
||||
priority=1,
|
||||
),
|
||||
EdgeSpec(
|
||||
id="r-done",
|
||||
source="review",
|
||||
target="done",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved') == True",
|
||||
priority=0,
|
||||
),
|
||||
],
|
||||
memory_keys=["draft_output", "approved", "final"],
|
||||
)
|
||||
@@ -430,28 +786,70 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
|
||||
|
||||
# Deterministic nodes: reject twice, then approve
|
||||
executor.register_node("draft", SuccessNode(output={"draft_output": "v1"}))
|
||||
executor.register_node("review", StatefulNode([
|
||||
NodeResult(success=True, output={"approved": False}),
|
||||
NodeResult(success=True, output={"approved": False}),
|
||||
NodeResult(success=True, output={"approved": True}),
|
||||
]))
|
||||
executor.register_node(
|
||||
"review",
|
||||
StatefulNode(
|
||||
[
|
||||
NodeResult(success=True, output={"approved": False}),
|
||||
NodeResult(success=True, output={"approved": False}),
|
||||
NodeResult(success=True, output={"approved": True}),
|
||||
]
|
||||
),
|
||||
)
|
||||
executor.register_node("done", SuccessNode(output={"final": "complete"}))
|
||||
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, 'done' in path, "
|
||||
"draft visited 3x, review visited 3x, "
|
||||
"done visited 1x, output['final']='complete'"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"'done' in path",
|
||||
"done" in result.path,
|
||||
actual=str(result.path),
|
||||
expected_val="contains 'done'",
|
||||
)
|
||||
assert "done" in result.path
|
||||
|
||||
# Strict visit count verification
|
||||
draft_visits = result.node_visit_counts.get("draft", 0)
|
||||
artifact.check(
|
||||
"draft visited 3 times", draft_visits == 3, actual=str(draft_visits), expected_val="3"
|
||||
)
|
||||
assert result.node_visit_counts.get("draft", 0) == 3, (
|
||||
f"Draft should be visited 3 times, got {result.node_visit_counts.get('draft')}"
|
||||
)
|
||||
|
||||
review_visits = result.node_visit_counts.get("review", 0)
|
||||
artifact.check(
|
||||
"review visited 3 times", review_visits == 3, actual=str(review_visits), expected_val="3"
|
||||
)
|
||||
assert result.node_visit_counts.get("review", 0) == 3, (
|
||||
f"Review should be visited 3 times, got {result.node_visit_counts.get('review')}"
|
||||
)
|
||||
|
||||
done_visits = result.node_visit_counts.get("done", 0)
|
||||
artifact.check("done visited once", done_visits == 1, actual=str(done_visits), expected_val="1")
|
||||
assert result.node_visit_counts.get("done", 0) == 1, (
|
||||
f"Done should be visited once, got {result.node_visit_counts.get('done')}"
|
||||
)
|
||||
|
||||
# Final output must be from the 'done' node
|
||||
final_output = result.output.get("final")
|
||||
artifact.check(
|
||||
"output['final'] is 'complete'",
|
||||
final_output == "complete",
|
||||
actual=repr(final_output),
|
||||
expected_val="'complete'",
|
||||
)
|
||||
assert result.output.get("final") == "complete"
|
||||
|
||||
@@ -15,47 +15,124 @@ from framework.llm.provider import ToolUse
|
||||
from .conftest import make_executor
|
||||
|
||||
|
||||
def test_tools_mcp_server_connects(tool_registry):
|
||||
def test_tools_mcp_server_connects(tool_registry, artifact):
|
||||
"""MCP server should start and expose tools."""
|
||||
tools = tool_registry.get_tools()
|
||||
|
||||
artifact.record_value(
|
||||
"tool_count",
|
||||
len(tools),
|
||||
expected="at least 1 tool exposed by MCP server",
|
||||
)
|
||||
artifact.record_value("tool_names", list(tools.keys()))
|
||||
|
||||
artifact.check(
|
||||
"MCP server exposes tools",
|
||||
len(tools) > 0,
|
||||
actual=str(len(tools)),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert len(tools) > 0, "MCP server should expose at least one tool"
|
||||
|
||||
|
||||
def test_tools_registry_has_expected_tools(tool_registry):
|
||||
def test_tools_registry_has_expected_tools(tool_registry, artifact):
|
||||
"""hive-tools should expose the expected tool names."""
|
||||
tool_names = set(tool_registry.get_tools().keys())
|
||||
expected = {"example_tool", "get_current_time"}
|
||||
assert expected.issubset(tool_names), (
|
||||
f"Missing expected tools: {expected - tool_names}"
|
||||
|
||||
artifact.record_value(
|
||||
"tool_names",
|
||||
sorted(tool_names),
|
||||
expected="superset of {example_tool, get_current_time}",
|
||||
)
|
||||
artifact.record_value("expected_tools", sorted(expected))
|
||||
|
||||
missing = expected - tool_names
|
||||
artifact.check(
|
||||
"expected tools present",
|
||||
expected.issubset(tool_names),
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val=f"superset of {sorted(expected)}",
|
||||
)
|
||||
assert expected.issubset(tool_names), f"Missing expected tools: {expected - tool_names}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tools_execute_example_tool(tool_registry):
|
||||
async def test_tools_execute_example_tool(tool_registry, artifact):
|
||||
"""Direct tool execution without LLM — verifies MCP round-trip."""
|
||||
executor = tool_registry.get_executor()
|
||||
tool_use = ToolUse(id="test-1", name="example_tool", input={"message": "hello", "uppercase": True})
|
||||
tool_use = ToolUse(
|
||||
id="test-1",
|
||||
name="example_tool",
|
||||
input={"message": "hello", "uppercase": True},
|
||||
)
|
||||
result = executor(tool_use)
|
||||
|
||||
artifact.record_value(
|
||||
"is_error",
|
||||
result.is_error,
|
||||
expected="not an error, content contains 'HELLO'",
|
||||
)
|
||||
artifact.record_value("content", result.content)
|
||||
|
||||
artifact.check(
|
||||
"result is not error",
|
||||
not result.is_error,
|
||||
actual=str(result.is_error),
|
||||
expected_val="False",
|
||||
)
|
||||
assert not result.is_error
|
||||
|
||||
artifact.check(
|
||||
"content contains HELLO",
|
||||
"HELLO" in result.content,
|
||||
actual=repr(result.content),
|
||||
expected_val="contains 'HELLO'",
|
||||
)
|
||||
assert "HELLO" in result.content
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tools_execute_get_current_time(tool_registry):
|
||||
async def test_tools_execute_get_current_time(tool_registry, artifact):
|
||||
"""get_current_time should return a dict with date/time fields."""
|
||||
executor = tool_registry.get_executor()
|
||||
tool_use = ToolUse(id="test-2", name="get_current_time", input={"timezone": "UTC"})
|
||||
tool_use = ToolUse(
|
||||
id="test-2",
|
||||
name="get_current_time",
|
||||
input={"timezone": "UTC"},
|
||||
)
|
||||
result = executor(tool_use)
|
||||
|
||||
artifact.record_value(
|
||||
"is_error",
|
||||
result.is_error,
|
||||
expected="not an error, content contains year (202x)",
|
||||
)
|
||||
artifact.record_value("content", result.content)
|
||||
|
||||
artifact.check(
|
||||
"result is not error",
|
||||
not result.is_error,
|
||||
actual=str(result.is_error),
|
||||
expected_val="False",
|
||||
)
|
||||
assert not result.is_error
|
||||
|
||||
artifact.check(
|
||||
"content contains year",
|
||||
"202" in result.content,
|
||||
actual=repr(result.content),
|
||||
expected_val="contains '202'",
|
||||
)
|
||||
# Should contain date-like content
|
||||
assert "202" in result.content, "Should contain a year (202x)"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tools_llm_calls_tool_and_gets_result(
|
||||
runtime, llm_provider, tool_registry, goal
|
||||
runtime, llm_provider, tool_registry, goal, artifact
|
||||
):
|
||||
"""Full round-trip: LLM calls a real tool and uses the result to set output."""
|
||||
"""Full round-trip: LLM calls a tool and uses the result."""
|
||||
graph = GraphSpec(
|
||||
id="tool-roundtrip",
|
||||
goal_id="dummy",
|
||||
@@ -72,8 +149,9 @@ async def test_tools_llm_calls_tool_and_gets_result(
|
||||
output_keys=["result"],
|
||||
tools=["example_tool"],
|
||||
system_prompt=(
|
||||
"Use the example_tool to process the message from the task input "
|
||||
"with uppercase=true. Then call set_output with key='result' and "
|
||||
"Use the example_tool to process the message "
|
||||
"from the task input with uppercase=true. Then "
|
||||
"call set_output with key='result' and "
|
||||
"the tool's return value."
|
||||
),
|
||||
),
|
||||
@@ -83,12 +161,35 @@ async def test_tools_llm_calls_tool_and_gets_result(
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph, goal, {"task": "Process the word 'hello'"}, validate_graph=False
|
||||
graph,
|
||||
goal,
|
||||
{"task": "Process the word 'hello'"},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, output['result'] is set",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
@@ -0,0 +1,713 @@
|
||||
"""Component tests: Verified Outcomes — cross-checked, deterministic, no trust required.
|
||||
|
||||
These tests eliminate false positives by:
|
||||
1. Using DETERMINISTIC inputs with KNOWN correct outputs
|
||||
2. Cross-checking LLM output against ground truth (tool results, file contents)
|
||||
3. Using REGEX validation instead of "is not None"
|
||||
4. Running a VERIFIER node that independently checks the first node's work
|
||||
5. Asserting on CONTENT, not just existence
|
||||
|
||||
If a test here passes, the output is provably correct — not just non-null.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.node import NodeSpec
|
||||
|
||||
from .conftest import make_executor
|
||||
|
||||
SET_OUTPUT = (
|
||||
"You MUST call the set_output tool. "
|
||||
"Do not just write text — call set_output with the correct key and value."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Echo round-trip: input == output (exact match, no LLM creativity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_echo_exact_content(runtime, goal, llm_provider, artifact):
|
||||
"""Echo test with EXACT content verification — not just 'is not None'.
|
||||
|
||||
The input is a unique token. The output must contain that exact token.
|
||||
This catches LLMs that hallucinate or paraphrase instead of echoing.
|
||||
"""
|
||||
UNIQUE_TOKEN = "XRAY_7742_BRAVO_ECHO"
|
||||
|
||||
graph = GraphSpec(
|
||||
id="verified-echo",
|
||||
goal_id="dummy",
|
||||
entry_node="echo",
|
||||
entry_points={"start": "echo"},
|
||||
terminal_nodes=["echo"],
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="echo",
|
||||
name="Echo",
|
||||
description="Echoes input exactly",
|
||||
node_type="event_loop",
|
||||
input_keys=["input"],
|
||||
output_keys=["output"],
|
||||
system_prompt=(
|
||||
"Read the 'input' value. Call set_output with key='output' "
|
||||
"and the EXACT same string. Do not modify it. Do not add quotes "
|
||||
"or punctuation. Just the raw string." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
memory_keys=["input", "output"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
result = await executor.execute(graph, goal, {"input": UNIQUE_TOKEN}, validate_graph=False)
|
||||
artifact.record(
|
||||
result, expected="success=True, output['output'] contains exact token XRAY_7742_BRAVO_ECHO"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
output = result.output.get("output", "")
|
||||
artifact.check(
|
||||
"output contains exact token",
|
||||
UNIQUE_TOKEN in output,
|
||||
actual=repr(output),
|
||||
expected_val=f"contains '{UNIQUE_TOKEN}'",
|
||||
)
|
||||
assert UNIQUE_TOKEN in output, f"Exact token '{UNIQUE_TOKEN}' not found in output: {output!r}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Math verification: LLM computes, we verify the answer independently
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_tool_result_matches_ground_truth(
|
||||
runtime, goal, llm_provider, tool_registry, artifact
|
||||
):
|
||||
"""get_current_time returns real data — verify output matches tool's actual return.
|
||||
|
||||
We call the tool directly (ground truth), then run the LLM graph,
|
||||
and verify the LLM's output contains the SAME day_of_week.
|
||||
This catches LLMs that hallucinate dates.
|
||||
"""
|
||||
from framework.llm.provider import ToolUse
|
||||
|
||||
# Step 1: Get ground truth by calling tool directly
|
||||
executor_fn = tool_registry.get_executor()
|
||||
tool_use = ToolUse(id="ground-truth", name="get_current_time", input={"timezone": "UTC"})
|
||||
ground_truth_result = executor_fn(tool_use)
|
||||
|
||||
artifact.record_value(
|
||||
"ground_truth_is_error",
|
||||
ground_truth_result.is_error,
|
||||
expected="ground truth tool returns day_of_week matching LLM output",
|
||||
)
|
||||
assert not ground_truth_result.is_error
|
||||
|
||||
# Parse the actual day_of_week from the tool
|
||||
gt_data = json.loads(ground_truth_result.content)
|
||||
actual_day = gt_data.get("day_of_week", "")
|
||||
artifact.record_value("ground_truth_day", actual_day)
|
||||
assert actual_day, f"Tool didn't return day_of_week: {gt_data}"
|
||||
|
||||
# Step 2: Run LLM graph that uses the same tool
|
||||
graph = GraphSpec(
|
||||
id="verified-time",
|
||||
goal_id="dummy",
|
||||
entry_node="worker",
|
||||
entry_points={"start": "worker"},
|
||||
terminal_nodes=["worker"],
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="worker",
|
||||
name="Worker",
|
||||
description="Get current time and report day",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"Call get_current_time with timezone='UTC'. "
|
||||
"Extract the day_of_week from the result. "
|
||||
"Call set_output with key='result' and ONLY the day_of_week string "
|
||||
"(e.g., 'Monday'). Nothing else." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
memory_keys=["result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=f"success=True, output['result'] matches ground truth day_of_week='{actual_day}'",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
llm_day = (result.output.get("result") or "").strip()
|
||||
artifact.record_value("llm_day", llm_day)
|
||||
|
||||
# Step 3: Cross-check — LLM's answer must match ground truth
|
||||
artifact.check(
|
||||
"LLM day matches ground truth",
|
||||
actual_day.lower() in llm_day.lower(),
|
||||
actual=repr(llm_day),
|
||||
expected_val=f"contains '{actual_day}'",
|
||||
)
|
||||
assert actual_day.lower() in llm_day.lower(), (
|
||||
f"LLM reported '{llm_day}' but tool returned '{actual_day}'. "
|
||||
f"The LLM hallucinated or misread the tool result."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. File artifact round-trip: write -> read -> binary compare
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_artifact_binary_match(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
|
||||
):
|
||||
"""Save a file, then verify the on-disk content matches EXACTLY.
|
||||
|
||||
Does NOT rely on LLM to verify — we read the file ourselves.
|
||||
This catches save_data bugs, encoding issues, or LLM adding extra content.
|
||||
"""
|
||||
PAYLOAD = "VERIFIED_PAYLOAD_99_ZULU"
|
||||
storage_path = tmp_path / "session"
|
||||
|
||||
graph = GraphSpec(
|
||||
id="verified-artifact",
|
||||
goal_id="dummy",
|
||||
entry_node="worker",
|
||||
entry_points={"start": "worker"},
|
||||
terminal_nodes=["worker"],
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="worker",
|
||||
name="Writer",
|
||||
description="Saves exact payload to file",
|
||||
node_type="event_loop",
|
||||
input_keys=["task"],
|
||||
output_keys=["result"],
|
||||
tools=["save_data"],
|
||||
system_prompt=(
|
||||
f"Call save_data with filename='verified.txt' and data='{PAYLOAD}'. "
|
||||
"Then call set_output with key='result' and value='saved'. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
memory_keys=["task", "result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=storage_path,
|
||||
)
|
||||
result = await executor.execute(graph, goal, {"task": "save the file"}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, file 'verified.txt' on disk "
|
||||
"matches VERIFIED_PAYLOAD_99_ZULU exactly"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
# Cross-check: read the file ourselves — don't trust the LLM
|
||||
artifact_path = storage_path / "data" / "verified.txt"
|
||||
|
||||
artifact.check(
|
||||
"file exists on disk",
|
||||
artifact_path.exists(),
|
||||
actual=str(artifact_path.exists()),
|
||||
expected_val="True",
|
||||
)
|
||||
assert artifact_path.exists(), f"File not created at {artifact_path}"
|
||||
|
||||
actual_content = artifact_path.read_text(encoding="utf-8").strip()
|
||||
artifact.check(
|
||||
"file content matches payload",
|
||||
actual_content == PAYLOAD,
|
||||
actual=repr(actual_content),
|
||||
expected_val=repr(PAYLOAD),
|
||||
)
|
||||
assert actual_content == PAYLOAD, (
|
||||
f"File content mismatch.\n"
|
||||
f" Expected: {PAYLOAD!r}\n"
|
||||
f" Actual: {actual_content!r}\n"
|
||||
f"The LLM may have modified the payload or save_data encoded it differently."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Pipeline data integrity: track a token through N nodes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_pipeline_token_survives(runtime, goal, llm_provider, artifact):
|
||||
"""Pass a unique token through 3 nodes — verify it arrives at the end.
|
||||
|
||||
Each node is instructed to PRESERVE the token. If any node drops or
|
||||
modifies it, the final assertion catches it. This verifies input_mapping
|
||||
and continuous conversation actually deliver data correctly.
|
||||
"""
|
||||
TOKEN = "TRACKING_TOKEN_88X"
|
||||
|
||||
graph = GraphSpec(
|
||||
id="verified-pipeline",
|
||||
goal_id="dummy",
|
||||
entry_node="a",
|
||||
entry_points={"start": "a"},
|
||||
terminal_nodes=["c"],
|
||||
conversation_mode="continuous",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="a",
|
||||
name="Node A",
|
||||
description="First node",
|
||||
node_type="event_loop",
|
||||
input_keys=["token"],
|
||||
output_keys=["a_out"],
|
||||
system_prompt=(
|
||||
"Read the 'token' input. Call set_output with key='a_out' "
|
||||
"and the EXACT token value. Do not modify it." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="b",
|
||||
name="Node B",
|
||||
description="Middle node",
|
||||
node_type="event_loop",
|
||||
input_keys=["b_in"],
|
||||
output_keys=["b_out"],
|
||||
system_prompt=(
|
||||
"Read the 'b_in' input. Call set_output with key='b_out' "
|
||||
"and the EXACT same value. Do not modify it." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="c",
|
||||
name="Node C",
|
||||
description="Terminal node",
|
||||
node_type="event_loop",
|
||||
input_keys=["c_in"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Read the 'c_in' input. Call set_output with key='result' "
|
||||
"and the EXACT same value. Do not modify it." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(
|
||||
id="a-b",
|
||||
source="a",
|
||||
target="b",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
input_mapping={"b_in": "a_out"},
|
||||
),
|
||||
EdgeSpec(
|
||||
id="b-c",
|
||||
source="b",
|
||||
target="c",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
input_mapping={"c_in": "b_out"},
|
||||
),
|
||||
],
|
||||
memory_keys=["token", "a_out", "b_in", "b_out", "c_in", "result"],
|
||||
)
|
||||
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
|
||||
result = await executor.execute(graph, goal, {"token": TOKEN}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected="success=True, path=['a','b','c'], output['result'] contains TRACKING_TOKEN_88X",
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["a", "b", "c"],
|
||||
actual=str(result.path),
|
||||
expected_val="['a', 'b', 'c']",
|
||||
)
|
||||
assert result.path == ["a", "b", "c"]
|
||||
|
||||
final_output = result.output.get("result", "")
|
||||
artifact.check(
|
||||
"token survives pipeline",
|
||||
TOKEN in final_output,
|
||||
actual=repr(final_output),
|
||||
expected_val=f"contains '{TOKEN}'",
|
||||
)
|
||||
assert TOKEN in final_output, (
|
||||
f"Token '{TOKEN}' lost in pipeline.\n"
|
||||
f" Input: {TOKEN}\n"
|
||||
f" Final output: {final_output!r}\n"
|
||||
f" Path: {result.path}\n"
|
||||
f"Data was corrupted or dropped during node transitions."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. Structured format with regex validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_format_with_regex(runtime, goal, llm_provider, tool_registry, artifact):
|
||||
"""Output must match a strict regex — not just 'contains a pipe character'.
|
||||
|
||||
Format: STATUS|YYYY-MM-DD|DayName
|
||||
Regex validates each segment independently.
|
||||
"""
|
||||
graph = GraphSpec(
|
||||
id="verified-format",
|
||||
goal_id="dummy",
|
||||
entry_node="worker",
|
||||
entry_points={"start": "worker"},
|
||||
terminal_nodes=["worker"],
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="worker",
|
||||
name="Worker",
|
||||
description="Produce formatted status string",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"Call get_current_time with timezone='UTC'. "
|
||||
"Build this EXACT format: STATUS|<date>|<day_of_week>\n"
|
||||
"Where <date> is YYYY-MM-DD format and <day_of_week> is the full day name.\n"
|
||||
"Example: STATUS|2026-04-03|Thursday\n"
|
||||
"Call set_output with key='result' and the formatted string.\n"
|
||||
"Output ONLY the formatted string, nothing else." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
memory_keys=["result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result, expected="success=True, output['result'] matches regex STATUS|YYYY-MM-DD|DayName"
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
output = (result.output.get("result") or "").strip()
|
||||
artifact.record_value("raw_output", output)
|
||||
|
||||
# Strict regex: STATUS|YYYY-MM-DD|DayName
|
||||
pattern = (
|
||||
r"^STATUS\|\d{4}-\d{2}-\d{2}\|(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$"
|
||||
)
|
||||
matches = bool(re.match(pattern, output))
|
||||
artifact.check(
|
||||
"output matches regex",
|
||||
matches,
|
||||
actual=repr(output),
|
||||
expected_val=f"matches pattern: {pattern}",
|
||||
)
|
||||
assert re.match(pattern, output), (
|
||||
f"Output does not match required format.\n"
|
||||
f" Expected pattern: STATUS|YYYY-MM-DD|DayName\n"
|
||||
f" Actual output: {output!r}\n"
|
||||
f" Regex: {pattern}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. Two-node cross-verification: writer + independent verifier
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_two_node_cross_check(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
|
||||
):
|
||||
"""Node 1 writes a file. Node 2 loads it and compares to expected.
|
||||
|
||||
Both nodes operate INDEPENDENTLY on the same file. If the content
|
||||
doesn't match, the verifier reports MISMATCH. We also read the file
|
||||
ourselves as a triple-check.
|
||||
"""
|
||||
EXPECTED = "CROSS_CHECK_ALPHA_42"
|
||||
storage_path = tmp_path / "session"
|
||||
|
||||
graph = GraphSpec(
|
||||
id="verified-cross-check",
|
||||
goal_id="dummy",
|
||||
entry_node="writer",
|
||||
entry_points={"start": "writer"},
|
||||
terminal_nodes=["verifier"],
|
||||
conversation_mode="continuous",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="writer",
|
||||
name="Writer",
|
||||
description="Writes exact content to file",
|
||||
node_type="event_loop",
|
||||
output_keys=["filename"],
|
||||
tools=["save_data"],
|
||||
system_prompt=(
|
||||
f"Call save_data with filename='crosscheck.txt' and data='{EXPECTED}'. "
|
||||
"Then call set_output with key='filename' and value='crosscheck.txt'."
|
||||
+ SET_OUTPUT
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="verifier",
|
||||
name="Verifier",
|
||||
description="Loads file and verifies content",
|
||||
node_type="event_loop",
|
||||
input_keys=["filename"],
|
||||
output_keys=["result"],
|
||||
tools=["load_data"],
|
||||
system_prompt=(
|
||||
"Load the file using load_data with the provided 'filename'. "
|
||||
f"If the loaded content is exactly '{EXPECTED}', "
|
||||
"call set_output with key='result' and value='VERIFIED'. "
|
||||
"If it does NOT match, call set_output with key='result' "
|
||||
"and value='MISMATCH:' followed by what you actually loaded." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(
|
||||
id="write-to-verify",
|
||||
source="writer",
|
||||
target="verifier",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
input_mapping={"filename": "filename"},
|
||||
),
|
||||
],
|
||||
memory_keys=["filename", "result"],
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=storage_path,
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, path=['writer','verifier'], "
|
||||
"verifier output='VERIFIED', disk content "
|
||||
"matches CROSS_CHECK_ALPHA_42"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["writer", "verifier"],
|
||||
actual=str(result.path),
|
||||
expected_val="['writer', 'verifier']",
|
||||
)
|
||||
assert result.path == ["writer", "verifier"]
|
||||
|
||||
# LLM-side verification
|
||||
verifier_output = result.output.get("result", "")
|
||||
artifact.check(
|
||||
"verifier output is VERIFIED",
|
||||
verifier_output == "VERIFIED",
|
||||
actual=repr(verifier_output),
|
||||
expected_val="'VERIFIED'",
|
||||
)
|
||||
assert verifier_output == "VERIFIED", (
|
||||
f"Verifier node reported: {verifier_output!r} (expected 'VERIFIED')"
|
||||
)
|
||||
|
||||
# Our own independent verification (triple-check)
|
||||
artifact_path = storage_path / "data" / "crosscheck.txt"
|
||||
artifact.check(
|
||||
"file exists on disk",
|
||||
artifact_path.exists(),
|
||||
actual=str(artifact_path.exists()),
|
||||
expected_val="True",
|
||||
)
|
||||
assert artifact_path.exists(), f"File not found at {artifact_path}"
|
||||
|
||||
actual = artifact_path.read_text(encoding="utf-8").strip()
|
||||
artifact.check(
|
||||
"disk content matches expected",
|
||||
actual == EXPECTED,
|
||||
actual=repr(actual),
|
||||
expected_val=repr(EXPECTED),
|
||||
)
|
||||
assert actual == EXPECTED, f"Disk content mismatch: expected {EXPECTED!r}, got {actual!r}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 7. Event bus cross-check: verify events match execution result
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verified_events_match_result(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
|
||||
):
|
||||
"""Cross-check: events captured on bus must agree with ExecutionResult.
|
||||
|
||||
If result says path=["a","b"], the events must show NODE_LOOP_COMPLETED
|
||||
for both "a" and "b". If result says tool X was called, TOOL_CALL_COMPLETED
|
||||
must contain X. This catches desynchronization between the event bus and
|
||||
the execution engine.
|
||||
"""
|
||||
from framework.runtime.event_bus import EventBus, EventType
|
||||
|
||||
bus = EventBus()
|
||||
completed_nodes = []
|
||||
tool_names = set()
|
||||
|
||||
async def _capture_node(event):
|
||||
completed_nodes.append(event.node_id)
|
||||
|
||||
async def _capture_tool(event):
|
||||
tool_names.add(event.data.get("tool_name", ""))
|
||||
|
||||
bus.subscribe(event_types=[EventType.NODE_LOOP_COMPLETED], handler=_capture_node)
|
||||
bus.subscribe(event_types=[EventType.TOOL_CALL_COMPLETED], handler=_capture_tool)
|
||||
|
||||
graph = GraphSpec(
|
||||
id="verified-events",
|
||||
goal_id="dummy",
|
||||
entry_node="worker",
|
||||
entry_points={"start": "worker"},
|
||||
terminal_nodes=["worker"],
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="worker",
|
||||
name="Worker",
|
||||
description="Uses tool then sets output",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"Call get_current_time with timezone='UTC'. "
|
||||
"Then call set_output with key='result' and value='done'." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
memory_keys=["result"],
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, event bus nodes match "
|
||||
"result.path, tool events include "
|
||||
"get_current_time and set_output"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.record_value("completed_nodes", completed_nodes)
|
||||
artifact.record_value("tool_names", sorted(tool_names))
|
||||
|
||||
# Cross-check 1: path nodes match completed nodes
|
||||
for node_id in result.path:
|
||||
artifact.check(
|
||||
f"node '{node_id}' in completed events",
|
||||
node_id in completed_nodes,
|
||||
actual=str(completed_nodes),
|
||||
expected_val=f"contains '{node_id}'",
|
||||
)
|
||||
assert node_id in completed_nodes, (
|
||||
f"Node '{node_id}' in result.path but no NODE_LOOP_COMPLETED event. "
|
||||
f"Events saw: {completed_nodes}"
|
||||
)
|
||||
|
||||
# Cross-check 2: get_current_time must appear in tool events
|
||||
artifact.check(
|
||||
"get_current_time in tool events",
|
||||
"get_current_time" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'get_current_time'",
|
||||
)
|
||||
assert "get_current_time" in tool_names, (
|
||||
f"get_current_time not in tool events. Captured: {tool_names}. "
|
||||
f"Result claims success but event bus disagrees."
|
||||
)
|
||||
|
||||
# Cross-check 3: set_output must appear in tool events
|
||||
artifact.check(
|
||||
"set_output in tool events",
|
||||
"set_output" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'set_output'",
|
||||
)
|
||||
assert "set_output" in tool_names, (
|
||||
f"set_output not in tool events. Captured: {tool_names}. "
|
||||
f"Result has output but no set_output event."
|
||||
)
|
||||
@@ -1,8 +1,9 @@
|
||||
"""Component tests: Worker Communication — event flow, completion, failure.
|
||||
"""Component tests: Worker Communication — event flow, completion.
|
||||
|
||||
Exercises the full worker execution lifecycle with EventBus subscriptions
|
||||
to verify that the exact events are published in the correct order, with
|
||||
correct data, simulating the queen-worker communication contract.
|
||||
Exercises the full worker execution lifecycle with EventBus
|
||||
subscriptions to verify that the exact events are published in
|
||||
the correct order, with correct data, simulating the queen-worker
|
||||
communication contract.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -20,7 +21,8 @@ from .conftest import make_executor
|
||||
|
||||
SET_OUTPUT = (
|
||||
"You MUST call the set_output tool. "
|
||||
"Do not just write text — call set_output with the correct key and value."
|
||||
"Do not just write text — call set_output with the correct "
|
||||
"key and value."
|
||||
)
|
||||
|
||||
|
||||
@@ -34,7 +36,7 @@ class EventCapture:
|
||||
return [e for e in self.events if e.type in event_types]
|
||||
|
||||
def tool_calls(self) -> list[dict]:
|
||||
"""Extract tool call data from TOOL_CALL_COMPLETED events."""
|
||||
"""Extract tool call data from TOOL_CALL_COMPLETED."""
|
||||
return [e.data for e in self.of_type(EventType.TOOL_CALL_COMPLETED)]
|
||||
|
||||
def tool_names_called(self) -> set[str]:
|
||||
@@ -51,14 +53,13 @@ class EventCapture:
|
||||
|
||||
|
||||
def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
|
||||
"""Create an EventBus with a capture handler subscribed to all events."""
|
||||
"""Create an EventBus with a capture handler."""
|
||||
bus = EventBus()
|
||||
capture = EventCapture()
|
||||
|
||||
async def _capture_all(event: AgentEvent) -> None:
|
||||
capture.events.append(event)
|
||||
|
||||
# Subscribe to the key event types we want to verify
|
||||
bus.subscribe(
|
||||
event_types=[
|
||||
EventType.NODE_LOOP_STARTED,
|
||||
@@ -79,14 +80,14 @@ def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
|
||||
return bus, capture
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
# Tests: Worker Completion Events
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path):
|
||||
"""Worker execution must emit LOOP_STARTED → iterations → LOOP_COMPLETED."""
|
||||
async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""Worker must emit STARTED -> iterations -> COMPLETED."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
graph = GraphSpec(
|
||||
@@ -102,7 +103,7 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
|
||||
description="Simple output",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='done'. " + SET_OUTPUT,
|
||||
system_prompt=("Call set_output with key='result' and value='done'. " + SET_OUTPUT),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
@@ -110,34 +111,78 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, lifecycle events in correct order: STARTED -> iterations -> COMPLETED"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
# Verify lifecycle event ordering
|
||||
loop_started = capture.of_type(EventType.NODE_LOOP_STARTED)
|
||||
loop_completed = capture.of_type(EventType.NODE_LOOP_COMPLETED)
|
||||
loop_completed = capture.of_type(
|
||||
EventType.NODE_LOOP_COMPLETED,
|
||||
)
|
||||
iterations = capture.of_type(EventType.NODE_LOOP_ITERATION)
|
||||
|
||||
artifact.check(
|
||||
"NODE_LOOP_STARTED emitted",
|
||||
len(loop_started) >= 1,
|
||||
actual=str(len(loop_started)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(loop_started) >= 1, "Missing NODE_LOOP_STARTED"
|
||||
|
||||
artifact.check(
|
||||
"NODE_LOOP_COMPLETED emitted",
|
||||
len(loop_completed) >= 1,
|
||||
actual=str(len(loop_completed)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(loop_completed) >= 1, "Missing NODE_LOOP_COMPLETED"
|
||||
|
||||
artifact.check(
|
||||
"NODE_LOOP_ITERATION emitted",
|
||||
len(iterations) >= 1,
|
||||
actual=str(len(iterations)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(iterations) >= 1, "Missing NODE_LOOP_ITERATION"
|
||||
|
||||
# STARTED must come before COMPLETED
|
||||
start_idx = capture.events.index(loop_started[0])
|
||||
end_idx = capture.events.index(loop_completed[0])
|
||||
artifact.check(
|
||||
"STARTED precedes COMPLETED",
|
||||
start_idx < end_idx,
|
||||
actual=f"start={start_idx}, end={end_idx}",
|
||||
expected_val="start < end",
|
||||
)
|
||||
assert start_idx < end_idx, "LOOP_STARTED must precede LOOP_COMPLETED"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_emits_llm_turn_with_token_counts(
|
||||
runtime, goal, llm_provider, tmp_path
|
||||
runtime, goal, llm_provider, tmp_path, artifact
|
||||
):
|
||||
"""Each LLM turn must emit LLM_TURN_COMPLETE with token counts."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
@@ -155,7 +200,7 @@ async def test_worker_emits_llm_turn_with_token_counts(
|
||||
description="Simple output",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
system_prompt="Call set_output with key='result' and value='ok'. " + SET_OUTPUT,
|
||||
system_prompt=("Call set_output with key='result' and value='ok'. " + SET_OUTPUT),
|
||||
),
|
||||
],
|
||||
edges=[],
|
||||
@@ -163,30 +208,82 @@ async def test_worker_emits_llm_turn_with_token_counts(
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 3},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, LLM_TURN_COMPLETE events with positive token counts and model"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
llm_turns = capture.of_type(EventType.LLM_TURN_COMPLETE)
|
||||
|
||||
artifact.check(
|
||||
"LLM_TURN_COMPLETE emitted",
|
||||
len(llm_turns) >= 1,
|
||||
actual=str(len(llm_turns)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(llm_turns) >= 1, "No LLM_TURN_COMPLETE events"
|
||||
|
||||
for turn in llm_turns:
|
||||
assert turn.data.get("input_tokens", 0) > 0, "input_tokens should be > 0"
|
||||
assert turn.data.get("output_tokens", 0) > 0, "output_tokens should be > 0"
|
||||
for i, turn in enumerate(llm_turns):
|
||||
in_tok = turn.data.get("input_tokens", 0)
|
||||
out_tok = turn.data.get("output_tokens", 0)
|
||||
model = turn.data.get("model", "")
|
||||
|
||||
artifact.check(
|
||||
f"turn[{i}] input_tokens > 0",
|
||||
in_tok > 0,
|
||||
actual=str(in_tok),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert in_tok > 0, "input_tokens should be > 0"
|
||||
|
||||
artifact.check(
|
||||
f"turn[{i}] output_tokens > 0",
|
||||
out_tok > 0,
|
||||
actual=str(out_tok),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert out_tok > 0, "output_tokens should be > 0"
|
||||
|
||||
artifact.check(
|
||||
f"turn[{i}] model populated",
|
||||
bool(model),
|
||||
actual=repr(model),
|
||||
expected_val="non-empty string",
|
||||
)
|
||||
assert turn.data.get("model"), "model should be populated"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_tool_calls_emit_events(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path
|
||||
runtime,
|
||||
goal,
|
||||
llm_provider,
|
||||
tool_registry,
|
||||
tmp_path,
|
||||
artifact,
|
||||
):
|
||||
"""Tool calls must emit TOOL_CALL_STARTED and TOOL_CALL_COMPLETED events."""
|
||||
"""Tool calls must emit STARTED and COMPLETED events."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
graph = GraphSpec(
|
||||
@@ -205,8 +302,8 @@ async def test_worker_tool_calls_emit_events(
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"Call get_current_time with timezone='UTC'. "
|
||||
"Then call set_output with key='result' and the day_of_week. "
|
||||
+ SET_OUTPUT
|
||||
"Then call set_output with key='result' and "
|
||||
"the day_of_week. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -215,38 +312,97 @@ async def test_worker_tool_calls_emit_events(
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, output['result'] set, tool events for get_current_time and set_output"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_output = result.output.get("result")
|
||||
artifact.check(
|
||||
"output['result'] is set",
|
||||
actual_output is not None,
|
||||
actual=repr(actual_output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("result") is not None
|
||||
|
||||
# Verify tool events
|
||||
tool_started = capture.of_type(EventType.TOOL_CALL_STARTED)
|
||||
tool_completed = capture.of_type(EventType.TOOL_CALL_COMPLETED)
|
||||
tool_completed = capture.of_type(
|
||||
EventType.TOOL_CALL_COMPLETED,
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"TOOL_CALL_STARTED emitted",
|
||||
len(tool_started) >= 1,
|
||||
actual=str(len(tool_started)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(tool_started) >= 1, "No TOOL_CALL_STARTED events"
|
||||
assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED events"
|
||||
|
||||
# get_current_time must be among the tools called
|
||||
assert "get_current_time" in capture.tool_names_called()
|
||||
artifact.check(
|
||||
"TOOL_CALL_COMPLETED emitted",
|
||||
len(tool_completed) >= 1,
|
||||
actual=str(len(tool_completed)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED"
|
||||
|
||||
# set_output must also appear (synthetic tool)
|
||||
assert "set_output" in capture.tool_names_called()
|
||||
tool_names = capture.tool_names_called()
|
||||
artifact.check(
|
||||
"get_current_time called",
|
||||
"get_current_time" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'get_current_time'",
|
||||
)
|
||||
assert "get_current_time" in tool_names
|
||||
|
||||
artifact.check(
|
||||
"set_output called",
|
||||
"set_output" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'set_output'",
|
||||
)
|
||||
assert "set_output" in tool_names
|
||||
|
||||
# Tool calls should not have errors
|
||||
for tc in capture.tool_calls():
|
||||
if tc.get("tool_name") in ("get_current_time", "set_output"):
|
||||
assert not tc.get("is_error"), f"Tool {tc.get('tool_name')} errored"
|
||||
tn = tc.get("tool_name")
|
||||
if tn in ("get_current_time", "set_output"):
|
||||
is_err = tc.get("is_error")
|
||||
artifact.check(
|
||||
f"tool {tn} no error",
|
||||
not is_err,
|
||||
actual=str(is_err),
|
||||
expected_val="False",
|
||||
)
|
||||
assert not is_err, f"Tool {tn} errored"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path):
|
||||
async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path, artifact):
|
||||
"""set_output must emit OUTPUT_KEY_SET event with the key name."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
@@ -275,34 +431,84 @@ async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path
|
||||
conversation_mode="continuous",
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=("success=True, output['name'] and output['status'] set, OUTPUT_KEY_SET for both"),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
actual_name = result.output.get("name")
|
||||
artifact.check(
|
||||
"output['name'] is set",
|
||||
actual_name is not None,
|
||||
actual=repr(actual_name),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("name") is not None
|
||||
|
||||
actual_status = result.output.get("status")
|
||||
artifact.check(
|
||||
"output['status'] is set",
|
||||
actual_status is not None,
|
||||
actual=repr(actual_status),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert result.output.get("status") is not None
|
||||
|
||||
# Verify OUTPUT_KEY_SET events for both keys
|
||||
keys_set = capture.output_keys_set()
|
||||
|
||||
artifact.check(
|
||||
"OUTPUT_KEY_SET for 'name'",
|
||||
"name" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'name'",
|
||||
)
|
||||
assert "name" in keys_set, f"Missing OUTPUT_KEY_SET for 'name', got: {keys_set}"
|
||||
|
||||
artifact.check(
|
||||
"OUTPUT_KEY_SET for 'status'",
|
||||
"status" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'status'",
|
||||
)
|
||||
assert "status" in keys_set, f"Missing OUTPUT_KEY_SET for 'status', got: {keys_set}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
# Tests: Multi-Node Worker Communication
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_pipeline_data_integrity(
|
||||
runtime, goal, llm_provider, tool_registry, tmp_path
|
||||
runtime,
|
||||
goal,
|
||||
llm_provider,
|
||||
tool_registry,
|
||||
tmp_path,
|
||||
artifact,
|
||||
):
|
||||
"""Data produced by node 1 must arrive at node 2 via input_mapping, verified end-to-end."""
|
||||
"""Data from node 1 must arrive at node 2, verified end-to-end."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
graph = GraphSpec(
|
||||
@@ -316,28 +522,30 @@ async def test_worker_pipeline_data_integrity(
|
||||
NodeSpec(
|
||||
id="producer",
|
||||
name="Producer",
|
||||
description="Produces a timestamped value using a real tool",
|
||||
description="Produces a timestamped value",
|
||||
node_type="event_loop",
|
||||
output_keys=["payload"],
|
||||
tools=["get_current_time"],
|
||||
system_prompt=(
|
||||
"Call get_current_time with timezone='UTC'. "
|
||||
"Extract the 'date' field from the result. "
|
||||
"Call set_output with key='payload' and the date string as value. "
|
||||
+ SET_OUTPUT
|
||||
"Call set_output with key='payload' and the "
|
||||
"date string as value. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="consumer",
|
||||
name="Consumer",
|
||||
description="Verifies received data contains a date",
|
||||
description="Verifies received data",
|
||||
node_type="event_loop",
|
||||
input_keys=["data"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Read the 'data' input. It should contain a date string. "
|
||||
"Call set_output with key='result' and value='VERIFIED|' followed by "
|
||||
"the first 10 characters of the data input. " + SET_OUTPUT
|
||||
"Read the 'data' input. It should contain a "
|
||||
"date string. Call set_output with "
|
||||
"key='result' and value='VERIFIED|' followed "
|
||||
"by the first 10 characters of the data "
|
||||
"input. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -353,44 +561,127 @@ async def test_worker_pipeline_data_integrity(
|
||||
memory_keys=["payload", "data", "result"],
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
tool_registry=tool_registry,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, clean, "
|
||||
"path=['producer','consumer'], steps=2, "
|
||||
"output starts with VERIFIED|"
|
||||
),
|
||||
)
|
||||
|
||||
# Strict outcome verification
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"clean success",
|
||||
result.is_clean_success,
|
||||
actual=str(result.execution_quality),
|
||||
expected_val="clean",
|
||||
)
|
||||
assert result.is_clean_success, f"quality={result.execution_quality}"
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["producer", "consumer"],
|
||||
actual=str(result.path),
|
||||
expected_val="['producer', 'consumer']",
|
||||
)
|
||||
assert result.path == ["producer", "consumer"]
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 2",
|
||||
result.steps_executed == 2,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="2",
|
||||
)
|
||||
assert result.steps_executed == 2
|
||||
|
||||
# Output must be present and correctly structured
|
||||
output = result.output.get("result")
|
||||
artifact.check(
|
||||
"consumer set 'result'",
|
||||
output is not None,
|
||||
actual=repr(output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert output is not None, "Consumer did not set 'result'"
|
||||
|
||||
artifact.check(
|
||||
"output starts with VERIFIED|",
|
||||
output.startswith("VERIFIED|"),
|
||||
actual=repr(output),
|
||||
expected_val="starts with 'VERIFIED|'",
|
||||
)
|
||||
assert output.startswith("VERIFIED|"), f"Expected VERIFIED|..., got: {output}"
|
||||
|
||||
# Token counts should be reasonable (not zero, not astronomical)
|
||||
artifact.check(
|
||||
"total_tokens > 0",
|
||||
result.total_tokens > 0,
|
||||
actual=str(result.total_tokens),
|
||||
expected_val=">0",
|
||||
)
|
||||
assert result.total_tokens > 0
|
||||
|
||||
artifact.check(
|
||||
"total_tokens < 100000",
|
||||
result.total_tokens < 100_000,
|
||||
actual=str(result.total_tokens),
|
||||
expected_val="<100000",
|
||||
)
|
||||
assert result.total_tokens < 100_000, f"Unexpectedly high tokens: {result.total_tokens}"
|
||||
|
||||
# Both nodes should have set their output keys
|
||||
keys_set = capture.output_keys_set()
|
||||
|
||||
artifact.check(
|
||||
"producer set 'payload'",
|
||||
"payload" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'payload'",
|
||||
)
|
||||
assert "payload" in keys_set, "Producer didn't set 'payload'"
|
||||
|
||||
artifact.check(
|
||||
"consumer set 'result' key",
|
||||
"result" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'result'",
|
||||
)
|
||||
assert "result" in keys_set, "Consumer didn't set 'result'"
|
||||
|
||||
# get_current_time must have been called (in producer)
|
||||
assert "get_current_time" in capture.tool_names_called()
|
||||
tool_names = capture.tool_names_called()
|
||||
artifact.check(
|
||||
"get_current_time called",
|
||||
"get_current_time" in tool_names,
|
||||
actual=str(sorted(tool_names)),
|
||||
expected_val="contains 'get_current_time'",
|
||||
)
|
||||
assert "get_current_time" in tool_names
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_multi_node_output_propagation(
|
||||
runtime, goal, llm_provider, tmp_path
|
||||
runtime, goal, llm_provider, tmp_path, artifact
|
||||
):
|
||||
"""Data from node A's output must arrive at node B and be reflected in final output."""
|
||||
"""Data from node A must arrive at node B in final output."""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
graph = GraphSpec(
|
||||
@@ -408,22 +699,25 @@ async def test_worker_multi_node_output_propagation(
|
||||
node_type="event_loop",
|
||||
output_keys=["code"],
|
||||
system_prompt=(
|
||||
"Call set_output with key='code' and value='ALPHA_BRAVO_42'. "
|
||||
"Call set_output with key='code' and "
|
||||
"value='ALPHA_BRAVO_42'. "
|
||||
"Do not write any text." + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
NodeSpec(
|
||||
id="formatter",
|
||||
name="Formatter",
|
||||
description="Wraps received code in brackets",
|
||||
description="Wraps code in brackets",
|
||||
node_type="event_loop",
|
||||
input_keys=["raw_code"],
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"Read the 'raw_code' input value. "
|
||||
"Call set_output with key='result' and value='[' followed by "
|
||||
"the raw_code value followed by ']'. "
|
||||
"Example: if raw_code is 'XYZ', output should be '[XYZ]'. " + SET_OUTPUT
|
||||
"Call set_output with key='result' and "
|
||||
"value='[' followed by the raw_code value "
|
||||
"followed by ']'. "
|
||||
"Example: if raw_code is 'XYZ', output "
|
||||
"should be '[XYZ]'. " + SET_OUTPUT
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -439,44 +733,110 @@ async def test_worker_multi_node_output_propagation(
|
||||
memory_keys=["code", "raw_code", "result"],
|
||||
)
|
||||
executor = make_executor(
|
||||
runtime, llm_provider,
|
||||
runtime,
|
||||
llm_provider,
|
||||
loop_config={"max_iterations": 5},
|
||||
storage_path=tmp_path / "session",
|
||||
event_bus=bus,
|
||||
stream_id="worker",
|
||||
)
|
||||
result = await executor.execute(graph, goal, {}, validate_graph=False)
|
||||
result = await executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
)
|
||||
artifact.record(
|
||||
result,
|
||||
expected=(
|
||||
"success=True, "
|
||||
"path=['generator','formatter'], steps=2, "
|
||||
"output contains [ALPHA_BRAVO_42]"
|
||||
),
|
||||
)
|
||||
|
||||
artifact.check(
|
||||
"execution succeeds",
|
||||
result.success,
|
||||
actual=str(result.success),
|
||||
expected_val="True",
|
||||
)
|
||||
assert result.success
|
||||
|
||||
artifact.check(
|
||||
"path matches",
|
||||
result.path == ["generator", "formatter"],
|
||||
actual=str(result.path),
|
||||
expected_val="['generator', 'formatter']",
|
||||
)
|
||||
assert result.path == ["generator", "formatter"]
|
||||
|
||||
artifact.check(
|
||||
"steps_executed is 2",
|
||||
result.steps_executed == 2,
|
||||
actual=str(result.steps_executed),
|
||||
expected_val="2",
|
||||
)
|
||||
assert result.steps_executed == 2
|
||||
|
||||
# Verify output structure
|
||||
output = result.output.get("result")
|
||||
artifact.check(
|
||||
"formatter set 'result'",
|
||||
output is not None,
|
||||
actual=repr(output),
|
||||
expected_val="non-None value",
|
||||
)
|
||||
assert output is not None, "Formatter did not set 'result'"
|
||||
assert "[" in output and "]" in output, f"Expected bracket wrapping, got: {output}"
|
||||
|
||||
has_brackets = "[" in output and "]" in output
|
||||
artifact.check(
|
||||
"output has bracket wrapping",
|
||||
has_brackets,
|
||||
actual=repr(output),
|
||||
expected_val="contains '[' and ']'",
|
||||
)
|
||||
assert has_brackets, f"Expected bracket wrapping, got: {output}"
|
||||
|
||||
artifact.check(
|
||||
"output contains ALPHA_BRAVO_42",
|
||||
"ALPHA_BRAVO_42" in output,
|
||||
actual=repr(output),
|
||||
expected_val="contains 'ALPHA_BRAVO_42'",
|
||||
)
|
||||
assert "ALPHA_BRAVO_42" in output, f"Code word missing from output: {output}"
|
||||
|
||||
# Both nodes should have set their output keys
|
||||
keys_set = capture.output_keys_set()
|
||||
artifact.check(
|
||||
"'code' in keys_set",
|
||||
"code" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'code'",
|
||||
)
|
||||
assert "code" in keys_set
|
||||
|
||||
artifact.check(
|
||||
"'result' in keys_set",
|
||||
"result" in keys_set,
|
||||
actual=str(sorted(keys_set)),
|
||||
expected_val="contains 'result'",
|
||||
)
|
||||
assert "result" in keys_set
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
# Tests: Escalation Event Flow
|
||||
# ---------------------------------------------------------------------------
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_escalation_emits_event_with_reason(
|
||||
runtime, goal, llm_provider, tmp_path
|
||||
runtime, goal, llm_provider, tmp_path, artifact
|
||||
):
|
||||
"""Worker calling escalate must emit ESCALATION_REQUESTED with the reason.
|
||||
"""Worker calling escalate must emit ESCALATION_REQUESTED.
|
||||
|
||||
After calling escalate, the worker blocks waiting for queen input.
|
||||
Since there's no queen in this test, we run with a short timeout and
|
||||
verify the escalation event was emitted before the timeout.
|
||||
After calling escalate, the worker blocks waiting for queen
|
||||
input. Since there's no queen in this test, we run with a
|
||||
short timeout and verify the escalation event was emitted.
|
||||
"""
|
||||
bus, capture = _make_event_bus_and_capture()
|
||||
|
||||
@@ -495,8 +855,10 @@ async def test_worker_escalation_emits_event_with_reason(
|
||||
output_keys=["result"],
|
||||
system_prompt=(
|
||||
"You are blocked and need human help. "
|
||||
"Call the escalate tool with reason='missing credentials for API'. "
|
||||
"Do not call set_output. Do not write any text first."
|
||||
"Call the escalate tool with "
|
||||
"reason='missing credentials for API'. "
|
||||
"Do not call set_output. "
|
||||
"Do not write any text first."
|
||||
),
|
||||
),
|
||||
],
|
||||
@@ -515,24 +877,59 @@ async def test_worker_escalation_emits_event_with_reason(
|
||||
stream_id="worker",
|
||||
)
|
||||
|
||||
# Worker will block after escalate (waiting for queen).
|
||||
# Use a short timeout — we only need the escalation event to fire.
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
executor.execute(graph, goal, {}, validate_graph=False),
|
||||
executor.execute(
|
||||
graph,
|
||||
goal,
|
||||
{},
|
||||
validate_graph=False,
|
||||
),
|
||||
timeout=30,
|
||||
)
|
||||
except (TimeoutError, asyncio.TimeoutError):
|
||||
pass # Expected: worker hangs waiting for queen input
|
||||
pass # Expected: worker hangs waiting for queen
|
||||
|
||||
# Verify escalation event was emitted before the timeout
|
||||
escalations = capture.of_type(EventType.ESCALATION_REQUESTED)
|
||||
assert len(escalations) >= 1, (
|
||||
f"No ESCALATION_REQUESTED event emitted. "
|
||||
f"Events captured: {[e.type.value for e in capture.events]}"
|
||||
all_types = [e.type.value for e in capture.events]
|
||||
|
||||
artifact.record_value(
|
||||
"escalation_count",
|
||||
len(escalations),
|
||||
expected=(">=1 ESCALATION_REQUESTED with non-empty reason, stream_id='worker'"),
|
||||
)
|
||||
artifact.record_value("all_event_types", all_types)
|
||||
|
||||
artifact.check(
|
||||
"escalation event emitted",
|
||||
len(escalations) >= 1,
|
||||
actual=str(len(escalations)),
|
||||
expected_val=">=1",
|
||||
)
|
||||
assert len(escalations) >= 1, f"No ESCALATION_REQUESTED event emitted. Events: {all_types}"
|
||||
|
||||
esc_data = escalations[0].data
|
||||
assert esc_data.get("reason"), "Escalation reason should not be empty"
|
||||
reason = esc_data.get("reason", "")
|
||||
artifact.check(
|
||||
"reason is non-empty",
|
||||
bool(reason),
|
||||
actual=repr(reason),
|
||||
expected_val="non-empty string",
|
||||
)
|
||||
assert esc_data.get("reason"), "Escalation reason empty"
|
||||
|
||||
artifact.check(
|
||||
"stream_id is 'worker'",
|
||||
escalations[0].stream_id == "worker",
|
||||
actual=repr(escalations[0].stream_id),
|
||||
expected_val="'worker'",
|
||||
)
|
||||
assert escalations[0].stream_id == "worker"
|
||||
|
||||
artifact.check(
|
||||
"node_id is 'worker'",
|
||||
escalations[0].node_id == "worker",
|
||||
actual=repr(escalations[0].node_id),
|
||||
expected_val="'worker'",
|
||||
)
|
||||
assert escalations[0].node_id == "worker"
|
||||
|
||||
Reference in New Issue
Block a user