feat: verified testing

This commit is contained in:
Timothy
2026-04-03 13:00:49 -07:00
parent 674454cc5b
commit 8f56b8b068
15 changed files with 4452 additions and 314 deletions
+225
View File
@@ -0,0 +1,225 @@
# Integration Test Reporting Skill
Run the Level 2 dummy agent integration test suite and produce a detailed HTML report with per-test input → outcome analysis.
## Trigger
User wants to run integration tests and see results:
- `/test-reporting`
- `/test-reporting test_component_queen_live.py`
- `/test-reporting --all`
## SOP: Running Tests
### Step 1: Select Scope
If the user provides a specific test file or pattern, use it. Otherwise run the full suite.
```bash
# Full suite
cd core && echo "1" | uv run python tests/dummy_agents/run_all.py --interactive 2>&1
# Specific file (requires manual provider setup)
cd core && uv run python -c "
import sys
sys.path.insert(0, '.')
from tests.dummy_agents.run_all import detect_available
from tests.dummy_agents.conftest import set_llm_selection
avail = detect_available()
claude = [p for p in avail if 'Claude Code' in p['name']]
if not claude:
avail_names = [p['name'] for p in avail]
raise RuntimeError(f'No Claude Code subscription. Available: {avail_names}')
provider = claude[0]
set_llm_selection(
model=provider['model'],
api_key=provider['api_key'],
extra_headers=provider.get('extra_headers'),
api_base=provider.get('api_base'),
)
import pytest
sys.exit(pytest.main([
'tests/dummy_agents/TEST_FILE_HERE',
'-v', '--override-ini=asyncio_mode=auto', '--no-header', '--tb=long',
'--log-cli-level=WARNING', '--junitxml=/tmp/hive_test_results.xml',
]))
"
```
### Step 2: Collect Results
After the test run completes, collect:
1. **JUnit XML** from `--junitxml` output (if available)
2. **stdout/stderr** from the run
3. **Summary table** from `run_all.py` output (the Unicode table)
### Step 3: Generate HTML Report
Write the report to `/tmp/hive_integration_test_report.html`.
The report MUST include these sections:
#### Header
- Run timestamp (ISO 8601)
- Provider used (model name, source)
- Total tests / passed / failed / skipped
- Total wall-clock time
- Overall verdict: PASS (all green) or FAIL (with count)
#### Per-Test Table
For EVERY test (not just failures), include a row with:
| Column | Description |
|--------|-------------|
| Component | Test file grouping (e.g., `component_queen_live`) |
| Test Name | Function name (e.g., `test_queen_starts_in_planning_without_worker`) |
| Status | PASS / FAIL / SKIP / ERROR with color badge |
| Duration | Wall-clock seconds |
| What | One-line description of what the test verifies |
| How | How it works (setup → action → assertion) |
| Why | Why this test matters (what bug/behavior it catches) |
| Input | The input data or configuration (graph spec, initial prompt, phase, etc.) |
| Expected Outcome | What the test asserts |
| Actual Outcome | What actually happened (PASS: matches expected / FAIL: actual vs expected) |
| Failure Detail | For failures only: full traceback + diagnosis |
#### What / How / Why Descriptions
These MUST be derived from the test function's docstring and code. Read each test file to extract:
- **What**: From the docstring first line
- **How**: From the test body (what fixtures, what graph, what assertions)
- **Why**: From the docstring body or "Why this matters" section in the test module
Use these mappings for the component test files:
```
test_component_llm.py → "LLM Provider" — streaming, tool calling, tokens
test_component_tools.py → "Tool Registry + MCP" — connection, execution
test_component_event_loop.py → "EventLoopNode" — iteration, output, stall
test_component_edges.py → "Edge Evaluation" — conditional, priority
test_component_conversation.py → "Conversation Persistence" — storage, cursor
test_component_escalation.py → "Escalation Flow" — worker→queen signaling
test_component_continuous.py → "Continuous Mode" — conversation threading
test_component_queen.py → "Queen Phase (Unit)" — phase state, tools, events
test_component_queen_live.py → "Queen Phase (Live)" — real queen, real LLM
test_component_queen_state_machine.py → "Queen State Machine" — edge cases, races
test_component_worker_comms.py → "Worker Communication" — events, data flow
test_component_strict_outcomes.py → "Strict Outcomes" — exact path, output, quality
```
#### HTML Template
Use this structure:
```html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hive Integration Test Report — {timestamp}</title>
<style>
:root { --pass: #22c55e; --fail: #ef4444; --skip: #f59e0b; --bg: #0f172a; --surface: #1e293b; --text: #e2e8f0; --muted: #94a3b8; --border: #334155; }
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: 'SF Mono', 'Fira Code', monospace; background: var(--bg); color: var(--text); padding: 2rem; line-height: 1.6; }
h1, h2, h3 { font-weight: 600; }
h1 { font-size: 1.5rem; margin-bottom: 1rem; }
h2 { font-size: 1.2rem; margin: 2rem 0 1rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
.card { background: var(--surface); padding: 1rem; border-radius: 8px; border: 1px solid var(--border); }
.card .label { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; }
.card .value { font-size: 1.5rem; font-weight: 700; margin-top: 0.25rem; }
.card .value.pass { color: var(--pass); }
.card .value.fail { color: var(--fail); }
table { width: 100%; border-collapse: collapse; font-size: 0.8rem; }
th { background: var(--surface); position: sticky; top: 0; text-align: left; padding: 0.5rem; border-bottom: 2px solid var(--border); color: var(--muted); text-transform: uppercase; font-size: 0.7rem; }
td { padding: 0.5rem; border-bottom: 1px solid var(--border); vertical-align: top; }
tr:hover { background: rgba(255,255,255,0.03); }
.badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; font-weight: 700; }
.badge.pass { background: rgba(34,197,94,0.2); color: var(--pass); }
.badge.fail { background: rgba(239,68,68,0.2); color: var(--fail); }
.badge.skip { background: rgba(245,158,11,0.2); color: var(--skip); }
.detail { background: #1a1a2e; padding: 0.75rem; border-radius: 4px; margin-top: 0.5rem; font-size: 0.75rem; white-space: pre-wrap; overflow-x: auto; max-height: 200px; overflow-y: auto; }
.component-header { background: var(--surface); padding: 0.75rem 0.5rem; font-weight: 600; font-size: 0.85rem; }
.meta { color: var(--muted); font-size: 0.75rem; }
</style>
</head>
<body>
<h1>Hive Integration Test Report</h1>
<p class="meta">Generated: {timestamp} | Provider: {provider} | Duration: {duration}s</p>
<div class="summary">
<div class="card"><div class="label">Total</div><div class="value">{total}</div></div>
<div class="card"><div class="label">Passed</div><div class="value pass">{passed}</div></div>
<div class="card"><div class="label">Failed</div><div class="value fail">{failed}</div></div>
<div class="card"><div class="label">Verdict</div><div class="value {verdict_class}">{verdict}</div></div>
</div>
<h2>Test Results</h2>
<table>
<thead>
<tr>
<th>Component</th>
<th>Test</th>
<th>Status</th>
<th>Time</th>
<th>What</th>
<th>Input → Expected → Actual</th>
</tr>
</thead>
<tbody>
<!-- For each test: -->
<tr>
<td>{component}</td>
<td>{test_name}</td>
<td><span class="badge {status_class}">{status}</span></td>
<td>{duration}s</td>
<td>{what_description}</td>
<td>
<strong>Input:</strong> {input_description}<br>
<strong>Expected:</strong> {expected_outcome}<br>
<strong>Actual:</strong> {actual_outcome}
<!-- If failed: -->
<div class="detail">{failure_traceback}</div>
</td>
</tr>
</tbody>
</table>
<h2>Failure Analysis</h2>
<!-- Only if there are failures -->
<p>For each failure, provide:</p>
<ul>
<li><strong>Root cause:</strong> Why it failed</li>
<li><strong>Impact:</strong> What this means for the system</li>
<li><strong>Suggested fix:</strong> How to address it</li>
</ul>
</body>
</html>
```
### Step 4: Output
1. Write the HTML file to `/tmp/hive_integration_test_report.html`
2. Print the file path so the user can open it
3. Print a concise summary to the terminal:
```
Test Report: /tmp/hive_integration_test_report.html
Result: 74/76 PASSED (2 failures)
Failures:
- parallel_merge::test_parallel_disjoint_output_keys
- worker::test_worker_timestamped_note_artifact
```
## Key Rules
1. ALWAYS use `--junitxml` when running pytest to get structured results
2. ALWAYS read the test source files to populate What/How/Why columns — do not guess
3. For Input/Expected/Actual, extract from the test's graph spec, assertions, and result
4. Color-code everything: green for pass, red for fail, amber for skip
5. Include the full traceback for failures in a scrollable `<div class="detail">`
6. Group tests by component (file name) with a visual separator
7. The report must be self-contained HTML (no external CSS/JS dependencies)
+128
View File
@@ -7,6 +7,7 @@ Run via: cd core && uv run python tests/dummy_agents/run_all.py
from __future__ import annotations
import asyncio
import json
import os
from pathlib import Path
@@ -202,3 +203,130 @@ def make_executor(
executor.execute = execute_with_timeout # type: ignore[method-assign]
return executor
# ── Artifact capture: raw output written to disk for every test ──────
ARTIFACTS_DIR = Path("/tmp/hive_test_artifacts")
class TestArtifact:
"""Collects raw output + expected behavior for a single test.
Usage in tests:
def test_foo(artifact, ...):
result = await executor.execute(...)
artifact.record(result, expected="path == ['a','b'], output['x'] == 'hello'")
"""
def __init__(self, test_id: str):
self.test_id = test_id
self._data: dict = {"test_id": test_id, "raw_output": None, "expected": "", "checks": []}
def record(self, result, *, expected: str = ""):
"""Record an ExecutionResult with expected behavior description."""
self._data["expected"] = expected
if result is None:
self._data["raw_output"] = None
return
self._data["raw_output"] = {
"success": getattr(result, "success", None),
"output": _safe_serialize(getattr(result, "output", {})),
"error": getattr(result, "error", None),
"path": getattr(result, "path", []),
"steps_executed": getattr(result, "steps_executed", 0),
"total_tokens": getattr(result, "total_tokens", 0),
"total_latency_ms": getattr(result, "total_latency_ms", 0),
"execution_quality": getattr(result, "execution_quality", ""),
"total_retries": getattr(result, "total_retries", 0),
"node_visit_counts": getattr(result, "node_visit_counts", {}),
"nodes_with_failures": getattr(result, "nodes_with_failures", []),
"session_state_buffer": _safe_serialize(
(getattr(result, "session_state", {}) or {}).get("data_buffer", {})
),
}
def record_value(self, key: str, value, *, expected: str = ""):
"""Record an arbitrary key-value (for non-ExecutionResult tests)."""
self._data.setdefault("values", {})[key] = _safe_serialize(value)
if expected:
self._data["expected"] = expected
def check(self, description: str, passed: bool, actual: str = "", expected_val: str = ""):
"""Record an individual assertion check."""
self._data["checks"].append({
"description": description,
"passed": passed,
"actual": actual,
"expected": expected_val,
})
def save(self):
"""Write artifact to disk."""
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
safe_name = self.test_id.replace("::", "__").replace("/", "_")
path = ARTIFACTS_DIR / f"{safe_name}.json"
with open(path, "w") as f:
json.dump(self._data, f, indent=2, default=str)
def _safe_serialize(obj):
"""Convert to JSON-safe types."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, dict):
return {str(k): _safe_serialize(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_safe_serialize(v) for v in obj]
return str(obj)[:500]
@pytest.fixture
def artifact(request):
"""Fixture that captures raw test output to disk.
Every test gets an artifact recorder. Call artifact.record(result)
and artifact.check("description", passed, actual, expected) to
capture data. Saved automatically on teardown.
"""
test_id = request.node.nodeid
art = TestArtifact(test_id)
yield art
art.save()
# Autouse hook: for tests that DON'T use the artifact fixture,
# create a minimal artifact from pass/fail status.
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
outcome = yield
rep = outcome.get_result()
if rep.when == "call":
item._test_report = rep
def pytest_runtest_teardown(item, nextitem):
"""Auto-save a minimal artifact for tests that didn't use the fixture."""
report = getattr(item, "_test_report", None)
if report is None:
return
# Check if the test already used the artifact fixture
if "artifact" in item.fixturenames:
return # Already handled by fixture teardown
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
safe_name = item.nodeid.replace("::", "__").replace("/", "_")
path = ARTIFACTS_DIR / f"{safe_name}.json"
data = {
"test_id": item.nodeid,
"raw_output": None,
"expected": "",
"checks": [],
"auto_captured": True,
"status": "PASS" if report.passed else ("FAIL" if report.failed else "SKIP"),
}
if report.failed and report.longreprtext:
data["failure_text"] = report.longreprtext[:5000]
with open(path, "w") as f:
json.dump(data, f, indent=2, default=str)
@@ -1,4 +1,4 @@
"""Component tests: Continuous Conversation Mode — threading, buffer passing.
"""Component tests: Continuous Conversation Mode — threading, buffer.
Exercises conversation threading across nodes to verify that downstream
nodes receive context from upstream nodes in continuous mode.
@@ -15,12 +15,15 @@ from .conftest import make_executor
SET_OUTPUT_INSTRUCTION = (
"You MUST call the set_output tool to provide your answer. "
"Do not just write text — call set_output with the correct key and value."
"Do not just write text — call set_output with the correct "
"key and value."
)
def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
"""Two-node pipeline: intake captures input, transform uppercases it."""
def _build_pipeline_graph(
conversation_mode: str = "continuous",
) -> GraphSpec:
"""Two-node pipeline: intake captures, transform uppercases."""
return GraphSpec(
id="continuous-pipeline",
goal_id="dummy",
@@ -37,8 +40,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
input_keys=["raw"],
output_keys=["captured"],
system_prompt=(
"Read the 'raw' input value and call set_output with "
"key='captured' and the same value. " + SET_OUTPUT_INSTRUCTION
"Read the 'raw' input value and call "
"set_output with key='captured' and the "
"same value. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -49,9 +53,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
input_keys=["value"],
output_keys=["result"],
system_prompt=(
"Read the 'value' input, convert it to UPPERCASE, "
"then call set_output with key='result' and the uppercased value. "
+ SET_OUTPUT_INSTRUCTION
"Read the 'value' input, convert it to "
"UPPERCASE, then call set_output with "
"key='result' and the uppercased value. " + SET_OUTPUT_INSTRUCTION
),
),
],
@@ -69,53 +73,141 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
@pytest.mark.asyncio
async def test_continuous_pipeline_traverses(runtime, goal, llm_provider):
async def test_continuous_pipeline_traverses(runtime, goal, llm_provider, artifact):
"""Continuous mode pipeline should traverse both nodes."""
graph = _build_pipeline_graph(conversation_mode="continuous")
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(
graph, goal, {"raw": "hello"}, validate_graph=False
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 5},
)
result = await executor.execute(
graph,
goal,
{"raw": "hello"},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, path=['intake','transform'], output['result'] is set"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["intake", "transform"],
actual=str(result.path),
expected_val="['intake', 'transform']",
)
assert result.path == ["intake", "transform"]
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
@pytest.mark.asyncio
async def test_continuous_data_flows_through(runtime, goal, llm_provider):
"""Data from node 1's output should be available to node 2 via input_mapping."""
async def test_continuous_data_flows_through(runtime, goal, llm_provider, artifact):
"""Data from node 1's output should be available to node 2."""
graph = _build_pipeline_graph(conversation_mode="continuous")
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(
graph, goal, {"raw": "test_data"}, validate_graph=False
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 5},
)
result = await executor.execute(
graph,
goal,
{"raw": "test_data"},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, output['result'] is non-empty",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
# The transform node should have produced something based on the input
output_len = len(str(result.output["result"]))
artifact.check(
"output is non-empty",
output_len > 0,
actual=str(output_len),
expected_val=">0",
)
assert len(str(result.output["result"])) > 0
@pytest.mark.asyncio
async def test_isolated_pipeline_traverses(runtime, goal, llm_provider):
async def test_isolated_pipeline_traverses(runtime, goal, llm_provider, artifact):
"""Isolated mode pipeline should also traverse both nodes."""
graph = _build_pipeline_graph(conversation_mode="isolated")
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(
graph, goal, {"raw": "data"}, validate_graph=False
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 5},
)
result = await executor.execute(
graph,
goal,
{"raw": "data"},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, path=['intake','transform']",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["intake", "transform"],
actual=str(result.path),
expected_val="['intake', 'transform']",
)
assert result.path == ["intake", "transform"]
@pytest.mark.asyncio
async def test_continuous_three_node_chain(runtime, goal, llm_provider):
"""Three-node continuous pipeline should thread conversation end-to-end."""
async def test_continuous_three_node_chain(runtime, goal, llm_provider, artifact):
"""Three-node continuous pipeline should thread end-to-end."""
graph = GraphSpec(
id="three-node-chain",
goal_id="dummy",
@@ -132,8 +224,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
input_keys=["input"],
output_keys=["a_out"],
system_prompt=(
"Read the 'input' value and call set_output with "
"key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
"Read the 'input' value and call set_output "
"with key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -144,9 +236,9 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
input_keys=["b_in"],
output_keys=["b_out"],
system_prompt=(
"Read the 'b_in' value and call set_output with "
"key='b_out' and value='processed_' followed by the input. "
+ SET_OUTPUT_INSTRUCTION
"Read the 'b_in' value and call set_output "
"with key='b_out' and value='processed_' "
"followed by the input. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -157,8 +249,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
input_keys=["c_in"],
output_keys=["result"],
system_prompt=(
"Read the 'c_in' value and call set_output with "
"key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
"Read the 'c_in' value and call set_output "
"with key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
),
),
],
@@ -178,14 +270,60 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
input_mapping={"c_in": "b_out"},
),
],
memory_keys=["input", "a_out", "b_in", "b_out", "c_in", "result"],
memory_keys=[
"input",
"a_out",
"b_in",
"b_out",
"c_in",
"result",
],
)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 5},
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(
graph, goal, {"input": "payload"}, validate_graph=False
graph,
goal,
{"input": "payload"},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, path=['a','b','c'], steps=3, output['result'] is set"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["a", "b", "c"],
actual=str(result.path),
expected_val="['a', 'b', 'c']",
)
assert result.path == ["a", "b", "c"]
artifact.check(
"steps_executed is 3",
result.steps_executed == 3,
actual=str(result.steps_executed),
expected_val="3",
)
assert result.steps_executed == 3
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
@@ -1,7 +1,7 @@
"""Component tests: Conversation Persistence — write-through, cursor, storage.
"""Component tests: Conversation Persistence — write-through, storage.
Exercises conversation persistence by running real LLM turns and verifying
that messages and state are written to disk correctly.
Exercises conversation persistence by running real LLM turns and
verifying that messages and state are written to disk correctly.
"""
from __future__ import annotations
@@ -31,8 +31,9 @@ def _build_echo_graph() -> GraphSpec:
input_keys=["input"],
output_keys=["output"],
system_prompt=(
"Read the 'input' value and immediately call set_output "
"with key='output' and the same value. Do not add any text."
"Read the 'input' value and immediately call "
"set_output with key='output' and the same "
"value. Do not add any text."
),
),
],
@@ -43,48 +44,113 @@ def _build_echo_graph() -> GraphSpec:
@pytest.mark.asyncio
async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path):
async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path, artifact):
"""After execution, conversation data should exist on disk."""
storage = tmp_path / "session"
graph = _build_echo_graph()
executor = make_executor(runtime, llm_provider, storage_path=storage)
executor = make_executor(
runtime,
llm_provider,
storage_path=storage,
)
result = await executor.execute(
graph, goal, {"input": "hello"}, validate_graph=False
graph,
goal,
{"input": "hello"},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, conversations/ dir exists with data files"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
# Verify conversation directory was created with content
conv_dir = storage / "conversations"
artifact.check(
"conversations/ dir exists",
conv_dir.exists(),
actual=str(conv_dir.exists()),
expected_val="True",
)
assert conv_dir.exists(), "conversations/ directory should exist"
# Should have at least one file (messages or cursor)
all_files = list(conv_dir.rglob("*"))
data_files = [f for f in all_files if f.is_file()]
artifact.check(
"at least one data file",
len(data_files) > 0,
actual=str(len(data_files)),
expected_val=">0",
)
assert len(data_files) > 0, "Should have persisted at least one conversation file"
@pytest.mark.asyncio
async def test_conversation_output_matches_execution(
runtime, goal, llm_provider, tmp_path
runtime, goal, llm_provider, tmp_path, artifact
):
"""ExecutionResult output should be consistent with what the node produced."""
"""ExecutionResult output should be consistent with the node."""
storage = tmp_path / "session"
graph = _build_echo_graph()
executor = make_executor(runtime, llm_provider, storage_path=storage)
executor = make_executor(
runtime,
llm_provider,
storage_path=storage,
)
result = await executor.execute(
graph, goal, {"input": "test_value"}, validate_graph=False
graph,
goal,
{"input": "test_value"},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, output['output'] is non-empty",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("output")
artifact.check(
"output['output'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("output") is not None
# The echo node should produce some non-empty output
output_len = len(str(result.output["output"]))
artifact.check(
"output is non-empty",
output_len > 0,
actual=str(output_len),
expected_val=">0",
)
assert len(str(result.output["output"])) > 0
@pytest.mark.asyncio
async def test_conversation_multi_node_persistence(
runtime, goal, llm_provider, tmp_path
):
async def test_conversation_multi_node_persistence(runtime, goal, llm_provider, tmp_path, artifact):
"""Multi-node graph should persist conversation data for each node."""
from framework.graph.edge import EdgeCondition, EdgeSpec
@@ -104,8 +170,8 @@ async def test_conversation_multi_node_persistence(
node_type="event_loop",
output_keys=["intermediate"],
system_prompt=(
"Call set_output with key='intermediate' and value='step1_done'. "
"Do not write text."
"Call set_output with key='intermediate' "
"and value='step1_done'. Do not write text."
),
),
NodeSpec(
@@ -116,8 +182,7 @@ async def test_conversation_multi_node_persistence(
input_keys=["intermediate"],
output_keys=["result"],
system_prompt=(
"Call set_output with key='result' and value='step2_done'. "
"Do not write text."
"Call set_output with key='result' and value='step2_done'. Do not write text."
),
),
],
@@ -132,12 +197,45 @@ async def test_conversation_multi_node_persistence(
],
memory_keys=["intermediate", "result"],
)
executor = make_executor(runtime, llm_provider, storage_path=storage)
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
storage_path=storage,
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, path=['step1','step2'], conversations/ dir exists"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["step1", "step2"],
actual=str(result.path),
expected_val="['step1', 'step2']",
)
assert result.path == ["step1", "step2"]
# Both nodes should have written conversation data
conv_dir = storage / "conversations"
artifact.check(
"conversations/ dir exists",
conv_dir.exists(),
actual=str(conv_dir.exists()),
expected_val="True",
)
assert conv_dir.exists()
+97 -23
View File
@@ -15,12 +15,13 @@ from .conftest import make_executor
SET_OUTPUT_INSTRUCTION = (
"You MUST call the set_output tool to provide your answer. "
"Do not just write text — call set_output with the correct key and value."
"Do not just write text — call set_output with the correct "
"key and value."
)
@pytest.mark.asyncio
async def test_edge_conditional_true_path(runtime, goal, llm_provider):
async def test_edge_conditional_true_path(runtime, goal, llm_provider, artifact):
"""Conditional edge with True expression should be traversed."""
graph = GraphSpec(
id="cond-true",
@@ -37,8 +38,7 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["label"],
system_prompt=(
"Call set_output with key='label' and value='yes'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='label' and value='yes'. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -48,8 +48,8 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"Call set_output with key='result' and value='reached'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='result' and "
"value='reached'. " + SET_OUTPUT_INSTRUCTION
),
),
],
@@ -64,15 +64,41 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
],
memory_keys=["label", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 3},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, path=['source','target']",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["source", "target"],
actual=str(result.path),
expected_val="['source', 'target']",
)
assert result.path == ["source", "target"]
@pytest.mark.asyncio
async def test_edge_conditional_false_path(runtime, goal, llm_provider):
async def test_edge_conditional_false_path(runtime, goal, llm_provider, artifact):
"""Conditional edge with False expression should NOT be traversed."""
graph = GraphSpec(
id="cond-false",
@@ -89,8 +115,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["label"],
system_prompt=(
"Call set_output with key='label' and value='no'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='label' and value='no'. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -99,7 +124,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
description="Should not be reached",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='bad'.",
system_prompt=("Call set_output with key='result' and value='bad'."),
),
],
edges=[
@@ -113,15 +138,41 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
],
memory_keys=["label", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 3},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, 'target' not in path",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"target not in path",
"target" not in result.path,
actual=str(result.path),
expected_val="path without 'target'",
)
assert "target" not in result.path
@pytest.mark.asyncio
async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
async def test_edge_priority_selects_higher(runtime, goal, llm_provider, artifact):
"""When multiple conditional edges match, higher priority wins."""
graph = GraphSpec(
id="priority-test",
@@ -138,8 +189,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["value"],
system_prompt=(
"Call set_output with key='value' and value='match'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='value' and value='match'. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -149,8 +199,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"Call set_output with key='result' and value='HIGH'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='result' and value='HIGH'. " + SET_OUTPUT_INSTRUCTION
),
),
NodeSpec(
@@ -160,8 +209,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"Call set_output with key='result' and value='LOW'. "
+ SET_OUTPUT_INSTRUCTION
"Call set_output with key='result' and value='LOW'. " + SET_OUTPUT_INSTRUCTION
),
),
],
@@ -185,8 +233,34 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
],
memory_keys=["value", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 3},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, path=['source','high']",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["source", "high"],
actual=str(result.path),
expected_val="['source', 'high']",
)
assert result.path == ["source", "high"]
@@ -16,7 +16,7 @@ from .conftest import make_executor
@pytest.mark.asyncio
async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path):
async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path, artifact):
"""Worker LLM should call the escalate tool when instructed.
After calling escalate, the worker blocks waiting for queen input.
@@ -40,8 +40,9 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"You MUST immediately call the escalate tool with "
"reason='need human approval for deployment'. "
"You MUST immediately call the escalate tool "
"with reason='need human approval for "
"deployment'. "
"Do not call set_output. Do not write text."
),
),
@@ -74,17 +75,34 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
# Worker will block after escalate. Short timeout is fine.
try:
await _asyncio.wait_for(
executor.execute(graph, goal, {}, validate_graph=False),
executor.execute(
graph,
goal,
{},
validate_graph=False,
),
timeout=30,
)
except (TimeoutError, _asyncio.TimeoutError):
pass # Expected: worker hangs waiting for queen
artifact.record_value(
"escalation_count",
len(escalations),
expected=">=1 ESCALATION_REQUESTED event emitted",
)
artifact.check(
"escalation event emitted",
len(escalations) >= 1,
actual=str(len(escalations)),
expected_val=">=1",
)
assert len(escalations) >= 1, "No ESCALATION_REQUESTED event emitted"
@pytest.mark.asyncio
async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path):
async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path, artifact):
"""Worker that escalates should still terminate (not hang forever)."""
graph = GraphSpec(
id="escalate-terminate",
@@ -100,8 +118,10 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"Call the escalate tool with reason='blocked on credentials'. "
"Then call set_output with key='result' and value='escalated'."
"Call the escalate tool with "
"reason='blocked on credentials'. "
"Then call set_output with key='result' "
"and value='escalated'."
),
),
],
@@ -115,6 +135,21 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
)
# Should terminate within timeout (make_executor wraps with asyncio.wait_for)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="steps_executed=1 (terminates, does not hang)",
)
artifact.check(
"steps_executed is 1",
result.steps_executed == 1,
actual=str(result.steps_executed),
expected_val="1",
)
assert result.steps_executed == 1
@@ -1,4 +1,4 @@
"""Component tests: EventLoopNode — iteration limits, output accumulation, stall safety.
"""Component tests: EventLoopNode — iteration limits, output, stall safety.
Exercises the core multi-turn LLM loop through single-node graphs with
real LLM calls to verify iteration control and termination behavior.
@@ -15,8 +15,8 @@ from .conftest import make_executor
@pytest.mark.asyncio
async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
"""LLM calls set_output on first turn — node should terminate with output."""
async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider, artifact):
"""LLM calls set_output on first turn — node terminates with output."""
graph = GraphSpec(
id="single-turn",
goal_id="dummy",
@@ -31,7 +31,8 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"Call set_output with key='result' and value='done'. "
"Call set_output with key='result' and "
"value='done'. "
"Do not write any text. Just call the tool."
),
),
@@ -40,19 +41,51 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
memory_keys=["result"],
conversation_mode="continuous",
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 3},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, output['result'] set, steps=1",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
artifact.check(
"steps_executed is 1",
result.steps_executed == 1,
actual=str(result.steps_executed),
expected_val="1",
)
assert result.steps_executed == 1
@pytest.mark.asyncio
async def test_event_loop_multi_turn_tool_use(
runtime, goal, llm_provider, tool_registry
):
"""LLM calls a tool, gets result, then calls set_output — multi-turn flow."""
async def test_event_loop_multi_turn_tool_use(runtime, goal, llm_provider, tool_registry, artifact):
"""LLM calls a tool, gets result, then calls set_output."""
graph = GraphSpec(
id="multi-turn",
goal_id="dummy",
@@ -68,9 +101,10 @@ async def test_event_loop_multi_turn_tool_use(
output_keys=["result"],
tools=["get_current_time"],
system_prompt=(
"First call get_current_time with timezone='UTC'. "
"Then call set_output with key='result' and the day_of_week "
"from the tool response."
"First call get_current_time with "
"timezone='UTC'. "
"Then call set_output with key='result' and "
"the day_of_week from the tool response."
),
),
],
@@ -79,18 +113,42 @@ async def test_event_loop_multi_turn_tool_use(
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, output['result'] is day_of_week",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
@pytest.mark.asyncio
async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider, artifact):
"""Node must terminate after max_iterations even without set_output."""
graph = GraphSpec(
id="stuck-node",
@@ -106,8 +164,7 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
node_type="event_loop",
output_keys=["result"],
system_prompt=(
"You are thinking deeply. Respond with a short thought. "
"Never call set_output."
"You are thinking deeply. Respond with a short thought. Never call set_output."
),
max_tokens=32,
),
@@ -116,15 +173,34 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
memory_keys=["result"],
conversation_mode="continuous",
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 3},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected="terminates (not hang), steps_executed=1",
)
# Should terminate (not hang) — the node was visited
artifact.check(
"steps_executed is 1",
result.steps_executed == 1,
actual=str(result.steps_executed),
expected_val="1",
)
assert result.steps_executed == 1
@pytest.mark.asyncio
async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider, artifact):
"""LLM should be able to set multiple output keys in a single node."""
graph = GraphSpec(
id="multi-output",
@@ -142,7 +218,8 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
system_prompt=(
"Call set_output twice: "
"first with key='name' and value='Alice', "
"then with key='greeting' and value='Hello Alice'. "
"then with key='greeting' and "
"value='Hello Alice'. "
"Do not write any text."
),
),
@@ -151,9 +228,44 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
memory_keys=["name", "greeting"],
conversation_mode="continuous",
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(graph, goal, {}, validate_graph=False)
executor = make_executor(
runtime,
llm_provider,
loop_config={"max_iterations": 5},
)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, output['name'] and output['greeting'] are set"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_name = result.output.get("name")
artifact.check(
"output['name'] is set",
actual_name is not None,
actual=repr(actual_name),
expected_val="non-None value",
)
assert result.output.get("name") is not None
actual_greeting = result.output.get("greeting")
artifact.check(
"output['greeting'] is set",
actual_greeting is not None,
actual=repr(actual_greeting),
expected_val="non-None value",
)
assert result.output.get("greeting") is not None
+123 -9
View File
@@ -15,18 +15,39 @@ from framework.llm.stream_events import FinishEvent, TextDeltaEvent, ToolCallEve
@pytest.mark.asyncio
async def test_llm_acomplete_returns_content(llm_provider):
async def test_llm_acomplete_returns_content(llm_provider, artifact):
"""acomplete() should return a non-empty LLMResponse."""
result = await llm_provider.acomplete(
messages=[{"role": "user", "content": "Reply with exactly: OK"}],
max_tokens=16,
)
artifact.record_value(
"result_type",
type(result).__name__,
expected="LLMResponse with non-empty content",
)
artifact.record_value("content", result.content)
artifact.check(
"result is LLMResponse",
isinstance(result, LLMResponse),
actual=type(result).__name__,
expected_val="LLMResponse",
)
assert isinstance(result, LLMResponse)
content_ok = result.content and result.content.strip()
artifact.check(
"content is non-empty",
bool(content_ok),
actual=repr(result.content),
expected_val="non-empty string",
)
assert result.content and result.content.strip()
@pytest.mark.asyncio
async def test_llm_stream_yields_text_delta(llm_provider):
async def test_llm_stream_yields_text_delta(llm_provider, artifact):
"""stream() should yield at least one TextDeltaEvent and a FinishEvent."""
text_deltas = []
finish_events = []
@@ -39,12 +60,32 @@ async def test_llm_stream_yields_text_delta(llm_provider):
elif isinstance(event, FinishEvent):
finish_events.append(event)
artifact.record_value(
"text_delta_count",
len(text_deltas),
expected=">=1 TextDeltaEvent and exactly 1 FinishEvent",
)
artifact.record_value("finish_event_count", len(finish_events))
artifact.check(
"at least one TextDeltaEvent",
len(text_deltas) >= 1,
actual=str(len(text_deltas)),
expected_val=">=1",
)
assert len(text_deltas) >= 1, "Expected at least one TextDeltaEvent"
artifact.check(
"exactly one FinishEvent",
len(finish_events) == 1,
actual=str(len(finish_events)),
expected_val="1",
)
assert len(finish_events) == 1, "Expected exactly one FinishEvent"
@pytest.mark.asyncio
async def test_llm_stream_tool_call(llm_provider):
async def test_llm_stream_tool_call(llm_provider, artifact):
"""stream() with a tool definition should produce a ToolCallEvent."""
tool = Tool(
name="record_result",
@@ -52,7 +93,10 @@ async def test_llm_stream_tool_call(llm_provider):
parameters={
"type": "object",
"properties": {
"value": {"type": "string", "description": "The result to record."},
"value": {
"type": "string",
"description": "The result to record.",
},
},
"required": ["value"],
},
@@ -63,7 +107,8 @@ async def test_llm_stream_tool_call(llm_provider):
{
"role": "user",
"content": (
"Call the record_result tool exactly once with value='OK'. "
"Call the record_result tool exactly once "
"with value='OK'. "
"Do not answer with plain text."
),
}
@@ -74,30 +119,79 @@ async def test_llm_stream_tool_call(llm_provider):
events.append(event)
tool_calls = [e for e in events if isinstance(e, ToolCallEvent)]
artifact.record_value(
"tool_call_count",
len(tool_calls),
expected=">=1 ToolCallEvent, tool_name='record_result'",
)
artifact.record_value(
"tool_names",
[tc.tool_name for tc in tool_calls],
)
artifact.check(
"LLM called record_result",
len(tool_calls) >= 1,
actual=str(len(tool_calls)),
expected_val=">=1",
)
assert len(tool_calls) >= 1, "LLM should have called record_result"
artifact.check(
"tool_name is record_result",
tool_calls[0].tool_name == "record_result",
actual=tool_calls[0].tool_name,
expected_val="record_result",
)
assert tool_calls[0].tool_name == "record_result"
@pytest.mark.asyncio
async def test_llm_token_counts_populated(llm_provider):
async def test_llm_token_counts_populated(llm_provider, artifact):
"""LLMResponse should have positive input_tokens and output_tokens."""
result = await llm_provider.acomplete(
messages=[{"role": "user", "content": "Reply OK."}],
max_tokens=16,
)
artifact.record_value(
"input_tokens",
result.input_tokens,
expected="positive input_tokens and output_tokens",
)
artifact.record_value("output_tokens", result.output_tokens)
artifact.check(
"input_tokens positive",
result.input_tokens > 0,
actual=str(result.input_tokens),
expected_val=">0",
)
assert result.input_tokens > 0, "input_tokens should be positive"
artifact.check(
"output_tokens positive",
result.output_tokens > 0,
actual=str(result.output_tokens),
expected_val=">0",
)
assert result.output_tokens > 0, "output_tokens should be positive"
@pytest.mark.asyncio
async def test_llm_json_mode(llm_provider):
"""acomplete(json_mode=True) should return parseable JSON when supported."""
async def test_llm_json_mode(llm_provider, artifact):
"""acomplete(json_mode=True) should return parseable JSON."""
try:
result = await llm_provider.acomplete(
messages=[
{
"role": "user",
"content": 'Return a JSON object with key "status" and value "ok". Output only valid JSON, no other text.',
"content": (
'Return a JSON object with key "status" '
'and value "ok". Output only valid JSON, '
"no other text."
),
}
],
max_tokens=64,
@@ -110,6 +204,26 @@ async def test_llm_json_mode(llm_provider):
if not content:
pytest.skip("Provider returned empty content for json_mode request")
artifact.record_value(
"content",
content,
expected="parseable JSON dict with 'status' key",
)
parsed = json.loads(content)
artifact.check(
"parsed is dict",
isinstance(parsed, dict),
actual=type(parsed).__name__,
expected_val="dict",
)
assert isinstance(parsed, dict)
artifact.check(
"'status' key present",
"status" in parsed,
actual=str(list(parsed.keys())),
expected_val="contains 'status'",
)
assert "status" in parsed
+163 -8
View File
@@ -16,15 +16,24 @@ def _make_tools(*names: str) -> list[Tool]:
return [Tool(name=n, description=f"Tool {n}", parameters={}) for n in names]
def test_queen_phase_state_initial_phase():
def test_queen_phase_state_initial_phase(artifact):
"""QueenPhaseState should default to 'building' phase."""
from framework.tools.queen_lifecycle_tools import QueenPhaseState
state = QueenPhaseState()
artifact.record_value("phase", state.phase, expected="default phase == 'building'")
artifact.check(
"default phase is building",
state.phase == "building",
actual=repr(state.phase),
expected_val="'building'",
)
assert state.phase == "building"
def test_queen_phase_state_planning_tools():
def test_queen_phase_state_planning_tools(artifact):
"""Planning phase should return planning_tools."""
from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -34,11 +43,31 @@ def test_queen_phase_state_planning_tools():
tools = state.get_current_tools()
tool_names = {t.name for t in tools}
artifact.record_value(
"tool_names",
sorted(tool_names),
expected="planning tools include list_agent_tools, exclude edit_file",
)
artifact.check(
"list_agent_tools in tools",
"list_agent_tools" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'list_agent_tools'",
)
assert "list_agent_tools" in tool_names
artifact.check(
"edit_file not in tools",
"edit_file" not in tool_names,
actual=str(sorted(tool_names)),
expected_val="does not contain 'edit_file'",
)
assert "edit_file" not in tool_names
def test_queen_phase_state_building_tools():
def test_queen_phase_state_building_tools(artifact):
"""Building phase should return building_tools."""
from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -48,11 +77,31 @@ def test_queen_phase_state_building_tools():
tools = state.get_current_tools()
tool_names = {t.name for t in tools}
artifact.record_value(
"tool_names",
sorted(tool_names),
expected="building tools include edit_file, exclude list_agent_tools",
)
artifact.check(
"edit_file in tools",
"edit_file" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'edit_file'",
)
assert "edit_file" in tool_names
artifact.check(
"list_agent_tools not in tools",
"list_agent_tools" not in tool_names,
actual=str(sorted(tool_names)),
expected_val="does not contain 'list_agent_tools'",
)
assert "list_agent_tools" not in tool_names
def test_queen_phase_state_tool_switching():
def test_queen_phase_state_tool_switching(artifact):
"""Switching phase should change which tools are returned."""
from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -62,33 +111,96 @@ def test_queen_phase_state_tool_switching():
state.staging_tools = _make_tools("c")
state.running_tools = _make_tools("d")
planning_tool = state.get_current_tools()[0].name
artifact.check(
"planning returns tool 'a'",
planning_tool == "a",
actual=repr(planning_tool),
expected_val="'a'",
)
assert state.get_current_tools()[0].name == "a"
state.phase = "building"
building_tool = state.get_current_tools()[0].name
artifact.check(
"building returns tool 'b'",
building_tool == "b",
actual=repr(building_tool),
expected_val="'b'",
)
assert state.get_current_tools()[0].name == "b"
state.phase = "staging"
staging_tool = state.get_current_tools()[0].name
artifact.check(
"staging returns tool 'c'",
staging_tool == "c",
actual=repr(staging_tool),
expected_val="'c'",
)
assert state.get_current_tools()[0].name == "c"
state.phase = "running"
running_tool = state.get_current_tools()[0].name
artifact.check(
"running returns tool 'd'",
running_tool == "d",
actual=repr(running_tool),
expected_val="'d'",
)
assert state.get_current_tools()[0].name == "d"
artifact.record_value(
"tool_per_phase",
{"planning": "a", "building": "b", "staging": "c", "running": "d"},
expected="each phase returns its own tool",
)
def test_queen_initial_phase_no_worker():
def test_queen_initial_phase_no_worker(artifact):
"""Without a worker identity, queen should start in 'planning'."""
# This tests the logic in queen_orchestrator.py line 106:
# initial_phase = "staging" if worker_identity else "planning"
worker_identity = None
initial_phase = "staging" if worker_identity else "planning"
artifact.record_value(
"initial_phase",
initial_phase,
expected="'planning' when worker_identity is None",
)
artifact.check(
"initial phase is planning",
initial_phase == "planning",
actual=repr(initial_phase),
expected_val="'planning'",
)
assert initial_phase == "planning"
def test_queen_initial_phase_with_worker():
def test_queen_initial_phase_with_worker(artifact):
"""With a worker identity, queen should start in 'staging'."""
worker_identity = "my_agent"
initial_phase = "staging" if worker_identity else "planning"
artifact.record_value(
"initial_phase",
initial_phase,
expected="'staging' when worker_identity is set",
)
artifact.check(
"initial phase is staging",
initial_phase == "staging",
actual=repr(initial_phase),
expected_val="'staging'",
)
assert initial_phase == "staging"
@pytest.mark.asyncio
async def test_queen_phase_switch_emits_event():
async def test_queen_phase_switch_emits_event(artifact):
"""Phase transition should emit QUEEN_PHASE_CHANGED event."""
from framework.runtime.event_bus import EventBus, EventType
from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -110,12 +222,36 @@ async def test_queen_phase_switch_emits_event():
await state.switch_to_building(source="tool")
artifact.record_value("phase", state.phase, expected="'building'")
artifact.record_value("event_count", len(phase_events))
artifact.check(
"phase is building",
state.phase == "building",
actual=repr(state.phase),
expected_val="'building'",
)
assert state.phase == "building"
artifact.check(
"at least 1 phase event",
len(phase_events) >= 1,
actual=str(len(phase_events)),
expected_val=">=1",
)
assert len(phase_events) >= 1
event_phase = phase_events[0].data.get("phase")
artifact.check(
"event reports building",
event_phase == "building",
actual=repr(event_phase),
expected_val="'building'",
)
assert phase_events[0].data.get("phase") == "building"
def test_queen_draft_graph_persists_across_turns():
def test_queen_draft_graph_persists_across_turns(artifact):
"""Draft graph stored on phase_state should survive phase changes."""
from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -126,5 +262,24 @@ def test_queen_draft_graph_persists_across_turns():
state.phase = "building"
# Draft should still be available
artifact.record_value(
"draft_graph",
state.draft_graph,
expected="draft_graph survives phase change, nodes=['a','b']",
)
artifact.check(
"draft_graph is not None",
state.draft_graph is not None,
actual=repr(state.draft_graph),
expected_val="non-None",
)
assert state.draft_graph is not None
artifact.check(
"draft has 2 nodes",
len(state.draft_graph["nodes"]) == 2,
actual=str(len(state.draft_graph["nodes"])),
expected_val="2",
)
assert len(state.draft_graph["nodes"]) == 2
@@ -0,0 +1,772 @@
"""Component tests: Queen Live Phase Switching — real LLM, real event bus.
Starts the actual queen via create_queen() with a real LLM provider and
verifies phase transitions, dynamic tool switching, prompt switching, and
event emission through the full queen lifecycle.
"""
from __future__ import annotations
import asyncio
import time
from dataclasses import dataclass, field
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from framework.runtime.event_bus import AgentEvent, EventBus, EventType
from framework.server.session_manager import Session
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
QUEEN_STARTUP_TIMEOUT = 30 # seconds to wait for queen to initialize
QUEEN_RESPONSE_TIMEOUT = 60 # seconds to wait for queen to respond to a message
@dataclass
class PhaseCapture:
"""Captures QUEEN_PHASE_CHANGED events."""
phases: list[str] = field(default_factory=list)
events: list[AgentEvent] = field(default_factory=list)
_waiters: list[tuple[str, asyncio.Event]] = field(default_factory=list)
async def on_event(self, event: AgentEvent) -> None:
phase = event.data.get("phase", "")
self.phases.append(phase)
self.events.append(event)
# Wake any waiters for this phase
for target_phase, evt in self._waiters:
if phase == target_phase:
evt.set()
async def wait_for_phase(self, phase: str, timeout: float = 30) -> bool:
"""Wait until a specific phase change is observed."""
if phase in self.phases:
return True
evt = asyncio.Event()
self._waiters.append((phase, evt))
try:
await asyncio.wait_for(evt.wait(), timeout=timeout)
return True
except (TimeoutError, asyncio.TimeoutError):
return False
@dataclass
class TextCapture:
"""Captures LLM text deltas to verify queen is responding."""
chunks: list[str] = field(default_factory=list)
_has_text: asyncio.Event = field(default_factory=asyncio.Event)
async def on_event(self, event: AgentEvent) -> None:
text = event.data.get("content", "")
if text:
self.chunks.append(text)
self._has_text.set()
async def wait_for_text(self, timeout: float = 30) -> bool:
try:
await asyncio.wait_for(self._has_text.wait(), timeout=timeout)
return True
except (TimeoutError, asyncio.TimeoutError):
return False
@property
def full_text(self) -> str:
return "".join(self.chunks)
def _make_mock_session_manager() -> MagicMock:
"""Create a minimal mock SessionManager that satisfies create_queen()."""
mgr = MagicMock()
# _subscribe_worker_handoffs needs to exist but can be a no-op for tests
mgr._subscribe_worker_handoffs = MagicMock()
return mgr
async def _start_queen(
llm_provider,
tmp_path: Path,
*,
worker_identity: str | None = None,
initial_prompt: str | None = None,
) -> tuple[Session, asyncio.Task]:
"""Start a real queen and return (session, task)."""
from framework.server.queen_orchestrator import create_queen
event_bus = EventBus()
session = Session(
id=f"test_{int(time.time())}",
event_bus=event_bus,
llm=llm_provider,
loaded_at=time.time(),
)
queen_dir = tmp_path / "queen"
queen_dir.mkdir(parents=True, exist_ok=True)
mgr = _make_mock_session_manager()
task = await create_queen(
session=session,
session_manager=mgr,
worker_identity=worker_identity,
queen_dir=queen_dir,
initial_prompt=initial_prompt,
)
# Wait for queen to initialize (queen_executor is set inside the task)
for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
if session.queen_executor is not None:
break
await asyncio.sleep(0.1)
assert session.queen_executor is not None, "Queen executor did not initialize"
assert session.phase_state is not None, "Phase state not set"
return session, task
async def _shutdown_queen(session: Session, task: asyncio.Task) -> None:
"""Cleanly shut down the queen."""
# Signal the event loop node to stop
node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
if node and hasattr(node, "signal_shutdown"):
node.signal_shutdown()
# Cancel the task as backup
if not task.done():
task.cancel()
try:
await asyncio.wait_for(task, timeout=5)
except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
pass
# ---------------------------------------------------------------------------
# Tests: Initial Phase
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_starts_in_planning_without_worker(llm_provider, tmp_path, artifact):
"""Queen with no worker_identity must start in 'planning' phase."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
actual_phase = session.phase_state.phase
artifact.record_value(
"phase", actual_phase, expected="phase == 'planning' when no worker_identity"
)
artifact.check(
"phase is planning",
actual_phase == "planning",
actual=repr(actual_phase),
expected_val="'planning'",
)
assert session.phase_state.phase == "planning", (
f"Expected planning, got {session.phase_state.phase}"
)
finally:
await _shutdown_queen(session, task)
@pytest.mark.asyncio
async def test_queen_starts_in_staging_with_worker(llm_provider, tmp_path, artifact):
"""Queen with worker_identity must start in 'staging' phase."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity="test_agent",
initial_prompt="Hello",
)
try:
actual_phase = session.phase_state.phase
artifact.record_value(
"phase", actual_phase, expected="phase == 'staging' when worker_identity is set"
)
artifact.check(
"phase is staging",
actual_phase == "staging",
actual=repr(actual_phase),
expected_val="'staging'",
)
assert session.phase_state.phase == "staging", (
f"Expected staging, got {session.phase_state.phase}"
)
finally:
await _shutdown_queen(session, task)
# ---------------------------------------------------------------------------
# Tests: Tool Availability Per Phase
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_planning_tools_available(llm_provider, tmp_path, artifact):
"""In planning phase, planning tools must be returned by get_current_tools()."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
ps = session.phase_state
artifact.record_value(
"phase",
ps.phase,
expected="phase='planning', tools include list_agent_tools, exclude edit_file",
)
artifact.check(
"phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
tool_names = {t.name for t in ps.get_current_tools()}
artifact.record_value("tool_names", sorted(tool_names))
# Planning phase must have agent discovery tools
artifact.check(
"list_agent_tools in tools",
"list_agent_tools" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'list_agent_tools'",
)
assert "list_agent_tools" in tool_names, (
f"list_agent_tools missing from planning tools: {tool_names}"
)
# Planning phase must NOT have building-only tools
artifact.check(
"edit_file not in tools",
"edit_file" not in tool_names,
actual=str(sorted(tool_names)),
expected_val="does not contain 'edit_file'",
)
assert "edit_file" not in tool_names, (
f"edit_file should not be in planning tools: {tool_names}"
)
finally:
await _shutdown_queen(session, task)
@pytest.mark.asyncio
async def test_queen_tools_change_on_phase_switch(llm_provider, tmp_path, artifact):
"""Switching phase must change the tools returned by get_current_tools()."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
ps = session.phase_state
planning_tools = {t.name for t in ps.get_current_tools()}
artifact.record_value(
"planning_tools",
sorted(planning_tools),
expected="planning, building, and staging tool sets all differ",
)
# Switch to building
await ps.switch_to_building(source="test")
building_tools = {t.name for t in ps.get_current_tools()}
artifact.record_value("building_tools", sorted(building_tools))
artifact.check(
"planning != building tools",
planning_tools != building_tools,
actual=f"planning={sorted(planning_tools)}, building={sorted(building_tools)}",
expected_val="different sets",
)
assert planning_tools != building_tools, "Planning and building tools must differ"
# Switch to staging
await ps.switch_to_staging(source="test")
staging_tools = {t.name for t in ps.get_current_tools()}
artifact.record_value("staging_tools", sorted(staging_tools))
artifact.check(
"staging != building tools",
staging_tools != building_tools,
actual=f"staging={sorted(staging_tools)}, building={sorted(building_tools)}",
expected_val="different sets",
)
assert staging_tools != building_tools, "Building and staging tools must differ"
finally:
await _shutdown_queen(session, task)
# ---------------------------------------------------------------------------
# Tests: Prompt Switching
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_prompt_changes_on_phase_switch(llm_provider, tmp_path, artifact):
"""Switching phase must change the system prompt returned by get_current_prompt()."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
ps = session.phase_state
planning_prompt = ps.get_current_prompt()
artifact.record_value(
"planning_prompt_len",
len(planning_prompt),
expected="non-empty planning and building prompts that differ",
)
artifact.check(
"planning prompt non-empty",
len(planning_prompt) > 0,
actual=str(len(planning_prompt)),
expected_val=">0",
)
assert len(planning_prompt) > 0, "Planning prompt should not be empty"
await ps.switch_to_building(source="test")
building_prompt = ps.get_current_prompt()
artifact.record_value("building_prompt_len", len(building_prompt))
artifact.check(
"building prompt non-empty",
len(building_prompt) > 0,
actual=str(len(building_prompt)),
expected_val=">0",
)
assert len(building_prompt) > 0, "Building prompt should not be empty"
artifact.check(
"prompts differ",
planning_prompt != building_prompt,
actual=f"planning_len={len(planning_prompt)}, building_len={len(building_prompt)}",
expected_val="different prompts",
)
assert planning_prompt != building_prompt, "Planning and building prompts must differ"
finally:
await _shutdown_queen(session, task)
# ---------------------------------------------------------------------------
# Tests: Phase Change Events
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_emits_phase_change_events(llm_provider, tmp_path, artifact):
"""Each phase switch must emit a QUEEN_PHASE_CHANGED event."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
capture = PhaseCapture()
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=capture.on_event,
)
try:
ps = session.phase_state
# planning -> building
await ps.switch_to_building(source="test")
assert await capture.wait_for_phase("building", timeout=5)
# building -> staging
await ps.switch_to_staging(source="test")
assert await capture.wait_for_phase("staging", timeout=5)
# staging -> running
await ps.switch_to_running(source="test")
assert await capture.wait_for_phase("running", timeout=5)
# running -> planning
await ps.switch_to_planning(source="test")
assert await capture.wait_for_phase("planning", timeout=5)
artifact.record_value(
"phases", capture.phases, expected="['building', 'staging', 'running', 'planning']"
)
artifact.check(
"phase sequence matches",
capture.phases == ["building", "staging", "running", "planning"],
actual=str(capture.phases),
expected_val="['building', 'staging', 'running', 'planning']",
)
assert capture.phases == ["building", "staging", "running", "planning"], (
f"Phase sequence was: {capture.phases}"
)
finally:
await _shutdown_queen(session, task)
@pytest.mark.asyncio
async def test_queen_no_duplicate_phase_event_on_same_phase(llm_provider, tmp_path, artifact):
"""Switching to the same phase should NOT emit a duplicate event."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
capture = PhaseCapture()
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=capture.on_event,
)
try:
ps = session.phase_state
artifact.check(
"initial phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
# Switch to building twice
await ps.switch_to_building(source="test")
await asyncio.sleep(0.2)
await ps.switch_to_building(source="test") # no-op
await asyncio.sleep(0.2)
# Should only have one "building" event
building_events = [p for p in capture.phases if p == "building"]
artifact.record_value(
"building_event_count",
len(building_events),
expected="exactly 1 building event (no duplicate)",
)
artifact.record_value("all_phases", capture.phases)
artifact.check(
"only 1 building event",
len(building_events) == 1,
actual=str(len(building_events)),
expected_val="1",
)
assert len(building_events) == 1, (
f"Expected 1 building event, got {len(building_events)}: {capture.phases}"
)
finally:
await _shutdown_queen(session, task)
# ---------------------------------------------------------------------------
# Tests: Queen Responds in Correct Phase
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_responds_to_message(llm_provider, tmp_path, artifact):
"""Queen must produce an LLM turn when started with an initial prompt."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello, I want to build an agent.",
)
turn_complete = asyncio.Event()
async def _on_turn(event: AgentEvent) -> None:
turn_complete.set()
session.event_bus.subscribe(
event_types=[EventType.LLM_TURN_COMPLETE],
handler=_on_turn,
filter_stream="queen",
)
try:
# Queen should complete at least one LLM turn (text or tool call)
got_turn = False
try:
await asyncio.wait_for(turn_complete.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
got_turn = True
except (TimeoutError, asyncio.TimeoutError):
pass
artifact.record_value(
"got_turn", got_turn, expected="queen completes at least one LLM turn"
)
artifact.check(
"queen completed LLM turn", got_turn, actual=str(got_turn), expected_val="True"
)
assert got_turn, "Queen did not complete any LLM turn"
finally:
await _shutdown_queen(session, task)
@pytest.mark.asyncio
async def test_queen_responds_after_injected_message(llm_provider, tmp_path, artifact):
"""Injecting a user message must trigger a new queen LLM turn."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
# Wait for initial response to settle
first_turn = asyncio.Event()
async def _on_first_turn(event: AgentEvent) -> None:
first_turn.set()
sub_id = session.event_bus.subscribe(
event_types=[EventType.LLM_TURN_COMPLETE],
handler=_on_first_turn,
filter_stream="queen",
)
try:
await asyncio.wait_for(first_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
except (TimeoutError, asyncio.TimeoutError):
pass
session.event_bus.unsubscribe(sub_id)
# Now inject a follow-up and listen for a new turn
second_turn = asyncio.Event()
async def _on_second_turn(event: AgentEvent) -> None:
second_turn.set()
session.event_bus.subscribe(
event_types=[EventType.LLM_TURN_COMPLETE],
handler=_on_second_turn,
filter_stream="queen",
)
node = session.queen_executor.node_registry.get("queen")
assert node is not None
await node.inject_event(
"What tools do you have available?",
is_client_input=True,
)
got_turn = False
try:
await asyncio.wait_for(second_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
got_turn = True
except (TimeoutError, asyncio.TimeoutError):
pass
artifact.record_value(
"got_second_turn", got_turn, expected="queen responds to injected message"
)
artifact.check(
"queen responded to injected message",
got_turn,
actual=str(got_turn),
expected_val="True",
)
assert got_turn, "Queen did not respond to injected message"
finally:
await _shutdown_queen(session, task)
# ---------------------------------------------------------------------------
# Tests: Phase Transition Cycle
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_queen_full_phase_cycle_with_events(llm_provider, tmp_path, artifact):
"""Walk through all 4 phases and verify state + events at each step."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
capture = PhaseCapture()
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=capture.on_event,
)
try:
ps = session.phase_state
# Start: planning
artifact.check(
"initial phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
planning_tools = {t.name for t in ps.get_current_tools()}
# -> building
await ps.switch_to_building(source="test")
artifact.check(
"phase is building",
ps.phase == "building",
actual=repr(ps.phase),
expected_val="'building'",
)
assert ps.phase == "building"
building_tools = {t.name for t in ps.get_current_tools()}
artifact.check(
"building tools differ from planning",
building_tools != planning_tools,
actual=f"building={sorted(building_tools)}",
expected_val="different from planning",
)
assert building_tools != planning_tools
# -> staging
await ps.switch_to_staging(source="test")
artifact.check(
"phase is staging",
ps.phase == "staging",
actual=repr(ps.phase),
expected_val="'staging'",
)
assert ps.phase == "staging"
staging_tools = {t.name for t in ps.get_current_tools()}
# -> running
await ps.switch_to_running(source="test")
artifact.check(
"phase is running",
ps.phase == "running",
actual=repr(ps.phase),
expected_val="'running'",
)
assert ps.phase == "running"
running_tools = {t.name for t in ps.get_current_tools()}
# -> back to planning
await ps.switch_to_planning(source="test")
artifact.check(
"phase is planning again",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
final_tools = {t.name for t in ps.get_current_tools()}
artifact.check(
"final tools match original planning set",
final_tools == planning_tools,
actual=f"final={sorted(final_tools)}",
expected_val=f"planning={sorted(planning_tools)}",
)
assert final_tools == planning_tools, "Tools should match original planning set"
# Verify events
await asyncio.sleep(0.3)
artifact.record_value(
"phase_events",
capture.phases,
expected="['building', 'staging', 'running', 'planning']",
)
artifact.check(
"phase event sequence",
capture.phases == ["building", "staging", "running", "planning"],
actual=str(capture.phases),
expected_val="['building', 'staging', 'running', 'planning']",
)
assert capture.phases == ["building", "staging", "running", "planning"]
# Verify all 4 phase tool sets are distinct
all_sets = [planning_tools, building_tools, staging_tools, running_tools]
for i, a in enumerate(all_sets):
for j, b in enumerate(all_sets):
if i != j:
phase_names = ["planning", "building", "staging", "running"]
artifact.check(
f"{phase_names[i]} != {phase_names[j]} tools",
a != b,
actual=f"{phase_names[i]}={sorted(a)}, {phase_names[j]}={sorted(b)}",
expected_val="different",
)
assert a != b, f"Phase tool sets {i} and {j} should differ but are identical"
finally:
await _shutdown_queen(session, task)
@pytest.mark.asyncio
async def test_queen_phase_state_persists_draft(llm_provider, tmp_path, artifact):
"""Draft graph on phase_state must survive phase transitions."""
session, task = await _start_queen(
llm_provider,
tmp_path,
worker_identity=None,
initial_prompt="Hello",
)
try:
ps = session.phase_state
ps.draft_graph = {"nodes": ["a", "b"], "edges": ["a->b"]}
await ps.switch_to_building(source="test")
artifact.check(
"draft survives building switch",
ps.draft_graph is not None,
actual=repr(ps.draft_graph),
expected_val="non-None",
)
assert ps.draft_graph is not None
artifact.check(
"draft nodes intact after building",
ps.draft_graph["nodes"] == ["a", "b"],
actual=str(ps.draft_graph["nodes"]),
expected_val="['a', 'b']",
)
assert ps.draft_graph["nodes"] == ["a", "b"]
await ps.switch_to_staging(source="test")
artifact.check(
"draft survives staging switch",
ps.draft_graph is not None,
actual=repr(ps.draft_graph),
expected_val="non-None",
)
assert ps.draft_graph is not None
await ps.switch_to_running(source="test")
artifact.check(
"draft survives running switch",
ps.draft_graph is not None,
actual=repr(ps.draft_graph),
expected_val="non-None",
)
assert ps.draft_graph is not None
artifact.record_value(
"final_draft_graph",
ps.draft_graph,
expected="draft_graph survives all phase transitions",
)
finally:
await _shutdown_queen(session, task)
@@ -0,0 +1,678 @@
"""Component tests: Queen State Machine Edge Cases.
Race conditions, invalid transitions, stale events.
These tests confirm real bugs and edge cases in the queen's phase
state machine:
- Non-atomic phase switch + event emission
- Stale worker completion events ignored during wrong phase
- No guards against invalid phase transitions
- Double phase switch deduplication
- inject_notification after executor teardown
- Empty tool lists per phase
- Phase persistence across rapid cycling
"""
from __future__ import annotations
import asyncio
import time
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from framework.runtime.event_bus import AgentEvent, EventBus, EventType
from framework.server.session_manager import Session
from framework.tools.queen_lifecycle_tools import QueenPhaseState
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
QUEEN_STARTUP_TIMEOUT = 30
async def _start_queen_session(llm_provider, tmp_path, *, worker_identity=None):
"""Start a real queen and return (session, task)."""
from framework.server.queen_orchestrator import create_queen
event_bus = EventBus()
session = Session(
id=f"test_{int(time.time())}",
event_bus=event_bus,
llm=llm_provider,
loaded_at=time.time(),
)
queen_dir = tmp_path / "queen"
queen_dir.mkdir(parents=True, exist_ok=True)
mgr = MagicMock()
mgr._subscribe_worker_handoffs = MagicMock()
task = await create_queen(
session=session,
session_manager=mgr,
worker_identity=worker_identity,
queen_dir=queen_dir,
initial_prompt="Hello",
)
for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
if session.queen_executor is not None:
break
await asyncio.sleep(0.1)
assert session.queen_executor is not None
return session, task
async def _shutdown(session, task):
node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
if node and hasattr(node, "signal_shutdown"):
node.signal_shutdown()
if not task.done():
task.cancel()
try:
await asyncio.wait_for(task, timeout=5)
except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
pass
# -----------------------------------------------------------------------
# BUG #1: Concurrent phase switches — no crash or lost events
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_concurrent_phase_switches_no_crash(llm_provider, tmp_path, artifact):
"""Firing multiple phase switches concurrently must not crash."""
session, task = await _start_queen_session(llm_provider, tmp_path)
phases_seen = []
async def _capture(event: AgentEvent):
phases_seen.append(event.data.get("phase"))
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
try:
ps = session.phase_state
# Fire 4 phase switches concurrently
await asyncio.gather(
ps.switch_to_building(source="test"),
ps.switch_to_staging(source="test"),
ps.switch_to_running(source="test"),
ps.switch_to_planning(source="test"),
)
await asyncio.sleep(0.3)
valid_phases = ("planning", "building", "staging", "running")
artifact.record_value(
"final_phase",
ps.phase,
expected="valid phase (not corrupted)",
)
artifact.record_value("phases_seen", phases_seen)
artifact.check(
"phase is valid",
ps.phase in valid_phases,
actual=repr(ps.phase),
expected_val="one of planning/building/staging/running",
)
assert ps.phase in valid_phases, f"Phase corrupted: {ps.phase}"
artifact.check(
"at least 1 phase event",
len(phases_seen) >= 1,
actual=str(len(phases_seen)),
expected_val=">=1",
)
assert len(phases_seen) >= 1, "No phase change events"
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# BUG #3: Non-atomic phase change + event
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_phase_changes_without_event_bus(artifact):
"""Phase must still change when event_bus is None (no crash)."""
ps = QueenPhaseState(phase="planning", event_bus=None)
await ps.switch_to_building(source="test")
artifact.record_value(
"phase",
ps.phase,
expected="'building' even without event bus",
)
artifact.check(
"phase changed to building",
ps.phase == "building",
actual=repr(ps.phase),
expected_val="'building'",
)
assert ps.phase == "building", "Phase should change even without event bus"
@pytest.mark.asyncio
async def test_phase_change_committed_before_event(artifact):
"""Phase assignment before event emission — verify both occur."""
bus = EventBus()
phases_at_event_time = []
async def _capture(event: AgentEvent):
phases_at_event_time.append(event.data.get("phase"))
bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
ps = QueenPhaseState(phase="planning", event_bus=bus)
await ps.switch_to_building(source="test")
await asyncio.sleep(0.1)
artifact.record_value(
"phase",
ps.phase,
expected="'building', event reports 'building'",
)
artifact.record_value(
"phases_at_event_time",
phases_at_event_time,
)
artifact.check(
"phase is building",
ps.phase == "building",
actual=repr(ps.phase),
expected_val="'building'",
)
assert ps.phase == "building"
artifact.check(
"event reports building",
phases_at_event_time == ["building"],
actual=str(phases_at_event_time),
expected_val="['building']",
)
assert phases_at_event_time == ["building"], (
f"Event should report 'building', got: {phases_at_event_time}"
)
# -----------------------------------------------------------------------
# BUG #4: Stale worker done events during non-running phase
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_worker_done_ignored_in_non_running_phase(llm_provider, tmp_path, artifact):
"""Worker completion in planning phase must be silently dropped.
This confirms BUG #4: the _on_worker_done handler only processes
events when phase == 'running'. Events in other phases are lost.
"""
session, task = await _start_queen_session(llm_provider, tmp_path)
phase_changes = []
async def _capture(event: AgentEvent):
phase_changes.append(event.data.get("phase"))
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
try:
ps = session.phase_state
artifact.check(
"initial phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
# Simulate a stale worker completion event
await session.event_bus.publish(
AgentEvent(
type=EventType.EXECUTION_COMPLETED,
stream_id="worker",
data={"output": {"result": "stale output"}},
)
)
await asyncio.sleep(0.5)
artifact.record_value(
"phase_after_stale_event",
ps.phase,
expected="still 'planning' (stale event ignored)",
)
artifact.record_value("phase_changes", phase_changes)
artifact.check(
"phase still planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning", f"Phase should still be planning, got: {ps.phase}"
artifact.check(
"no auto-switch to staging",
"staging" not in phase_changes,
actual=str(phase_changes),
expected_val="does not contain 'staging'",
)
assert "staging" not in phase_changes, (
"Should not auto-switch to staging from planning phase"
)
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# BUG #10: No guards against invalid phase transitions
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_invalid_transition_planning_to_running(llm_provider, tmp_path, artifact):
"""planning -> running should succeed (no guard).
This confirms BUG #10: the state machine allows any transition.
"""
session, task = await _start_queen_session(llm_provider, tmp_path)
try:
ps = session.phase_state
artifact.check(
"initial phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning"
await ps.switch_to_running(source="test")
artifact.record_value(
"phase_after_invalid_transition",
ps.phase,
expected="'running' (no guard, transition allowed)",
)
artifact.check(
"phase is running",
ps.phase == "running",
actual=repr(ps.phase),
expected_val="'running'",
)
assert ps.phase == "running", "switch_to_running should succeed from planning"
finally:
await _shutdown(session, task)
@pytest.mark.asyncio
async def test_invalid_transition_running_to_building(llm_provider, tmp_path, artifact):
"""running -> building should succeed (no guard).
In production this could leave a running worker orphaned.
"""
session, task = await _start_queen_session(llm_provider, tmp_path)
try:
ps = session.phase_state
await ps.switch_to_running(source="test")
artifact.check(
"phase is running",
ps.phase == "running",
actual=repr(ps.phase),
expected_val="'running'",
)
assert ps.phase == "running"
await ps.switch_to_building(source="test")
artifact.record_value(
"phase_after_invalid_transition",
ps.phase,
expected="'building' (no guard)",
)
artifact.check(
"phase is building",
ps.phase == "building",
actual=repr(ps.phase),
expected_val="'building'",
)
assert ps.phase == "building"
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# BUG #1 supplement: Double phase switch deduplication
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_double_switch_to_same_phase_is_noop(llm_provider, tmp_path, artifact):
"""switch_to_X when already in X must be a no-op (no event)."""
session, task = await _start_queen_session(llm_provider, tmp_path)
events = []
async def _capture(event: AgentEvent):
events.append(event.data.get("phase"))
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
try:
ps = session.phase_state
await ps.switch_to_building(source="test")
await asyncio.sleep(0.1)
count_after_first = len(events)
# Second call to same phase
await ps.switch_to_building(source="test")
await asyncio.sleep(0.1)
artifact.record_value(
"events_after_first",
count_after_first,
expected="no extra event after double switch",
)
artifact.record_value(
"events_after_second",
len(events),
)
artifact.record_value("all_events", events)
artifact.check(
"no extra event on double switch",
len(events) == count_after_first,
actual=f"first={count_after_first}, second={len(events)}",
expected_val="same count",
)
assert len(events) == count_after_first, (
f"Double switch should not emit extra event. Events: {events}"
)
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# BUG #6: Phase with empty tool lists
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_phase_with_empty_tools_returns_empty(llm_provider, tmp_path, artifact):
"""get_current_tools() with empty tool list returns [] not crash."""
session, task = await _start_queen_session(llm_provider, tmp_path)
try:
ps = session.phase_state
# Clear all running tools
ps.running_tools = []
await ps.switch_to_running(source="test")
tools = ps.get_current_tools()
artifact.record_value(
"tool_count",
len(tools),
expected="0 (empty list, no crash)",
)
artifact.record_value(
"tool_names",
[t.name for t in tools],
)
artifact.check(
"empty tools returns []",
tools == [],
actual=str([t.name for t in tools]),
expected_val="[]",
)
assert tools == [], f"Expected empty list, got: {[t.name for t in tools]}"
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# Rapid phase cycling — verify final state is consistent
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_rapid_phase_cycling_final_state(llm_provider, tmp_path, artifact):
"""Rapidly cycling through phases must leave state consistent."""
session, task = await _start_queen_session(llm_provider, tmp_path)
all_events = []
async def _capture(event: AgentEvent):
all_events.append(event.data.get("phase"))
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
try:
ps = session.phase_state
# Cycle 3 times
for _ in range(3):
await ps.switch_to_building(source="test")
await ps.switch_to_staging(source="test")
await ps.switch_to_running(source="test")
await ps.switch_to_planning(source="test")
await asyncio.sleep(0.3)
artifact.record_value(
"final_phase",
ps.phase,
expected="'planning' after 3 full cycles",
)
artifact.record_value("event_count", len(all_events))
artifact.record_value("all_events", all_events)
artifact.check(
"final phase is planning",
ps.phase == "planning",
actual=repr(ps.phase),
expected_val="'planning'",
)
assert ps.phase == "planning", f"Expected planning, got: {ps.phase}"
# Should have 12 phase change events (4 per cycle x 3)
artifact.check(
"12 phase events",
len(all_events) == 12,
actual=str(len(all_events)),
expected_val="12",
)
assert len(all_events) == 12, f"Expected 12 events, got {len(all_events)}: {all_events}"
# Tools and prompt should match planning phase
prompt = ps.get_current_prompt()
artifact.check(
"prompt non-empty after cycling",
len(prompt) > 0,
actual=str(len(prompt)),
expected_val=">0",
)
assert len(prompt) > 0, "Prompt should not be empty after cycling"
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# Tool availability is correct per phase (strict verification)
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_tool_sets_are_disjoint_across_phases(llm_provider, tmp_path, artifact):
"""Each phase must have a distinct non-empty tool set."""
session, task = await _start_queen_session(llm_provider, tmp_path)
try:
ps = session.phase_state
phase_tools = {}
for phase in ("planning", "building", "staging", "running"):
getattr(
ps,
f"switch_to_{phase}",
ps.switch_to_planning,
)
# Use direct assignment for simplicity
ps.phase = phase
tools = {t.name for t in ps.get_current_tools()}
phase_tools[phase] = tools
# All phases should have at least 1 tool
for phase, tools in phase_tools.items():
artifact.check(
f"{phase} has tools",
len(tools) > 0,
actual=str(len(tools)),
expected_val=">0",
)
assert len(tools) > 0, f"{phase} has no tools"
artifact.record_value(
"phase_tools",
{k: sorted(v) for k, v in phase_tools.items()},
expected="all 4 phases have distinct tool sets",
)
# Pairwise comparison: all sets should differ
phases = list(phase_tools.keys())
for i in range(len(phases)):
for j in range(i + 1, len(phases)):
a, b = phases[i], phases[j]
artifact.check(
f"{a} != {b} tools",
phase_tools[a] != phase_tools[b],
actual=(f"{a}={sorted(phase_tools[a])}, {b}={sorted(phase_tools[b])}"),
expected_val="different",
)
assert phase_tools[a] != phase_tools[b], (
f"{a} and {b} have identical tools: {phase_tools[a]}"
)
finally:
await _shutdown(session, task)
# -----------------------------------------------------------------------
# Worker completion -> auto-staging transition
# -----------------------------------------------------------------------
@pytest.mark.asyncio
async def test_worker_completion_triggers_auto_staging(llm_provider, tmp_path, artifact):
"""EXECUTION_COMPLETED in running phase must auto-switch to staging."""
session, task = await _start_queen_session(llm_provider, tmp_path)
phase_changes = []
async def _capture(event: AgentEvent):
phase_changes.append(event.data.get("phase"))
session.event_bus.subscribe(
event_types=[EventType.QUEEN_PHASE_CHANGED],
handler=_capture,
)
try:
ps = session.phase_state
# Move to running phase
await ps.switch_to_running(source="test")
await asyncio.sleep(0.3)
phase_changes.clear() # Reset after manual switch
# Simulate worker completion event
await session.event_bus.publish(
AgentEvent(
type=EventType.EXECUTION_COMPLETED,
stream_id="worker",
data={"output": {"result": "done"}},
)
)
await asyncio.sleep(1.0)
artifact.record_value(
"phase_after_completion",
ps.phase,
expected="'staging' (auto-switch on completion)",
)
artifact.record_value("phase_changes", phase_changes)
artifact.check(
"auto-switched to staging",
ps.phase == "staging",
actual=repr(ps.phase),
expected_val="'staging'",
)
assert ps.phase == "staging", f"Expected auto-switch to staging, got: {ps.phase}"
artifact.check(
"staging event emitted",
"staging" in phase_changes,
actual=str(phase_changes),
expected_val="contains 'staging'",
)
assert "staging" in phase_changes, (
f"QUEEN_PHASE_CHANGED(staging) not emitted. Events: {phase_changes}"
)
finally:
await _shutdown(session, task)
@pytest.mark.asyncio
async def test_worker_failure_triggers_auto_staging(llm_provider, tmp_path, artifact):
"""EXECUTION_FAILED in running phase must auto-switch to staging."""
session, task = await _start_queen_session(llm_provider, tmp_path)
try:
ps = session.phase_state
await ps.switch_to_running(source="test")
await asyncio.sleep(0.3)
# Simulate worker failure event
await session.event_bus.publish(
AgentEvent(
type=EventType.EXECUTION_FAILED,
stream_id="worker",
data={"error": "worker crashed"},
)
)
await asyncio.sleep(1.0)
artifact.record_value(
"phase_after_failure",
ps.phase,
expected="'staging' (auto-switch on failure)",
)
artifact.check(
"auto-switched to staging on failure",
ps.phase == "staging",
actual=repr(ps.phase),
expected_val="'staging'",
)
assert ps.phase == "staging", f"Expected auto-switch to staging on failure, got: {ps.phase}"
finally:
await _shutdown(session, task)
@@ -27,7 +27,7 @@ SET_OUTPUT = (
@pytest.mark.asyncio
async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider, artifact):
"""Echo node: path must be exactly ['echo'], steps must be 1."""
graph = GraphSpec(
id="strict-echo",
@@ -54,21 +54,70 @@ async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
conversation_mode="continuous",
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(
graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False
result = await executor.execute(graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, path=['echo'], steps=1, "
"output['output'] set, quality='clean', "
"retries=0, tokens>0"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches", result.path == ["echo"], actual=str(result.path), expected_val="['echo']"
)
assert result.path == ["echo"]
artifact.check(
"steps_executed is 1",
result.steps_executed == 1,
actual=str(result.steps_executed),
expected_val="1",
)
assert result.steps_executed == 1
actual_output = result.output.get("output")
artifact.check(
"output['output'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("output") is not None
artifact.check(
"execution_quality is clean",
result.execution_quality == "clean",
actual=repr(result.execution_quality),
expected_val="'clean'",
)
assert result.execution_quality == "clean"
artifact.check(
"total_retries is 0",
result.total_retries == 0,
actual=str(result.total_retries),
expected_val="0",
)
assert result.total_retries == 0
artifact.check(
"total_tokens > 0",
result.total_tokens > 0,
actual=str(result.total_tokens),
expected_val=">0",
)
assert result.total_tokens > 0
@pytest.mark.asyncio
async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
async def test_strict_clean_execution_quality(runtime, goal, llm_provider, artifact):
"""A simple set_output call should produce 'clean' execution quality."""
graph = GraphSpec(
id="strict-clean",
@@ -92,12 +141,37 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(result, expected="clean success, no partial failures, no nodes_with_failures")
artifact.check(
"is_clean_success",
result.is_clean_success,
actual=(
f"quality={result.execution_quality}, "
f"retries={result.total_retries}, "
f"failures={result.nodes_with_failures}"
),
expected_val="True",
)
assert result.is_clean_success, (
f"Expected clean success, got quality={result.execution_quality}, "
f"retries={result.total_retries}, failures={result.nodes_with_failures}"
)
artifact.check(
"no partial failures",
not result.had_partial_failures,
actual=str(result.had_partial_failures),
expected_val="False",
)
assert not result.had_partial_failures
artifact.check(
"no nodes_with_failures",
len(result.nodes_with_failures) == 0,
actual=str(result.nodes_with_failures),
expected_val="[]",
)
assert len(result.nodes_with_failures) == 0
@@ -107,8 +181,8 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
@pytest.mark.asyncio
async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
"""Three-node pipeline must traverse in exact order: a b c."""
async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider, artifact):
"""Three-node pipeline must traverse in exact order: a -> b -> c."""
graph = GraphSpec(
id="strict-pipeline",
goal_id="dummy",
@@ -118,45 +192,106 @@ async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
conversation_mode="continuous",
nodes=[
NodeSpec(
id="a", name="A", description="First",
node_type="event_loop", output_keys=["a_out"],
id="a",
name="A",
description="First",
node_type="event_loop",
output_keys=["a_out"],
system_prompt="Call set_output with key='a_out' and value='from_a'. " + SET_OUTPUT,
),
NodeSpec(
id="b", name="B", description="Second",
node_type="event_loop", input_keys=["b_in"], output_keys=["b_out"],
id="b",
name="B",
description="Second",
node_type="event_loop",
input_keys=["b_in"],
output_keys=["b_out"],
system_prompt="Call set_output with key='b_out' and value='from_b'. " + SET_OUTPUT,
),
NodeSpec(
id="c", name="C", description="Third",
node_type="event_loop", input_keys=["c_in"], output_keys=["result"],
id="c",
name="C",
description="Third",
node_type="event_loop",
input_keys=["c_in"],
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='from_c'. " + SET_OUTPUT,
),
],
edges=[
EdgeSpec(id="a-b", source="a", target="b",
condition=EdgeCondition.ON_SUCCESS, input_mapping={"b_in": "a_out"}),
EdgeSpec(id="b-c", source="b", target="c",
condition=EdgeCondition.ON_SUCCESS, input_mapping={"c_in": "b_out"}),
EdgeSpec(
id="a-b",
source="a",
target="b",
condition=EdgeCondition.ON_SUCCESS,
input_mapping={"b_in": "a_out"},
),
EdgeSpec(
id="b-c",
source="b",
target="c",
condition=EdgeCondition.ON_SUCCESS,
input_mapping={"c_in": "b_out"},
),
],
memory_keys=["a_out", "b_in", "b_out", "c_in", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, path=['a','b','c'], steps=3, "
"output['result'] set, each node visited once"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["a", "b", "c"],
actual=str(result.path),
expected_val="['a', 'b', 'c']",
)
assert result.path == ["a", "b", "c"], f"Path was {result.path}"
artifact.check(
"steps_executed is 3",
result.steps_executed == 3,
actual=str(result.steps_executed),
expected_val="3",
)
assert result.steps_executed == 3
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
# Visit counts: each node visited exactly once
a_visits = result.node_visit_counts.get("a", 0)
artifact.check("node 'a' visited once", a_visits == 1, actual=str(a_visits), expected_val="1")
assert result.node_visit_counts.get("a", 0) == 1
b_visits = result.node_visit_counts.get("b", 0)
artifact.check("node 'b' visited once", b_visits == 1, actual=str(b_visits), expected_val="1")
assert result.node_visit_counts.get("b", 0) == 1
c_visits = result.node_visit_counts.get("c", 0)
artifact.check("node 'c' visited once", c_visits == 1, actual=str(c_visits), expected_val="1")
assert result.node_visit_counts.get("c", 0) == 1
@pytest.mark.asyncio
async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
async def test_strict_branch_correct_terminal(runtime, goal, llm_provider, artifact):
"""Classifier node must route 'I love it' to the positive terminal."""
graph = GraphSpec(
id="strict-branch",
@@ -167,8 +302,11 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
conversation_mode="continuous",
nodes=[
NodeSpec(
id="classify", name="Classify", description="Sentiment classifier",
node_type="event_loop", input_keys=["text"],
id="classify",
name="Classify",
description="Sentiment classifier",
node_type="event_loop",
input_keys=["text"],
output_keys=["label"],
system_prompt=(
"Read the 'text' input. Determine if sentiment is positive or negative. "
@@ -177,39 +315,87 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
),
),
NodeSpec(
id="positive", name="Positive", description="Positive handler",
node_type="event_loop", output_keys=["result"],
id="positive",
name="Positive",
description="Positive handler",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
),
NodeSpec(
id="negative", name="Negative", description="Negative handler",
node_type="event_loop", output_keys=["result"],
id="negative",
name="Negative",
description="Negative handler",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
),
],
edges=[
EdgeSpec(id="to-pos", source="classify", target="positive",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'positive'", priority=1),
EdgeSpec(id="to-neg", source="classify", target="negative",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'negative'", priority=0),
EdgeSpec(
id="to-pos",
source="classify",
target="positive",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'positive'",
priority=1,
),
EdgeSpec(
id="to-neg",
source="classify",
target="negative",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'negative'",
priority=0,
),
],
memory_keys=["text", "label", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(
graph, goal, {"text": "I absolutely love this product, it's fantastic!"}, validate_graph=False
graph,
goal,
{"text": "I absolutely love this product, it's fantastic!"},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, path=['classify','positive'], steps=2, output['result']='POS'",
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["classify", "positive"],
actual=str(result.path),
expected_val="['classify', 'positive']",
)
assert result.path == ["classify", "positive"], f"Path was {result.path}"
artifact.check(
"steps_executed is 2",
result.steps_executed == 2,
actual=str(result.steps_executed),
expected_val="2",
)
assert result.steps_executed == 2
actual_result = result.output.get("result")
artifact.check(
"output['result'] is 'POS'",
actual_result == "POS",
actual=repr(actual_result),
expected_val="'POS'",
)
assert result.output.get("result") == "POS"
@pytest.mark.asyncio
async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
async def test_strict_branch_negative_terminal(runtime, goal, llm_provider, artifact):
"""Classifier node must route hateful text to the negative terminal."""
graph = GraphSpec(
id="strict-branch-neg",
@@ -220,8 +406,11 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
conversation_mode="continuous",
nodes=[
NodeSpec(
id="classify", name="Classify", description="Sentiment classifier",
node_type="event_loop", input_keys=["text"],
id="classify",
name="Classify",
description="Sentiment classifier",
node_type="event_loop",
input_keys=["text"],
output_keys=["label"],
system_prompt=(
"Read the 'text' input. Determine if sentiment is positive or negative. "
@@ -230,34 +419,82 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
),
),
NodeSpec(
id="positive", name="Positive", description="Positive handler",
node_type="event_loop", output_keys=["result"],
id="positive",
name="Positive",
description="Positive handler",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
),
NodeSpec(
id="negative", name="Negative", description="Negative handler",
node_type="event_loop", output_keys=["result"],
id="negative",
name="Negative",
description="Negative handler",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
),
],
edges=[
EdgeSpec(id="to-pos", source="classify", target="positive",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'positive'", priority=1),
EdgeSpec(id="to-neg", source="classify", target="negative",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'negative'", priority=0),
EdgeSpec(
id="to-pos",
source="classify",
target="positive",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'positive'",
priority=1,
),
EdgeSpec(
id="to-neg",
source="classify",
target="negative",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('label') == 'negative'",
priority=0,
),
],
memory_keys=["text", "label", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
result = await executor.execute(
graph, goal, {"text": "This is absolutely terrible and broken. Worst ever."}, validate_graph=False
graph,
goal,
{"text": "This is absolutely terrible and broken. Worst ever."},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, path=['classify','negative'], steps=2, output['result']='NEG'",
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["classify", "negative"],
actual=str(result.path),
expected_val="['classify', 'negative']",
)
assert result.path == ["classify", "negative"], f"Path was {result.path}"
artifact.check(
"steps_executed is 2",
result.steps_executed == 2,
actual=str(result.steps_executed),
expected_val="2",
)
assert result.steps_executed == 2
actual_result = result.output.get("result")
artifact.check(
"output['result'] is 'NEG'",
actual_result == "NEG",
actual=repr(actual_result),
expected_val="'NEG'",
)
assert result.output.get("result") == "NEG"
@@ -268,7 +505,7 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
@pytest.mark.asyncio
async def test_strict_tool_output_format(
runtime, goal, llm_provider, tool_registry, tmp_path
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
):
"""Worker must call get_current_time and produce output in STATUS|date|day format."""
graph = GraphSpec(
@@ -290,8 +527,7 @@ async def test_strict_tool_output_format(
"Extract the 'date' and 'day_of_week' fields from the result. "
"Build this exact format: STATUS|<date>|<day_of_week> "
"(example: STATUS|2026-04-03|Thursday). "
"Call set_output with key='result' and this formatted string. "
+ SET_OUTPUT
"Call set_output with key='result' and this formatted string. " + SET_OUTPUT
),
),
],
@@ -300,32 +536,77 @@ async def test_strict_tool_output_format(
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result, expected="success=True, output['result'] in STATUS|YYYY-MM-DD|DayName format"
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
output = result.output.get("result")
artifact.check(
"output['result'] is set",
output is not None,
actual=repr(output),
expected_val="non-None value",
)
assert output is not None, "No result output"
# Strict format verification: STATUS|date|day_of_week
parts = output.split("|")
artifact.check(
"3 pipe-separated parts",
len(parts) == 3,
actual=f"{len(parts)} parts: {output}",
expected_val="3 parts",
)
assert len(parts) == 3, f"Expected 3 pipe-separated parts, got {len(parts)}: {output}"
artifact.check(
"first part is STATUS", parts[0] == "STATUS", actual=repr(parts[0]), expected_val="'STATUS'"
)
assert parts[0] == "STATUS", f"First part should be STATUS, got: {parts[0]}"
# Date part should look like YYYY-MM-DD
artifact.check(
"date part length >= 8",
len(parts[1]) >= 8,
actual=f"len={len(parts[1])}, value={parts[1]}",
expected_val=">=8",
)
assert len(parts[1]) >= 8, f"Date part too short: {parts[1]}"
artifact.check(
"date part contains dashes",
"-" in parts[1],
actual=repr(parts[1]),
expected_val="contains '-'",
)
assert "-" in parts[1], f"Date part should contain dashes: {parts[1]}"
# Day of week should be a recognizable day name
valid_days = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"}
artifact.check(
"valid day_of_week",
parts[2] in valid_days,
actual=repr(parts[2]),
expected_val=f"one of {sorted(valid_days)}",
)
assert parts[2] in valid_days, f"Invalid day_of_week: {parts[2]}"
@pytest.mark.asyncio
async def test_strict_artifact_creation_and_verification(
runtime, goal, llm_provider, tool_registry, tmp_path
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
):
"""Single-node: saves a file via save_data, then verifies the artifact on disk."""
storage_path = tmp_path / "session"
@@ -359,7 +640,8 @@ async def test_strict_artifact_creation_and_verification(
memory_keys=["task", "result"],
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=storage_path,
@@ -367,23 +649,72 @@ async def test_strict_artifact_creation_and_verification(
result = await executor.execute(
graph, goal, {"task": "Create and verify artifact"}, validate_graph=False
)
artifact.record(
result,
expected=(
"success=True, path=['worker'], steps=1, "
"output contains INTEGRATION_TEST_PAYLOAD_XYZ, "
"file on disk matches"
),
)
# Strict outcome verification
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["worker"],
actual=str(result.path),
expected_val="['worker']",
)
assert result.path == ["worker"], f"Path was {result.path}"
artifact.check(
"steps_executed is 1",
result.steps_executed == 1,
actual=str(result.steps_executed),
expected_val="1",
)
assert result.steps_executed == 1
# Output must be the loaded content
output = result.output.get("result")
assert output is not None, "Worker did not set 'result'"
assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, (
f"Expected payload in output, got: {output}"
artifact.check(
"output['result'] is set",
output is not None,
actual=repr(output),
expected_val="non-None value",
)
assert output is not None, "Worker did not set 'result'"
artifact.check(
"output contains payload",
"INTEGRATION_TEST_PAYLOAD_XYZ" in output,
actual=repr(output),
expected_val="contains 'INTEGRATION_TEST_PAYLOAD_XYZ'",
)
assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, f"Expected payload in output, got: {output}"
# Verify the actual file exists on disk (save_data uses storage_path/data/)
artifact_path = storage_path / "data" / "test_artifact.txt"
artifact.check(
"artifact file exists",
artifact_path.exists(),
actual=str(artifact_path.exists()),
expected_val="True",
)
assert artifact_path.exists(), f"Artifact not found at {artifact_path}"
file_content = artifact_path.read_text(encoding="utf-8").strip()
artifact.check(
"file content matches payload",
file_content == "INTEGRATION_TEST_PAYLOAD_XYZ",
actual=repr(file_content),
expected_val="'INTEGRATION_TEST_PAYLOAD_XYZ'",
)
assert file_content == "INTEGRATION_TEST_PAYLOAD_XYZ", (
f"File content mismatch: {file_content!r}"
)
@@ -395,7 +726,7 @@ async def test_strict_artifact_creation_and_verification(
@pytest.mark.asyncio
async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider, artifact):
"""Feedback loop must respect max_node_visits and record visit counts."""
from .nodes import StatefulNode, SuccessNode
from framework.graph.node import NodeResult
@@ -406,23 +737,48 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
entry_node="draft",
terminal_nodes=["done"],
nodes=[
NodeSpec(id="draft", name="Draft", description="Produces draft",
node_type="event_loop", output_keys=["draft_output"], max_node_visits=3),
NodeSpec(id="review", name="Review", description="Reviews draft",
node_type="event_loop", input_keys=["draft_output"],
output_keys=["approved"]),
NodeSpec(id="done", name="Done", description="Terminal",
node_type="event_loop", output_keys=["final"]),
NodeSpec(
id="draft",
name="Draft",
description="Produces draft",
node_type="event_loop",
output_keys=["draft_output"],
max_node_visits=3,
),
NodeSpec(
id="review",
name="Review",
description="Reviews draft",
node_type="event_loop",
input_keys=["draft_output"],
output_keys=["approved"],
),
NodeSpec(
id="done",
name="Done",
description="Terminal",
node_type="event_loop",
output_keys=["final"],
),
],
edges=[
EdgeSpec(id="d-r", source="draft", target="review",
condition=EdgeCondition.ON_SUCCESS),
EdgeSpec(id="r-d", source="review", target="draft",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('approved') == False", priority=1),
EdgeSpec(id="r-done", source="review", target="done",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('approved') == True", priority=0),
EdgeSpec(id="d-r", source="draft", target="review", condition=EdgeCondition.ON_SUCCESS),
EdgeSpec(
id="r-d",
source="review",
target="draft",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('approved') == False",
priority=1,
),
EdgeSpec(
id="r-done",
source="review",
target="done",
condition=EdgeCondition.CONDITIONAL,
condition_expr="output.get('approved') == True",
priority=0,
),
],
memory_keys=["draft_output", "approved", "final"],
)
@@ -430,28 +786,70 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
# Deterministic nodes: reject twice, then approve
executor.register_node("draft", SuccessNode(output={"draft_output": "v1"}))
executor.register_node("review", StatefulNode([
NodeResult(success=True, output={"approved": False}),
NodeResult(success=True, output={"approved": False}),
NodeResult(success=True, output={"approved": True}),
]))
executor.register_node(
"review",
StatefulNode(
[
NodeResult(success=True, output={"approved": False}),
NodeResult(success=True, output={"approved": False}),
NodeResult(success=True, output={"approved": True}),
]
),
)
executor.register_node("done", SuccessNode(output={"final": "complete"}))
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, 'done' in path, "
"draft visited 3x, review visited 3x, "
"done visited 1x, output['final']='complete'"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"'done' in path",
"done" in result.path,
actual=str(result.path),
expected_val="contains 'done'",
)
assert "done" in result.path
# Strict visit count verification
draft_visits = result.node_visit_counts.get("draft", 0)
artifact.check(
"draft visited 3 times", draft_visits == 3, actual=str(draft_visits), expected_val="3"
)
assert result.node_visit_counts.get("draft", 0) == 3, (
f"Draft should be visited 3 times, got {result.node_visit_counts.get('draft')}"
)
review_visits = result.node_visit_counts.get("review", 0)
artifact.check(
"review visited 3 times", review_visits == 3, actual=str(review_visits), expected_val="3"
)
assert result.node_visit_counts.get("review", 0) == 3, (
f"Review should be visited 3 times, got {result.node_visit_counts.get('review')}"
)
done_visits = result.node_visit_counts.get("done", 0)
artifact.check("done visited once", done_visits == 1, actual=str(done_visits), expected_val="1")
assert result.node_visit_counts.get("done", 0) == 1, (
f"Done should be visited once, got {result.node_visit_counts.get('done')}"
)
# Final output must be from the 'done' node
final_output = result.output.get("final")
artifact.check(
"output['final'] is 'complete'",
final_output == "complete",
actual=repr(final_output),
expected_val="'complete'",
)
assert result.output.get("final") == "complete"
+115 -14
View File
@@ -15,47 +15,124 @@ from framework.llm.provider import ToolUse
from .conftest import make_executor
def test_tools_mcp_server_connects(tool_registry):
def test_tools_mcp_server_connects(tool_registry, artifact):
"""MCP server should start and expose tools."""
tools = tool_registry.get_tools()
artifact.record_value(
"tool_count",
len(tools),
expected="at least 1 tool exposed by MCP server",
)
artifact.record_value("tool_names", list(tools.keys()))
artifact.check(
"MCP server exposes tools",
len(tools) > 0,
actual=str(len(tools)),
expected_val=">0",
)
assert len(tools) > 0, "MCP server should expose at least one tool"
def test_tools_registry_has_expected_tools(tool_registry):
def test_tools_registry_has_expected_tools(tool_registry, artifact):
"""hive-tools should expose the expected tool names."""
tool_names = set(tool_registry.get_tools().keys())
expected = {"example_tool", "get_current_time"}
assert expected.issubset(tool_names), (
f"Missing expected tools: {expected - tool_names}"
artifact.record_value(
"tool_names",
sorted(tool_names),
expected="superset of {example_tool, get_current_time}",
)
artifact.record_value("expected_tools", sorted(expected))
missing = expected - tool_names
artifact.check(
"expected tools present",
expected.issubset(tool_names),
actual=str(sorted(tool_names)),
expected_val=f"superset of {sorted(expected)}",
)
assert expected.issubset(tool_names), f"Missing expected tools: {expected - tool_names}"
@pytest.mark.asyncio
async def test_tools_execute_example_tool(tool_registry):
async def test_tools_execute_example_tool(tool_registry, artifact):
"""Direct tool execution without LLM — verifies MCP round-trip."""
executor = tool_registry.get_executor()
tool_use = ToolUse(id="test-1", name="example_tool", input={"message": "hello", "uppercase": True})
tool_use = ToolUse(
id="test-1",
name="example_tool",
input={"message": "hello", "uppercase": True},
)
result = executor(tool_use)
artifact.record_value(
"is_error",
result.is_error,
expected="not an error, content contains 'HELLO'",
)
artifact.record_value("content", result.content)
artifact.check(
"result is not error",
not result.is_error,
actual=str(result.is_error),
expected_val="False",
)
assert not result.is_error
artifact.check(
"content contains HELLO",
"HELLO" in result.content,
actual=repr(result.content),
expected_val="contains 'HELLO'",
)
assert "HELLO" in result.content
@pytest.mark.asyncio
async def test_tools_execute_get_current_time(tool_registry):
async def test_tools_execute_get_current_time(tool_registry, artifact):
"""get_current_time should return a dict with date/time fields."""
executor = tool_registry.get_executor()
tool_use = ToolUse(id="test-2", name="get_current_time", input={"timezone": "UTC"})
tool_use = ToolUse(
id="test-2",
name="get_current_time",
input={"timezone": "UTC"},
)
result = executor(tool_use)
artifact.record_value(
"is_error",
result.is_error,
expected="not an error, content contains year (202x)",
)
artifact.record_value("content", result.content)
artifact.check(
"result is not error",
not result.is_error,
actual=str(result.is_error),
expected_val="False",
)
assert not result.is_error
artifact.check(
"content contains year",
"202" in result.content,
actual=repr(result.content),
expected_val="contains '202'",
)
# Should contain date-like content
assert "202" in result.content, "Should contain a year (202x)"
@pytest.mark.asyncio
async def test_tools_llm_calls_tool_and_gets_result(
runtime, llm_provider, tool_registry, goal
runtime, llm_provider, tool_registry, goal, artifact
):
"""Full round-trip: LLM calls a real tool and uses the result to set output."""
"""Full round-trip: LLM calls a tool and uses the result."""
graph = GraphSpec(
id="tool-roundtrip",
goal_id="dummy",
@@ -72,8 +149,9 @@ async def test_tools_llm_calls_tool_and_gets_result(
output_keys=["result"],
tools=["example_tool"],
system_prompt=(
"Use the example_tool to process the message from the task input "
"with uppercase=true. Then call set_output with key='result' and "
"Use the example_tool to process the message "
"from the task input with uppercase=true. Then "
"call set_output with key='result' and "
"the tool's return value."
),
),
@@ -83,12 +161,35 @@ async def test_tools_llm_calls_tool_and_gets_result(
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
)
result = await executor.execute(
graph, goal, {"task": "Process the word 'hello'"}, validate_graph=False
graph,
goal,
{"task": "Process the word 'hello'"},
validate_graph=False,
)
artifact.record(
result,
expected="success=True, output['result'] is set",
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
@@ -0,0 +1,713 @@
"""Component tests: Verified Outcomes — cross-checked, deterministic, no trust required.
These tests eliminate false positives by:
1. Using DETERMINISTIC inputs with KNOWN correct outputs
2. Cross-checking LLM output against ground truth (tool results, file contents)
3. Using REGEX validation instead of "is not None"
4. Running a VERIFIER node that independently checks the first node's work
5. Asserting on CONTENT, not just existence
If a test here passes, the output is provably correct not just non-null.
"""
from __future__ import annotations
import json
import re
import pytest
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
from framework.graph.node import NodeSpec
from .conftest import make_executor
SET_OUTPUT = (
"You MUST call the set_output tool. "
"Do not just write text — call set_output with the correct key and value."
)
# ---------------------------------------------------------------------------
# 1. Echo round-trip: input == output (exact match, no LLM creativity)
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_echo_exact_content(runtime, goal, llm_provider, artifact):
"""Echo test with EXACT content verification — not just 'is not None'.
The input is a unique token. The output must contain that exact token.
This catches LLMs that hallucinate or paraphrase instead of echoing.
"""
UNIQUE_TOKEN = "XRAY_7742_BRAVO_ECHO"
graph = GraphSpec(
id="verified-echo",
goal_id="dummy",
entry_node="echo",
entry_points={"start": "echo"},
terminal_nodes=["echo"],
nodes=[
NodeSpec(
id="echo",
name="Echo",
description="Echoes input exactly",
node_type="event_loop",
input_keys=["input"],
output_keys=["output"],
system_prompt=(
"Read the 'input' value. Call set_output with key='output' "
"and the EXACT same string. Do not modify it. Do not add quotes "
"or punctuation. Just the raw string." + SET_OUTPUT
),
),
],
edges=[],
memory_keys=["input", "output"],
conversation_mode="continuous",
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(graph, goal, {"input": UNIQUE_TOKEN}, validate_graph=False)
artifact.record(
result, expected="success=True, output['output'] contains exact token XRAY_7742_BRAVO_ECHO"
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
output = result.output.get("output", "")
artifact.check(
"output contains exact token",
UNIQUE_TOKEN in output,
actual=repr(output),
expected_val=f"contains '{UNIQUE_TOKEN}'",
)
assert UNIQUE_TOKEN in output, f"Exact token '{UNIQUE_TOKEN}' not found in output: {output!r}"
# ---------------------------------------------------------------------------
# 2. Math verification: LLM computes, we verify the answer independently
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_tool_result_matches_ground_truth(
runtime, goal, llm_provider, tool_registry, artifact
):
"""get_current_time returns real data — verify output matches tool's actual return.
We call the tool directly (ground truth), then run the LLM graph,
and verify the LLM's output contains the SAME day_of_week.
This catches LLMs that hallucinate dates.
"""
from framework.llm.provider import ToolUse
# Step 1: Get ground truth by calling tool directly
executor_fn = tool_registry.get_executor()
tool_use = ToolUse(id="ground-truth", name="get_current_time", input={"timezone": "UTC"})
ground_truth_result = executor_fn(tool_use)
artifact.record_value(
"ground_truth_is_error",
ground_truth_result.is_error,
expected="ground truth tool returns day_of_week matching LLM output",
)
assert not ground_truth_result.is_error
# Parse the actual day_of_week from the tool
gt_data = json.loads(ground_truth_result.content)
actual_day = gt_data.get("day_of_week", "")
artifact.record_value("ground_truth_day", actual_day)
assert actual_day, f"Tool didn't return day_of_week: {gt_data}"
# Step 2: Run LLM graph that uses the same tool
graph = GraphSpec(
id="verified-time",
goal_id="dummy",
entry_node="worker",
entry_points={"start": "worker"},
terminal_nodes=["worker"],
nodes=[
NodeSpec(
id="worker",
name="Worker",
description="Get current time and report day",
node_type="event_loop",
output_keys=["result"],
tools=["get_current_time"],
system_prompt=(
"Call get_current_time with timezone='UTC'. "
"Extract the day_of_week from the result. "
"Call set_output with key='result' and ONLY the day_of_week string "
"(e.g., 'Monday'). Nothing else." + SET_OUTPUT
),
),
],
edges=[],
memory_keys=["result"],
conversation_mode="continuous",
)
executor = make_executor(
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result,
expected=f"success=True, output['result'] matches ground truth day_of_week='{actual_day}'",
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
llm_day = (result.output.get("result") or "").strip()
artifact.record_value("llm_day", llm_day)
# Step 3: Cross-check — LLM's answer must match ground truth
artifact.check(
"LLM day matches ground truth",
actual_day.lower() in llm_day.lower(),
actual=repr(llm_day),
expected_val=f"contains '{actual_day}'",
)
assert actual_day.lower() in llm_day.lower(), (
f"LLM reported '{llm_day}' but tool returned '{actual_day}'. "
f"The LLM hallucinated or misread the tool result."
)
# ---------------------------------------------------------------------------
# 3. File artifact round-trip: write -> read -> binary compare
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_artifact_binary_match(
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
):
"""Save a file, then verify the on-disk content matches EXACTLY.
Does NOT rely on LLM to verify we read the file ourselves.
This catches save_data bugs, encoding issues, or LLM adding extra content.
"""
PAYLOAD = "VERIFIED_PAYLOAD_99_ZULU"
storage_path = tmp_path / "session"
graph = GraphSpec(
id="verified-artifact",
goal_id="dummy",
entry_node="worker",
entry_points={"start": "worker"},
terminal_nodes=["worker"],
nodes=[
NodeSpec(
id="worker",
name="Writer",
description="Saves exact payload to file",
node_type="event_loop",
input_keys=["task"],
output_keys=["result"],
tools=["save_data"],
system_prompt=(
f"Call save_data with filename='verified.txt' and data='{PAYLOAD}'. "
"Then call set_output with key='result' and value='saved'. " + SET_OUTPUT
),
),
],
edges=[],
memory_keys=["task", "result"],
conversation_mode="continuous",
)
executor = make_executor(
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=storage_path,
)
result = await executor.execute(graph, goal, {"task": "save the file"}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, file 'verified.txt' on disk "
"matches VERIFIED_PAYLOAD_99_ZULU exactly"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
# Cross-check: read the file ourselves — don't trust the LLM
artifact_path = storage_path / "data" / "verified.txt"
artifact.check(
"file exists on disk",
artifact_path.exists(),
actual=str(artifact_path.exists()),
expected_val="True",
)
assert artifact_path.exists(), f"File not created at {artifact_path}"
actual_content = artifact_path.read_text(encoding="utf-8").strip()
artifact.check(
"file content matches payload",
actual_content == PAYLOAD,
actual=repr(actual_content),
expected_val=repr(PAYLOAD),
)
assert actual_content == PAYLOAD, (
f"File content mismatch.\n"
f" Expected: {PAYLOAD!r}\n"
f" Actual: {actual_content!r}\n"
f"The LLM may have modified the payload or save_data encoded it differently."
)
# ---------------------------------------------------------------------------
# 4. Pipeline data integrity: track a token through N nodes
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_pipeline_token_survives(runtime, goal, llm_provider, artifact):
"""Pass a unique token through 3 nodes — verify it arrives at the end.
Each node is instructed to PRESERVE the token. If any node drops or
modifies it, the final assertion catches it. This verifies input_mapping
and continuous conversation actually deliver data correctly.
"""
TOKEN = "TRACKING_TOKEN_88X"
graph = GraphSpec(
id="verified-pipeline",
goal_id="dummy",
entry_node="a",
entry_points={"start": "a"},
terminal_nodes=["c"],
conversation_mode="continuous",
nodes=[
NodeSpec(
id="a",
name="Node A",
description="First node",
node_type="event_loop",
input_keys=["token"],
output_keys=["a_out"],
system_prompt=(
"Read the 'token' input. Call set_output with key='a_out' "
"and the EXACT token value. Do not modify it." + SET_OUTPUT
),
),
NodeSpec(
id="b",
name="Node B",
description="Middle node",
node_type="event_loop",
input_keys=["b_in"],
output_keys=["b_out"],
system_prompt=(
"Read the 'b_in' input. Call set_output with key='b_out' "
"and the EXACT same value. Do not modify it." + SET_OUTPUT
),
),
NodeSpec(
id="c",
name="Node C",
description="Terminal node",
node_type="event_loop",
input_keys=["c_in"],
output_keys=["result"],
system_prompt=(
"Read the 'c_in' input. Call set_output with key='result' "
"and the EXACT same value. Do not modify it." + SET_OUTPUT
),
),
],
edges=[
EdgeSpec(
id="a-b",
source="a",
target="b",
condition=EdgeCondition.ON_SUCCESS,
input_mapping={"b_in": "a_out"},
),
EdgeSpec(
id="b-c",
source="b",
target="c",
condition=EdgeCondition.ON_SUCCESS,
input_mapping={"c_in": "b_out"},
),
],
memory_keys=["token", "a_out", "b_in", "b_out", "c_in", "result"],
)
executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
result = await executor.execute(graph, goal, {"token": TOKEN}, validate_graph=False)
artifact.record(
result,
expected="success=True, path=['a','b','c'], output['result'] contains TRACKING_TOKEN_88X",
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["a", "b", "c"],
actual=str(result.path),
expected_val="['a', 'b', 'c']",
)
assert result.path == ["a", "b", "c"]
final_output = result.output.get("result", "")
artifact.check(
"token survives pipeline",
TOKEN in final_output,
actual=repr(final_output),
expected_val=f"contains '{TOKEN}'",
)
assert TOKEN in final_output, (
f"Token '{TOKEN}' lost in pipeline.\n"
f" Input: {TOKEN}\n"
f" Final output: {final_output!r}\n"
f" Path: {result.path}\n"
f"Data was corrupted or dropped during node transitions."
)
# ---------------------------------------------------------------------------
# 5. Structured format with regex validation
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_format_with_regex(runtime, goal, llm_provider, tool_registry, artifact):
"""Output must match a strict regex — not just 'contains a pipe character'.
Format: STATUS|YYYY-MM-DD|DayName
Regex validates each segment independently.
"""
graph = GraphSpec(
id="verified-format",
goal_id="dummy",
entry_node="worker",
entry_points={"start": "worker"},
terminal_nodes=["worker"],
nodes=[
NodeSpec(
id="worker",
name="Worker",
description="Produce formatted status string",
node_type="event_loop",
output_keys=["result"],
tools=["get_current_time"],
system_prompt=(
"Call get_current_time with timezone='UTC'. "
"Build this EXACT format: STATUS|<date>|<day_of_week>\n"
"Where <date> is YYYY-MM-DD format and <day_of_week> is the full day name.\n"
"Example: STATUS|2026-04-03|Thursday\n"
"Call set_output with key='result' and the formatted string.\n"
"Output ONLY the formatted string, nothing else." + SET_OUTPUT
),
),
],
edges=[],
memory_keys=["result"],
conversation_mode="continuous",
)
executor = make_executor(
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result, expected="success=True, output['result'] matches regex STATUS|YYYY-MM-DD|DayName"
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
output = (result.output.get("result") or "").strip()
artifact.record_value("raw_output", output)
# Strict regex: STATUS|YYYY-MM-DD|DayName
pattern = (
r"^STATUS\|\d{4}-\d{2}-\d{2}\|(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$"
)
matches = bool(re.match(pattern, output))
artifact.check(
"output matches regex",
matches,
actual=repr(output),
expected_val=f"matches pattern: {pattern}",
)
assert re.match(pattern, output), (
f"Output does not match required format.\n"
f" Expected pattern: STATUS|YYYY-MM-DD|DayName\n"
f" Actual output: {output!r}\n"
f" Regex: {pattern}"
)
# ---------------------------------------------------------------------------
# 6. Two-node cross-verification: writer + independent verifier
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_two_node_cross_check(
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
):
"""Node 1 writes a file. Node 2 loads it and compares to expected.
Both nodes operate INDEPENDENTLY on the same file. If the content
doesn't match, the verifier reports MISMATCH. We also read the file
ourselves as a triple-check.
"""
EXPECTED = "CROSS_CHECK_ALPHA_42"
storage_path = tmp_path / "session"
graph = GraphSpec(
id="verified-cross-check",
goal_id="dummy",
entry_node="writer",
entry_points={"start": "writer"},
terminal_nodes=["verifier"],
conversation_mode="continuous",
nodes=[
NodeSpec(
id="writer",
name="Writer",
description="Writes exact content to file",
node_type="event_loop",
output_keys=["filename"],
tools=["save_data"],
system_prompt=(
f"Call save_data with filename='crosscheck.txt' and data='{EXPECTED}'. "
"Then call set_output with key='filename' and value='crosscheck.txt'."
+ SET_OUTPUT
),
),
NodeSpec(
id="verifier",
name="Verifier",
description="Loads file and verifies content",
node_type="event_loop",
input_keys=["filename"],
output_keys=["result"],
tools=["load_data"],
system_prompt=(
"Load the file using load_data with the provided 'filename'. "
f"If the loaded content is exactly '{EXPECTED}', "
"call set_output with key='result' and value='VERIFIED'. "
"If it does NOT match, call set_output with key='result' "
"and value='MISMATCH:' followed by what you actually loaded." + SET_OUTPUT
),
),
],
edges=[
EdgeSpec(
id="write-to-verify",
source="writer",
target="verifier",
condition=EdgeCondition.ON_SUCCESS,
input_mapping={"filename": "filename"},
),
],
memory_keys=["filename", "result"],
)
executor = make_executor(
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=storage_path,
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, path=['writer','verifier'], "
"verifier output='VERIFIED', disk content "
"matches CROSS_CHECK_ALPHA_42"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.check(
"path matches",
result.path == ["writer", "verifier"],
actual=str(result.path),
expected_val="['writer', 'verifier']",
)
assert result.path == ["writer", "verifier"]
# LLM-side verification
verifier_output = result.output.get("result", "")
artifact.check(
"verifier output is VERIFIED",
verifier_output == "VERIFIED",
actual=repr(verifier_output),
expected_val="'VERIFIED'",
)
assert verifier_output == "VERIFIED", (
f"Verifier node reported: {verifier_output!r} (expected 'VERIFIED')"
)
# Our own independent verification (triple-check)
artifact_path = storage_path / "data" / "crosscheck.txt"
artifact.check(
"file exists on disk",
artifact_path.exists(),
actual=str(artifact_path.exists()),
expected_val="True",
)
assert artifact_path.exists(), f"File not found at {artifact_path}"
actual = artifact_path.read_text(encoding="utf-8").strip()
artifact.check(
"disk content matches expected",
actual == EXPECTED,
actual=repr(actual),
expected_val=repr(EXPECTED),
)
assert actual == EXPECTED, f"Disk content mismatch: expected {EXPECTED!r}, got {actual!r}"
# ---------------------------------------------------------------------------
# 7. Event bus cross-check: verify events match execution result
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_verified_events_match_result(
runtime, goal, llm_provider, tool_registry, tmp_path, artifact
):
"""Cross-check: events captured on bus must agree with ExecutionResult.
If result says path=["a","b"], the events must show NODE_LOOP_COMPLETED
for both "a" and "b". If result says tool X was called, TOOL_CALL_COMPLETED
must contain X. This catches desynchronization between the event bus and
the execution engine.
"""
from framework.runtime.event_bus import EventBus, EventType
bus = EventBus()
completed_nodes = []
tool_names = set()
async def _capture_node(event):
completed_nodes.append(event.node_id)
async def _capture_tool(event):
tool_names.add(event.data.get("tool_name", ""))
bus.subscribe(event_types=[EventType.NODE_LOOP_COMPLETED], handler=_capture_node)
bus.subscribe(event_types=[EventType.TOOL_CALL_COMPLETED], handler=_capture_tool)
graph = GraphSpec(
id="verified-events",
goal_id="dummy",
entry_node="worker",
entry_points={"start": "worker"},
terminal_nodes=["worker"],
nodes=[
NodeSpec(
id="worker",
name="Worker",
description="Uses tool then sets output",
node_type="event_loop",
output_keys=["result"],
tools=["get_current_time"],
system_prompt=(
"Call get_current_time with timezone='UTC'. "
"Then call set_output with key='result' and value='done'." + SET_OUTPUT
),
),
],
edges=[],
memory_keys=["result"],
conversation_mode="continuous",
)
executor = make_executor(
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
artifact.record(
result,
expected=(
"success=True, event bus nodes match "
"result.path, tool events include "
"get_current_time and set_output"
),
)
artifact.check(
"execution succeeds", result.success, actual=str(result.success), expected_val="True"
)
assert result.success
artifact.record_value("completed_nodes", completed_nodes)
artifact.record_value("tool_names", sorted(tool_names))
# Cross-check 1: path nodes match completed nodes
for node_id in result.path:
artifact.check(
f"node '{node_id}' in completed events",
node_id in completed_nodes,
actual=str(completed_nodes),
expected_val=f"contains '{node_id}'",
)
assert node_id in completed_nodes, (
f"Node '{node_id}' in result.path but no NODE_LOOP_COMPLETED event. "
f"Events saw: {completed_nodes}"
)
# Cross-check 2: get_current_time must appear in tool events
artifact.check(
"get_current_time in tool events",
"get_current_time" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'get_current_time'",
)
assert "get_current_time" in tool_names, (
f"get_current_time not in tool events. Captured: {tool_names}. "
f"Result claims success but event bus disagrees."
)
# Cross-check 3: set_output must appear in tool events
artifact.check(
"set_output in tool events",
"set_output" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'set_output'",
)
assert "set_output" in tool_names, (
f"set_output not in tool events. Captured: {tool_names}. "
f"Result has output but no set_output event."
)
@@ -1,8 +1,9 @@
"""Component tests: Worker Communication — event flow, completion, failure.
"""Component tests: Worker Communication — event flow, completion.
Exercises the full worker execution lifecycle with EventBus subscriptions
to verify that the exact events are published in the correct order, with
correct data, simulating the queen-worker communication contract.
Exercises the full worker execution lifecycle with EventBus
subscriptions to verify that the exact events are published in
the correct order, with correct data, simulating the queen-worker
communication contract.
"""
from __future__ import annotations
@@ -20,7 +21,8 @@ from .conftest import make_executor
SET_OUTPUT = (
"You MUST call the set_output tool. "
"Do not just write text — call set_output with the correct key and value."
"Do not just write text — call set_output with the correct "
"key and value."
)
@@ -34,7 +36,7 @@ class EventCapture:
return [e for e in self.events if e.type in event_types]
def tool_calls(self) -> list[dict]:
"""Extract tool call data from TOOL_CALL_COMPLETED events."""
"""Extract tool call data from TOOL_CALL_COMPLETED."""
return [e.data for e in self.of_type(EventType.TOOL_CALL_COMPLETED)]
def tool_names_called(self) -> set[str]:
@@ -51,14 +53,13 @@ class EventCapture:
def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
"""Create an EventBus with a capture handler subscribed to all events."""
"""Create an EventBus with a capture handler."""
bus = EventBus()
capture = EventCapture()
async def _capture_all(event: AgentEvent) -> None:
capture.events.append(event)
# Subscribe to the key event types we want to verify
bus.subscribe(
event_types=[
EventType.NODE_LOOP_STARTED,
@@ -79,14 +80,14 @@ def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
return bus, capture
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
# Tests: Worker Completion Events
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
@pytest.mark.asyncio
async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path):
"""Worker execution must emit LOOP_STARTED iterations → LOOP_COMPLETED."""
async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path, artifact):
"""Worker must emit STARTED -> iterations -> COMPLETED."""
bus, capture = _make_event_bus_and_capture()
graph = GraphSpec(
@@ -102,7 +103,7 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
description="Simple output",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='done'. " + SET_OUTPUT,
system_prompt=("Call set_output with key='result' and value='done'. " + SET_OUTPUT),
),
],
edges=[],
@@ -110,34 +111,78 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=(
"success=True, lifecycle events in correct order: STARTED -> iterations -> COMPLETED"
),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
# Verify lifecycle event ordering
loop_started = capture.of_type(EventType.NODE_LOOP_STARTED)
loop_completed = capture.of_type(EventType.NODE_LOOP_COMPLETED)
loop_completed = capture.of_type(
EventType.NODE_LOOP_COMPLETED,
)
iterations = capture.of_type(EventType.NODE_LOOP_ITERATION)
artifact.check(
"NODE_LOOP_STARTED emitted",
len(loop_started) >= 1,
actual=str(len(loop_started)),
expected_val=">=1",
)
assert len(loop_started) >= 1, "Missing NODE_LOOP_STARTED"
artifact.check(
"NODE_LOOP_COMPLETED emitted",
len(loop_completed) >= 1,
actual=str(len(loop_completed)),
expected_val=">=1",
)
assert len(loop_completed) >= 1, "Missing NODE_LOOP_COMPLETED"
artifact.check(
"NODE_LOOP_ITERATION emitted",
len(iterations) >= 1,
actual=str(len(iterations)),
expected_val=">=1",
)
assert len(iterations) >= 1, "Missing NODE_LOOP_ITERATION"
# STARTED must come before COMPLETED
start_idx = capture.events.index(loop_started[0])
end_idx = capture.events.index(loop_completed[0])
artifact.check(
"STARTED precedes COMPLETED",
start_idx < end_idx,
actual=f"start={start_idx}, end={end_idx}",
expected_val="start < end",
)
assert start_idx < end_idx, "LOOP_STARTED must precede LOOP_COMPLETED"
@pytest.mark.asyncio
async def test_worker_emits_llm_turn_with_token_counts(
runtime, goal, llm_provider, tmp_path
runtime, goal, llm_provider, tmp_path, artifact
):
"""Each LLM turn must emit LLM_TURN_COMPLETE with token counts."""
bus, capture = _make_event_bus_and_capture()
@@ -155,7 +200,7 @@ async def test_worker_emits_llm_turn_with_token_counts(
description="Simple output",
node_type="event_loop",
output_keys=["result"],
system_prompt="Call set_output with key='result' and value='ok'. " + SET_OUTPUT,
system_prompt=("Call set_output with key='result' and value='ok'. " + SET_OUTPUT),
),
],
edges=[],
@@ -163,30 +208,82 @@ async def test_worker_emits_llm_turn_with_token_counts(
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
loop_config={"max_iterations": 3},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, LLM_TURN_COMPLETE events with positive token counts and model"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
llm_turns = capture.of_type(EventType.LLM_TURN_COMPLETE)
artifact.check(
"LLM_TURN_COMPLETE emitted",
len(llm_turns) >= 1,
actual=str(len(llm_turns)),
expected_val=">=1",
)
assert len(llm_turns) >= 1, "No LLM_TURN_COMPLETE events"
for turn in llm_turns:
assert turn.data.get("input_tokens", 0) > 0, "input_tokens should be > 0"
assert turn.data.get("output_tokens", 0) > 0, "output_tokens should be > 0"
for i, turn in enumerate(llm_turns):
in_tok = turn.data.get("input_tokens", 0)
out_tok = turn.data.get("output_tokens", 0)
model = turn.data.get("model", "")
artifact.check(
f"turn[{i}] input_tokens > 0",
in_tok > 0,
actual=str(in_tok),
expected_val=">0",
)
assert in_tok > 0, "input_tokens should be > 0"
artifact.check(
f"turn[{i}] output_tokens > 0",
out_tok > 0,
actual=str(out_tok),
expected_val=">0",
)
assert out_tok > 0, "output_tokens should be > 0"
artifact.check(
f"turn[{i}] model populated",
bool(model),
actual=repr(model),
expected_val="non-empty string",
)
assert turn.data.get("model"), "model should be populated"
@pytest.mark.asyncio
async def test_worker_tool_calls_emit_events(
runtime, goal, llm_provider, tool_registry, tmp_path
runtime,
goal,
llm_provider,
tool_registry,
tmp_path,
artifact,
):
"""Tool calls must emit TOOL_CALL_STARTED and TOOL_CALL_COMPLETED events."""
"""Tool calls must emit STARTED and COMPLETED events."""
bus, capture = _make_event_bus_and_capture()
graph = GraphSpec(
@@ -205,8 +302,8 @@ async def test_worker_tool_calls_emit_events(
tools=["get_current_time"],
system_prompt=(
"Call get_current_time with timezone='UTC'. "
"Then call set_output with key='result' and the day_of_week. "
+ SET_OUTPUT
"Then call set_output with key='result' and "
"the day_of_week. " + SET_OUTPUT
),
),
],
@@ -215,38 +312,97 @@ async def test_worker_tool_calls_emit_events(
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=(
"success=True, output['result'] set, tool events for get_current_time and set_output"
),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_output = result.output.get("result")
artifact.check(
"output['result'] is set",
actual_output is not None,
actual=repr(actual_output),
expected_val="non-None value",
)
assert result.output.get("result") is not None
# Verify tool events
tool_started = capture.of_type(EventType.TOOL_CALL_STARTED)
tool_completed = capture.of_type(EventType.TOOL_CALL_COMPLETED)
tool_completed = capture.of_type(
EventType.TOOL_CALL_COMPLETED,
)
artifact.check(
"TOOL_CALL_STARTED emitted",
len(tool_started) >= 1,
actual=str(len(tool_started)),
expected_val=">=1",
)
assert len(tool_started) >= 1, "No TOOL_CALL_STARTED events"
assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED events"
# get_current_time must be among the tools called
assert "get_current_time" in capture.tool_names_called()
artifact.check(
"TOOL_CALL_COMPLETED emitted",
len(tool_completed) >= 1,
actual=str(len(tool_completed)),
expected_val=">=1",
)
assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED"
# set_output must also appear (synthetic tool)
assert "set_output" in capture.tool_names_called()
tool_names = capture.tool_names_called()
artifact.check(
"get_current_time called",
"get_current_time" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'get_current_time'",
)
assert "get_current_time" in tool_names
artifact.check(
"set_output called",
"set_output" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'set_output'",
)
assert "set_output" in tool_names
# Tool calls should not have errors
for tc in capture.tool_calls():
if tc.get("tool_name") in ("get_current_time", "set_output"):
assert not tc.get("is_error"), f"Tool {tc.get('tool_name')} errored"
tn = tc.get("tool_name")
if tn in ("get_current_time", "set_output"):
is_err = tc.get("is_error")
artifact.check(
f"tool {tn} no error",
not is_err,
actual=str(is_err),
expected_val="False",
)
assert not is_err, f"Tool {tn} errored"
@pytest.mark.asyncio
async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path):
async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path, artifact):
"""set_output must emit OUTPUT_KEY_SET event with the key name."""
bus, capture = _make_event_bus_and_capture()
@@ -275,34 +431,84 @@ async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path
conversation_mode="continuous",
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=("success=True, output['name'] and output['status'] set, OUTPUT_KEY_SET for both"),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
actual_name = result.output.get("name")
artifact.check(
"output['name'] is set",
actual_name is not None,
actual=repr(actual_name),
expected_val="non-None value",
)
assert result.output.get("name") is not None
actual_status = result.output.get("status")
artifact.check(
"output['status'] is set",
actual_status is not None,
actual=repr(actual_status),
expected_val="non-None value",
)
assert result.output.get("status") is not None
# Verify OUTPUT_KEY_SET events for both keys
keys_set = capture.output_keys_set()
artifact.check(
"OUTPUT_KEY_SET for 'name'",
"name" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'name'",
)
assert "name" in keys_set, f"Missing OUTPUT_KEY_SET for 'name', got: {keys_set}"
artifact.check(
"OUTPUT_KEY_SET for 'status'",
"status" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'status'",
)
assert "status" in keys_set, f"Missing OUTPUT_KEY_SET for 'status', got: {keys_set}"
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
# Tests: Multi-Node Worker Communication
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
@pytest.mark.asyncio
async def test_worker_pipeline_data_integrity(
runtime, goal, llm_provider, tool_registry, tmp_path
runtime,
goal,
llm_provider,
tool_registry,
tmp_path,
artifact,
):
"""Data produced by node 1 must arrive at node 2 via input_mapping, verified end-to-end."""
"""Data from node 1 must arrive at node 2, verified end-to-end."""
bus, capture = _make_event_bus_and_capture()
graph = GraphSpec(
@@ -316,28 +522,30 @@ async def test_worker_pipeline_data_integrity(
NodeSpec(
id="producer",
name="Producer",
description="Produces a timestamped value using a real tool",
description="Produces a timestamped value",
node_type="event_loop",
output_keys=["payload"],
tools=["get_current_time"],
system_prompt=(
"Call get_current_time with timezone='UTC'. "
"Extract the 'date' field from the result. "
"Call set_output with key='payload' and the date string as value. "
+ SET_OUTPUT
"Call set_output with key='payload' and the "
"date string as value. " + SET_OUTPUT
),
),
NodeSpec(
id="consumer",
name="Consumer",
description="Verifies received data contains a date",
description="Verifies received data",
node_type="event_loop",
input_keys=["data"],
output_keys=["result"],
system_prompt=(
"Read the 'data' input. It should contain a date string. "
"Call set_output with key='result' and value='VERIFIED|' followed by "
"the first 10 characters of the data input. " + SET_OUTPUT
"Read the 'data' input. It should contain a "
"date string. Call set_output with "
"key='result' and value='VERIFIED|' followed "
"by the first 10 characters of the data "
"input. " + SET_OUTPUT
),
),
],
@@ -353,44 +561,127 @@ async def test_worker_pipeline_data_integrity(
memory_keys=["payload", "data", "result"],
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
tool_registry=tool_registry,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=(
"success=True, clean, "
"path=['producer','consumer'], steps=2, "
"output starts with VERIFIED|"
),
)
# Strict outcome verification
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"clean success",
result.is_clean_success,
actual=str(result.execution_quality),
expected_val="clean",
)
assert result.is_clean_success, f"quality={result.execution_quality}"
artifact.check(
"path matches",
result.path == ["producer", "consumer"],
actual=str(result.path),
expected_val="['producer', 'consumer']",
)
assert result.path == ["producer", "consumer"]
artifact.check(
"steps_executed is 2",
result.steps_executed == 2,
actual=str(result.steps_executed),
expected_val="2",
)
assert result.steps_executed == 2
# Output must be present and correctly structured
output = result.output.get("result")
artifact.check(
"consumer set 'result'",
output is not None,
actual=repr(output),
expected_val="non-None value",
)
assert output is not None, "Consumer did not set 'result'"
artifact.check(
"output starts with VERIFIED|",
output.startswith("VERIFIED|"),
actual=repr(output),
expected_val="starts with 'VERIFIED|'",
)
assert output.startswith("VERIFIED|"), f"Expected VERIFIED|..., got: {output}"
# Token counts should be reasonable (not zero, not astronomical)
artifact.check(
"total_tokens > 0",
result.total_tokens > 0,
actual=str(result.total_tokens),
expected_val=">0",
)
assert result.total_tokens > 0
artifact.check(
"total_tokens < 100000",
result.total_tokens < 100_000,
actual=str(result.total_tokens),
expected_val="<100000",
)
assert result.total_tokens < 100_000, f"Unexpectedly high tokens: {result.total_tokens}"
# Both nodes should have set their output keys
keys_set = capture.output_keys_set()
artifact.check(
"producer set 'payload'",
"payload" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'payload'",
)
assert "payload" in keys_set, "Producer didn't set 'payload'"
artifact.check(
"consumer set 'result' key",
"result" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'result'",
)
assert "result" in keys_set, "Consumer didn't set 'result'"
# get_current_time must have been called (in producer)
assert "get_current_time" in capture.tool_names_called()
tool_names = capture.tool_names_called()
artifact.check(
"get_current_time called",
"get_current_time" in tool_names,
actual=str(sorted(tool_names)),
expected_val="contains 'get_current_time'",
)
assert "get_current_time" in tool_names
@pytest.mark.asyncio
async def test_worker_multi_node_output_propagation(
runtime, goal, llm_provider, tmp_path
runtime, goal, llm_provider, tmp_path, artifact
):
"""Data from node A's output must arrive at node B and be reflected in final output."""
"""Data from node A must arrive at node B in final output."""
bus, capture = _make_event_bus_and_capture()
graph = GraphSpec(
@@ -408,22 +699,25 @@ async def test_worker_multi_node_output_propagation(
node_type="event_loop",
output_keys=["code"],
system_prompt=(
"Call set_output with key='code' and value='ALPHA_BRAVO_42'. "
"Call set_output with key='code' and "
"value='ALPHA_BRAVO_42'. "
"Do not write any text." + SET_OUTPUT
),
),
NodeSpec(
id="formatter",
name="Formatter",
description="Wraps received code in brackets",
description="Wraps code in brackets",
node_type="event_loop",
input_keys=["raw_code"],
output_keys=["result"],
system_prompt=(
"Read the 'raw_code' input value. "
"Call set_output with key='result' and value='[' followed by "
"the raw_code value followed by ']'. "
"Example: if raw_code is 'XYZ', output should be '[XYZ]'. " + SET_OUTPUT
"Call set_output with key='result' and "
"value='[' followed by the raw_code value "
"followed by ']'. "
"Example: if raw_code is 'XYZ', output "
"should be '[XYZ]'. " + SET_OUTPUT
),
),
],
@@ -439,44 +733,110 @@ async def test_worker_multi_node_output_propagation(
memory_keys=["code", "raw_code", "result"],
)
executor = make_executor(
runtime, llm_provider,
runtime,
llm_provider,
loop_config={"max_iterations": 5},
storage_path=tmp_path / "session",
event_bus=bus,
stream_id="worker",
)
result = await executor.execute(graph, goal, {}, validate_graph=False)
result = await executor.execute(
graph,
goal,
{},
validate_graph=False,
)
artifact.record(
result,
expected=(
"success=True, "
"path=['generator','formatter'], steps=2, "
"output contains [ALPHA_BRAVO_42]"
),
)
artifact.check(
"execution succeeds",
result.success,
actual=str(result.success),
expected_val="True",
)
assert result.success
artifact.check(
"path matches",
result.path == ["generator", "formatter"],
actual=str(result.path),
expected_val="['generator', 'formatter']",
)
assert result.path == ["generator", "formatter"]
artifact.check(
"steps_executed is 2",
result.steps_executed == 2,
actual=str(result.steps_executed),
expected_val="2",
)
assert result.steps_executed == 2
# Verify output structure
output = result.output.get("result")
artifact.check(
"formatter set 'result'",
output is not None,
actual=repr(output),
expected_val="non-None value",
)
assert output is not None, "Formatter did not set 'result'"
assert "[" in output and "]" in output, f"Expected bracket wrapping, got: {output}"
has_brackets = "[" in output and "]" in output
artifact.check(
"output has bracket wrapping",
has_brackets,
actual=repr(output),
expected_val="contains '[' and ']'",
)
assert has_brackets, f"Expected bracket wrapping, got: {output}"
artifact.check(
"output contains ALPHA_BRAVO_42",
"ALPHA_BRAVO_42" in output,
actual=repr(output),
expected_val="contains 'ALPHA_BRAVO_42'",
)
assert "ALPHA_BRAVO_42" in output, f"Code word missing from output: {output}"
# Both nodes should have set their output keys
keys_set = capture.output_keys_set()
artifact.check(
"'code' in keys_set",
"code" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'code'",
)
assert "code" in keys_set
artifact.check(
"'result' in keys_set",
"result" in keys_set,
actual=str(sorted(keys_set)),
expected_val="contains 'result'",
)
assert "result" in keys_set
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
# Tests: Escalation Event Flow
# ---------------------------------------------------------------------------
# -------------------------------------------------------------------
@pytest.mark.asyncio
async def test_worker_escalation_emits_event_with_reason(
runtime, goal, llm_provider, tmp_path
runtime, goal, llm_provider, tmp_path, artifact
):
"""Worker calling escalate must emit ESCALATION_REQUESTED with the reason.
"""Worker calling escalate must emit ESCALATION_REQUESTED.
After calling escalate, the worker blocks waiting for queen input.
Since there's no queen in this test, we run with a short timeout and
verify the escalation event was emitted before the timeout.
After calling escalate, the worker blocks waiting for queen
input. Since there's no queen in this test, we run with a
short timeout and verify the escalation event was emitted.
"""
bus, capture = _make_event_bus_and_capture()
@@ -495,8 +855,10 @@ async def test_worker_escalation_emits_event_with_reason(
output_keys=["result"],
system_prompt=(
"You are blocked and need human help. "
"Call the escalate tool with reason='missing credentials for API'. "
"Do not call set_output. Do not write any text first."
"Call the escalate tool with "
"reason='missing credentials for API'. "
"Do not call set_output. "
"Do not write any text first."
),
),
],
@@ -515,24 +877,59 @@ async def test_worker_escalation_emits_event_with_reason(
stream_id="worker",
)
# Worker will block after escalate (waiting for queen).
# Use a short timeout — we only need the escalation event to fire.
try:
await asyncio.wait_for(
executor.execute(graph, goal, {}, validate_graph=False),
executor.execute(
graph,
goal,
{},
validate_graph=False,
),
timeout=30,
)
except (TimeoutError, asyncio.TimeoutError):
pass # Expected: worker hangs waiting for queen input
pass # Expected: worker hangs waiting for queen
# Verify escalation event was emitted before the timeout
escalations = capture.of_type(EventType.ESCALATION_REQUESTED)
assert len(escalations) >= 1, (
f"No ESCALATION_REQUESTED event emitted. "
f"Events captured: {[e.type.value for e in capture.events]}"
all_types = [e.type.value for e in capture.events]
artifact.record_value(
"escalation_count",
len(escalations),
expected=(">=1 ESCALATION_REQUESTED with non-empty reason, stream_id='worker'"),
)
artifact.record_value("all_event_types", all_types)
artifact.check(
"escalation event emitted",
len(escalations) >= 1,
actual=str(len(escalations)),
expected_val=">=1",
)
assert len(escalations) >= 1, f"No ESCALATION_REQUESTED event emitted. Events: {all_types}"
esc_data = escalations[0].data
assert esc_data.get("reason"), "Escalation reason should not be empty"
reason = esc_data.get("reason", "")
artifact.check(
"reason is non-empty",
bool(reason),
actual=repr(reason),
expected_val="non-empty string",
)
assert esc_data.get("reason"), "Escalation reason empty"
artifact.check(
"stream_id is 'worker'",
escalations[0].stream_id == "worker",
actual=repr(escalations[0].stream_id),
expected_val="'worker'",
)
assert escalations[0].stream_id == "worker"
artifact.check(
"node_id is 'worker'",
escalations[0].node_id == "worker",
actual=repr(escalations[0].node_id),
expected_val="'worker'",
)
assert escalations[0].node_id == "worker"