feat: verified testing

2026-04-03 13:00:49 -07:00
parent 674454cc5b
commit 8f56b8b068
15 changed files with 4452 additions and 314 deletions
@@ -0,0 +1,225 @@
+# Integration Test Reporting Skill
+
+Run the Level 2 dummy agent integration test suite and produce a detailed HTML report with per-test input → outcome analysis.
+
+## Trigger
+
+User wants to run integration tests and see results:
+- `/test-reporting`
+- `/test-reporting test_component_queen_live.py`
+- `/test-reporting --all`
+
+## SOP: Running Tests
+
+### Step 1: Select Scope
+
+If the user provides a specific test file or pattern, use it. Otherwise run the full suite.
+
+```bash
+# Full suite
+cd core && echo "1" | uv run python tests/dummy_agents/run_all.py --interactive 2>&1
+
+# Specific file (requires manual provider setup)
+cd core && uv run python -c "
+import sys
+sys.path.insert(0, '.')
+from tests.dummy_agents.run_all import detect_available
+from tests.dummy_agents.conftest import set_llm_selection
+
+avail = detect_available()
+claude = [p for p in avail if 'Claude Code' in p['name']]
+if not claude:
+    avail_names = [p['name'] for p in avail]
+    raise RuntimeError(f'No Claude Code subscription. Available: {avail_names}')
+provider = claude[0]
+set_llm_selection(
+    model=provider['model'],
+    api_key=provider['api_key'],
+    extra_headers=provider.get('extra_headers'),
+    api_base=provider.get('api_base'),
+)
+
+import pytest
+sys.exit(pytest.main([
+    'tests/dummy_agents/TEST_FILE_HERE',
+    '-v', '--override-ini=asyncio_mode=auto', '--no-header', '--tb=long',
+    '--log-cli-level=WARNING', '--junitxml=/tmp/hive_test_results.xml',
+]))
+"
+```
+
+### Step 2: Collect Results
+
+After the test run completes, collect:
+1. **JUnit XML** from `--junitxml` output (if available)
+2. **stdout/stderr** from the run
+3. **Summary table** from `run_all.py` output (the Unicode table)
+
+### Step 3: Generate HTML Report
+
+Write the report to `/tmp/hive_integration_test_report.html`.
+
+The report MUST include these sections:
+
+#### Header
+- Run timestamp (ISO 8601)
+- Provider used (model name, source)
+- Total tests / passed / failed / skipped
+- Total wall-clock time
+- Overall verdict: PASS (all green) or FAIL (with count)
+
+#### Per-Test Table
+
+For EVERY test (not just failures), include a row with:
+
+| Column | Description |
+|--------|-------------|
+| Component | Test file grouping (e.g., `component_queen_live`) |
+| Test Name | Function name (e.g., `test_queen_starts_in_planning_without_worker`) |
+| Status | PASS / FAIL / SKIP / ERROR with color badge |
+| Duration | Wall-clock seconds |
+| What | One-line description of what the test verifies |
+| How | How it works (setup → action → assertion) |
+| Why | Why this test matters (what bug/behavior it catches) |
+| Input | The input data or configuration (graph spec, initial prompt, phase, etc.) |
+| Expected Outcome | What the test asserts |
+| Actual Outcome | What actually happened (PASS: matches expected / FAIL: actual vs expected) |
+| Failure Detail | For failures only: full traceback + diagnosis |
+
+#### What / How / Why Descriptions
+
+These MUST be derived from the test function's docstring and code. Read each test file to extract:
+- **What**: From the docstring first line
+- **How**: From the test body (what fixtures, what graph, what assertions)
+- **Why**: From the docstring body or "Why this matters" section in the test module
+
+Use these mappings for the component test files:
+
+```
+test_component_llm.py          → "LLM Provider" — streaming, tool calling, tokens
+test_component_tools.py        → "Tool Registry + MCP" — connection, execution
+test_component_event_loop.py   → "EventLoopNode" — iteration, output, stall
+test_component_edges.py        → "Edge Evaluation" — conditional, priority
+test_component_conversation.py → "Conversation Persistence" — storage, cursor
+test_component_escalation.py   → "Escalation Flow" — worker→queen signaling
+test_component_continuous.py   → "Continuous Mode" — conversation threading
+test_component_queen.py        → "Queen Phase (Unit)" — phase state, tools, events
+test_component_queen_live.py   → "Queen Phase (Live)" — real queen, real LLM
+test_component_queen_state_machine.py → "Queen State Machine" — edge cases, races
+test_component_worker_comms.py → "Worker Communication" — events, data flow
+test_component_strict_outcomes.py → "Strict Outcomes" — exact path, output, quality
+```
+
+#### HTML Template
+
+Use this structure:
+
+```html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Hive Integration Test Report — {timestamp}</title>
+<style>
+  :root { --pass: #22c55e; --fail: #ef4444; --skip: #f59e0b; --bg: #0f172a; --surface: #1e293b; --text: #e2e8f0; --muted: #94a3b8; --border: #334155; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: 'SF Mono', 'Fira Code', monospace; background: var(--bg); color: var(--text); padding: 2rem; line-height: 1.6; }
+  h1, h2, h3 { font-weight: 600; }
+  h1 { font-size: 1.5rem; margin-bottom: 1rem; }
+  h2 { font-size: 1.2rem; margin: 2rem 0 1rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
+  .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--surface); padding: 1rem; border-radius: 8px; border: 1px solid var(--border); }
+  .card .label { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; }
+  .card .value { font-size: 1.5rem; font-weight: 700; margin-top: 0.25rem; }
+  .card .value.pass { color: var(--pass); }
+  .card .value.fail { color: var(--fail); }
+  table { width: 100%; border-collapse: collapse; font-size: 0.8rem; }
+  th { background: var(--surface); position: sticky; top: 0; text-align: left; padding: 0.5rem; border-bottom: 2px solid var(--border); color: var(--muted); text-transform: uppercase; font-size: 0.7rem; }
+  td { padding: 0.5rem; border-bottom: 1px solid var(--border); vertical-align: top; }
+  tr:hover { background: rgba(255,255,255,0.03); }
+  .badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; font-weight: 700; }
+  .badge.pass { background: rgba(34,197,94,0.2); color: var(--pass); }
+  .badge.fail { background: rgba(239,68,68,0.2); color: var(--fail); }
+  .badge.skip { background: rgba(245,158,11,0.2); color: var(--skip); }
+  .detail { background: #1a1a2e; padding: 0.75rem; border-radius: 4px; margin-top: 0.5rem; font-size: 0.75rem; white-space: pre-wrap; overflow-x: auto; max-height: 200px; overflow-y: auto; }
+  .component-header { background: var(--surface); padding: 0.75rem 0.5rem; font-weight: 600; font-size: 0.85rem; }
+  .meta { color: var(--muted); font-size: 0.75rem; }
+</style>
+</head>
+<body>
+<h1>Hive Integration Test Report</h1>
+<p class="meta">Generated: {timestamp} | Provider: {provider} | Duration: {duration}s</p>
+
+<div class="summary">
+  <div class="card"><div class="label">Total</div><div class="value">{total}</div></div>
+  <div class="card"><div class="label">Passed</div><div class="value pass">{passed}</div></div>
+  <div class="card"><div class="label">Failed</div><div class="value fail">{failed}</div></div>
+  <div class="card"><div class="label">Verdict</div><div class="value {verdict_class}">{verdict}</div></div>
+</div>
+
+<h2>Test Results</h2>
+<table>
+<thead>
+<tr>
+  <th>Component</th>
+  <th>Test</th>
+  <th>Status</th>
+  <th>Time</th>
+  <th>What</th>
+  <th>Input → Expected → Actual</th>
+</tr>
+</thead>
+<tbody>
+<!-- For each test: -->
+<tr>
+  <td>{component}</td>
+  <td>{test_name}</td>
+  <td><span class="badge {status_class}">{status}</span></td>
+  <td>{duration}s</td>
+  <td>{what_description}</td>
+  <td>
+    <strong>Input:</strong> {input_description}<br>
+    <strong>Expected:</strong> {expected_outcome}<br>
+    <strong>Actual:</strong> {actual_outcome}
+    <!-- If failed: -->
+    <div class="detail">{failure_traceback}</div>
+  </td>
+</tr>
+</tbody>
+</table>
+
+<h2>Failure Analysis</h2>
+<!-- Only if there are failures -->
+<p>For each failure, provide:</p>
+<ul>
+  <li><strong>Root cause:</strong> Why it failed</li>
+  <li><strong>Impact:</strong> What this means for the system</li>
+  <li><strong>Suggested fix:</strong> How to address it</li>
+</ul>
+
+</body>
+</html>
+```
+
+### Step 4: Output
+
+1. Write the HTML file to `/tmp/hive_integration_test_report.html`
+2. Print the file path so the user can open it
+3. Print a concise summary to the terminal:
+   ```
+   Test Report: /tmp/hive_integration_test_report.html
+   Result: 74/76 PASSED (2 failures)
+   Failures:
+     - parallel_merge::test_parallel_disjoint_output_keys
+     - worker::test_worker_timestamped_note_artifact
+   ```
+
+## Key Rules
+
+1. ALWAYS use `--junitxml` when running pytest to get structured results
+2. ALWAYS read the test source files to populate What/How/Why columns — do not guess
+3. For Input/Expected/Actual, extract from the test's graph spec, assertions, and result
+4. Color-code everything: green for pass, red for fail, amber for skip
+5. Include the full traceback for failures in a scrollable `<div class="detail">`
+6. Group tests by component (file name) with a visual separator
+7. The report must be self-contained HTML (no external CSS/JS dependencies)
@@ -7,6 +7,7 @@ Run via: cd core && uv run python tests/dummy_agents/run_all.py
 from __future__ import annotations

 import asyncio
+import json
 import os
 from pathlib import Path

@@ -202,3 +203,130 @@ def make_executor(

    executor.execute = execute_with_timeout  # type: ignore[method-assign]
    return executor
+
+
+# ── Artifact capture: raw output written to disk for every test ──────
+
+ARTIFACTS_DIR = Path("/tmp/hive_test_artifacts")
+
+
+class TestArtifact:
+    """Collects raw output + expected behavior for a single test.
+
+    Usage in tests:
+        def test_foo(artifact, ...):
+            result = await executor.execute(...)
+            artifact.record(result, expected="path == ['a','b'], output['x'] == 'hello'")
+    """
+
+    def __init__(self, test_id: str):
+        self.test_id = test_id
+        self._data: dict = {"test_id": test_id, "raw_output": None, "expected": "", "checks": []}
+
+    def record(self, result, *, expected: str = ""):
+        """Record an ExecutionResult with expected behavior description."""
+        self._data["expected"] = expected
+        if result is None:
+            self._data["raw_output"] = None
+            return
+        self._data["raw_output"] = {
+            "success": getattr(result, "success", None),
+            "output": _safe_serialize(getattr(result, "output", {})),
+            "error": getattr(result, "error", None),
+            "path": getattr(result, "path", []),
+            "steps_executed": getattr(result, "steps_executed", 0),
+            "total_tokens": getattr(result, "total_tokens", 0),
+            "total_latency_ms": getattr(result, "total_latency_ms", 0),
+            "execution_quality": getattr(result, "execution_quality", ""),
+            "total_retries": getattr(result, "total_retries", 0),
+            "node_visit_counts": getattr(result, "node_visit_counts", {}),
+            "nodes_with_failures": getattr(result, "nodes_with_failures", []),
+            "session_state_buffer": _safe_serialize(
+                (getattr(result, "session_state", {}) or {}).get("data_buffer", {})
+            ),
+        }
+
+    def record_value(self, key: str, value, *, expected: str = ""):
+        """Record an arbitrary key-value (for non-ExecutionResult tests)."""
+        self._data.setdefault("values", {})[key] = _safe_serialize(value)
+        if expected:
+            self._data["expected"] = expected
+
+    def check(self, description: str, passed: bool, actual: str = "", expected_val: str = ""):
+        """Record an individual assertion check."""
+        self._data["checks"].append({
+            "description": description,
+            "passed": passed,
+            "actual": actual,
+            "expected": expected_val,
+        })
+
+    def save(self):
+        """Write artifact to disk."""
+        ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
+        safe_name = self.test_id.replace("::", "__").replace("/", "_")
+        path = ARTIFACTS_DIR / f"{safe_name}.json"
+        with open(path, "w") as f:
+            json.dump(self._data, f, indent=2, default=str)
+
+
+def _safe_serialize(obj):
+    """Convert to JSON-safe types."""
+    if obj is None:
+        return None
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+    if isinstance(obj, dict):
+        return {str(k): _safe_serialize(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [_safe_serialize(v) for v in obj]
+    return str(obj)[:500]
+
+
+@pytest.fixture
+def artifact(request):
+    """Fixture that captures raw test output to disk.
+
+    Every test gets an artifact recorder. Call artifact.record(result)
+    and artifact.check("description", passed, actual, expected) to
+    capture data. Saved automatically on teardown.
+    """
+    test_id = request.node.nodeid
+    art = TestArtifact(test_id)
+    yield art
+    art.save()
+
+
+# Autouse hook: for tests that DON'T use the artifact fixture,
+# create a minimal artifact from pass/fail status.
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    outcome = yield
+    rep = outcome.get_result()
+    if rep.when == "call":
+        item._test_report = rep
+
+
+def pytest_runtest_teardown(item, nextitem):
+    """Auto-save a minimal artifact for tests that didn't use the fixture."""
+    report = getattr(item, "_test_report", None)
+    if report is None:
+        return
+    # Check if the test already used the artifact fixture
+    if "artifact" in item.fixturenames:
+        return  # Already handled by fixture teardown
+    ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
+    safe_name = item.nodeid.replace("::", "__").replace("/", "_")
+    path = ARTIFACTS_DIR / f"{safe_name}.json"
+    data = {
+        "test_id": item.nodeid,
+        "raw_output": None,
+        "expected": "",
+        "checks": [],
+        "auto_captured": True,
+        "status": "PASS" if report.passed else ("FAIL" if report.failed else "SKIP"),
+    }
+    if report.failed and report.longreprtext:
+        data["failure_text"] = report.longreprtext[:5000]
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2, default=str)
@@ -1,4 +1,4 @@
-"""Component tests: Continuous Conversation Mode — threading, buffer passing.
+"""Component tests: Continuous Conversation Mode — threading, buffer.

 Exercises conversation threading across nodes to verify that downstream
 nodes receive context from upstream nodes in continuous mode.
@@ -15,12 +15,15 @@ from .conftest import make_executor

 SET_OUTPUT_INSTRUCTION = (
    "You MUST call the set_output tool to provide your answer. "
-    "Do not just write text — call set_output with the correct key and value."
+    "Do not just write text — call set_output with the correct "
+    "key and value."
 )


-def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
-    """Two-node pipeline: intake captures input, transform uppercases it."""
+def _build_pipeline_graph(
+    conversation_mode: str = "continuous",
+) -> GraphSpec:
+    """Two-node pipeline: intake captures, transform uppercases."""
    return GraphSpec(
        id="continuous-pipeline",
        goal_id="dummy",
@@ -37,8 +40,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
                input_keys=["raw"],
                output_keys=["captured"],
                system_prompt=(
-                    "Read the 'raw' input value and call set_output with "
-                    "key='captured' and the same value. " + SET_OUTPUT_INSTRUCTION
+                    "Read the 'raw' input value and call "
+                    "set_output with key='captured' and the "
+                    "same value. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -49,9 +53,9 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:
                input_keys=["value"],
                output_keys=["result"],
                system_prompt=(
-                    "Read the 'value' input, convert it to UPPERCASE, "
-                    "then call set_output with key='result' and the uppercased value. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Read the 'value' input, convert it to "
+                    "UPPERCASE, then call set_output with "
+                    "key='result' and the uppercased value. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
        ],
@@ -69,53 +73,141 @@ def _build_pipeline_graph(conversation_mode: str = "continuous") -> GraphSpec:


@pytest.mark.asyncio
-async def test_continuous_pipeline_traverses(runtime, goal, llm_provider):
+async def test_continuous_pipeline_traverses(runtime, goal, llm_provider, artifact):
    """Continuous mode pipeline should traverse both nodes."""
    graph = _build_pipeline_graph(conversation_mode="continuous")
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
-
-    result = await executor.execute(
-        graph, goal, {"raw": "hello"}, validate_graph=False
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 5},
    )

+    result = await executor.execute(
+        graph,
+        goal,
+        {"raw": "hello"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, path=['intake','transform'], output['result'] is set"),
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["intake", "transform"],
+        actual=str(result.path),
+        expected_val="['intake', 'transform']",
+    )
    assert result.path == ["intake", "transform"]
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None


@pytest.mark.asyncio
-async def test_continuous_data_flows_through(runtime, goal, llm_provider):
-    """Data from node 1's output should be available to node 2 via input_mapping."""
+async def test_continuous_data_flows_through(runtime, goal, llm_provider, artifact):
+    """Data from node 1's output should be available to node 2."""
    graph = _build_pipeline_graph(conversation_mode="continuous")
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
-
-    result = await executor.execute(
-        graph, goal, {"raw": "test_data"}, validate_graph=False
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 5},
    )

+    result = await executor.execute(
+        graph,
+        goal,
+        {"raw": "test_data"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, output['result'] is non-empty",
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None
-    # The transform node should have produced something based on the input
+
+    output_len = len(str(result.output["result"]))
+    artifact.check(
+        "output is non-empty",
+        output_len > 0,
+        actual=str(output_len),
+        expected_val=">0",
+    )
    assert len(str(result.output["result"])) > 0


@pytest.mark.asyncio
-async def test_isolated_pipeline_traverses(runtime, goal, llm_provider):
+async def test_isolated_pipeline_traverses(runtime, goal, llm_provider, artifact):
    """Isolated mode pipeline should also traverse both nodes."""
    graph = _build_pipeline_graph(conversation_mode="isolated")
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
-
-    result = await executor.execute(
-        graph, goal, {"raw": "data"}, validate_graph=False
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 5},
    )

+    result = await executor.execute(
+        graph,
+        goal,
+        {"raw": "data"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, path=['intake','transform']",
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["intake", "transform"],
+        actual=str(result.path),
+        expected_val="['intake', 'transform']",
+    )
    assert result.path == ["intake", "transform"]


@pytest.mark.asyncio
-async def test_continuous_three_node_chain(runtime, goal, llm_provider):
-    """Three-node continuous pipeline should thread conversation end-to-end."""
+async def test_continuous_three_node_chain(runtime, goal, llm_provider, artifact):
+    """Three-node continuous pipeline should thread end-to-end."""
    graph = GraphSpec(
        id="three-node-chain",
        goal_id="dummy",
@@ -132,8 +224,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
                input_keys=["input"],
                output_keys=["a_out"],
                system_prompt=(
-                    "Read the 'input' value and call set_output with "
-                    "key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
+                    "Read the 'input' value and call set_output "
+                    "with key='a_out' and the same value. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -144,9 +236,9 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
                input_keys=["b_in"],
                output_keys=["b_out"],
                system_prompt=(
-                    "Read the 'b_in' value and call set_output with "
-                    "key='b_out' and value='processed_' followed by the input. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Read the 'b_in' value and call set_output "
+                    "with key='b_out' and value='processed_' "
+                    "followed by the input. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -157,8 +249,8 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
                input_keys=["c_in"],
                output_keys=["result"],
                system_prompt=(
-                    "Read the 'c_in' value and call set_output with "
-                    "key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
+                    "Read the 'c_in' value and call set_output "
+                    "with key='result' and the same value. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
        ],
@@ -178,14 +270,60 @@ async def test_continuous_three_node_chain(runtime, goal, llm_provider):
                input_mapping={"c_in": "b_out"},
            ),
        ],
-        memory_keys=["input", "a_out", "b_in", "b_out", "c_in", "result"],
+        memory_keys=[
+            "input",
+            "a_out",
+            "b_in",
+            "b_out",
+            "c_in",
+            "result",
+        ],
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 5},
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
    result = await executor.execute(
-        graph, goal, {"input": "payload"}, validate_graph=False
+        graph,
+        goal,
+        {"input": "payload"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, path=['a','b','c'], steps=3, output['result'] is set"),
    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["a", "b", "c"],
+        actual=str(result.path),
+        expected_val="['a', 'b', 'c']",
+    )
    assert result.path == ["a", "b", "c"]
+
+    artifact.check(
+        "steps_executed is 3",
+        result.steps_executed == 3,
+        actual=str(result.steps_executed),
+        expected_val="3",
+    )
    assert result.steps_executed == 3
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None
@@ -1,7 +1,7 @@
-"""Component tests: Conversation Persistence — write-through, cursor, storage.
+"""Component tests: Conversation Persistence — write-through, storage.

-Exercises conversation persistence by running real LLM turns and verifying
-that messages and state are written to disk correctly.
+Exercises conversation persistence by running real LLM turns and
+verifying that messages and state are written to disk correctly.
 """

 from __future__ import annotations
@@ -31,8 +31,9 @@ def _build_echo_graph() -> GraphSpec:
                input_keys=["input"],
                output_keys=["output"],
                system_prompt=(
-                    "Read the 'input' value and immediately call set_output "
-                    "with key='output' and the same value. Do not add any text."
+                    "Read the 'input' value and immediately call "
+                    "set_output with key='output' and the same "
+                    "value. Do not add any text."
                ),
            ),
        ],
@@ -43,48 +44,113 @@ def _build_echo_graph() -> GraphSpec:


@pytest.mark.asyncio
-async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path):
+async def test_conversation_persists_messages(runtime, goal, llm_provider, tmp_path, artifact):
    """After execution, conversation data should exist on disk."""
    storage = tmp_path / "session"
    graph = _build_echo_graph()
-    executor = make_executor(runtime, llm_provider, storage_path=storage)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        storage_path=storage,
+    )

    result = await executor.execute(
-        graph, goal, {"input": "hello"}, validate_graph=False
+        graph,
+        goal,
+        {"input": "hello"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, conversations/ dir exists with data files"),
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
    )
    assert result.success

    # Verify conversation directory was created with content
    conv_dir = storage / "conversations"
+
+    artifact.check(
+        "conversations/ dir exists",
+        conv_dir.exists(),
+        actual=str(conv_dir.exists()),
+        expected_val="True",
+    )
    assert conv_dir.exists(), "conversations/ directory should exist"
+
    # Should have at least one file (messages or cursor)
    all_files = list(conv_dir.rglob("*"))
    data_files = [f for f in all_files if f.is_file()]
+
+    artifact.check(
+        "at least one data file",
+        len(data_files) > 0,
+        actual=str(len(data_files)),
+        expected_val=">0",
+    )
    assert len(data_files) > 0, "Should have persisted at least one conversation file"


@pytest.mark.asyncio
 async def test_conversation_output_matches_execution(
-    runtime, goal, llm_provider, tmp_path
+    runtime, goal, llm_provider, tmp_path, artifact
 ):
-    """ExecutionResult output should be consistent with what the node produced."""
+    """ExecutionResult output should be consistent with the node."""
    storage = tmp_path / "session"
    graph = _build_echo_graph()
-    executor = make_executor(runtime, llm_provider, storage_path=storage)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        storage_path=storage,
+    )

    result = await executor.execute(
-        graph, goal, {"input": "test_value"}, validate_graph=False
+        graph,
+        goal,
+        {"input": "test_value"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, output['output'] is non-empty",
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
    )
    assert result.success
+
+    actual_output = result.output.get("output")
+    artifact.check(
+        "output['output'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("output") is not None
+
    # The echo node should produce some non-empty output
+    output_len = len(str(result.output["output"]))
+    artifact.check(
+        "output is non-empty",
+        output_len > 0,
+        actual=str(output_len),
+        expected_val=">0",
+    )
    assert len(str(result.output["output"])) > 0


@pytest.mark.asyncio
-async def test_conversation_multi_node_persistence(
-    runtime, goal, llm_provider, tmp_path
-):
+async def test_conversation_multi_node_persistence(runtime, goal, llm_provider, tmp_path, artifact):
    """Multi-node graph should persist conversation data for each node."""
    from framework.graph.edge import EdgeCondition, EdgeSpec

@@ -104,8 +170,8 @@ async def test_conversation_multi_node_persistence(
                node_type="event_loop",
                output_keys=["intermediate"],
                system_prompt=(
-                    "Call set_output with key='intermediate' and value='step1_done'. "
-                    "Do not write text."
+                    "Call set_output with key='intermediate' "
+                    "and value='step1_done'. Do not write text."
                ),
            ),
            NodeSpec(
@@ -116,8 +182,7 @@ async def test_conversation_multi_node_persistence(
                input_keys=["intermediate"],
                output_keys=["result"],
                system_prompt=(
-                    "Call set_output with key='result' and value='step2_done'. "
-                    "Do not write text."
+                    "Call set_output with key='result' and value='step2_done'. Do not write text."
                ),
            ),
        ],
@@ -132,12 +197,45 @@ async def test_conversation_multi_node_persistence(
        ],
        memory_keys=["intermediate", "result"],
    )
-    executor = make_executor(runtime, llm_provider, storage_path=storage)
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        storage_path=storage,
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, path=['step1','step2'], conversations/ dir exists"),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["step1", "step2"],
+        actual=str(result.path),
+        expected_val="['step1', 'step2']",
+    )
    assert result.path == ["step1", "step2"]

    # Both nodes should have written conversation data
    conv_dir = storage / "conversations"
+
+    artifact.check(
+        "conversations/ dir exists",
+        conv_dir.exists(),
+        actual=str(conv_dir.exists()),
+        expected_val="True",
+    )
    assert conv_dir.exists()
@@ -15,12 +15,13 @@ from .conftest import make_executor

 SET_OUTPUT_INSTRUCTION = (
    "You MUST call the set_output tool to provide your answer. "
-    "Do not just write text — call set_output with the correct key and value."
+    "Do not just write text — call set_output with the correct "
+    "key and value."
 )


@pytest.mark.asyncio
-async def test_edge_conditional_true_path(runtime, goal, llm_provider):
+async def test_edge_conditional_true_path(runtime, goal, llm_provider, artifact):
    """Conditional edge with True expression should be traversed."""
    graph = GraphSpec(
        id="cond-true",
@@ -37,8 +38,7 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["label"],
                system_prompt=(
-                    "Call set_output with key='label' and value='yes'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='label' and value='yes'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -48,8 +48,8 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "Call set_output with key='result' and value='reached'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='result' and "
+                    "value='reached'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
        ],
@@ -64,15 +64,41 @@ async def test_edge_conditional_true_path(runtime, goal, llm_provider):
        ],
        memory_keys=["label", "result"],
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 3},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, path=['source','target']",
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["source", "target"],
+        actual=str(result.path),
+        expected_val="['source', 'target']",
+    )
    assert result.path == ["source", "target"]


@pytest.mark.asyncio
-async def test_edge_conditional_false_path(runtime, goal, llm_provider):
+async def test_edge_conditional_false_path(runtime, goal, llm_provider, artifact):
    """Conditional edge with False expression should NOT be traversed."""
    graph = GraphSpec(
        id="cond-false",
@@ -89,8 +115,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["label"],
                system_prompt=(
-                    "Call set_output with key='label' and value='no'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='label' and value='no'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -99,7 +124,7 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
                description="Should not be reached",
                node_type="event_loop",
                output_keys=["result"],
-                system_prompt="Call set_output with key='result' and value='bad'.",
+                system_prompt=("Call set_output with key='result' and value='bad'."),
            ),
        ],
        edges=[
@@ -113,15 +138,41 @@ async def test_edge_conditional_false_path(runtime, goal, llm_provider):
        ],
        memory_keys=["label", "result"],
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 3},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, 'target' not in path",
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "target not in path",
+        "target" not in result.path,
+        actual=str(result.path),
+        expected_val="path without 'target'",
+    )
    assert "target" not in result.path


@pytest.mark.asyncio
-async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
+async def test_edge_priority_selects_higher(runtime, goal, llm_provider, artifact):
    """When multiple conditional edges match, higher priority wins."""
    graph = GraphSpec(
        id="priority-test",
@@ -138,8 +189,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["value"],
                system_prompt=(
-                    "Call set_output with key='value' and value='match'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='value' and value='match'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -149,8 +199,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "Call set_output with key='result' and value='HIGH'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='result' and value='HIGH'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
            NodeSpec(
@@ -160,8 +209,7 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "Call set_output with key='result' and value='LOW'. "
-                    + SET_OUTPUT_INSTRUCTION
+                    "Call set_output with key='result' and value='LOW'. " + SET_OUTPUT_INSTRUCTION
                ),
            ),
        ],
@@ -185,8 +233,34 @@ async def test_edge_priority_selects_higher(runtime, goal, llm_provider):
        ],
        memory_keys=["value", "result"],
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 3},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, path=['source','high']",
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["source", "high"],
+        actual=str(result.path),
+        expected_val="['source', 'high']",
+    )
    assert result.path == ["source", "high"]
@@ -16,7 +16,7 @@ from .conftest import make_executor


@pytest.mark.asyncio
-async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path):
+async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp_path, artifact):
    """Worker LLM should call the escalate tool when instructed.

    After calling escalate, the worker blocks waiting for queen input.
@@ -40,8 +40,9 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "You MUST immediately call the escalate tool with "
-                    "reason='need human approval for deployment'. "
+                    "You MUST immediately call the escalate tool "
+                    "with reason='need human approval for "
+                    "deployment'. "
                    "Do not call set_output. Do not write text."
                ),
            ),
@@ -74,17 +75,34 @@ async def test_escalation_worker_calls_escalate(runtime, goal, llm_provider, tmp
    # Worker will block after escalate. Short timeout is fine.
    try:
        await _asyncio.wait_for(
-            executor.execute(graph, goal, {}, validate_graph=False),
+            executor.execute(
+                graph,
+                goal,
+                {},
+                validate_graph=False,
+            ),
            timeout=30,
        )
    except (TimeoutError, _asyncio.TimeoutError):
        pass  # Expected: worker hangs waiting for queen

+    artifact.record_value(
+        "escalation_count",
+        len(escalations),
+        expected=">=1 ESCALATION_REQUESTED event emitted",
+    )
+
+    artifact.check(
+        "escalation event emitted",
+        len(escalations) >= 1,
+        actual=str(len(escalations)),
+        expected_val=">=1",
+    )
    assert len(escalations) >= 1, "No ESCALATION_REQUESTED event emitted"


@pytest.mark.asyncio
-async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path):
+async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path, artifact):
    """Worker that escalates should still terminate (not hang forever)."""
    graph = GraphSpec(
        id="escalate-terminate",
@@ -100,8 +118,10 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "Call the escalate tool with reason='blocked on credentials'. "
-                    "Then call set_output with key='result' and value='escalated'."
+                    "Call the escalate tool with "
+                    "reason='blocked on credentials'. "
+                    "Then call set_output with key='result' "
+                    "and value='escalated'."
                ),
            ),
        ],
@@ -115,6 +135,21 @@ async def test_escalation_node_terminates(runtime, goal, llm_provider, tmp_path)
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
    )
-    # Should terminate within timeout (make_executor wraps with asyncio.wait_for)
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="steps_executed=1 (terminates, does not hang)",
+    )
+
+    artifact.check(
+        "steps_executed is 1",
+        result.steps_executed == 1,
+        actual=str(result.steps_executed),
+        expected_val="1",
+    )
    assert result.steps_executed == 1
@@ -1,4 +1,4 @@
-"""Component tests: EventLoopNode — iteration limits, output accumulation, stall safety.
+"""Component tests: EventLoopNode — iteration limits, output, stall safety.

 Exercises the core multi-turn LLM loop through single-node graphs with
 real LLM calls to verify iteration control and termination behavior.
@@ -15,8 +15,8 @@ from .conftest import make_executor


@pytest.mark.asyncio
-async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
-    """LLM calls set_output on first turn — node should terminate with output."""
+async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider, artifact):
+    """LLM calls set_output on first turn — node terminates with output."""
    graph = GraphSpec(
        id="single-turn",
        goal_id="dummy",
@@ -31,7 +31,8 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "Call set_output with key='result' and value='done'. "
+                    "Call set_output with key='result' and "
+                    "value='done'. "
                    "Do not write any text. Just call the tool."
                ),
            ),
@@ -40,19 +41,51 @@ async def test_event_loop_single_turn_set_output(runtime, goal, llm_provider):
        memory_keys=["result"],
        conversation_mode="continuous",
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 3},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, output['result'] set, steps=1",
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None
+
+    artifact.check(
+        "steps_executed is 1",
+        result.steps_executed == 1,
+        actual=str(result.steps_executed),
+        expected_val="1",
+    )
    assert result.steps_executed == 1


@pytest.mark.asyncio
-async def test_event_loop_multi_turn_tool_use(
-    runtime, goal, llm_provider, tool_registry
-):
-    """LLM calls a tool, gets result, then calls set_output — multi-turn flow."""
+async def test_event_loop_multi_turn_tool_use(runtime, goal, llm_provider, tool_registry, artifact):
+    """LLM calls a tool, gets result, then calls set_output."""
    graph = GraphSpec(
        id="multi-turn",
        goal_id="dummy",
@@ -68,9 +101,10 @@ async def test_event_loop_multi_turn_tool_use(
                output_keys=["result"],
                tools=["get_current_time"],
                system_prompt=(
-                    "First call get_current_time with timezone='UTC'. "
-                    "Then call set_output with key='result' and the day_of_week "
-                    "from the tool response."
+                    "First call get_current_time with "
+                    "timezone='UTC'. "
+                    "Then call set_output with key='result' and "
+                    "the day_of_week from the tool response."
                ),
            ),
        ],
@@ -79,18 +113,42 @@ async def test_event_loop_multi_turn_tool_use(
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, output['result'] is day_of_week",
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None


@pytest.mark.asyncio
-async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
+async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider, artifact):
    """Node must terminate after max_iterations even without set_output."""
    graph = GraphSpec(
        id="stuck-node",
@@ -106,8 +164,7 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
                node_type="event_loop",
                output_keys=["result"],
                system_prompt=(
-                    "You are thinking deeply. Respond with a short thought. "
-                    "Never call set_output."
+                    "You are thinking deeply. Respond with a short thought. Never call set_output."
                ),
                max_tokens=32,
            ),
@@ -116,15 +173,34 @@ async def test_event_loop_max_iterations_respected(runtime, goal, llm_provider):
        memory_keys=["result"],
        conversation_mode="continuous",
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 3},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="terminates (not hang), steps_executed=1",
+    )

    # Should terminate (not hang) — the node was visited
+    artifact.check(
+        "steps_executed is 1",
+        result.steps_executed == 1,
+        actual=str(result.steps_executed),
+        expected_val="1",
+    )
    assert result.steps_executed == 1


@pytest.mark.asyncio
-async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
+async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider, artifact):
    """LLM should be able to set multiple output keys in a single node."""
    graph = GraphSpec(
        id="multi-output",
@@ -142,7 +218,8 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
                system_prompt=(
                    "Call set_output twice: "
                    "first with key='name' and value='Alice', "
-                    "then with key='greeting' and value='Hello Alice'. "
+                    "then with key='greeting' and "
+                    "value='Hello Alice'. "
                    "Do not write any text."
                ),
            ),
@@ -151,9 +228,44 @@ async def test_event_loop_multiple_output_keys(runtime, goal, llm_provider):
        memory_keys=["name", "greeting"],
        conversation_mode="continuous",
    )
-    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        loop_config={"max_iterations": 5},
+    )
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, output['name'] and output['greeting'] are set"),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_name = result.output.get("name")
+    artifact.check(
+        "output['name'] is set",
+        actual_name is not None,
+        actual=repr(actual_name),
+        expected_val="non-None value",
+    )
    assert result.output.get("name") is not None
+
+    actual_greeting = result.output.get("greeting")
+    artifact.check(
+        "output['greeting'] is set",
+        actual_greeting is not None,
+        actual=repr(actual_greeting),
+        expected_val="non-None value",
+    )
    assert result.output.get("greeting") is not None
@@ -15,18 +15,39 @@ from framework.llm.stream_events import FinishEvent, TextDeltaEvent, ToolCallEve


@pytest.mark.asyncio
-async def test_llm_acomplete_returns_content(llm_provider):
+async def test_llm_acomplete_returns_content(llm_provider, artifact):
    """acomplete() should return a non-empty LLMResponse."""
    result = await llm_provider.acomplete(
        messages=[{"role": "user", "content": "Reply with exactly: OK"}],
        max_tokens=16,
    )
+    artifact.record_value(
+        "result_type",
+        type(result).__name__,
+        expected="LLMResponse with non-empty content",
+    )
+    artifact.record_value("content", result.content)
+
+    artifact.check(
+        "result is LLMResponse",
+        isinstance(result, LLMResponse),
+        actual=type(result).__name__,
+        expected_val="LLMResponse",
+    )
    assert isinstance(result, LLMResponse)
+
+    content_ok = result.content and result.content.strip()
+    artifact.check(
+        "content is non-empty",
+        bool(content_ok),
+        actual=repr(result.content),
+        expected_val="non-empty string",
+    )
    assert result.content and result.content.strip()


@pytest.mark.asyncio
-async def test_llm_stream_yields_text_delta(llm_provider):
+async def test_llm_stream_yields_text_delta(llm_provider, artifact):
    """stream() should yield at least one TextDeltaEvent and a FinishEvent."""
    text_deltas = []
    finish_events = []
@@ -39,12 +60,32 @@ async def test_llm_stream_yields_text_delta(llm_provider):
        elif isinstance(event, FinishEvent):
            finish_events.append(event)

+    artifact.record_value(
+        "text_delta_count",
+        len(text_deltas),
+        expected=">=1 TextDeltaEvent and exactly 1 FinishEvent",
+    )
+    artifact.record_value("finish_event_count", len(finish_events))
+
+    artifact.check(
+        "at least one TextDeltaEvent",
+        len(text_deltas) >= 1,
+        actual=str(len(text_deltas)),
+        expected_val=">=1",
+    )
    assert len(text_deltas) >= 1, "Expected at least one TextDeltaEvent"
+
+    artifact.check(
+        "exactly one FinishEvent",
+        len(finish_events) == 1,
+        actual=str(len(finish_events)),
+        expected_val="1",
+    )
    assert len(finish_events) == 1, "Expected exactly one FinishEvent"


@pytest.mark.asyncio
-async def test_llm_stream_tool_call(llm_provider):
+async def test_llm_stream_tool_call(llm_provider, artifact):
    """stream() with a tool definition should produce a ToolCallEvent."""
    tool = Tool(
        name="record_result",
@@ -52,7 +93,10 @@ async def test_llm_stream_tool_call(llm_provider):
        parameters={
            "type": "object",
            "properties": {
-                "value": {"type": "string", "description": "The result to record."},
+                "value": {
+                    "type": "string",
+                    "description": "The result to record.",
+                },
            },
            "required": ["value"],
        },
@@ -63,7 +107,8 @@ async def test_llm_stream_tool_call(llm_provider):
            {
                "role": "user",
                "content": (
-                    "Call the record_result tool exactly once with value='OK'. "
+                    "Call the record_result tool exactly once "
+                    "with value='OK'. "
                    "Do not answer with plain text."
                ),
            }
@@ -74,30 +119,79 @@ async def test_llm_stream_tool_call(llm_provider):
        events.append(event)

    tool_calls = [e for e in events if isinstance(e, ToolCallEvent)]
+
+    artifact.record_value(
+        "tool_call_count",
+        len(tool_calls),
+        expected=">=1 ToolCallEvent, tool_name='record_result'",
+    )
+    artifact.record_value(
+        "tool_names",
+        [tc.tool_name for tc in tool_calls],
+    )
+
+    artifact.check(
+        "LLM called record_result",
+        len(tool_calls) >= 1,
+        actual=str(len(tool_calls)),
+        expected_val=">=1",
+    )
    assert len(tool_calls) >= 1, "LLM should have called record_result"
+
+    artifact.check(
+        "tool_name is record_result",
+        tool_calls[0].tool_name == "record_result",
+        actual=tool_calls[0].tool_name,
+        expected_val="record_result",
+    )
    assert tool_calls[0].tool_name == "record_result"


@pytest.mark.asyncio
-async def test_llm_token_counts_populated(llm_provider):
+async def test_llm_token_counts_populated(llm_provider, artifact):
    """LLMResponse should have positive input_tokens and output_tokens."""
    result = await llm_provider.acomplete(
        messages=[{"role": "user", "content": "Reply OK."}],
        max_tokens=16,
    )
+
+    artifact.record_value(
+        "input_tokens",
+        result.input_tokens,
+        expected="positive input_tokens and output_tokens",
+    )
+    artifact.record_value("output_tokens", result.output_tokens)
+
+    artifact.check(
+        "input_tokens positive",
+        result.input_tokens > 0,
+        actual=str(result.input_tokens),
+        expected_val=">0",
+    )
    assert result.input_tokens > 0, "input_tokens should be positive"
+
+    artifact.check(
+        "output_tokens positive",
+        result.output_tokens > 0,
+        actual=str(result.output_tokens),
+        expected_val=">0",
+    )
    assert result.output_tokens > 0, "output_tokens should be positive"


@pytest.mark.asyncio
-async def test_llm_json_mode(llm_provider):
-    """acomplete(json_mode=True) should return parseable JSON when supported."""
+async def test_llm_json_mode(llm_provider, artifact):
+    """acomplete(json_mode=True) should return parseable JSON."""
    try:
        result = await llm_provider.acomplete(
            messages=[
                {
                    "role": "user",
-                    "content": 'Return a JSON object with key "status" and value "ok". Output only valid JSON, no other text.',
+                    "content": (
+                        'Return a JSON object with key "status" '
+                        'and value "ok". Output only valid JSON, '
+                        "no other text."
+                    ),
                }
            ],
            max_tokens=64,
@@ -110,6 +204,26 @@ async def test_llm_json_mode(llm_provider):
    if not content:
        pytest.skip("Provider returned empty content for json_mode request")

+    artifact.record_value(
+        "content",
+        content,
+        expected="parseable JSON dict with 'status' key",
+    )
+
    parsed = json.loads(content)
+
+    artifact.check(
+        "parsed is dict",
+        isinstance(parsed, dict),
+        actual=type(parsed).__name__,
+        expected_val="dict",
+    )
    assert isinstance(parsed, dict)
+
+    artifact.check(
+        "'status' key present",
+        "status" in parsed,
+        actual=str(list(parsed.keys())),
+        expected_val="contains 'status'",
+    )
    assert "status" in parsed
@@ -16,15 +16,24 @@ def _make_tools(*names: str) -> list[Tool]:
    return [Tool(name=n, description=f"Tool {n}", parameters={}) for n in names]


-def test_queen_phase_state_initial_phase():
+def test_queen_phase_state_initial_phase(artifact):
    """QueenPhaseState should default to 'building' phase."""
    from framework.tools.queen_lifecycle_tools import QueenPhaseState

    state = QueenPhaseState()
+
+    artifact.record_value("phase", state.phase, expected="default phase == 'building'")
+
+    artifact.check(
+        "default phase is building",
+        state.phase == "building",
+        actual=repr(state.phase),
+        expected_val="'building'",
+    )
    assert state.phase == "building"


-def test_queen_phase_state_planning_tools():
+def test_queen_phase_state_planning_tools(artifact):
    """Planning phase should return planning_tools."""
    from framework.tools.queen_lifecycle_tools import QueenPhaseState

@@ -34,11 +43,31 @@ def test_queen_phase_state_planning_tools():

    tools = state.get_current_tools()
    tool_names = {t.name for t in tools}
+
+    artifact.record_value(
+        "tool_names",
+        sorted(tool_names),
+        expected="planning tools include list_agent_tools, exclude edit_file",
+    )
+
+    artifact.check(
+        "list_agent_tools in tools",
+        "list_agent_tools" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'list_agent_tools'",
+    )
    assert "list_agent_tools" in tool_names
+
+    artifact.check(
+        "edit_file not in tools",
+        "edit_file" not in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="does not contain 'edit_file'",
+    )
    assert "edit_file" not in tool_names


-def test_queen_phase_state_building_tools():
+def test_queen_phase_state_building_tools(artifact):
    """Building phase should return building_tools."""
    from framework.tools.queen_lifecycle_tools import QueenPhaseState

@@ -48,11 +77,31 @@ def test_queen_phase_state_building_tools():

    tools = state.get_current_tools()
    tool_names = {t.name for t in tools}
+
+    artifact.record_value(
+        "tool_names",
+        sorted(tool_names),
+        expected="building tools include edit_file, exclude list_agent_tools",
+    )
+
+    artifact.check(
+        "edit_file in tools",
+        "edit_file" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'edit_file'",
+    )
    assert "edit_file" in tool_names
+
+    artifact.check(
+        "list_agent_tools not in tools",
+        "list_agent_tools" not in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="does not contain 'list_agent_tools'",
+    )
    assert "list_agent_tools" not in tool_names


-def test_queen_phase_state_tool_switching():
+def test_queen_phase_state_tool_switching(artifact):
    """Switching phase should change which tools are returned."""
    from framework.tools.queen_lifecycle_tools import QueenPhaseState

@@ -62,33 +111,96 @@ def test_queen_phase_state_tool_switching():
    state.staging_tools = _make_tools("c")
    state.running_tools = _make_tools("d")

+    planning_tool = state.get_current_tools()[0].name
+    artifact.check(
+        "planning returns tool 'a'",
+        planning_tool == "a",
+        actual=repr(planning_tool),
+        expected_val="'a'",
+    )
    assert state.get_current_tools()[0].name == "a"
+
    state.phase = "building"
+    building_tool = state.get_current_tools()[0].name
+    artifact.check(
+        "building returns tool 'b'",
+        building_tool == "b",
+        actual=repr(building_tool),
+        expected_val="'b'",
+    )
    assert state.get_current_tools()[0].name == "b"
+
    state.phase = "staging"
+    staging_tool = state.get_current_tools()[0].name
+    artifact.check(
+        "staging returns tool 'c'",
+        staging_tool == "c",
+        actual=repr(staging_tool),
+        expected_val="'c'",
+    )
    assert state.get_current_tools()[0].name == "c"
+
    state.phase = "running"
+    running_tool = state.get_current_tools()[0].name
+    artifact.check(
+        "running returns tool 'd'",
+        running_tool == "d",
+        actual=repr(running_tool),
+        expected_val="'d'",
+    )
    assert state.get_current_tools()[0].name == "d"

+    artifact.record_value(
+        "tool_per_phase",
+        {"planning": "a", "building": "b", "staging": "c", "running": "d"},
+        expected="each phase returns its own tool",
+    )

-def test_queen_initial_phase_no_worker():
+
+def test_queen_initial_phase_no_worker(artifact):
    """Without a worker identity, queen should start in 'planning'."""
    # This tests the logic in queen_orchestrator.py line 106:
    # initial_phase = "staging" if worker_identity else "planning"
    worker_identity = None
    initial_phase = "staging" if worker_identity else "planning"
+
+    artifact.record_value(
+        "initial_phase",
+        initial_phase,
+        expected="'planning' when worker_identity is None",
+    )
+
+    artifact.check(
+        "initial phase is planning",
+        initial_phase == "planning",
+        actual=repr(initial_phase),
+        expected_val="'planning'",
+    )
    assert initial_phase == "planning"


-def test_queen_initial_phase_with_worker():
+def test_queen_initial_phase_with_worker(artifact):
    """With a worker identity, queen should start in 'staging'."""
    worker_identity = "my_agent"
    initial_phase = "staging" if worker_identity else "planning"
+
+    artifact.record_value(
+        "initial_phase",
+        initial_phase,
+        expected="'staging' when worker_identity is set",
+    )
+
+    artifact.check(
+        "initial phase is staging",
+        initial_phase == "staging",
+        actual=repr(initial_phase),
+        expected_val="'staging'",
+    )
    assert initial_phase == "staging"


@pytest.mark.asyncio
-async def test_queen_phase_switch_emits_event():
+async def test_queen_phase_switch_emits_event(artifact):
    """Phase transition should emit QUEEN_PHASE_CHANGED event."""
    from framework.runtime.event_bus import EventBus, EventType
    from framework.tools.queen_lifecycle_tools import QueenPhaseState
@@ -110,12 +222,36 @@ async def test_queen_phase_switch_emits_event():

    await state.switch_to_building(source="tool")

+    artifact.record_value("phase", state.phase, expected="'building'")
+    artifact.record_value("event_count", len(phase_events))
+
+    artifact.check(
+        "phase is building",
+        state.phase == "building",
+        actual=repr(state.phase),
+        expected_val="'building'",
+    )
    assert state.phase == "building"
+
+    artifact.check(
+        "at least 1 phase event",
+        len(phase_events) >= 1,
+        actual=str(len(phase_events)),
+        expected_val=">=1",
+    )
    assert len(phase_events) >= 1
+
+    event_phase = phase_events[0].data.get("phase")
+    artifact.check(
+        "event reports building",
+        event_phase == "building",
+        actual=repr(event_phase),
+        expected_val="'building'",
+    )
    assert phase_events[0].data.get("phase") == "building"


-def test_queen_draft_graph_persists_across_turns():
+def test_queen_draft_graph_persists_across_turns(artifact):
    """Draft graph stored on phase_state should survive phase changes."""
    from framework.tools.queen_lifecycle_tools import QueenPhaseState

@@ -126,5 +262,24 @@ def test_queen_draft_graph_persists_across_turns():
    state.phase = "building"

    # Draft should still be available
+    artifact.record_value(
+        "draft_graph",
+        state.draft_graph,
+        expected="draft_graph survives phase change, nodes=['a','b']",
+    )
+
+    artifact.check(
+        "draft_graph is not None",
+        state.draft_graph is not None,
+        actual=repr(state.draft_graph),
+        expected_val="non-None",
+    )
    assert state.draft_graph is not None
+
+    artifact.check(
+        "draft has 2 nodes",
+        len(state.draft_graph["nodes"]) == 2,
+        actual=str(len(state.draft_graph["nodes"])),
+        expected_val="2",
+    )
    assert len(state.draft_graph["nodes"]) == 2
@@ -0,0 +1,772 @@
+"""Component tests: Queen Live Phase Switching — real LLM, real event bus.
+
+Starts the actual queen via create_queen() with a real LLM provider and
+verifies phase transitions, dynamic tool switching, prompt switching, and
+event emission through the full queen lifecycle.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+from framework.runtime.event_bus import AgentEvent, EventBus, EventType
+from framework.server.session_manager import Session
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+QUEEN_STARTUP_TIMEOUT = 30  # seconds to wait for queen to initialize
+QUEEN_RESPONSE_TIMEOUT = 60  # seconds to wait for queen to respond to a message
+
+
+@dataclass
+class PhaseCapture:
+    """Captures QUEEN_PHASE_CHANGED events."""
+
+    phases: list[str] = field(default_factory=list)
+    events: list[AgentEvent] = field(default_factory=list)
+    _waiters: list[tuple[str, asyncio.Event]] = field(default_factory=list)
+
+    async def on_event(self, event: AgentEvent) -> None:
+        phase = event.data.get("phase", "")
+        self.phases.append(phase)
+        self.events.append(event)
+        # Wake any waiters for this phase
+        for target_phase, evt in self._waiters:
+            if phase == target_phase:
+                evt.set()
+
+    async def wait_for_phase(self, phase: str, timeout: float = 30) -> bool:
+        """Wait until a specific phase change is observed."""
+        if phase in self.phases:
+            return True
+        evt = asyncio.Event()
+        self._waiters.append((phase, evt))
+        try:
+            await asyncio.wait_for(evt.wait(), timeout=timeout)
+            return True
+        except (TimeoutError, asyncio.TimeoutError):
+            return False
+
+
+@dataclass
+class TextCapture:
+    """Captures LLM text deltas to verify queen is responding."""
+
+    chunks: list[str] = field(default_factory=list)
+    _has_text: asyncio.Event = field(default_factory=asyncio.Event)
+
+    async def on_event(self, event: AgentEvent) -> None:
+        text = event.data.get("content", "")
+        if text:
+            self.chunks.append(text)
+            self._has_text.set()
+
+    async def wait_for_text(self, timeout: float = 30) -> bool:
+        try:
+            await asyncio.wait_for(self._has_text.wait(), timeout=timeout)
+            return True
+        except (TimeoutError, asyncio.TimeoutError):
+            return False
+
+    @property
+    def full_text(self) -> str:
+        return "".join(self.chunks)
+
+
+def _make_mock_session_manager() -> MagicMock:
+    """Create a minimal mock SessionManager that satisfies create_queen()."""
+    mgr = MagicMock()
+    # _subscribe_worker_handoffs needs to exist but can be a no-op for tests
+    mgr._subscribe_worker_handoffs = MagicMock()
+    return mgr
+
+
+async def _start_queen(
+    llm_provider,
+    tmp_path: Path,
+    *,
+    worker_identity: str | None = None,
+    initial_prompt: str | None = None,
+) -> tuple[Session, asyncio.Task]:
+    """Start a real queen and return (session, task)."""
+    from framework.server.queen_orchestrator import create_queen
+
+    event_bus = EventBus()
+    session = Session(
+        id=f"test_{int(time.time())}",
+        event_bus=event_bus,
+        llm=llm_provider,
+        loaded_at=time.time(),
+    )
+
+    queen_dir = tmp_path / "queen"
+    queen_dir.mkdir(parents=True, exist_ok=True)
+
+    mgr = _make_mock_session_manager()
+
+    task = await create_queen(
+        session=session,
+        session_manager=mgr,
+        worker_identity=worker_identity,
+        queen_dir=queen_dir,
+        initial_prompt=initial_prompt,
+    )
+
+    # Wait for queen to initialize (queen_executor is set inside the task)
+    for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
+        if session.queen_executor is not None:
+            break
+        await asyncio.sleep(0.1)
+
+    assert session.queen_executor is not None, "Queen executor did not initialize"
+    assert session.phase_state is not None, "Phase state not set"
+
+    return session, task
+
+
+async def _shutdown_queen(session: Session, task: asyncio.Task) -> None:
+    """Cleanly shut down the queen."""
+    # Signal the event loop node to stop
+    node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
+    if node and hasattr(node, "signal_shutdown"):
+        node.signal_shutdown()
+
+    # Cancel the task as backup
+    if not task.done():
+        task.cancel()
+    try:
+        await asyncio.wait_for(task, timeout=5)
+    except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
+        pass
+
+
+# ---------------------------------------------------------------------------
+# Tests: Initial Phase
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_starts_in_planning_without_worker(llm_provider, tmp_path, artifact):
+    """Queen with no worker_identity must start in 'planning' phase."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        actual_phase = session.phase_state.phase
+        artifact.record_value(
+            "phase", actual_phase, expected="phase == 'planning' when no worker_identity"
+        )
+
+        artifact.check(
+            "phase is planning",
+            actual_phase == "planning",
+            actual=repr(actual_phase),
+            expected_val="'planning'",
+        )
+        assert session.phase_state.phase == "planning", (
+            f"Expected planning, got {session.phase_state.phase}"
+        )
+    finally:
+        await _shutdown_queen(session, task)
+
+
+@pytest.mark.asyncio
+async def test_queen_starts_in_staging_with_worker(llm_provider, tmp_path, artifact):
+    """Queen with worker_identity must start in 'staging' phase."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity="test_agent",
+        initial_prompt="Hello",
+    )
+    try:
+        actual_phase = session.phase_state.phase
+        artifact.record_value(
+            "phase", actual_phase, expected="phase == 'staging' when worker_identity is set"
+        )
+
+        artifact.check(
+            "phase is staging",
+            actual_phase == "staging",
+            actual=repr(actual_phase),
+            expected_val="'staging'",
+        )
+        assert session.phase_state.phase == "staging", (
+            f"Expected staging, got {session.phase_state.phase}"
+        )
+    finally:
+        await _shutdown_queen(session, task)
+
+
+# ---------------------------------------------------------------------------
+# Tests: Tool Availability Per Phase
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_planning_tools_available(llm_provider, tmp_path, artifact):
+    """In planning phase, planning tools must be returned by get_current_tools()."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        ps = session.phase_state
+        artifact.record_value(
+            "phase",
+            ps.phase,
+            expected="phase='planning', tools include list_agent_tools, exclude edit_file",
+        )
+
+        artifact.check(
+            "phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+
+        tool_names = {t.name for t in ps.get_current_tools()}
+        artifact.record_value("tool_names", sorted(tool_names))
+
+        # Planning phase must have agent discovery tools
+        artifact.check(
+            "list_agent_tools in tools",
+            "list_agent_tools" in tool_names,
+            actual=str(sorted(tool_names)),
+            expected_val="contains 'list_agent_tools'",
+        )
+        assert "list_agent_tools" in tool_names, (
+            f"list_agent_tools missing from planning tools: {tool_names}"
+        )
+        # Planning phase must NOT have building-only tools
+        artifact.check(
+            "edit_file not in tools",
+            "edit_file" not in tool_names,
+            actual=str(sorted(tool_names)),
+            expected_val="does not contain 'edit_file'",
+        )
+        assert "edit_file" not in tool_names, (
+            f"edit_file should not be in planning tools: {tool_names}"
+        )
+    finally:
+        await _shutdown_queen(session, task)
+
+
+@pytest.mark.asyncio
+async def test_queen_tools_change_on_phase_switch(llm_provider, tmp_path, artifact):
+    """Switching phase must change the tools returned by get_current_tools()."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        ps = session.phase_state
+        planning_tools = {t.name for t in ps.get_current_tools()}
+        artifact.record_value(
+            "planning_tools",
+            sorted(planning_tools),
+            expected="planning, building, and staging tool sets all differ",
+        )
+
+        # Switch to building
+        await ps.switch_to_building(source="test")
+        building_tools = {t.name for t in ps.get_current_tools()}
+        artifact.record_value("building_tools", sorted(building_tools))
+
+        artifact.check(
+            "planning != building tools",
+            planning_tools != building_tools,
+            actual=f"planning={sorted(planning_tools)}, building={sorted(building_tools)}",
+            expected_val="different sets",
+        )
+        assert planning_tools != building_tools, "Planning and building tools must differ"
+
+        # Switch to staging
+        await ps.switch_to_staging(source="test")
+        staging_tools = {t.name for t in ps.get_current_tools()}
+        artifact.record_value("staging_tools", sorted(staging_tools))
+
+        artifact.check(
+            "staging != building tools",
+            staging_tools != building_tools,
+            actual=f"staging={sorted(staging_tools)}, building={sorted(building_tools)}",
+            expected_val="different sets",
+        )
+        assert staging_tools != building_tools, "Building and staging tools must differ"
+    finally:
+        await _shutdown_queen(session, task)
+
+
+# ---------------------------------------------------------------------------
+# Tests: Prompt Switching
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_prompt_changes_on_phase_switch(llm_provider, tmp_path, artifact):
+    """Switching phase must change the system prompt returned by get_current_prompt()."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        ps = session.phase_state
+        planning_prompt = ps.get_current_prompt()
+        artifact.record_value(
+            "planning_prompt_len",
+            len(planning_prompt),
+            expected="non-empty planning and building prompts that differ",
+        )
+
+        artifact.check(
+            "planning prompt non-empty",
+            len(planning_prompt) > 0,
+            actual=str(len(planning_prompt)),
+            expected_val=">0",
+        )
+        assert len(planning_prompt) > 0, "Planning prompt should not be empty"
+
+        await ps.switch_to_building(source="test")
+        building_prompt = ps.get_current_prompt()
+        artifact.record_value("building_prompt_len", len(building_prompt))
+
+        artifact.check(
+            "building prompt non-empty",
+            len(building_prompt) > 0,
+            actual=str(len(building_prompt)),
+            expected_val=">0",
+        )
+        assert len(building_prompt) > 0, "Building prompt should not be empty"
+
+        artifact.check(
+            "prompts differ",
+            planning_prompt != building_prompt,
+            actual=f"planning_len={len(planning_prompt)}, building_len={len(building_prompt)}",
+            expected_val="different prompts",
+        )
+        assert planning_prompt != building_prompt, "Planning and building prompts must differ"
+    finally:
+        await _shutdown_queen(session, task)
+
+
+# ---------------------------------------------------------------------------
+# Tests: Phase Change Events
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_emits_phase_change_events(llm_provider, tmp_path, artifact):
+    """Each phase switch must emit a QUEEN_PHASE_CHANGED event."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    capture = PhaseCapture()
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=capture.on_event,
+    )
+    try:
+        ps = session.phase_state
+
+        # planning -> building
+        await ps.switch_to_building(source="test")
+        assert await capture.wait_for_phase("building", timeout=5)
+
+        # building -> staging
+        await ps.switch_to_staging(source="test")
+        assert await capture.wait_for_phase("staging", timeout=5)
+
+        # staging -> running
+        await ps.switch_to_running(source="test")
+        assert await capture.wait_for_phase("running", timeout=5)
+
+        # running -> planning
+        await ps.switch_to_planning(source="test")
+        assert await capture.wait_for_phase("planning", timeout=5)
+
+        artifact.record_value(
+            "phases", capture.phases, expected="['building', 'staging', 'running', 'planning']"
+        )
+
+        artifact.check(
+            "phase sequence matches",
+            capture.phases == ["building", "staging", "running", "planning"],
+            actual=str(capture.phases),
+            expected_val="['building', 'staging', 'running', 'planning']",
+        )
+        assert capture.phases == ["building", "staging", "running", "planning"], (
+            f"Phase sequence was: {capture.phases}"
+        )
+    finally:
+        await _shutdown_queen(session, task)
+
+
+@pytest.mark.asyncio
+async def test_queen_no_duplicate_phase_event_on_same_phase(llm_provider, tmp_path, artifact):
+    """Switching to the same phase should NOT emit a duplicate event."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    capture = PhaseCapture()
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=capture.on_event,
+    )
+    try:
+        ps = session.phase_state
+
+        artifact.check(
+            "initial phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+
+        # Switch to building twice
+        await ps.switch_to_building(source="test")
+        await asyncio.sleep(0.2)
+        await ps.switch_to_building(source="test")  # no-op
+        await asyncio.sleep(0.2)
+
+        # Should only have one "building" event
+        building_events = [p for p in capture.phases if p == "building"]
+
+        artifact.record_value(
+            "building_event_count",
+            len(building_events),
+            expected="exactly 1 building event (no duplicate)",
+        )
+        artifact.record_value("all_phases", capture.phases)
+
+        artifact.check(
+            "only 1 building event",
+            len(building_events) == 1,
+            actual=str(len(building_events)),
+            expected_val="1",
+        )
+        assert len(building_events) == 1, (
+            f"Expected 1 building event, got {len(building_events)}: {capture.phases}"
+        )
+    finally:
+        await _shutdown_queen(session, task)
+
+
+# ---------------------------------------------------------------------------
+# Tests: Queen Responds in Correct Phase
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_responds_to_message(llm_provider, tmp_path, artifact):
+    """Queen must produce an LLM turn when started with an initial prompt."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello, I want to build an agent.",
+    )
+    turn_complete = asyncio.Event()
+
+    async def _on_turn(event: AgentEvent) -> None:
+        turn_complete.set()
+
+    session.event_bus.subscribe(
+        event_types=[EventType.LLM_TURN_COMPLETE],
+        handler=_on_turn,
+        filter_stream="queen",
+    )
+    try:
+        # Queen should complete at least one LLM turn (text or tool call)
+        got_turn = False
+        try:
+            await asyncio.wait_for(turn_complete.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
+            got_turn = True
+        except (TimeoutError, asyncio.TimeoutError):
+            pass
+
+        artifact.record_value(
+            "got_turn", got_turn, expected="queen completes at least one LLM turn"
+        )
+
+        artifact.check(
+            "queen completed LLM turn", got_turn, actual=str(got_turn), expected_val="True"
+        )
+        assert got_turn, "Queen did not complete any LLM turn"
+    finally:
+        await _shutdown_queen(session, task)
+
+
+@pytest.mark.asyncio
+async def test_queen_responds_after_injected_message(llm_provider, tmp_path, artifact):
+    """Injecting a user message must trigger a new queen LLM turn."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        # Wait for initial response to settle
+        first_turn = asyncio.Event()
+
+        async def _on_first_turn(event: AgentEvent) -> None:
+            first_turn.set()
+
+        sub_id = session.event_bus.subscribe(
+            event_types=[EventType.LLM_TURN_COMPLETE],
+            handler=_on_first_turn,
+            filter_stream="queen",
+        )
+        try:
+            await asyncio.wait_for(first_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
+        except (TimeoutError, asyncio.TimeoutError):
+            pass
+        session.event_bus.unsubscribe(sub_id)
+
+        # Now inject a follow-up and listen for a new turn
+        second_turn = asyncio.Event()
+
+        async def _on_second_turn(event: AgentEvent) -> None:
+            second_turn.set()
+
+        session.event_bus.subscribe(
+            event_types=[EventType.LLM_TURN_COMPLETE],
+            handler=_on_second_turn,
+            filter_stream="queen",
+        )
+
+        node = session.queen_executor.node_registry.get("queen")
+        assert node is not None
+        await node.inject_event(
+            "What tools do you have available?",
+            is_client_input=True,
+        )
+
+        got_turn = False
+        try:
+            await asyncio.wait_for(second_turn.wait(), timeout=QUEEN_RESPONSE_TIMEOUT)
+            got_turn = True
+        except (TimeoutError, asyncio.TimeoutError):
+            pass
+
+        artifact.record_value(
+            "got_second_turn", got_turn, expected="queen responds to injected message"
+        )
+
+        artifact.check(
+            "queen responded to injected message",
+            got_turn,
+            actual=str(got_turn),
+            expected_val="True",
+        )
+        assert got_turn, "Queen did not respond to injected message"
+    finally:
+        await _shutdown_queen(session, task)
+
+
+# ---------------------------------------------------------------------------
+# Tests: Phase Transition Cycle
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_queen_full_phase_cycle_with_events(llm_provider, tmp_path, artifact):
+    """Walk through all 4 phases and verify state + events at each step."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    capture = PhaseCapture()
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=capture.on_event,
+    )
+    try:
+        ps = session.phase_state
+
+        # Start: planning
+        artifact.check(
+            "initial phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+        planning_tools = {t.name for t in ps.get_current_tools()}
+
+        # -> building
+        await ps.switch_to_building(source="test")
+        artifact.check(
+            "phase is building",
+            ps.phase == "building",
+            actual=repr(ps.phase),
+            expected_val="'building'",
+        )
+        assert ps.phase == "building"
+        building_tools = {t.name for t in ps.get_current_tools()}
+
+        artifact.check(
+            "building tools differ from planning",
+            building_tools != planning_tools,
+            actual=f"building={sorted(building_tools)}",
+            expected_val="different from planning",
+        )
+        assert building_tools != planning_tools
+
+        # -> staging
+        await ps.switch_to_staging(source="test")
+        artifact.check(
+            "phase is staging",
+            ps.phase == "staging",
+            actual=repr(ps.phase),
+            expected_val="'staging'",
+        )
+        assert ps.phase == "staging"
+        staging_tools = {t.name for t in ps.get_current_tools()}
+
+        # -> running
+        await ps.switch_to_running(source="test")
+        artifact.check(
+            "phase is running",
+            ps.phase == "running",
+            actual=repr(ps.phase),
+            expected_val="'running'",
+        )
+        assert ps.phase == "running"
+        running_tools = {t.name for t in ps.get_current_tools()}
+
+        # -> back to planning
+        await ps.switch_to_planning(source="test")
+        artifact.check(
+            "phase is planning again",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+        final_tools = {t.name for t in ps.get_current_tools()}
+
+        artifact.check(
+            "final tools match original planning set",
+            final_tools == planning_tools,
+            actual=f"final={sorted(final_tools)}",
+            expected_val=f"planning={sorted(planning_tools)}",
+        )
+        assert final_tools == planning_tools, "Tools should match original planning set"
+
+        # Verify events
+        await asyncio.sleep(0.3)
+        artifact.record_value(
+            "phase_events",
+            capture.phases,
+            expected="['building', 'staging', 'running', 'planning']",
+        )
+
+        artifact.check(
+            "phase event sequence",
+            capture.phases == ["building", "staging", "running", "planning"],
+            actual=str(capture.phases),
+            expected_val="['building', 'staging', 'running', 'planning']",
+        )
+        assert capture.phases == ["building", "staging", "running", "planning"]
+
+        # Verify all 4 phase tool sets are distinct
+        all_sets = [planning_tools, building_tools, staging_tools, running_tools]
+        for i, a in enumerate(all_sets):
+            for j, b in enumerate(all_sets):
+                if i != j:
+                    phase_names = ["planning", "building", "staging", "running"]
+                    artifact.check(
+                        f"{phase_names[i]} != {phase_names[j]} tools",
+                        a != b,
+                        actual=f"{phase_names[i]}={sorted(a)}, {phase_names[j]}={sorted(b)}",
+                        expected_val="different",
+                    )
+                    assert a != b, f"Phase tool sets {i} and {j} should differ but are identical"
+    finally:
+        await _shutdown_queen(session, task)
+
+
+@pytest.mark.asyncio
+async def test_queen_phase_state_persists_draft(llm_provider, tmp_path, artifact):
+    """Draft graph on phase_state must survive phase transitions."""
+    session, task = await _start_queen(
+        llm_provider,
+        tmp_path,
+        worker_identity=None,
+        initial_prompt="Hello",
+    )
+    try:
+        ps = session.phase_state
+        ps.draft_graph = {"nodes": ["a", "b"], "edges": ["a->b"]}
+
+        await ps.switch_to_building(source="test")
+        artifact.check(
+            "draft survives building switch",
+            ps.draft_graph is not None,
+            actual=repr(ps.draft_graph),
+            expected_val="non-None",
+        )
+        assert ps.draft_graph is not None
+
+        artifact.check(
+            "draft nodes intact after building",
+            ps.draft_graph["nodes"] == ["a", "b"],
+            actual=str(ps.draft_graph["nodes"]),
+            expected_val="['a', 'b']",
+        )
+        assert ps.draft_graph["nodes"] == ["a", "b"]
+
+        await ps.switch_to_staging(source="test")
+        artifact.check(
+            "draft survives staging switch",
+            ps.draft_graph is not None,
+            actual=repr(ps.draft_graph),
+            expected_val="non-None",
+        )
+        assert ps.draft_graph is not None
+
+        await ps.switch_to_running(source="test")
+        artifact.check(
+            "draft survives running switch",
+            ps.draft_graph is not None,
+            actual=repr(ps.draft_graph),
+            expected_val="non-None",
+        )
+        assert ps.draft_graph is not None
+
+        artifact.record_value(
+            "final_draft_graph",
+            ps.draft_graph,
+            expected="draft_graph survives all phase transitions",
+        )
+    finally:
+        await _shutdown_queen(session, task)
@@ -0,0 +1,678 @@
+"""Component tests: Queen State Machine Edge Cases.
+
+Race conditions, invalid transitions, stale events.
+
+These tests confirm real bugs and edge cases in the queen's phase
+state machine:
+- Non-atomic phase switch + event emission
+- Stale worker completion events ignored during wrong phase
+- No guards against invalid phase transitions
+- Double phase switch deduplication
+- inject_notification after executor teardown
+- Empty tool lists per phase
+- Phase persistence across rapid cycling
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+from framework.runtime.event_bus import AgentEvent, EventBus, EventType
+from framework.server.session_manager import Session
+from framework.tools.queen_lifecycle_tools import QueenPhaseState
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+QUEEN_STARTUP_TIMEOUT = 30
+
+
+async def _start_queen_session(llm_provider, tmp_path, *, worker_identity=None):
+    """Start a real queen and return (session, task)."""
+    from framework.server.queen_orchestrator import create_queen
+
+    event_bus = EventBus()
+    session = Session(
+        id=f"test_{int(time.time())}",
+        event_bus=event_bus,
+        llm=llm_provider,
+        loaded_at=time.time(),
+    )
+    queen_dir = tmp_path / "queen"
+    queen_dir.mkdir(parents=True, exist_ok=True)
+
+    mgr = MagicMock()
+    mgr._subscribe_worker_handoffs = MagicMock()
+
+    task = await create_queen(
+        session=session,
+        session_manager=mgr,
+        worker_identity=worker_identity,
+        queen_dir=queen_dir,
+        initial_prompt="Hello",
+    )
+
+    for _ in range(QUEEN_STARTUP_TIMEOUT * 10):
+        if session.queen_executor is not None:
+            break
+        await asyncio.sleep(0.1)
+
+    assert session.queen_executor is not None
+    return session, task
+
+
+async def _shutdown(session, task):
+    node = session.queen_executor.node_registry.get("queen") if session.queen_executor else None
+    if node and hasattr(node, "signal_shutdown"):
+        node.signal_shutdown()
+    if not task.done():
+        task.cancel()
+    try:
+        await asyncio.wait_for(task, timeout=5)
+    except (asyncio.CancelledError, TimeoutError, asyncio.TimeoutError):
+        pass
+
+
+# -----------------------------------------------------------------------
+# BUG #1: Concurrent phase switches — no crash or lost events
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_concurrent_phase_switches_no_crash(llm_provider, tmp_path, artifact):
+    """Firing multiple phase switches concurrently must not crash."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    phases_seen = []
+
+    async def _capture(event: AgentEvent):
+        phases_seen.append(event.data.get("phase"))
+
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+    try:
+        ps = session.phase_state
+        # Fire 4 phase switches concurrently
+        await asyncio.gather(
+            ps.switch_to_building(source="test"),
+            ps.switch_to_staging(source="test"),
+            ps.switch_to_running(source="test"),
+            ps.switch_to_planning(source="test"),
+        )
+        await asyncio.sleep(0.3)
+
+        valid_phases = ("planning", "building", "staging", "running")
+
+        artifact.record_value(
+            "final_phase",
+            ps.phase,
+            expected="valid phase (not corrupted)",
+        )
+        artifact.record_value("phases_seen", phases_seen)
+
+        artifact.check(
+            "phase is valid",
+            ps.phase in valid_phases,
+            actual=repr(ps.phase),
+            expected_val="one of planning/building/staging/running",
+        )
+        assert ps.phase in valid_phases, f"Phase corrupted: {ps.phase}"
+
+        artifact.check(
+            "at least 1 phase event",
+            len(phases_seen) >= 1,
+            actual=str(len(phases_seen)),
+            expected_val=">=1",
+        )
+        assert len(phases_seen) >= 1, "No phase change events"
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# BUG #3: Non-atomic phase change + event
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_phase_changes_without_event_bus(artifact):
+    """Phase must still change when event_bus is None (no crash)."""
+    ps = QueenPhaseState(phase="planning", event_bus=None)
+
+    await ps.switch_to_building(source="test")
+
+    artifact.record_value(
+        "phase",
+        ps.phase,
+        expected="'building' even without event bus",
+    )
+
+    artifact.check(
+        "phase changed to building",
+        ps.phase == "building",
+        actual=repr(ps.phase),
+        expected_val="'building'",
+    )
+    assert ps.phase == "building", "Phase should change even without event bus"
+
+
+@pytest.mark.asyncio
+async def test_phase_change_committed_before_event(artifact):
+    """Phase assignment before event emission — verify both occur."""
+    bus = EventBus()
+    phases_at_event_time = []
+
+    async def _capture(event: AgentEvent):
+        phases_at_event_time.append(event.data.get("phase"))
+
+    bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+
+    ps = QueenPhaseState(phase="planning", event_bus=bus)
+    await ps.switch_to_building(source="test")
+    await asyncio.sleep(0.1)
+
+    artifact.record_value(
+        "phase",
+        ps.phase,
+        expected="'building', event reports 'building'",
+    )
+    artifact.record_value(
+        "phases_at_event_time",
+        phases_at_event_time,
+    )
+
+    artifact.check(
+        "phase is building",
+        ps.phase == "building",
+        actual=repr(ps.phase),
+        expected_val="'building'",
+    )
+    assert ps.phase == "building"
+
+    artifact.check(
+        "event reports building",
+        phases_at_event_time == ["building"],
+        actual=str(phases_at_event_time),
+        expected_val="['building']",
+    )
+    assert phases_at_event_time == ["building"], (
+        f"Event should report 'building', got: {phases_at_event_time}"
+    )
+
+
+# -----------------------------------------------------------------------
+# BUG #4: Stale worker done events during non-running phase
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_worker_done_ignored_in_non_running_phase(llm_provider, tmp_path, artifact):
+    """Worker completion in planning phase must be silently dropped.
+
+    This confirms BUG #4: the _on_worker_done handler only processes
+    events when phase == 'running'. Events in other phases are lost.
+    """
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    phase_changes = []
+
+    async def _capture(event: AgentEvent):
+        phase_changes.append(event.data.get("phase"))
+
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+    try:
+        ps = session.phase_state
+
+        artifact.check(
+            "initial phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+
+        # Simulate a stale worker completion event
+        await session.event_bus.publish(
+            AgentEvent(
+                type=EventType.EXECUTION_COMPLETED,
+                stream_id="worker",
+                data={"output": {"result": "stale output"}},
+            )
+        )
+        await asyncio.sleep(0.5)
+
+        artifact.record_value(
+            "phase_after_stale_event",
+            ps.phase,
+            expected="still 'planning' (stale event ignored)",
+        )
+        artifact.record_value("phase_changes", phase_changes)
+
+        artifact.check(
+            "phase still planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning", f"Phase should still be planning, got: {ps.phase}"
+
+        artifact.check(
+            "no auto-switch to staging",
+            "staging" not in phase_changes,
+            actual=str(phase_changes),
+            expected_val="does not contain 'staging'",
+        )
+        assert "staging" not in phase_changes, (
+            "Should not auto-switch to staging from planning phase"
+        )
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# BUG #10: No guards against invalid phase transitions
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_invalid_transition_planning_to_running(llm_provider, tmp_path, artifact):
+    """planning -> running should succeed (no guard).
+
+    This confirms BUG #10: the state machine allows any transition.
+    """
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    try:
+        ps = session.phase_state
+
+        artifact.check(
+            "initial phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning"
+
+        await ps.switch_to_running(source="test")
+
+        artifact.record_value(
+            "phase_after_invalid_transition",
+            ps.phase,
+            expected="'running' (no guard, transition allowed)",
+        )
+
+        artifact.check(
+            "phase is running",
+            ps.phase == "running",
+            actual=repr(ps.phase),
+            expected_val="'running'",
+        )
+        assert ps.phase == "running", "switch_to_running should succeed from planning"
+    finally:
+        await _shutdown(session, task)
+
+
+@pytest.mark.asyncio
+async def test_invalid_transition_running_to_building(llm_provider, tmp_path, artifact):
+    """running -> building should succeed (no guard).
+
+    In production this could leave a running worker orphaned.
+    """
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    try:
+        ps = session.phase_state
+        await ps.switch_to_running(source="test")
+
+        artifact.check(
+            "phase is running",
+            ps.phase == "running",
+            actual=repr(ps.phase),
+            expected_val="'running'",
+        )
+        assert ps.phase == "running"
+
+        await ps.switch_to_building(source="test")
+
+        artifact.record_value(
+            "phase_after_invalid_transition",
+            ps.phase,
+            expected="'building' (no guard)",
+        )
+
+        artifact.check(
+            "phase is building",
+            ps.phase == "building",
+            actual=repr(ps.phase),
+            expected_val="'building'",
+        )
+        assert ps.phase == "building"
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# BUG #1 supplement: Double phase switch deduplication
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_double_switch_to_same_phase_is_noop(llm_provider, tmp_path, artifact):
+    """switch_to_X when already in X must be a no-op (no event)."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    events = []
+
+    async def _capture(event: AgentEvent):
+        events.append(event.data.get("phase"))
+
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+    try:
+        ps = session.phase_state
+        await ps.switch_to_building(source="test")
+        await asyncio.sleep(0.1)
+        count_after_first = len(events)
+
+        # Second call to same phase
+        await ps.switch_to_building(source="test")
+        await asyncio.sleep(0.1)
+
+        artifact.record_value(
+            "events_after_first",
+            count_after_first,
+            expected="no extra event after double switch",
+        )
+        artifact.record_value(
+            "events_after_second",
+            len(events),
+        )
+        artifact.record_value("all_events", events)
+
+        artifact.check(
+            "no extra event on double switch",
+            len(events) == count_after_first,
+            actual=f"first={count_after_first}, second={len(events)}",
+            expected_val="same count",
+        )
+        assert len(events) == count_after_first, (
+            f"Double switch should not emit extra event. Events: {events}"
+        )
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# BUG #6: Phase with empty tool lists
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_phase_with_empty_tools_returns_empty(llm_provider, tmp_path, artifact):
+    """get_current_tools() with empty tool list returns [] not crash."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    try:
+        ps = session.phase_state
+        # Clear all running tools
+        ps.running_tools = []
+        await ps.switch_to_running(source="test")
+
+        tools = ps.get_current_tools()
+
+        artifact.record_value(
+            "tool_count",
+            len(tools),
+            expected="0 (empty list, no crash)",
+        )
+        artifact.record_value(
+            "tool_names",
+            [t.name for t in tools],
+        )
+
+        artifact.check(
+            "empty tools returns []",
+            tools == [],
+            actual=str([t.name for t in tools]),
+            expected_val="[]",
+        )
+        assert tools == [], f"Expected empty list, got: {[t.name for t in tools]}"
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# Rapid phase cycling — verify final state is consistent
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_rapid_phase_cycling_final_state(llm_provider, tmp_path, artifact):
+    """Rapidly cycling through phases must leave state consistent."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    all_events = []
+
+    async def _capture(event: AgentEvent):
+        all_events.append(event.data.get("phase"))
+
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+    try:
+        ps = session.phase_state
+
+        # Cycle 3 times
+        for _ in range(3):
+            await ps.switch_to_building(source="test")
+            await ps.switch_to_staging(source="test")
+            await ps.switch_to_running(source="test")
+            await ps.switch_to_planning(source="test")
+
+        await asyncio.sleep(0.3)
+
+        artifact.record_value(
+            "final_phase",
+            ps.phase,
+            expected="'planning' after 3 full cycles",
+        )
+        artifact.record_value("event_count", len(all_events))
+        artifact.record_value("all_events", all_events)
+
+        artifact.check(
+            "final phase is planning",
+            ps.phase == "planning",
+            actual=repr(ps.phase),
+            expected_val="'planning'",
+        )
+        assert ps.phase == "planning", f"Expected planning, got: {ps.phase}"
+
+        # Should have 12 phase change events (4 per cycle x 3)
+        artifact.check(
+            "12 phase events",
+            len(all_events) == 12,
+            actual=str(len(all_events)),
+            expected_val="12",
+        )
+        assert len(all_events) == 12, f"Expected 12 events, got {len(all_events)}: {all_events}"
+
+        # Tools and prompt should match planning phase
+        prompt = ps.get_current_prompt()
+
+        artifact.check(
+            "prompt non-empty after cycling",
+            len(prompt) > 0,
+            actual=str(len(prompt)),
+            expected_val=">0",
+        )
+        assert len(prompt) > 0, "Prompt should not be empty after cycling"
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# Tool availability is correct per phase (strict verification)
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_tool_sets_are_disjoint_across_phases(llm_provider, tmp_path, artifact):
+    """Each phase must have a distinct non-empty tool set."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    try:
+        ps = session.phase_state
+
+        phase_tools = {}
+        for phase in ("planning", "building", "staging", "running"):
+            getattr(
+                ps,
+                f"switch_to_{phase}",
+                ps.switch_to_planning,
+            )
+            # Use direct assignment for simplicity
+            ps.phase = phase
+            tools = {t.name for t in ps.get_current_tools()}
+            phase_tools[phase] = tools
+
+        # All phases should have at least 1 tool
+        for phase, tools in phase_tools.items():
+            artifact.check(
+                f"{phase} has tools",
+                len(tools) > 0,
+                actual=str(len(tools)),
+                expected_val=">0",
+            )
+            assert len(tools) > 0, f"{phase} has no tools"
+
+        artifact.record_value(
+            "phase_tools",
+            {k: sorted(v) for k, v in phase_tools.items()},
+            expected="all 4 phases have distinct tool sets",
+        )
+
+        # Pairwise comparison: all sets should differ
+        phases = list(phase_tools.keys())
+        for i in range(len(phases)):
+            for j in range(i + 1, len(phases)):
+                a, b = phases[i], phases[j]
+                artifact.check(
+                    f"{a} != {b} tools",
+                    phase_tools[a] != phase_tools[b],
+                    actual=(f"{a}={sorted(phase_tools[a])}, {b}={sorted(phase_tools[b])}"),
+                    expected_val="different",
+                )
+                assert phase_tools[a] != phase_tools[b], (
+                    f"{a} and {b} have identical tools: {phase_tools[a]}"
+                )
+    finally:
+        await _shutdown(session, task)
+
+
+# -----------------------------------------------------------------------
+# Worker completion -> auto-staging transition
+# -----------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_worker_completion_triggers_auto_staging(llm_provider, tmp_path, artifact):
+    """EXECUTION_COMPLETED in running phase must auto-switch to staging."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    phase_changes = []
+
+    async def _capture(event: AgentEvent):
+        phase_changes.append(event.data.get("phase"))
+
+    session.event_bus.subscribe(
+        event_types=[EventType.QUEEN_PHASE_CHANGED],
+        handler=_capture,
+    )
+    try:
+        ps = session.phase_state
+        # Move to running phase
+        await ps.switch_to_running(source="test")
+        await asyncio.sleep(0.3)
+        phase_changes.clear()  # Reset after manual switch
+
+        # Simulate worker completion event
+        await session.event_bus.publish(
+            AgentEvent(
+                type=EventType.EXECUTION_COMPLETED,
+                stream_id="worker",
+                data={"output": {"result": "done"}},
+            )
+        )
+        await asyncio.sleep(1.0)
+
+        artifact.record_value(
+            "phase_after_completion",
+            ps.phase,
+            expected="'staging' (auto-switch on completion)",
+        )
+        artifact.record_value("phase_changes", phase_changes)
+
+        artifact.check(
+            "auto-switched to staging",
+            ps.phase == "staging",
+            actual=repr(ps.phase),
+            expected_val="'staging'",
+        )
+        assert ps.phase == "staging", f"Expected auto-switch to staging, got: {ps.phase}"
+
+        artifact.check(
+            "staging event emitted",
+            "staging" in phase_changes,
+            actual=str(phase_changes),
+            expected_val="contains 'staging'",
+        )
+        assert "staging" in phase_changes, (
+            f"QUEEN_PHASE_CHANGED(staging) not emitted. Events: {phase_changes}"
+        )
+    finally:
+        await _shutdown(session, task)
+
+
+@pytest.mark.asyncio
+async def test_worker_failure_triggers_auto_staging(llm_provider, tmp_path, artifact):
+    """EXECUTION_FAILED in running phase must auto-switch to staging."""
+    session, task = await _start_queen_session(llm_provider, tmp_path)
+    try:
+        ps = session.phase_state
+        await ps.switch_to_running(source="test")
+        await asyncio.sleep(0.3)
+
+        # Simulate worker failure event
+        await session.event_bus.publish(
+            AgentEvent(
+                type=EventType.EXECUTION_FAILED,
+                stream_id="worker",
+                data={"error": "worker crashed"},
+            )
+        )
+        await asyncio.sleep(1.0)
+
+        artifact.record_value(
+            "phase_after_failure",
+            ps.phase,
+            expected="'staging' (auto-switch on failure)",
+        )
+
+        artifact.check(
+            "auto-switched to staging on failure",
+            ps.phase == "staging",
+            actual=repr(ps.phase),
+            expected_val="'staging'",
+        )
+        assert ps.phase == "staging", f"Expected auto-switch to staging on failure, got: {ps.phase}"
+    finally:
+        await _shutdown(session, task)
@@ -27,7 +27,7 @@ SET_OUTPUT = (


@pytest.mark.asyncio
-async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
+async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider, artifact):
    """Echo node: path must be exactly ['echo'], steps must be 1."""
    graph = GraphSpec(
        id="strict-echo",
@@ -54,21 +54,70 @@ async def test_strict_echo_exact_path_and_steps(runtime, goal, llm_provider):
        conversation_mode="continuous",
    )
    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
-    result = await executor.execute(
-        graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False
+    result = await executor.execute(graph, goal, {"input": "ECHO_TEST_42"}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, path=['echo'], steps=1, "
+            "output['output'] set, quality='clean', "
+            "retries=0, tokens>0"
+        ),
    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "path matches", result.path == ["echo"], actual=str(result.path), expected_val="['echo']"
+    )
    assert result.path == ["echo"]
+
+    artifact.check(
+        "steps_executed is 1",
+        result.steps_executed == 1,
+        actual=str(result.steps_executed),
+        expected_val="1",
+    )
    assert result.steps_executed == 1
+
+    actual_output = result.output.get("output")
+    artifact.check(
+        "output['output'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("output") is not None
+
+    artifact.check(
+        "execution_quality is clean",
+        result.execution_quality == "clean",
+        actual=repr(result.execution_quality),
+        expected_val="'clean'",
+    )
    assert result.execution_quality == "clean"
+
+    artifact.check(
+        "total_retries is 0",
+        result.total_retries == 0,
+        actual=str(result.total_retries),
+        expected_val="0",
+    )
    assert result.total_retries == 0
+
+    artifact.check(
+        "total_tokens > 0",
+        result.total_tokens > 0,
+        actual=str(result.total_tokens),
+        expected_val=">0",
+    )
    assert result.total_tokens > 0


@pytest.mark.asyncio
-async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
+async def test_strict_clean_execution_quality(runtime, goal, llm_provider, artifact):
    """A simple set_output call should produce 'clean' execution quality."""
    graph = GraphSpec(
        id="strict-clean",
@@ -92,12 +141,37 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):
    )
    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(result, expected="clean success, no partial failures, no nodes_with_failures")

+    artifact.check(
+        "is_clean_success",
+        result.is_clean_success,
+        actual=(
+            f"quality={result.execution_quality}, "
+            f"retries={result.total_retries}, "
+            f"failures={result.nodes_with_failures}"
+        ),
+        expected_val="True",
+    )
    assert result.is_clean_success, (
        f"Expected clean success, got quality={result.execution_quality}, "
        f"retries={result.total_retries}, failures={result.nodes_with_failures}"
    )
+
+    artifact.check(
+        "no partial failures",
+        not result.had_partial_failures,
+        actual=str(result.had_partial_failures),
+        expected_val="False",
+    )
    assert not result.had_partial_failures
+
+    artifact.check(
+        "no nodes_with_failures",
+        len(result.nodes_with_failures) == 0,
+        actual=str(result.nodes_with_failures),
+        expected_val="[]",
+    )
    assert len(result.nodes_with_failures) == 0


@@ -107,8 +181,8 @@ async def test_strict_clean_execution_quality(runtime, goal, llm_provider):


@pytest.mark.asyncio
-async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
-    """Three-node pipeline must traverse in exact order: a → b → c."""
+async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider, artifact):
+    """Three-node pipeline must traverse in exact order: a -> b -> c."""
    graph = GraphSpec(
        id="strict-pipeline",
        goal_id="dummy",
@@ -118,45 +192,106 @@ async def test_strict_pipeline_path_ordering(runtime, goal, llm_provider):
        conversation_mode="continuous",
        nodes=[
            NodeSpec(
-                id="a", name="A", description="First",
-                node_type="event_loop", output_keys=["a_out"],
+                id="a",
+                name="A",
+                description="First",
+                node_type="event_loop",
+                output_keys=["a_out"],
                system_prompt="Call set_output with key='a_out' and value='from_a'. " + SET_OUTPUT,
            ),
            NodeSpec(
-                id="b", name="B", description="Second",
-                node_type="event_loop", input_keys=["b_in"], output_keys=["b_out"],
+                id="b",
+                name="B",
+                description="Second",
+                node_type="event_loop",
+                input_keys=["b_in"],
+                output_keys=["b_out"],
                system_prompt="Call set_output with key='b_out' and value='from_b'. " + SET_OUTPUT,
            ),
            NodeSpec(
-                id="c", name="C", description="Third",
-                node_type="event_loop", input_keys=["c_in"], output_keys=["result"],
+                id="c",
+                name="C",
+                description="Third",
+                node_type="event_loop",
+                input_keys=["c_in"],
+                output_keys=["result"],
                system_prompt="Call set_output with key='result' and value='from_c'. " + SET_OUTPUT,
            ),
        ],
        edges=[
-            EdgeSpec(id="a-b", source="a", target="b",
-                     condition=EdgeCondition.ON_SUCCESS, input_mapping={"b_in": "a_out"}),
-            EdgeSpec(id="b-c", source="b", target="c",
-                     condition=EdgeCondition.ON_SUCCESS, input_mapping={"c_in": "b_out"}),
+            EdgeSpec(
+                id="a-b",
+                source="a",
+                target="b",
+                condition=EdgeCondition.ON_SUCCESS,
+                input_mapping={"b_in": "a_out"},
+            ),
+            EdgeSpec(
+                id="b-c",
+                source="b",
+                target="c",
+                condition=EdgeCondition.ON_SUCCESS,
+                input_mapping={"c_in": "b_out"},
+            ),
        ],
        memory_keys=["a_out", "b_in", "b_out", "c_in", "result"],
    )
    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, path=['a','b','c'], steps=3, "
+            "output['result'] set, each node visited once"
+        ),
+    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["a", "b", "c"],
+        actual=str(result.path),
+        expected_val="['a', 'b', 'c']",
+    )
    assert result.path == ["a", "b", "c"], f"Path was {result.path}"
+
+    artifact.check(
+        "steps_executed is 3",
+        result.steps_executed == 3,
+        actual=str(result.steps_executed),
+        expected_val="3",
+    )
    assert result.steps_executed == 3
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None

    # Visit counts: each node visited exactly once
+    a_visits = result.node_visit_counts.get("a", 0)
+    artifact.check("node 'a' visited once", a_visits == 1, actual=str(a_visits), expected_val="1")
    assert result.node_visit_counts.get("a", 0) == 1
+
+    b_visits = result.node_visit_counts.get("b", 0)
+    artifact.check("node 'b' visited once", b_visits == 1, actual=str(b_visits), expected_val="1")
    assert result.node_visit_counts.get("b", 0) == 1
+
+    c_visits = result.node_visit_counts.get("c", 0)
+    artifact.check("node 'c' visited once", c_visits == 1, actual=str(c_visits), expected_val="1")
    assert result.node_visit_counts.get("c", 0) == 1


@pytest.mark.asyncio
-async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
+async def test_strict_branch_correct_terminal(runtime, goal, llm_provider, artifact):
    """Classifier node must route 'I love it' to the positive terminal."""
    graph = GraphSpec(
        id="strict-branch",
@@ -167,8 +302,11 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
        conversation_mode="continuous",
        nodes=[
            NodeSpec(
-                id="classify", name="Classify", description="Sentiment classifier",
-                node_type="event_loop", input_keys=["text"],
+                id="classify",
+                name="Classify",
+                description="Sentiment classifier",
+                node_type="event_loop",
+                input_keys=["text"],
                output_keys=["label"],
                system_prompt=(
                    "Read the 'text' input. Determine if sentiment is positive or negative. "
@@ -177,39 +315,87 @@ async def test_strict_branch_correct_terminal(runtime, goal, llm_provider):
                ),
            ),
            NodeSpec(
-                id="positive", name="Positive", description="Positive handler",
-                node_type="event_loop", output_keys=["result"],
+                id="positive",
+                name="Positive",
+                description="Positive handler",
+                node_type="event_loop",
+                output_keys=["result"],
                system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
            ),
            NodeSpec(
-                id="negative", name="Negative", description="Negative handler",
-                node_type="event_loop", output_keys=["result"],
+                id="negative",
+                name="Negative",
+                description="Negative handler",
+                node_type="event_loop",
+                output_keys=["result"],
                system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
            ),
        ],
        edges=[
-            EdgeSpec(id="to-pos", source="classify", target="positive",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('label') == 'positive'", priority=1),
-            EdgeSpec(id="to-neg", source="classify", target="negative",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('label') == 'negative'", priority=0),
+            EdgeSpec(
+                id="to-pos",
+                source="classify",
+                target="positive",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('label') == 'positive'",
+                priority=1,
+            ),
+            EdgeSpec(
+                id="to-neg",
+                source="classify",
+                target="negative",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('label') == 'negative'",
+                priority=0,
+            ),
        ],
        memory_keys=["text", "label", "result"],
    )
    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
    result = await executor.execute(
-        graph, goal, {"text": "I absolutely love this product, it's fantastic!"}, validate_graph=False
+        graph,
+        goal,
+        {"text": "I absolutely love this product, it's fantastic!"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, path=['classify','positive'], steps=2, output['result']='POS'",
    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["classify", "positive"],
+        actual=str(result.path),
+        expected_val="['classify', 'positive']",
+    )
    assert result.path == ["classify", "positive"], f"Path was {result.path}"
+
+    artifact.check(
+        "steps_executed is 2",
+        result.steps_executed == 2,
+        actual=str(result.steps_executed),
+        expected_val="2",
+    )
    assert result.steps_executed == 2
+
+    actual_result = result.output.get("result")
+    artifact.check(
+        "output['result'] is 'POS'",
+        actual_result == "POS",
+        actual=repr(actual_result),
+        expected_val="'POS'",
+    )
    assert result.output.get("result") == "POS"


@pytest.mark.asyncio
-async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
+async def test_strict_branch_negative_terminal(runtime, goal, llm_provider, artifact):
    """Classifier node must route hateful text to the negative terminal."""
    graph = GraphSpec(
        id="strict-branch-neg",
@@ -220,8 +406,11 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
        conversation_mode="continuous",
        nodes=[
            NodeSpec(
-                id="classify", name="Classify", description="Sentiment classifier",
-                node_type="event_loop", input_keys=["text"],
+                id="classify",
+                name="Classify",
+                description="Sentiment classifier",
+                node_type="event_loop",
+                input_keys=["text"],
                output_keys=["label"],
                system_prompt=(
                    "Read the 'text' input. Determine if sentiment is positive or negative. "
@@ -230,34 +419,82 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):
                ),
            ),
            NodeSpec(
-                id="positive", name="Positive", description="Positive handler",
-                node_type="event_loop", output_keys=["result"],
+                id="positive",
+                name="Positive",
+                description="Positive handler",
+                node_type="event_loop",
+                output_keys=["result"],
                system_prompt="Call set_output with key='result' and value='POS'. " + SET_OUTPUT,
            ),
            NodeSpec(
-                id="negative", name="Negative", description="Negative handler",
-                node_type="event_loop", output_keys=["result"],
+                id="negative",
+                name="Negative",
+                description="Negative handler",
+                node_type="event_loop",
+                output_keys=["result"],
                system_prompt="Call set_output with key='result' and value='NEG'. " + SET_OUTPUT,
            ),
        ],
        edges=[
-            EdgeSpec(id="to-pos", source="classify", target="positive",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('label') == 'positive'", priority=1),
-            EdgeSpec(id="to-neg", source="classify", target="negative",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('label') == 'negative'", priority=0),
+            EdgeSpec(
+                id="to-pos",
+                source="classify",
+                target="positive",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('label') == 'positive'",
+                priority=1,
+            ),
+            EdgeSpec(
+                id="to-neg",
+                source="classify",
+                target="negative",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('label') == 'negative'",
+                priority=0,
+            ),
        ],
        memory_keys=["text", "label", "result"],
    )
    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 3})
    result = await executor.execute(
-        graph, goal, {"text": "This is absolutely terrible and broken. Worst ever."}, validate_graph=False
+        graph,
+        goal,
+        {"text": "This is absolutely terrible and broken. Worst ever."},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, path=['classify','negative'], steps=2, output['result']='NEG'",
    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["classify", "negative"],
+        actual=str(result.path),
+        expected_val="['classify', 'negative']",
+    )
    assert result.path == ["classify", "negative"], f"Path was {result.path}"
+
+    artifact.check(
+        "steps_executed is 2",
+        result.steps_executed == 2,
+        actual=str(result.steps_executed),
+        expected_val="2",
+    )
    assert result.steps_executed == 2
+
+    actual_result = result.output.get("result")
+    artifact.check(
+        "output['result'] is 'NEG'",
+        actual_result == "NEG",
+        actual=repr(actual_result),
+        expected_val="'NEG'",
+    )
    assert result.output.get("result") == "NEG"


@@ -268,7 +505,7 @@ async def test_strict_branch_negative_terminal(runtime, goal, llm_provider):

@pytest.mark.asyncio
 async def test_strict_tool_output_format(
-    runtime, goal, llm_provider, tool_registry, tmp_path
+    runtime, goal, llm_provider, tool_registry, tmp_path, artifact
 ):
    """Worker must call get_current_time and produce output in STATUS|date|day format."""
    graph = GraphSpec(
@@ -290,8 +527,7 @@ async def test_strict_tool_output_format(
                    "Extract the 'date' and 'day_of_week' fields from the result. "
                    "Build this exact format: STATUS|<date>|<day_of_week> "
                    "(example: STATUS|2026-04-03|Thursday). "
-                    "Call set_output with key='result' and this formatted string. "
-                    + SET_OUTPUT
+                    "Call set_output with key='result' and this formatted string. " + SET_OUTPUT
                ),
            ),
        ],
@@ -300,32 +536,77 @@ async def test_strict_tool_output_format(
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
    )
    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result, expected="success=True, output['result'] in STATUS|YYYY-MM-DD|DayName format"
+    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
    output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        output is not None,
+        actual=repr(output),
+        expected_val="non-None value",
+    )
    assert output is not None, "No result output"

    # Strict format verification: STATUS|date|day_of_week
    parts = output.split("|")
+    artifact.check(
+        "3 pipe-separated parts",
+        len(parts) == 3,
+        actual=f"{len(parts)} parts: {output}",
+        expected_val="3 parts",
+    )
    assert len(parts) == 3, f"Expected 3 pipe-separated parts, got {len(parts)}: {output}"
+
+    artifact.check(
+        "first part is STATUS", parts[0] == "STATUS", actual=repr(parts[0]), expected_val="'STATUS'"
+    )
    assert parts[0] == "STATUS", f"First part should be STATUS, got: {parts[0]}"
+
    # Date part should look like YYYY-MM-DD
+    artifact.check(
+        "date part length >= 8",
+        len(parts[1]) >= 8,
+        actual=f"len={len(parts[1])}, value={parts[1]}",
+        expected_val=">=8",
+    )
    assert len(parts[1]) >= 8, f"Date part too short: {parts[1]}"
+
+    artifact.check(
+        "date part contains dashes",
+        "-" in parts[1],
+        actual=repr(parts[1]),
+        expected_val="contains '-'",
+    )
    assert "-" in parts[1], f"Date part should contain dashes: {parts[1]}"
+
    # Day of week should be a recognizable day name
    valid_days = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"}
+    artifact.check(
+        "valid day_of_week",
+        parts[2] in valid_days,
+        actual=repr(parts[2]),
+        expected_val=f"one of {sorted(valid_days)}",
+    )
    assert parts[2] in valid_days, f"Invalid day_of_week: {parts[2]}"


@pytest.mark.asyncio
 async def test_strict_artifact_creation_and_verification(
-    runtime, goal, llm_provider, tool_registry, tmp_path
+    runtime, goal, llm_provider, tool_registry, tmp_path, artifact
 ):
    """Single-node: saves a file via save_data, then verifies the artifact on disk."""
    storage_path = tmp_path / "session"
@@ -359,7 +640,8 @@ async def test_strict_artifact_creation_and_verification(
        memory_keys=["task", "result"],
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
        storage_path=storage_path,
@@ -367,23 +649,72 @@ async def test_strict_artifact_creation_and_verification(
    result = await executor.execute(
        graph, goal, {"task": "Create and verify artifact"}, validate_graph=False
    )
+    artifact.record(
+        result,
+        expected=(
+            "success=True, path=['worker'], steps=1, "
+            "output contains INTEGRATION_TEST_PAYLOAD_XYZ, "
+            "file on disk matches"
+        ),
+    )

    # Strict outcome verification
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["worker"],
+        actual=str(result.path),
+        expected_val="['worker']",
+    )
    assert result.path == ["worker"], f"Path was {result.path}"
+
+    artifact.check(
+        "steps_executed is 1",
+        result.steps_executed == 1,
+        actual=str(result.steps_executed),
+        expected_val="1",
+    )
    assert result.steps_executed == 1

    # Output must be the loaded content
    output = result.output.get("result")
-    assert output is not None, "Worker did not set 'result'"
-    assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, (
-        f"Expected payload in output, got: {output}"
+    artifact.check(
+        "output['result'] is set",
+        output is not None,
+        actual=repr(output),
+        expected_val="non-None value",
    )
+    assert output is not None, "Worker did not set 'result'"
+
+    artifact.check(
+        "output contains payload",
+        "INTEGRATION_TEST_PAYLOAD_XYZ" in output,
+        actual=repr(output),
+        expected_val="contains 'INTEGRATION_TEST_PAYLOAD_XYZ'",
+    )
+    assert "INTEGRATION_TEST_PAYLOAD_XYZ" in output, f"Expected payload in output, got: {output}"

    # Verify the actual file exists on disk (save_data uses storage_path/data/)
    artifact_path = storage_path / "data" / "test_artifact.txt"
+    artifact.check(
+        "artifact file exists",
+        artifact_path.exists(),
+        actual=str(artifact_path.exists()),
+        expected_val="True",
+    )
    assert artifact_path.exists(), f"Artifact not found at {artifact_path}"
+
    file_content = artifact_path.read_text(encoding="utf-8").strip()
+    artifact.check(
+        "file content matches payload",
+        file_content == "INTEGRATION_TEST_PAYLOAD_XYZ",
+        actual=repr(file_content),
+        expected_val="'INTEGRATION_TEST_PAYLOAD_XYZ'",
+    )
    assert file_content == "INTEGRATION_TEST_PAYLOAD_XYZ", (
        f"File content mismatch: {file_content!r}"
    )
@@ -395,7 +726,7 @@ async def test_strict_artifact_creation_and_verification(


@pytest.mark.asyncio
-async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
+async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider, artifact):
    """Feedback loop must respect max_node_visits and record visit counts."""
    from .nodes import StatefulNode, SuccessNode
    from framework.graph.node import NodeResult
@@ -406,23 +737,48 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):
        entry_node="draft",
        terminal_nodes=["done"],
        nodes=[
-            NodeSpec(id="draft", name="Draft", description="Produces draft",
-                     node_type="event_loop", output_keys=["draft_output"], max_node_visits=3),
-            NodeSpec(id="review", name="Review", description="Reviews draft",
-                     node_type="event_loop", input_keys=["draft_output"],
-                     output_keys=["approved"]),
-            NodeSpec(id="done", name="Done", description="Terminal",
-                     node_type="event_loop", output_keys=["final"]),
+            NodeSpec(
+                id="draft",
+                name="Draft",
+                description="Produces draft",
+                node_type="event_loop",
+                output_keys=["draft_output"],
+                max_node_visits=3,
+            ),
+            NodeSpec(
+                id="review",
+                name="Review",
+                description="Reviews draft",
+                node_type="event_loop",
+                input_keys=["draft_output"],
+                output_keys=["approved"],
+            ),
+            NodeSpec(
+                id="done",
+                name="Done",
+                description="Terminal",
+                node_type="event_loop",
+                output_keys=["final"],
+            ),
        ],
        edges=[
-            EdgeSpec(id="d-r", source="draft", target="review",
-                     condition=EdgeCondition.ON_SUCCESS),
-            EdgeSpec(id="r-d", source="review", target="draft",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('approved') == False", priority=1),
-            EdgeSpec(id="r-done", source="review", target="done",
-                     condition=EdgeCondition.CONDITIONAL,
-                     condition_expr="output.get('approved') == True", priority=0),
+            EdgeSpec(id="d-r", source="draft", target="review", condition=EdgeCondition.ON_SUCCESS),
+            EdgeSpec(
+                id="r-d",
+                source="review",
+                target="draft",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('approved') == False",
+                priority=1,
+            ),
+            EdgeSpec(
+                id="r-done",
+                source="review",
+                target="done",
+                condition=EdgeCondition.CONDITIONAL,
+                condition_expr="output.get('approved') == True",
+                priority=0,
+            ),
        ],
        memory_keys=["draft_output", "approved", "final"],
    )
@@ -430,28 +786,70 @@ async def test_strict_feedback_loop_visit_counts(runtime, goal, llm_provider):

    # Deterministic nodes: reject twice, then approve
    executor.register_node("draft", SuccessNode(output={"draft_output": "v1"}))
-    executor.register_node("review", StatefulNode([
-        NodeResult(success=True, output={"approved": False}),
-        NodeResult(success=True, output={"approved": False}),
-        NodeResult(success=True, output={"approved": True}),
-    ]))
+    executor.register_node(
+        "review",
+        StatefulNode(
+            [
+                NodeResult(success=True, output={"approved": False}),
+                NodeResult(success=True, output={"approved": False}),
+                NodeResult(success=True, output={"approved": True}),
+            ]
+        ),
+    )
    executor.register_node("done", SuccessNode(output={"final": "complete"}))

    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, 'done' in path, "
+            "draft visited 3x, review visited 3x, "
+            "done visited 1x, output['final']='complete'"
+        ),
+    )

+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
    assert result.success
+
+    artifact.check(
+        "'done' in path",
+        "done" in result.path,
+        actual=str(result.path),
+        expected_val="contains 'done'",
+    )
    assert "done" in result.path

    # Strict visit count verification
+    draft_visits = result.node_visit_counts.get("draft", 0)
+    artifact.check(
+        "draft visited 3 times", draft_visits == 3, actual=str(draft_visits), expected_val="3"
+    )
    assert result.node_visit_counts.get("draft", 0) == 3, (
        f"Draft should be visited 3 times, got {result.node_visit_counts.get('draft')}"
    )
+
+    review_visits = result.node_visit_counts.get("review", 0)
+    artifact.check(
+        "review visited 3 times", review_visits == 3, actual=str(review_visits), expected_val="3"
+    )
    assert result.node_visit_counts.get("review", 0) == 3, (
        f"Review should be visited 3 times, got {result.node_visit_counts.get('review')}"
    )
+
+    done_visits = result.node_visit_counts.get("done", 0)
+    artifact.check("done visited once", done_visits == 1, actual=str(done_visits), expected_val="1")
    assert result.node_visit_counts.get("done", 0) == 1, (
        f"Done should be visited once, got {result.node_visit_counts.get('done')}"
    )

    # Final output must be from the 'done' node
+    final_output = result.output.get("final")
+    artifact.check(
+        "output['final'] is 'complete'",
+        final_output == "complete",
+        actual=repr(final_output),
+        expected_val="'complete'",
+    )
    assert result.output.get("final") == "complete"
@@ -15,47 +15,124 @@ from framework.llm.provider import ToolUse
 from .conftest import make_executor


-def test_tools_mcp_server_connects(tool_registry):
+def test_tools_mcp_server_connects(tool_registry, artifact):
    """MCP server should start and expose tools."""
    tools = tool_registry.get_tools()
+
+    artifact.record_value(
+        "tool_count",
+        len(tools),
+        expected="at least 1 tool exposed by MCP server",
+    )
+    artifact.record_value("tool_names", list(tools.keys()))
+
+    artifact.check(
+        "MCP server exposes tools",
+        len(tools) > 0,
+        actual=str(len(tools)),
+        expected_val=">0",
+    )
    assert len(tools) > 0, "MCP server should expose at least one tool"


-def test_tools_registry_has_expected_tools(tool_registry):
+def test_tools_registry_has_expected_tools(tool_registry, artifact):
    """hive-tools should expose the expected tool names."""
    tool_names = set(tool_registry.get_tools().keys())
    expected = {"example_tool", "get_current_time"}
-    assert expected.issubset(tool_names), (
-        f"Missing expected tools: {expected - tool_names}"
+
+    artifact.record_value(
+        "tool_names",
+        sorted(tool_names),
+        expected="superset of {example_tool, get_current_time}",
    )
+    artifact.record_value("expected_tools", sorted(expected))
+
+    missing = expected - tool_names
+    artifact.check(
+        "expected tools present",
+        expected.issubset(tool_names),
+        actual=str(sorted(tool_names)),
+        expected_val=f"superset of {sorted(expected)}",
+    )
+    assert expected.issubset(tool_names), f"Missing expected tools: {expected - tool_names}"


@pytest.mark.asyncio
-async def test_tools_execute_example_tool(tool_registry):
+async def test_tools_execute_example_tool(tool_registry, artifact):
    """Direct tool execution without LLM — verifies MCP round-trip."""
    executor = tool_registry.get_executor()
-    tool_use = ToolUse(id="test-1", name="example_tool", input={"message": "hello", "uppercase": True})
+    tool_use = ToolUse(
+        id="test-1",
+        name="example_tool",
+        input={"message": "hello", "uppercase": True},
+    )
    result = executor(tool_use)
+
+    artifact.record_value(
+        "is_error",
+        result.is_error,
+        expected="not an error, content contains 'HELLO'",
+    )
+    artifact.record_value("content", result.content)
+
+    artifact.check(
+        "result is not error",
+        not result.is_error,
+        actual=str(result.is_error),
+        expected_val="False",
+    )
    assert not result.is_error
+
+    artifact.check(
+        "content contains HELLO",
+        "HELLO" in result.content,
+        actual=repr(result.content),
+        expected_val="contains 'HELLO'",
+    )
    assert "HELLO" in result.content


@pytest.mark.asyncio
-async def test_tools_execute_get_current_time(tool_registry):
+async def test_tools_execute_get_current_time(tool_registry, artifact):
    """get_current_time should return a dict with date/time fields."""
    executor = tool_registry.get_executor()
-    tool_use = ToolUse(id="test-2", name="get_current_time", input={"timezone": "UTC"})
+    tool_use = ToolUse(
+        id="test-2",
+        name="get_current_time",
+        input={"timezone": "UTC"},
+    )
    result = executor(tool_use)
+
+    artifact.record_value(
+        "is_error",
+        result.is_error,
+        expected="not an error, content contains year (202x)",
+    )
+    artifact.record_value("content", result.content)
+
+    artifact.check(
+        "result is not error",
+        not result.is_error,
+        actual=str(result.is_error),
+        expected_val="False",
+    )
    assert not result.is_error
+
+    artifact.check(
+        "content contains year",
+        "202" in result.content,
+        actual=repr(result.content),
+        expected_val="contains '202'",
+    )
    # Should contain date-like content
    assert "202" in result.content, "Should contain a year (202x)"


@pytest.mark.asyncio
 async def test_tools_llm_calls_tool_and_gets_result(
-    runtime, llm_provider, tool_registry, goal
+    runtime, llm_provider, tool_registry, goal, artifact
 ):
-    """Full round-trip: LLM calls a real tool and uses the result to set output."""
+    """Full round-trip: LLM calls a tool and uses the result."""
    graph = GraphSpec(
        id="tool-roundtrip",
        goal_id="dummy",
@@ -72,8 +149,9 @@ async def test_tools_llm_calls_tool_and_gets_result(
                output_keys=["result"],
                tools=["example_tool"],
                system_prompt=(
-                    "Use the example_tool to process the message from the task input "
-                    "with uppercase=true. Then call set_output with key='result' and "
+                    "Use the example_tool to process the message "
+                    "from the task input with uppercase=true. Then "
+                    "call set_output with key='result' and "
                    "the tool's return value."
                ),
            ),
@@ -83,12 +161,35 @@ async def test_tools_llm_calls_tool_and_gets_result(
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
    )
    result = await executor.execute(
-        graph, goal, {"task": "Process the word 'hello'"}, validate_graph=False
+        graph,
+        goal,
+        {"task": "Process the word 'hello'"},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected="success=True, output['result'] is set",
+    )
+
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
    )
    assert result.success
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None
@@ -0,0 +1,713 @@
+"""Component tests: Verified Outcomes — cross-checked, deterministic, no trust required.
+
+These tests eliminate false positives by:
+1. Using DETERMINISTIC inputs with KNOWN correct outputs
+2. Cross-checking LLM output against ground truth (tool results, file contents)
+3. Using REGEX validation instead of "is not None"
+4. Running a VERIFIER node that independently checks the first node's work
+5. Asserting on CONTENT, not just existence
+
+If a test here passes, the output is provably correct — not just non-null.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+
+import pytest
+
+from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
+from framework.graph.node import NodeSpec
+
+from .conftest import make_executor
+
+SET_OUTPUT = (
+    "You MUST call the set_output tool. "
+    "Do not just write text — call set_output with the correct key and value."
+)
+
+
+# ---------------------------------------------------------------------------
+# 1. Echo round-trip: input == output (exact match, no LLM creativity)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_echo_exact_content(runtime, goal, llm_provider, artifact):
+    """Echo test with EXACT content verification — not just 'is not None'.
+
+    The input is a unique token. The output must contain that exact token.
+    This catches LLMs that hallucinate or paraphrase instead of echoing.
+    """
+    UNIQUE_TOKEN = "XRAY_7742_BRAVO_ECHO"
+
+    graph = GraphSpec(
+        id="verified-echo",
+        goal_id="dummy",
+        entry_node="echo",
+        entry_points={"start": "echo"},
+        terminal_nodes=["echo"],
+        nodes=[
+            NodeSpec(
+                id="echo",
+                name="Echo",
+                description="Echoes input exactly",
+                node_type="event_loop",
+                input_keys=["input"],
+                output_keys=["output"],
+                system_prompt=(
+                    "Read the 'input' value. Call set_output with key='output' "
+                    "and the EXACT same string. Do not modify it. Do not add quotes "
+                    "or punctuation. Just the raw string." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[],
+        memory_keys=["input", "output"],
+        conversation_mode="continuous",
+    )
+    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
+    result = await executor.execute(graph, goal, {"input": UNIQUE_TOKEN}, validate_graph=False)
+    artifact.record(
+        result, expected="success=True, output['output'] contains exact token XRAY_7742_BRAVO_ECHO"
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    output = result.output.get("output", "")
+    artifact.check(
+        "output contains exact token",
+        UNIQUE_TOKEN in output,
+        actual=repr(output),
+        expected_val=f"contains '{UNIQUE_TOKEN}'",
+    )
+    assert UNIQUE_TOKEN in output, f"Exact token '{UNIQUE_TOKEN}' not found in output: {output!r}"
+
+
+# ---------------------------------------------------------------------------
+# 2. Math verification: LLM computes, we verify the answer independently
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_tool_result_matches_ground_truth(
+    runtime, goal, llm_provider, tool_registry, artifact
+):
+    """get_current_time returns real data — verify output matches tool's actual return.
+
+    We call the tool directly (ground truth), then run the LLM graph,
+    and verify the LLM's output contains the SAME day_of_week.
+    This catches LLMs that hallucinate dates.
+    """
+    from framework.llm.provider import ToolUse
+
+    # Step 1: Get ground truth by calling tool directly
+    executor_fn = tool_registry.get_executor()
+    tool_use = ToolUse(id="ground-truth", name="get_current_time", input={"timezone": "UTC"})
+    ground_truth_result = executor_fn(tool_use)
+
+    artifact.record_value(
+        "ground_truth_is_error",
+        ground_truth_result.is_error,
+        expected="ground truth tool returns day_of_week matching LLM output",
+    )
+    assert not ground_truth_result.is_error
+
+    # Parse the actual day_of_week from the tool
+    gt_data = json.loads(ground_truth_result.content)
+    actual_day = gt_data.get("day_of_week", "")
+    artifact.record_value("ground_truth_day", actual_day)
+    assert actual_day, f"Tool didn't return day_of_week: {gt_data}"
+
+    # Step 2: Run LLM graph that uses the same tool
+    graph = GraphSpec(
+        id="verified-time",
+        goal_id="dummy",
+        entry_node="worker",
+        entry_points={"start": "worker"},
+        terminal_nodes=["worker"],
+        nodes=[
+            NodeSpec(
+                id="worker",
+                name="Worker",
+                description="Get current time and report day",
+                node_type="event_loop",
+                output_keys=["result"],
+                tools=["get_current_time"],
+                system_prompt=(
+                    "Call get_current_time with timezone='UTC'. "
+                    "Extract the day_of_week from the result. "
+                    "Call set_output with key='result' and ONLY the day_of_week string "
+                    "(e.g., 'Monday'). Nothing else." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[],
+        memory_keys=["result"],
+        conversation_mode="continuous",
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        tool_registry=tool_registry,
+        loop_config={"max_iterations": 5},
+    )
+    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=f"success=True, output['result'] matches ground truth day_of_week='{actual_day}'",
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    llm_day = (result.output.get("result") or "").strip()
+    artifact.record_value("llm_day", llm_day)
+
+    # Step 3: Cross-check — LLM's answer must match ground truth
+    artifact.check(
+        "LLM day matches ground truth",
+        actual_day.lower() in llm_day.lower(),
+        actual=repr(llm_day),
+        expected_val=f"contains '{actual_day}'",
+    )
+    assert actual_day.lower() in llm_day.lower(), (
+        f"LLM reported '{llm_day}' but tool returned '{actual_day}'. "
+        f"The LLM hallucinated or misread the tool result."
+    )
+
+
+# ---------------------------------------------------------------------------
+# 3. File artifact round-trip: write -> read -> binary compare
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_artifact_binary_match(
+    runtime, goal, llm_provider, tool_registry, tmp_path, artifact
+):
+    """Save a file, then verify the on-disk content matches EXACTLY.
+
+    Does NOT rely on LLM to verify — we read the file ourselves.
+    This catches save_data bugs, encoding issues, or LLM adding extra content.
+    """
+    PAYLOAD = "VERIFIED_PAYLOAD_99_ZULU"
+    storage_path = tmp_path / "session"
+
+    graph = GraphSpec(
+        id="verified-artifact",
+        goal_id="dummy",
+        entry_node="worker",
+        entry_points={"start": "worker"},
+        terminal_nodes=["worker"],
+        nodes=[
+            NodeSpec(
+                id="worker",
+                name="Writer",
+                description="Saves exact payload to file",
+                node_type="event_loop",
+                input_keys=["task"],
+                output_keys=["result"],
+                tools=["save_data"],
+                system_prompt=(
+                    f"Call save_data with filename='verified.txt' and data='{PAYLOAD}'. "
+                    "Then call set_output with key='result' and value='saved'. " + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[],
+        memory_keys=["task", "result"],
+        conversation_mode="continuous",
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        tool_registry=tool_registry,
+        loop_config={"max_iterations": 5},
+        storage_path=storage_path,
+    )
+    result = await executor.execute(graph, goal, {"task": "save the file"}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, file 'verified.txt' on disk "
+            "matches VERIFIED_PAYLOAD_99_ZULU exactly"
+        ),
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    # Cross-check: read the file ourselves — don't trust the LLM
+    artifact_path = storage_path / "data" / "verified.txt"
+
+    artifact.check(
+        "file exists on disk",
+        artifact_path.exists(),
+        actual=str(artifact_path.exists()),
+        expected_val="True",
+    )
+    assert artifact_path.exists(), f"File not created at {artifact_path}"
+
+    actual_content = artifact_path.read_text(encoding="utf-8").strip()
+    artifact.check(
+        "file content matches payload",
+        actual_content == PAYLOAD,
+        actual=repr(actual_content),
+        expected_val=repr(PAYLOAD),
+    )
+    assert actual_content == PAYLOAD, (
+        f"File content mismatch.\n"
+        f"  Expected: {PAYLOAD!r}\n"
+        f"  Actual:   {actual_content!r}\n"
+        f"The LLM may have modified the payload or save_data encoded it differently."
+    )
+
+
+# ---------------------------------------------------------------------------
+# 4. Pipeline data integrity: track a token through N nodes
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_pipeline_token_survives(runtime, goal, llm_provider, artifact):
+    """Pass a unique token through 3 nodes — verify it arrives at the end.
+
+    Each node is instructed to PRESERVE the token. If any node drops or
+    modifies it, the final assertion catches it. This verifies input_mapping
+    and continuous conversation actually deliver data correctly.
+    """
+    TOKEN = "TRACKING_TOKEN_88X"
+
+    graph = GraphSpec(
+        id="verified-pipeline",
+        goal_id="dummy",
+        entry_node="a",
+        entry_points={"start": "a"},
+        terminal_nodes=["c"],
+        conversation_mode="continuous",
+        nodes=[
+            NodeSpec(
+                id="a",
+                name="Node A",
+                description="First node",
+                node_type="event_loop",
+                input_keys=["token"],
+                output_keys=["a_out"],
+                system_prompt=(
+                    "Read the 'token' input. Call set_output with key='a_out' "
+                    "and the EXACT token value. Do not modify it." + SET_OUTPUT
+                ),
+            ),
+            NodeSpec(
+                id="b",
+                name="Node B",
+                description="Middle node",
+                node_type="event_loop",
+                input_keys=["b_in"],
+                output_keys=["b_out"],
+                system_prompt=(
+                    "Read the 'b_in' input. Call set_output with key='b_out' "
+                    "and the EXACT same value. Do not modify it." + SET_OUTPUT
+                ),
+            ),
+            NodeSpec(
+                id="c",
+                name="Node C",
+                description="Terminal node",
+                node_type="event_loop",
+                input_keys=["c_in"],
+                output_keys=["result"],
+                system_prompt=(
+                    "Read the 'c_in' input. Call set_output with key='result' "
+                    "and the EXACT same value. Do not modify it." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[
+            EdgeSpec(
+                id="a-b",
+                source="a",
+                target="b",
+                condition=EdgeCondition.ON_SUCCESS,
+                input_mapping={"b_in": "a_out"},
+            ),
+            EdgeSpec(
+                id="b-c",
+                source="b",
+                target="c",
+                condition=EdgeCondition.ON_SUCCESS,
+                input_mapping={"c_in": "b_out"},
+            ),
+        ],
+        memory_keys=["token", "a_out", "b_in", "b_out", "c_in", "result"],
+    )
+    executor = make_executor(runtime, llm_provider, loop_config={"max_iterations": 5})
+    result = await executor.execute(graph, goal, {"token": TOKEN}, validate_graph=False)
+    artifact.record(
+        result,
+        expected="success=True, path=['a','b','c'], output['result'] contains TRACKING_TOKEN_88X",
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["a", "b", "c"],
+        actual=str(result.path),
+        expected_val="['a', 'b', 'c']",
+    )
+    assert result.path == ["a", "b", "c"]
+
+    final_output = result.output.get("result", "")
+    artifact.check(
+        "token survives pipeline",
+        TOKEN in final_output,
+        actual=repr(final_output),
+        expected_val=f"contains '{TOKEN}'",
+    )
+    assert TOKEN in final_output, (
+        f"Token '{TOKEN}' lost in pipeline.\n"
+        f"  Input: {TOKEN}\n"
+        f"  Final output: {final_output!r}\n"
+        f"  Path: {result.path}\n"
+        f"Data was corrupted or dropped during node transitions."
+    )
+
+
+# ---------------------------------------------------------------------------
+# 5. Structured format with regex validation
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_format_with_regex(runtime, goal, llm_provider, tool_registry, artifact):
+    """Output must match a strict regex — not just 'contains a pipe character'.
+
+    Format: STATUS|YYYY-MM-DD|DayName
+    Regex validates each segment independently.
+    """
+    graph = GraphSpec(
+        id="verified-format",
+        goal_id="dummy",
+        entry_node="worker",
+        entry_points={"start": "worker"},
+        terminal_nodes=["worker"],
+        nodes=[
+            NodeSpec(
+                id="worker",
+                name="Worker",
+                description="Produce formatted status string",
+                node_type="event_loop",
+                output_keys=["result"],
+                tools=["get_current_time"],
+                system_prompt=(
+                    "Call get_current_time with timezone='UTC'. "
+                    "Build this EXACT format: STATUS|<date>|<day_of_week>\n"
+                    "Where <date> is YYYY-MM-DD format and <day_of_week> is the full day name.\n"
+                    "Example: STATUS|2026-04-03|Thursday\n"
+                    "Call set_output with key='result' and the formatted string.\n"
+                    "Output ONLY the formatted string, nothing else." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[],
+        memory_keys=["result"],
+        conversation_mode="continuous",
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        tool_registry=tool_registry,
+        loop_config={"max_iterations": 5},
+    )
+    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result, expected="success=True, output['result'] matches regex STATUS|YYYY-MM-DD|DayName"
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    output = (result.output.get("result") or "").strip()
+    artifact.record_value("raw_output", output)
+
+    # Strict regex: STATUS|YYYY-MM-DD|DayName
+    pattern = (
+        r"^STATUS\|\d{4}-\d{2}-\d{2}\|(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$"
+    )
+    matches = bool(re.match(pattern, output))
+    artifact.check(
+        "output matches regex",
+        matches,
+        actual=repr(output),
+        expected_val=f"matches pattern: {pattern}",
+    )
+    assert re.match(pattern, output), (
+        f"Output does not match required format.\n"
+        f"  Expected pattern: STATUS|YYYY-MM-DD|DayName\n"
+        f"  Actual output:    {output!r}\n"
+        f"  Regex:            {pattern}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# 6. Two-node cross-verification: writer + independent verifier
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_two_node_cross_check(
+    runtime, goal, llm_provider, tool_registry, tmp_path, artifact
+):
+    """Node 1 writes a file. Node 2 loads it and compares to expected.
+
+    Both nodes operate INDEPENDENTLY on the same file. If the content
+    doesn't match, the verifier reports MISMATCH. We also read the file
+    ourselves as a triple-check.
+    """
+    EXPECTED = "CROSS_CHECK_ALPHA_42"
+    storage_path = tmp_path / "session"
+
+    graph = GraphSpec(
+        id="verified-cross-check",
+        goal_id="dummy",
+        entry_node="writer",
+        entry_points={"start": "writer"},
+        terminal_nodes=["verifier"],
+        conversation_mode="continuous",
+        nodes=[
+            NodeSpec(
+                id="writer",
+                name="Writer",
+                description="Writes exact content to file",
+                node_type="event_loop",
+                output_keys=["filename"],
+                tools=["save_data"],
+                system_prompt=(
+                    f"Call save_data with filename='crosscheck.txt' and data='{EXPECTED}'. "
+                    "Then call set_output with key='filename' and value='crosscheck.txt'."
+                    + SET_OUTPUT
+                ),
+            ),
+            NodeSpec(
+                id="verifier",
+                name="Verifier",
+                description="Loads file and verifies content",
+                node_type="event_loop",
+                input_keys=["filename"],
+                output_keys=["result"],
+                tools=["load_data"],
+                system_prompt=(
+                    "Load the file using load_data with the provided 'filename'. "
+                    f"If the loaded content is exactly '{EXPECTED}', "
+                    "call set_output with key='result' and value='VERIFIED'. "
+                    "If it does NOT match, call set_output with key='result' "
+                    "and value='MISMATCH:' followed by what you actually loaded." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[
+            EdgeSpec(
+                id="write-to-verify",
+                source="writer",
+                target="verifier",
+                condition=EdgeCondition.ON_SUCCESS,
+                input_mapping={"filename": "filename"},
+            ),
+        ],
+        memory_keys=["filename", "result"],
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        tool_registry=tool_registry,
+        loop_config={"max_iterations": 5},
+        storage_path=storage_path,
+    )
+    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, path=['writer','verifier'], "
+            "verifier output='VERIFIED', disk content "
+            "matches CROSS_CHECK_ALPHA_42"
+        ),
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["writer", "verifier"],
+        actual=str(result.path),
+        expected_val="['writer', 'verifier']",
+    )
+    assert result.path == ["writer", "verifier"]
+
+    # LLM-side verification
+    verifier_output = result.output.get("result", "")
+    artifact.check(
+        "verifier output is VERIFIED",
+        verifier_output == "VERIFIED",
+        actual=repr(verifier_output),
+        expected_val="'VERIFIED'",
+    )
+    assert verifier_output == "VERIFIED", (
+        f"Verifier node reported: {verifier_output!r} (expected 'VERIFIED')"
+    )
+
+    # Our own independent verification (triple-check)
+    artifact_path = storage_path / "data" / "crosscheck.txt"
+    artifact.check(
+        "file exists on disk",
+        artifact_path.exists(),
+        actual=str(artifact_path.exists()),
+        expected_val="True",
+    )
+    assert artifact_path.exists(), f"File not found at {artifact_path}"
+
+    actual = artifact_path.read_text(encoding="utf-8").strip()
+    artifact.check(
+        "disk content matches expected",
+        actual == EXPECTED,
+        actual=repr(actual),
+        expected_val=repr(EXPECTED),
+    )
+    assert actual == EXPECTED, f"Disk content mismatch: expected {EXPECTED!r}, got {actual!r}"
+
+
+# ---------------------------------------------------------------------------
+# 7. Event bus cross-check: verify events match execution result
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_verified_events_match_result(
+    runtime, goal, llm_provider, tool_registry, tmp_path, artifact
+):
+    """Cross-check: events captured on bus must agree with ExecutionResult.
+
+    If result says path=["a","b"], the events must show NODE_LOOP_COMPLETED
+    for both "a" and "b". If result says tool X was called, TOOL_CALL_COMPLETED
+    must contain X. This catches desynchronization between the event bus and
+    the execution engine.
+    """
+    from framework.runtime.event_bus import EventBus, EventType
+
+    bus = EventBus()
+    completed_nodes = []
+    tool_names = set()
+
+    async def _capture_node(event):
+        completed_nodes.append(event.node_id)
+
+    async def _capture_tool(event):
+        tool_names.add(event.data.get("tool_name", ""))
+
+    bus.subscribe(event_types=[EventType.NODE_LOOP_COMPLETED], handler=_capture_node)
+    bus.subscribe(event_types=[EventType.TOOL_CALL_COMPLETED], handler=_capture_tool)
+
+    graph = GraphSpec(
+        id="verified-events",
+        goal_id="dummy",
+        entry_node="worker",
+        entry_points={"start": "worker"},
+        terminal_nodes=["worker"],
+        nodes=[
+            NodeSpec(
+                id="worker",
+                name="Worker",
+                description="Uses tool then sets output",
+                node_type="event_loop",
+                output_keys=["result"],
+                tools=["get_current_time"],
+                system_prompt=(
+                    "Call get_current_time with timezone='UTC'. "
+                    "Then call set_output with key='result' and value='done'." + SET_OUTPUT
+                ),
+            ),
+        ],
+        edges=[],
+        memory_keys=["result"],
+        conversation_mode="continuous",
+    )
+    executor = make_executor(
+        runtime,
+        llm_provider,
+        tool_registry=tool_registry,
+        loop_config={"max_iterations": 5},
+        storage_path=tmp_path / "session",
+        event_bus=bus,
+        stream_id="worker",
+    )
+    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    artifact.record(
+        result,
+        expected=(
+            "success=True, event bus nodes match "
+            "result.path, tool events include "
+            "get_current_time and set_output"
+        ),
+    )
+
+    artifact.check(
+        "execution succeeds", result.success, actual=str(result.success), expected_val="True"
+    )
+    assert result.success
+
+    artifact.record_value("completed_nodes", completed_nodes)
+    artifact.record_value("tool_names", sorted(tool_names))
+
+    # Cross-check 1: path nodes match completed nodes
+    for node_id in result.path:
+        artifact.check(
+            f"node '{node_id}' in completed events",
+            node_id in completed_nodes,
+            actual=str(completed_nodes),
+            expected_val=f"contains '{node_id}'",
+        )
+        assert node_id in completed_nodes, (
+            f"Node '{node_id}' in result.path but no NODE_LOOP_COMPLETED event. "
+            f"Events saw: {completed_nodes}"
+        )
+
+    # Cross-check 2: get_current_time must appear in tool events
+    artifact.check(
+        "get_current_time in tool events",
+        "get_current_time" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'get_current_time'",
+    )
+    assert "get_current_time" in tool_names, (
+        f"get_current_time not in tool events. Captured: {tool_names}. "
+        f"Result claims success but event bus disagrees."
+    )
+
+    # Cross-check 3: set_output must appear in tool events
+    artifact.check(
+        "set_output in tool events",
+        "set_output" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'set_output'",
+    )
+    assert "set_output" in tool_names, (
+        f"set_output not in tool events. Captured: {tool_names}. "
+        f"Result has output but no set_output event."
+    )
@@ -1,8 +1,9 @@
-"""Component tests: Worker Communication — event flow, completion, failure.
+"""Component tests: Worker Communication — event flow, completion.

-Exercises the full worker execution lifecycle with EventBus subscriptions
-to verify that the exact events are published in the correct order, with
-correct data, simulating the queen-worker communication contract.
+Exercises the full worker execution lifecycle with EventBus
+subscriptions to verify that the exact events are published in
+the correct order, with correct data, simulating the queen-worker
+communication contract.
 """

 from __future__ import annotations
@@ -20,7 +21,8 @@ from .conftest import make_executor

 SET_OUTPUT = (
    "You MUST call the set_output tool. "
-    "Do not just write text — call set_output with the correct key and value."
+    "Do not just write text — call set_output with the correct "
+    "key and value."
 )


@@ -34,7 +36,7 @@ class EventCapture:
        return [e for e in self.events if e.type in event_types]

    def tool_calls(self) -> list[dict]:
-        """Extract tool call data from TOOL_CALL_COMPLETED events."""
+        """Extract tool call data from TOOL_CALL_COMPLETED."""
        return [e.data for e in self.of_type(EventType.TOOL_CALL_COMPLETED)]

    def tool_names_called(self) -> set[str]:
@@ -51,14 +53,13 @@ class EventCapture:


 def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
-    """Create an EventBus with a capture handler subscribed to all events."""
+    """Create an EventBus with a capture handler."""
    bus = EventBus()
    capture = EventCapture()

    async def _capture_all(event: AgentEvent) -> None:
        capture.events.append(event)

-    # Subscribe to the key event types we want to verify
    bus.subscribe(
        event_types=[
            EventType.NODE_LOOP_STARTED,
@@ -79,14 +80,14 @@ def _make_event_bus_and_capture() -> tuple[EventBus, EventCapture]:
    return bus, capture


-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------
 # Tests: Worker Completion Events
-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------


@pytest.mark.asyncio
-async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path):
-    """Worker execution must emit LOOP_STARTED → iterations → LOOP_COMPLETED."""
+async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, tmp_path, artifact):
+    """Worker must emit STARTED -> iterations -> COMPLETED."""
    bus, capture = _make_event_bus_and_capture()

    graph = GraphSpec(
@@ -102,7 +103,7 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
                description="Simple output",
                node_type="event_loop",
                output_keys=["result"],
-                system_prompt="Call set_output with key='result' and value='done'. " + SET_OUTPUT,
+                system_prompt=("Call set_output with key='result' and value='done'. " + SET_OUTPUT),
            ),
        ],
        edges=[],
@@ -110,34 +111,78 @@ async def test_worker_emits_loop_lifecycle_events(runtime, goal, llm_provider, t
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=(
+            "success=True, lifecycle events in correct order: STARTED -> iterations -> COMPLETED"
+        ),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success

-    # Verify lifecycle event ordering
    loop_started = capture.of_type(EventType.NODE_LOOP_STARTED)
-    loop_completed = capture.of_type(EventType.NODE_LOOP_COMPLETED)
+    loop_completed = capture.of_type(
+        EventType.NODE_LOOP_COMPLETED,
+    )
    iterations = capture.of_type(EventType.NODE_LOOP_ITERATION)

+    artifact.check(
+        "NODE_LOOP_STARTED emitted",
+        len(loop_started) >= 1,
+        actual=str(len(loop_started)),
+        expected_val=">=1",
+    )
    assert len(loop_started) >= 1, "Missing NODE_LOOP_STARTED"
+
+    artifact.check(
+        "NODE_LOOP_COMPLETED emitted",
+        len(loop_completed) >= 1,
+        actual=str(len(loop_completed)),
+        expected_val=">=1",
+    )
    assert len(loop_completed) >= 1, "Missing NODE_LOOP_COMPLETED"
+
+    artifact.check(
+        "NODE_LOOP_ITERATION emitted",
+        len(iterations) >= 1,
+        actual=str(len(iterations)),
+        expected_val=">=1",
+    )
    assert len(iterations) >= 1, "Missing NODE_LOOP_ITERATION"

-    # STARTED must come before COMPLETED
    start_idx = capture.events.index(loop_started[0])
    end_idx = capture.events.index(loop_completed[0])
+    artifact.check(
+        "STARTED precedes COMPLETED",
+        start_idx < end_idx,
+        actual=f"start={start_idx}, end={end_idx}",
+        expected_val="start < end",
+    )
    assert start_idx < end_idx, "LOOP_STARTED must precede LOOP_COMPLETED"


@pytest.mark.asyncio
 async def test_worker_emits_llm_turn_with_token_counts(
-    runtime, goal, llm_provider, tmp_path
+    runtime, goal, llm_provider, tmp_path, artifact
 ):
    """Each LLM turn must emit LLM_TURN_COMPLETE with token counts."""
    bus, capture = _make_event_bus_and_capture()
@@ -155,7 +200,7 @@ async def test_worker_emits_llm_turn_with_token_counts(
                description="Simple output",
                node_type="event_loop",
                output_keys=["result"],
-                system_prompt="Call set_output with key='result' and value='ok'. " + SET_OUTPUT,
+                system_prompt=("Call set_output with key='result' and value='ok'. " + SET_OUTPUT),
            ),
        ],
        edges=[],
@@ -163,30 +208,82 @@ async def test_worker_emits_llm_turn_with_token_counts(
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        loop_config={"max_iterations": 3},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, LLM_TURN_COMPLETE events with positive token counts and model"),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success

    llm_turns = capture.of_type(EventType.LLM_TURN_COMPLETE)
+
+    artifact.check(
+        "LLM_TURN_COMPLETE emitted",
+        len(llm_turns) >= 1,
+        actual=str(len(llm_turns)),
+        expected_val=">=1",
+    )
    assert len(llm_turns) >= 1, "No LLM_TURN_COMPLETE events"

-    for turn in llm_turns:
-        assert turn.data.get("input_tokens", 0) > 0, "input_tokens should be > 0"
-        assert turn.data.get("output_tokens", 0) > 0, "output_tokens should be > 0"
+    for i, turn in enumerate(llm_turns):
+        in_tok = turn.data.get("input_tokens", 0)
+        out_tok = turn.data.get("output_tokens", 0)
+        model = turn.data.get("model", "")
+
+        artifact.check(
+            f"turn[{i}] input_tokens > 0",
+            in_tok > 0,
+            actual=str(in_tok),
+            expected_val=">0",
+        )
+        assert in_tok > 0, "input_tokens should be > 0"
+
+        artifact.check(
+            f"turn[{i}] output_tokens > 0",
+            out_tok > 0,
+            actual=str(out_tok),
+            expected_val=">0",
+        )
+        assert out_tok > 0, "output_tokens should be > 0"
+
+        artifact.check(
+            f"turn[{i}] model populated",
+            bool(model),
+            actual=repr(model),
+            expected_val="non-empty string",
+        )
        assert turn.data.get("model"), "model should be populated"


@pytest.mark.asyncio
 async def test_worker_tool_calls_emit_events(
-    runtime, goal, llm_provider, tool_registry, tmp_path
+    runtime,
+    goal,
+    llm_provider,
+    tool_registry,
+    tmp_path,
+    artifact,
 ):
-    """Tool calls must emit TOOL_CALL_STARTED and TOOL_CALL_COMPLETED events."""
+    """Tool calls must emit STARTED and COMPLETED events."""
    bus, capture = _make_event_bus_and_capture()

    graph = GraphSpec(
@@ -205,8 +302,8 @@ async def test_worker_tool_calls_emit_events(
                tools=["get_current_time"],
                system_prompt=(
                    "Call get_current_time with timezone='UTC'. "
-                    "Then call set_output with key='result' and the day_of_week. "
-                    + SET_OUTPUT
+                    "Then call set_output with key='result' and "
+                    "the day_of_week. " + SET_OUTPUT
                ),
            ),
        ],
@@ -215,38 +312,97 @@ async def test_worker_tool_calls_emit_events(
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=(
+            "success=True, output['result'] set, tool events for get_current_time and set_output"
+        ),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_output = result.output.get("result")
+    artifact.check(
+        "output['result'] is set",
+        actual_output is not None,
+        actual=repr(actual_output),
+        expected_val="non-None value",
+    )
    assert result.output.get("result") is not None

-    # Verify tool events
    tool_started = capture.of_type(EventType.TOOL_CALL_STARTED)
-    tool_completed = capture.of_type(EventType.TOOL_CALL_COMPLETED)
+    tool_completed = capture.of_type(
+        EventType.TOOL_CALL_COMPLETED,
+    )
+
+    artifact.check(
+        "TOOL_CALL_STARTED emitted",
+        len(tool_started) >= 1,
+        actual=str(len(tool_started)),
+        expected_val=">=1",
+    )
    assert len(tool_started) >= 1, "No TOOL_CALL_STARTED events"
-    assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED events"

-    # get_current_time must be among the tools called
-    assert "get_current_time" in capture.tool_names_called()
+    artifact.check(
+        "TOOL_CALL_COMPLETED emitted",
+        len(tool_completed) >= 1,
+        actual=str(len(tool_completed)),
+        expected_val=">=1",
+    )
+    assert len(tool_completed) >= 1, "No TOOL_CALL_COMPLETED"

-    # set_output must also appear (synthetic tool)
-    assert "set_output" in capture.tool_names_called()
+    tool_names = capture.tool_names_called()
+    artifact.check(
+        "get_current_time called",
+        "get_current_time" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'get_current_time'",
+    )
+    assert "get_current_time" in tool_names
+
+    artifact.check(
+        "set_output called",
+        "set_output" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'set_output'",
+    )
+    assert "set_output" in tool_names

-    # Tool calls should not have errors
    for tc in capture.tool_calls():
-        if tc.get("tool_name") in ("get_current_time", "set_output"):
-            assert not tc.get("is_error"), f"Tool {tc.get('tool_name')} errored"
+        tn = tc.get("tool_name")
+        if tn in ("get_current_time", "set_output"):
+            is_err = tc.get("is_error")
+            artifact.check(
+                f"tool {tn} no error",
+                not is_err,
+                actual=str(is_err),
+                expected_val="False",
+            )
+            assert not is_err, f"Tool {tn} errored"


@pytest.mark.asyncio
-async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path):
+async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path, artifact):
    """set_output must emit OUTPUT_KEY_SET event with the key name."""
    bus, capture = _make_event_bus_and_capture()

@@ -275,34 +431,84 @@ async def test_worker_output_key_set_event(runtime, goal, llm_provider, tmp_path
        conversation_mode="continuous",
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=("success=True, output['name'] and output['status'] set, OUTPUT_KEY_SET for both"),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    actual_name = result.output.get("name")
+    artifact.check(
+        "output['name'] is set",
+        actual_name is not None,
+        actual=repr(actual_name),
+        expected_val="non-None value",
+    )
    assert result.output.get("name") is not None
+
+    actual_status = result.output.get("status")
+    artifact.check(
+        "output['status'] is set",
+        actual_status is not None,
+        actual=repr(actual_status),
+        expected_val="non-None value",
+    )
    assert result.output.get("status") is not None

-    # Verify OUTPUT_KEY_SET events for both keys
    keys_set = capture.output_keys_set()
+
+    artifact.check(
+        "OUTPUT_KEY_SET for 'name'",
+        "name" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'name'",
+    )
    assert "name" in keys_set, f"Missing OUTPUT_KEY_SET for 'name', got: {keys_set}"
+
+    artifact.check(
+        "OUTPUT_KEY_SET for 'status'",
+        "status" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'status'",
+    )
    assert "status" in keys_set, f"Missing OUTPUT_KEY_SET for 'status', got: {keys_set}"


-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------
 # Tests: Multi-Node Worker Communication
-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------


@pytest.mark.asyncio
 async def test_worker_pipeline_data_integrity(
-    runtime, goal, llm_provider, tool_registry, tmp_path
+    runtime,
+    goal,
+    llm_provider,
+    tool_registry,
+    tmp_path,
+    artifact,
 ):
-    """Data produced by node 1 must arrive at node 2 via input_mapping, verified end-to-end."""
+    """Data from node 1 must arrive at node 2, verified end-to-end."""
    bus, capture = _make_event_bus_and_capture()

    graph = GraphSpec(
@@ -316,28 +522,30 @@ async def test_worker_pipeline_data_integrity(
            NodeSpec(
                id="producer",
                name="Producer",
-                description="Produces a timestamped value using a real tool",
+                description="Produces a timestamped value",
                node_type="event_loop",
                output_keys=["payload"],
                tools=["get_current_time"],
                system_prompt=(
                    "Call get_current_time with timezone='UTC'. "
                    "Extract the 'date' field from the result. "
-                    "Call set_output with key='payload' and the date string as value. "
-                    + SET_OUTPUT
+                    "Call set_output with key='payload' and the "
+                    "date string as value. " + SET_OUTPUT
                ),
            ),
            NodeSpec(
                id="consumer",
                name="Consumer",
-                description="Verifies received data contains a date",
+                description="Verifies received data",
                node_type="event_loop",
                input_keys=["data"],
                output_keys=["result"],
                system_prompt=(
-                    "Read the 'data' input. It should contain a date string. "
-                    "Call set_output with key='result' and value='VERIFIED|' followed by "
-                    "the first 10 characters of the data input. " + SET_OUTPUT
+                    "Read the 'data' input. It should contain a "
+                    "date string. Call set_output with "
+                    "key='result' and value='VERIFIED|' followed "
+                    "by the first 10 characters of the data "
+                    "input. " + SET_OUTPUT
                ),
            ),
        ],
@@ -353,44 +561,127 @@ async def test_worker_pipeline_data_integrity(
        memory_keys=["payload", "data", "result"],
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        tool_registry=tool_registry,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=(
+            "success=True, clean, "
+            "path=['producer','consumer'], steps=2, "
+            "output starts with VERIFIED|"
+        ),
+    )

-    # Strict outcome verification
+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "clean success",
+        result.is_clean_success,
+        actual=str(result.execution_quality),
+        expected_val="clean",
+    )
    assert result.is_clean_success, f"quality={result.execution_quality}"
+
+    artifact.check(
+        "path matches",
+        result.path == ["producer", "consumer"],
+        actual=str(result.path),
+        expected_val="['producer', 'consumer']",
+    )
    assert result.path == ["producer", "consumer"]
+
+    artifact.check(
+        "steps_executed is 2",
+        result.steps_executed == 2,
+        actual=str(result.steps_executed),
+        expected_val="2",
+    )
    assert result.steps_executed == 2

-    # Output must be present and correctly structured
    output = result.output.get("result")
+    artifact.check(
+        "consumer set 'result'",
+        output is not None,
+        actual=repr(output),
+        expected_val="non-None value",
+    )
    assert output is not None, "Consumer did not set 'result'"
+
+    artifact.check(
+        "output starts with VERIFIED|",
+        output.startswith("VERIFIED|"),
+        actual=repr(output),
+        expected_val="starts with 'VERIFIED|'",
+    )
    assert output.startswith("VERIFIED|"), f"Expected VERIFIED|..., got: {output}"

-    # Token counts should be reasonable (not zero, not astronomical)
+    artifact.check(
+        "total_tokens > 0",
+        result.total_tokens > 0,
+        actual=str(result.total_tokens),
+        expected_val=">0",
+    )
    assert result.total_tokens > 0
+
+    artifact.check(
+        "total_tokens < 100000",
+        result.total_tokens < 100_000,
+        actual=str(result.total_tokens),
+        expected_val="<100000",
+    )
    assert result.total_tokens < 100_000, f"Unexpectedly high tokens: {result.total_tokens}"

-    # Both nodes should have set their output keys
    keys_set = capture.output_keys_set()
+
+    artifact.check(
+        "producer set 'payload'",
+        "payload" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'payload'",
+    )
    assert "payload" in keys_set, "Producer didn't set 'payload'"
+
+    artifact.check(
+        "consumer set 'result' key",
+        "result" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'result'",
+    )
    assert "result" in keys_set, "Consumer didn't set 'result'"

-    # get_current_time must have been called (in producer)
-    assert "get_current_time" in capture.tool_names_called()
+    tool_names = capture.tool_names_called()
+    artifact.check(
+        "get_current_time called",
+        "get_current_time" in tool_names,
+        actual=str(sorted(tool_names)),
+        expected_val="contains 'get_current_time'",
+    )
+    assert "get_current_time" in tool_names


@pytest.mark.asyncio
 async def test_worker_multi_node_output_propagation(
-    runtime, goal, llm_provider, tmp_path
+    runtime, goal, llm_provider, tmp_path, artifact
 ):
-    """Data from node A's output must arrive at node B and be reflected in final output."""
+    """Data from node A must arrive at node B in final output."""
    bus, capture = _make_event_bus_and_capture()

    graph = GraphSpec(
@@ -408,22 +699,25 @@ async def test_worker_multi_node_output_propagation(
                node_type="event_loop",
                output_keys=["code"],
                system_prompt=(
-                    "Call set_output with key='code' and value='ALPHA_BRAVO_42'. "
+                    "Call set_output with key='code' and "
+                    "value='ALPHA_BRAVO_42'. "
                    "Do not write any text." + SET_OUTPUT
                ),
            ),
            NodeSpec(
                id="formatter",
                name="Formatter",
-                description="Wraps received code in brackets",
+                description="Wraps code in brackets",
                node_type="event_loop",
                input_keys=["raw_code"],
                output_keys=["result"],
                system_prompt=(
                    "Read the 'raw_code' input value. "
-                    "Call set_output with key='result' and value='[' followed by "
-                    "the raw_code value followed by ']'. "
-                    "Example: if raw_code is 'XYZ', output should be '[XYZ]'. " + SET_OUTPUT
+                    "Call set_output with key='result' and "
+                    "value='[' followed by the raw_code value "
+                    "followed by ']'. "
+                    "Example: if raw_code is 'XYZ', output "
+                    "should be '[XYZ]'. " + SET_OUTPUT
                ),
            ),
        ],
@@ -439,44 +733,110 @@ async def test_worker_multi_node_output_propagation(
        memory_keys=["code", "raw_code", "result"],
    )
    executor = make_executor(
-        runtime, llm_provider,
+        runtime,
+        llm_provider,
        loop_config={"max_iterations": 5},
        storage_path=tmp_path / "session",
        event_bus=bus,
        stream_id="worker",
    )
-    result = await executor.execute(graph, goal, {}, validate_graph=False)
+    result = await executor.execute(
+        graph,
+        goal,
+        {},
+        validate_graph=False,
+    )
+    artifact.record(
+        result,
+        expected=(
+            "success=True, "
+            "path=['generator','formatter'], steps=2, "
+            "output contains [ALPHA_BRAVO_42]"
+        ),
+    )

+    artifact.check(
+        "execution succeeds",
+        result.success,
+        actual=str(result.success),
+        expected_val="True",
+    )
    assert result.success
+
+    artifact.check(
+        "path matches",
+        result.path == ["generator", "formatter"],
+        actual=str(result.path),
+        expected_val="['generator', 'formatter']",
+    )
    assert result.path == ["generator", "formatter"]
+
+    artifact.check(
+        "steps_executed is 2",
+        result.steps_executed == 2,
+        actual=str(result.steps_executed),
+        expected_val="2",
+    )
    assert result.steps_executed == 2

-    # Verify output structure
    output = result.output.get("result")
+    artifact.check(
+        "formatter set 'result'",
+        output is not None,
+        actual=repr(output),
+        expected_val="non-None value",
+    )
    assert output is not None, "Formatter did not set 'result'"
-    assert "[" in output and "]" in output, f"Expected bracket wrapping, got: {output}"
+
+    has_brackets = "[" in output and "]" in output
+    artifact.check(
+        "output has bracket wrapping",
+        has_brackets,
+        actual=repr(output),
+        expected_val="contains '[' and ']'",
+    )
+    assert has_brackets, f"Expected bracket wrapping, got: {output}"
+
+    artifact.check(
+        "output contains ALPHA_BRAVO_42",
+        "ALPHA_BRAVO_42" in output,
+        actual=repr(output),
+        expected_val="contains 'ALPHA_BRAVO_42'",
+    )
    assert "ALPHA_BRAVO_42" in output, f"Code word missing from output: {output}"

-    # Both nodes should have set their output keys
    keys_set = capture.output_keys_set()
+    artifact.check(
+        "'code' in keys_set",
+        "code" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'code'",
+    )
    assert "code" in keys_set
+
+    artifact.check(
+        "'result' in keys_set",
+        "result" in keys_set,
+        actual=str(sorted(keys_set)),
+        expected_val="contains 'result'",
+    )
    assert "result" in keys_set


-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------
 # Tests: Escalation Event Flow
-# ---------------------------------------------------------------------------
+# -------------------------------------------------------------------


@pytest.mark.asyncio
 async def test_worker_escalation_emits_event_with_reason(
-    runtime, goal, llm_provider, tmp_path
+    runtime, goal, llm_provider, tmp_path, artifact
 ):
-    """Worker calling escalate must emit ESCALATION_REQUESTED with the reason.
+    """Worker calling escalate must emit ESCALATION_REQUESTED.

-    After calling escalate, the worker blocks waiting for queen input.
-    Since there's no queen in this test, we run with a short timeout and
-    verify the escalation event was emitted before the timeout.
+    After calling escalate, the worker blocks waiting for queen
+    input. Since there's no queen in this test, we run with a
+    short timeout and verify the escalation event was emitted.
    """
    bus, capture = _make_event_bus_and_capture()

@@ -495,8 +855,10 @@ async def test_worker_escalation_emits_event_with_reason(
                output_keys=["result"],
                system_prompt=(
                    "You are blocked and need human help. "
-                    "Call the escalate tool with reason='missing credentials for API'. "
-                    "Do not call set_output. Do not write any text first."
+                    "Call the escalate tool with "
+                    "reason='missing credentials for API'. "
+                    "Do not call set_output. "
+                    "Do not write any text first."
                ),
            ),
        ],
@@ -515,24 +877,59 @@ async def test_worker_escalation_emits_event_with_reason(
        stream_id="worker",
    )

-    # Worker will block after escalate (waiting for queen).
-    # Use a short timeout — we only need the escalation event to fire.
    try:
        await asyncio.wait_for(
-            executor.execute(graph, goal, {}, validate_graph=False),
+            executor.execute(
+                graph,
+                goal,
+                {},
+                validate_graph=False,
+            ),
            timeout=30,
        )
    except (TimeoutError, asyncio.TimeoutError):
-        pass  # Expected: worker hangs waiting for queen input
+        pass  # Expected: worker hangs waiting for queen

-    # Verify escalation event was emitted before the timeout
    escalations = capture.of_type(EventType.ESCALATION_REQUESTED)
-    assert len(escalations) >= 1, (
-        f"No ESCALATION_REQUESTED event emitted. "
-        f"Events captured: {[e.type.value for e in capture.events]}"
+    all_types = [e.type.value for e in capture.events]
+
+    artifact.record_value(
+        "escalation_count",
+        len(escalations),
+        expected=(">=1 ESCALATION_REQUESTED with non-empty reason, stream_id='worker'"),
    )
+    artifact.record_value("all_event_types", all_types)
+
+    artifact.check(
+        "escalation event emitted",
+        len(escalations) >= 1,
+        actual=str(len(escalations)),
+        expected_val=">=1",
+    )
+    assert len(escalations) >= 1, f"No ESCALATION_REQUESTED event emitted. Events: {all_types}"

    esc_data = escalations[0].data
-    assert esc_data.get("reason"), "Escalation reason should not be empty"
+    reason = esc_data.get("reason", "")
+    artifact.check(
+        "reason is non-empty",
+        bool(reason),
+        actual=repr(reason),
+        expected_val="non-empty string",
+    )
+    assert esc_data.get("reason"), "Escalation reason empty"
+
+    artifact.check(
+        "stream_id is 'worker'",
+        escalations[0].stream_id == "worker",
+        actual=repr(escalations[0].stream_id),
+        expected_val="'worker'",
+    )
    assert escalations[0].stream_id == "worker"
+
+    artifact.check(
+        "node_id is 'worker'",
+        escalations[0].node_id == "worker",
+        actual=repr(escalations[0].node_id),
+        expected_val="'worker'",
+    )
    assert escalations[0].node_id == "worker"