Merge remote-tracking branch 'origin/main' into fix/first-success

2026-02-10 18:40:27 -08:00
parent 4cc00f3bdc 0297520263
commit ac311e10ba
30 changed files with 2272 additions and 1509 deletions
@@ -0,0 +1,34 @@
+{
+  "permissions": {
+    "allow": [
+      "mcp__agent-builder__create_session",
+      "mcp__agent-builder__set_goal",
+      "mcp__agent-builder__add_node",
+      "mcp__agent-builder__add_edge",
+      "mcp__agent-builder__configure_loop",
+      "mcp__agent-builder__add_mcp_server",
+      "mcp__agent-builder__validate_graph",
+      "mcp__agent-builder__export_graph",
+      "mcp__agent-builder__load_session_by_id",
+      "Bash(git status:*)",
+      "Bash(gh run view:*)",
+      "Bash(uv run:*)",
+      "Bash(env:*)",
+      "mcp__agent-builder__test_node",
+      "mcp__agent-builder__list_mcp_tools",
+      "Bash(python -m py_compile:*)",
+      "Bash(python -m pytest:*)",
+      "Bash(source:*)",
+      "mcp__agent-builder__update_node",
+      "mcp__agent-builder__check_missing_credentials",
+      "mcp__agent-builder__list_stored_credentials",
+      "Bash(find:*)",
+      "mcp__agent-builder__run_tests",
+      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)",
+      "mcp__agent-builder__list_agent_sessions",
+      "mcp__agent-builder__generate_constraint_tests",
+      "mcp__agent-builder__generate_success_tests"
+    ]
+  },
+  "enabledMcpjsonServers": ["agent-builder", "tools"]
+}
@@ -562,15 +562,33 @@ PYTHONPATH=core:exports python -m {agent_name} --tui

 ### Find Available Checkpoints:

-```bash
-# In TUI:
-/sessions {session_id}
+Use MCP tools to programmatically find and inspect checkpoints:

-# This shows all checkpoints with timestamps:
-Available Checkpoints: (3)
-  1. cp_node_complete_intake_143030
-  2. cp_node_complete_research_143115
-  3. cp_pause_research_143130
+```
+# List all sessions to find the failed one
+list_agent_sessions(agent_work_dir="~/.hive/agents/{agent_name}", status="failed")
+
+# Inspect session state
+get_agent_session_state(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}")
+
+# Find clean checkpoints to resume from
+list_agent_checkpoints(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", is_clean="true")
+
+# Compare checkpoints to understand what changed
+compare_agent_checkpoints(
+    agent_work_dir="~/.hive/agents/{agent_name}",
+    session_id="{session_id}",
+    checkpoint_id_before="cp_node_complete_intake_143030",
+    checkpoint_id_after="cp_node_complete_research_143115"
+)
+
+# Inspect memory at a specific checkpoint
+get_agent_checkpoint(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", checkpoint_id="cp_node_complete_intake_143030")
+```
+
+Or in TUI:
+```bash
+/sessions {session_id}
 ```

 **Verification:**
@@ -717,6 +735,80 @@ Let me know when you've run it and I'll help check the logs!"
  )
  ```

+### Session & Checkpoint Tools
+
+**list_agent_sessions** - Browse sessions with filtering
+- **When to use:** Finding resumable sessions, identifying failed sessions, Stage 3 triage
+- **Returns:** Session list with status, timestamps, is_resumable, current_node, quality
+- **Example:**
+  ```
+  list_agent_sessions(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      status="failed",
+      limit=10
+  )
+  ```
+
+**get_agent_session_state** - Load full session state (excludes memory values)
+- **When to use:** Inspecting session progress, checking is_resumable, examining path
+- **Returns:** Full state with memory_keys/memory_size instead of memory values
+- **Example:**
+  ```
+  get_agent_session_state(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345"
+  )
+  ```
+
+**get_agent_session_memory** - Get memory contents from a session
+- **When to use:** Stage 5 root cause analysis, inspecting produced data
+- **Returns:** All memory keys+values, or a single key's value
+- **Example:**
+  ```
+  get_agent_session_memory(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      key="twitter_handles"
+  )
+  ```
+
+**list_agent_checkpoints** - List checkpoints for a session
+- **When to use:** Stage 6 recovery, finding clean checkpoints to resume from
+- **Returns:** Checkpoint summaries with type, node, clean status
+- **Example:**
+  ```
+  list_agent_checkpoints(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      is_clean="true"
+  )
+  ```
+
+**get_agent_checkpoint** - Load a specific checkpoint with full state
+- **When to use:** Inspecting exact state at a checkpoint, comparing to current state
+- **Returns:** Full checkpoint: memory snapshot, execution path, metrics
+- **Example:**
+  ```
+  get_agent_checkpoint(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      checkpoint_id="cp_node_complete_intake_143030"
+  )
+  ```
+
+**compare_agent_checkpoints** - Diff memory between two checkpoints
+- **When to use:** Understanding data flow, finding where state diverged
+- **Returns:** Memory diff (added/removed/changed keys) + execution path diff
+- **Example:**
+  ```
+  compare_agent_checkpoints(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      checkpoint_id_before="cp_node_complete_intake_143030",
+      checkpoint_id_after="cp_node_complete_research_143115"
+  )
+  ```
+
 ### Query Patterns

 **Pattern 1: Top-Down Investigation** (Most common)
@@ -739,6 +831,16 @@ Loop every 10 seconds:
  2. If found: Alert and drill into L2
 ```

+**Pattern 4: Session State + Checkpoint Recovery**
+```
+1. list_agent_sessions: Find failed/paused sessions
+2. get_agent_session_state: Check is_resumable, see execution path
+3. get_agent_session_memory: Inspect what data was produced
+4. list_agent_checkpoints: Find clean checkpoints before failure
+5. compare_agent_checkpoints: Understand what changed between checkpoints
+6. Recommend resume command with specific checkpoint
+```
+
 ---

 ## Complete Example Walkthrough
@@ -1,351 +1,333 @@
-# Example: Testing a YouTube Research Agent
+# Example: Iterative Testing of a Research Agent

-This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
+This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.

-## Prerequisites
+## Agent Structure

- Agent built with hive-create skill at `exports/youtube-research/`
- Goal defined with success criteria and constraints
-
-## Step 1: Load the Goal
-
-First, load the goal that was defined during the Goal stage:
-
-```json
-{
-    "id": "youtube-research",
-    "name": "YouTube Research Agent",
-    "description": "Find relevant YouTube videos on a given topic",
-    "success_criteria": [
-        {
-            "id": "find_videos",
-            "description": "Find 3-5 relevant videos",
-            "metric": "video_count",
-            "target": "3-5",
-            "weight": 1.0
-        },
-        {
-            "id": "relevance",
-            "description": "Videos must be relevant to the topic",
-            "metric": "relevance_score",
-            "target": ">0.8",
-            "weight": 0.8
-        }
-    ],
-    "constraints": [
-        {
-            "id": "api_limits",
-            "description": "Must not exceed YouTube API rate limits",
-            "constraint_type": "hard",
-            "category": "technical"
-        },
-        {
-            "id": "content_safety",
-            "description": "Must filter out inappropriate content",
-            "constraint_type": "hard",
-            "category": "safety"
-        }
-    ]
-}
+```
+exports/deep_research_agent/
+├── agent.py          # Goal + graph: intake → research → review → report
+├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
+├── config.py         # Model config
+├── mcp_servers.json  # Tools: web_search, web_scrape
+└── tests/            # Test files (we'll create these)
 ```

-## Step 2: Get Constraint Test Guidelines
+**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.

-During the Goal stage (or early Eval), get test guidelines for constraints:
+---
+
+## Phase 1: Generate Tests
+
+### Read the goal

 ```python
-result = generate_constraint_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON above>',
-    agent_path="exports/youtube-research"
-)
+Read(file_path="exports/deep_research_agent/agent.py")
+# Extract: goal_id="rigorous-interactive-research"
+# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
+# constraints: no-hallucination, source-attribution
 ```

-**The result contains guidelines (not generated tests):**
- `output_file`: Where to write tests
- `file_header`: Imports and fixtures to use
- `test_template`: Format for test functions
- `constraints_formatted`: The constraints to test
- `test_guidelines`: Rules for writing tests
-
-## Step 3: Write Constraint Tests
-
-Using the guidelines, write tests directly with the Write tool:
-
-```python
-# Write constraint tests using the provided file_header and guidelines
-Write(
-    file_path="exports/youtube-research/tests/test_constraints.py",
-    content='''
-"""Constraint tests for youtube-research agent."""
-
-import os
-import pytest
-from exports.youtube_research import default_agent
-
-
-pytestmark = pytest.mark.skipif(
-    not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing."
-)
-
-
-@pytest.mark.asyncio
-async def test_constraint_api_limits_respected():
-    """Verify API rate limits are not exceeded."""
-    import time
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-
-    for i in range(10):
-        result = await default_agent.run({"topic": f"test_{i}"}, mock_mode=mock_mode)
-        time.sleep(0.1)
-
-    # Should complete without rate limit errors
-    assert "rate limit" not in str(result).lower()
-
-
-@pytest.mark.asyncio
-async def test_constraint_content_safety_filter():
-    """Verify inappropriate content is filtered."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "general topic"}, mock_mode=mock_mode)
-
-    for video in result.videos:
-        assert video.safe_for_work is True
-        assert video.age_restricted is False
-'''
-)
-```
-
-## Step 4: Get Success Criteria Test Guidelines
-
-After the agent is built, get success criteria test guidelines:
+### Get test guidelines

 ```python
 result = generate_success_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON>',
-    node_names="search_node,filter_node,rank_node,format_node",
-    tool_names="youtube_search,video_details,channel_info",
-    agent_path="exports/youtube-research"
+    goal_id="rigorous-interactive-research",
+    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
+    node_names="intake,research,review,report",
+    tool_names="web_search,web_scrape",
+    agent_path="exports/deep_research_agent"
 )
 ```

-## Step 5: Write Success Criteria Tests
-
-Using the guidelines, write success criteria tests:
+### Write tests

 ```python
 Write(
-    file_path="exports/youtube-research/tests/test_success_criteria.py",
-    content='''
-"""Success criteria tests for youtube-research agent."""
-
-import os
-import pytest
-from exports.youtube_research import default_agent
-
-
-pytestmark = pytest.mark.skipif(
-    not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing."
-)
-
+    file_path="exports/deep_research_agent/tests/test_success_criteria.py",
+    content=result["file_header"] + '''

@pytest.mark.asyncio
-async def test_find_videos_happy_path():
-    """Test finding videos for a common topic."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "machine learning"}, mock_mode=mock_mode)
-
-    assert result.success
-    assert 3 <= len(result.videos) <= 5
-    assert all(v.title for v in result.videos)
-    assert all(v.video_id for v in result.videos)
-
+async def test_success_source_diversity(runner, auto_responder, mock_mode):
+    """At least 5 diverse sources are found."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "impact of remote work on productivity"})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    sources = output.get("sources", [])
+    if isinstance(sources, list):
+        assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"

@pytest.mark.asyncio
-async def test_find_videos_minimum_boundary():
-    """Test at minimum threshold (3 videos)."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "niche topic xyz"}, mock_mode=mock_mode)
-
-    assert len(result.videos) >= 3
-
+async def test_success_citation_coverage(runner, auto_responder, mock_mode):
+    """Every factual claim in the report cites its source."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "climate change effects on agriculture"})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    report = output.get("report", "")
+    # Check that report contains numbered references
+    assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"

@pytest.mark.asyncio
-async def test_relevance_score_threshold():
-    """Test relevance scoring meets threshold."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "python programming"}, mock_mode=mock_mode)
-
-    for video in result.videos:
-        assert video.relevance_score > 0.8
-
+async def test_success_report_completeness(runner, auto_responder, mock_mode):
+    """Report addresses the original research question."""
+    query = "pros and cons of nuclear energy"
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": query})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    report = output.get("report", "")
+    assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"

@pytest.mark.asyncio
-async def test_find_videos_no_results_graceful():
-    """Test graceful handling of no results."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "xyznonexistent123"}, mock_mode=mock_mode)
+async def test_empty_query_handling(runner, auto_responder, mock_mode):
+    """Agent handles empty input gracefully."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": ""})
+    finally:
+        await auto_responder.stop()
+    output = result.output or {}
+    assert not result.success or output.get("error"), "Should handle empty query"

-    # Should not crash, return empty or message
-    assert result.videos == [] or result.message
+@pytest.mark.asyncio
+async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
+    """Feedback loop between review and research terminates."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "quantum computing basics"})
+    finally:
+        await auto_responder.stop()
+    visits = result.node_visit_counts or {}
+    for node_id, count in visits.items():
+        assert count <= 5, f"Node {node_id} visited {count} times"
 '''
 )
 ```

-## Step 6: Run All Tests
+---

-Execute all tests:
+## Phase 2: First Execution

 ```python
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]',
-    parallel=4
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent",
+    fail_fast=True
 )
 ```

-**Results:**
-
+**Result:**
 ```json
 {
-    "goal_id": "youtube-research",
  "overall_passed": false,
-    "summary": {
-        "total": 6,
-        "passed": 5,
-        "failed": 1,
-        "pass_rate": "83.3%"
-    },
-    "duration_ms": 4521,
-    "results": [
-        {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
-        {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
-        {"test_id": "test_success_001", "passed": true, "duration_ms": 789},
-        {"test_id": "test_success_002", "passed": true, "duration_ms": 654},
-        {"test_id": "test_success_003", "passed": true, "duration_ms": 543},
-        {"test_id": "test_success_004", "passed": false, "duration_ms": 845,
-         "error_category": "IMPLEMENTATION_ERROR",
-         "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
+  "summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
+  "failures": [
+    {"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
+    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
  ]
 }
 ```

-## Step 7: Debug the Failed Test
+---
+
+## Phase 3: Analyze (Iteration 1)
+
+### Debug the first failure

 ```python
-result = debug_test(
-    goal_id="youtube-research",
-    test_name="test_find_videos_no_results_graceful",
-    agent_path="exports/youtube-research"
+debug_test(
+    goal_id="rigorous-interactive-research",
+    test_name="test_success_source_diversity",
+    agent_path="exports/deep_research_agent"
+)
+# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
+```
+
+### Find the session and inspect memory
+
+```python
+list_agent_sessions(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    status="completed",
+    limit=1
+)
+# → session_20260209_150000_abc12345
+
+get_agent_session_memory(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_150000_abc12345",
+    key="research_results"
+)
+# → Only 2 sources found. LLM stopped searching after 2 queries.
+```
+
+### Check LLM behavior in the research node
+
+```python
+query_runtime_log_raw(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    run_id="session_20260209_150000_abc12345",
+    node_id="research"
+)
+# → LLM called web_search twice, got results, immediately called set_output.
+# → Prompt doesn't instruct it to find at least 5 sources.
+```
+
+**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
+
+---
+
+## Phase 4: Fix (Iteration 1)
+
+```python
+Read(file_path="exports/deep_research_agent/nodes/__init__.py")
+
+# Fix the research node prompt
+Edit(
+    file_path="exports/deep_research_agent/nodes/__init__.py",
+    old_string='system_prompt="Search for information on the user\'s topic using web search."',
+    new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
 )
 ```

-**Debug Output:**
+---

+## Phase 5: Recover & Resume (Iteration 1)
+
+The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
+
+```python
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent",
+    fail_fast=True
+)
+```
+
+**Result:**
 ```json
 {
-    "test_id": "test_success_004",
-    "test_name": "test_find_videos_no_results_graceful",
-    "input": {"topic": "xyznonexistent123"},
-    "expected": "Empty list or message",
-    "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
-    "passed": false,
-    "error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
-    "error_category": "IMPLEMENTATION_ERROR",
-    "stack_trace": "Traceback (most recent call last):\n  File \"filter_node.py\", line 42\n    for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
-    "logs": [
-        {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
-    ],
-    "runtime_data": {
-        "execution_path": ["start", "search_node", "filter_node"],
-        "node_outputs": {
-            "search_node": null
-        }
-    },
-    "suggested_fix": "Add null check in filter_node before accessing .videos attribute",
-    "iteration_guidance": {
-        "stage": "Agent",
-        "action": "Fix the code in nodes/edges",
-        "restart_required": false,
-        "description": "The goal is correct, but filter_node doesn't handle null results from search_node."
-    }
+  "overall_passed": false,
+  "summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
+  "failures": [
+    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
+  ]
 }
 ```

-## Step 8: Iterate Based on Category
+Source diversity now passes. Citation coverage still fails.

-Since this is an **IMPLEMENTATION_ERROR**, we:
+---

-1. **Don't restart** the Goal → Agent → Eval flow
-2. **Fix the agent** using hive-create skill:
-   - Modify `filter_node` to handle null results
-3. **Re-run Eval** (tests only)
-
-### Fix in hive-create:
+## Phase 3: Analyze (Iteration 2)

 ```python
-# Update the filter_node to handle null
-add_node(
-    node_id="filter_node",
-    name="Filter Node",
-    description="Filter and rank videos",
-    node_type="function",
-    input_keys=["search_results"],
-    output_keys=["filtered_videos"],
-    system_prompt="""
-    Filter videos by relevance.
-    IMPORTANT: Handle case where search_results is None or empty.
-    Return empty list if no results.
-    """
+debug_test(
+    goal_id="rigorous-interactive-research",
+    test_name="test_success_citation_coverage",
+    agent_path="exports/deep_research_agent"
+)
+# Category: ASSERTION_FAILURE — Report lacks citations
+
+# Check what the report node produced
+list_agent_sessions(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    status="completed",
+    limit=1
+)
+# → session_20260209_151500_def67890
+
+get_agent_session_memory(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_151500_def67890",
+    key="report"
+)
+# → Report text exists but uses no numbered references.
+# → Sources are in memory but report node doesn't cite them.
+```
+
+**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
+
+---
+
+## Phase 4: Fix (Iteration 2)
+
+```python
+Edit(
+    file_path="exports/deep_research_agent/nodes/__init__.py",
+    old_string='system_prompt="Write a comprehensive report based on the research findings."',
+    new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
 )
 ```

-### Re-export and re-test:
+---
+
+## Phase 5: Resume (Iteration 2)
+
+The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
+
+```bash
+# Run via CLI to get checkpoints
+uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
+
+# After it runs, find the clean checkpoint before report
+list_agent_checkpoints(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_152000_ghi34567",
+    is_clean="true"
+)
+# → cp_node_complete_review_152100 (after review, before report)
+
+# Resume — skips intake, research, review entirely
+uv run hive run exports/deep_research_agent \
+  --resume-session session_20260209_152000_ghi34567 \
+  --checkpoint cp_node_complete_review_152100
+```
+
+Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
+
+---
+
+## Phase 6: Final Verification

 ```python
-# Re-export the fixed agent
-export_graph(path="exports/youtube-research")
-
-# Re-run tests
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]'
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent"
 )
 ```

-**Updated Results:**
-
+**Result:**
 ```json
 {
-    "goal_id": "youtube-research",
  "overall_passed": true,
-    "summary": {
-        "total": 6,
-        "passed": 6,
-        "failed": 0,
-        "pass_rate": "100.0%"
-    }
+  "summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
 }
 ```

+All tests pass.
+
+---
+
 ## Summary

-1. **Got guidelines** for constraint tests during Goal stage
-2. **Wrote** constraint tests using Write tool
-3. **Got guidelines** for success criteria tests during Eval stage
-4. **Wrote** success criteria tests using Write tool
-5. **Ran** tests in parallel
-6. **Debugged** the one failure
-7. **Categorized** as IMPLEMENTATION_ERROR
-8. **Fixed** the agent (not the goal)
-9. **Re-ran** Eval only (didn't restart full flow)
-10. **Passed** all tests
+| Iteration | Failure | Root Cause | Fix | Recovery |
+|-----------|---------|------------|-----|----------|
+| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
+| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |

-The agent is now validated and ready for production use.
+**Key takeaways:**
+- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
+- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
+- Final `run_tests` confirms all scenarios pass end-to-end
@@ -0,0 +1,20 @@
+---
+name: hive
+description: Hive Agent Builder & Manager
+mode: primary
+tools:
+  agent-builder: true
+  tools: true
+---
+
+# Hive Agent
+You are the Hive Agent Builder. Your goal is to help the user construct, configure, and deploy AI agents using the Hive framework.
+
+## Capabilities
+1. **Scaffold Agents:** Create new agent directories/configs.
+2. **Manage Tools:** Add/remove tools via MCP.
+3. **Debug:** Analyze agent workflows.
+
+## Context
+- You are an expert in the Hive framework architecture.
+- Always use the `agent-builder` MCP server for filesystem operations.
@@ -0,0 +1,30 @@
+{
+  "mcpServers": {
+    "agent-builder": {
+      "command": "uv",
+      "args": [
+        "run",
+        "python",
+        "-m",
+        "framework.mcp.agent_builder_server"
+      ],
+      "cwd": "core",
+      "env": {
+        "PYTHONPATH": "../tools/src"
+      }
+    },
+    "tools": {
+      "command": "uv",
+      "args": [
+        "run",
+        "python",
+        "mcp_server.py",
+        "--stdio"
+      ],
+      "cwd": "tools",
+      "env": {
+        "PYTHONPATH": "src"
+      }
+    }
+  }
+}
@@ -0,0 +1 @@
+../../.claude/skills/hive
@@ -0,0 +1 @@
+../../.claude/skills/hive-concepts
@@ -0,0 +1 @@
+../../.claude/skills/hive-create
@@ -0,0 +1 @@
+../../.claude/skills/hive-credentials
@@ -0,0 +1 @@
+../../.claude/skills/hive-debugger
@@ -0,0 +1 @@
+../../.claude/skills/hive-patterns
@@ -0,0 +1 @@
+../../.claude/skills/hive-test
@@ -0,0 +1 @@
+../../.claude/skills/triage-issue
@@ -49,8 +49,8 @@ You may submit PRs without prior assignment for:
   make check    # Lint and format checks (ruff check + ruff format --check on core/ and tools/)
   make test     # Core tests (cd core && pytest tests/ -v)
   ```
-6. Commit your changes following our commit conventions
-7. Push to your fork and submit a Pull Request
+8. Commit your changes following our commit conventions
+9. Push to your fork and submit a Pull Request

 ## Development Setup

@@ -145,6 +145,9 @@ make test
 # Or run tests directly
 cd core && pytest tests/ -v

+# Run tools package tests (when contributing to tools/)
+cd tools && uv run pytest tests/ -v
+
 # Run tests for a specific agent
 PYTHONPATH=exports uv run python -m agent_name test
 ```
@@ -120,6 +120,16 @@ hive tui
 # Or run directly
 hive run exports/your_agent_name --input '{"key": "value"}'
 ```
+##  Coding Agent Support
+### Opencode 
+Hive includes native support for [Opencode](https://github.com/opencode-ai/opencode).
+
+1. **Setup:** Run the quickstart script 
+2. **Launch:** Open Opencode in the project root.
+3. **Activate:** Type `/hive` in the chat to switch to the Hive Agent.
+4. **Verify:** Ask the agent *"List your tools"* to confirm the connection.
+
+The agent has access to all Hive skills and can scaffold agents, add tools, and debug workflows directly from the chat.

 **[📖 Complete Setup Guide](docs/environment-setup.md)** - Detailed instructions for agent development

@@ -274,6 +274,7 @@ class EventLoopNode(NodeProtocol):

        # 5. Stall detection state
        recent_responses: list[str] = []
+        user_interaction_count = 0  # tracks how many times this node blocked for user input

        # 6. Main loop
        for iteration in range(start_iteration, self._config.max_iterations):
@@ -485,13 +486,11 @@ class EventLoopNode(NodeProtocol):

            # 6h. Client-facing input blocking
            #
-            # For client_facing nodes, block for user input only when the
-            # LLM explicitly called ask_user().  Text-only turns without
-            # ask_user flow through without blocking, allowing progress
-            # updates and summaries to stream freely.
-            #
-            # After user input, always fall through to judge evaluation
-            # (6i).  The judge handles all acceptance decisions.
+            # Block ONLY when the LLM explicitly calls ask_user().
+            # Text-only turns and set_output-only turns flow through
+            # without blocking, allowing progress updates and summaries
+            # to stream freely.  After user input arrives, fall through
+            # to judge evaluation (6i) — the judge handles acceptance.
            if ctx.node_spec.client_facing and user_input_requested:
                if self._shutdown:
                    await self._publish_loop_completed(stream_id, node_id, iteration + 1)
@@ -578,6 +577,7 @@ class EventLoopNode(NodeProtocol):
                        latency_ms=latency_ms,
                    )

+                user_interaction_count += 1
                recent_responses.clear()
                # Fall through to judge evaluation (6i)

@@ -824,6 +824,12 @@ class EventLoopNode(NodeProtocol):

        Returns True if input arrived, False if shutdown was signaled.
        """
+        # Clear BEFORE emitting so that synchronous handlers (e.g. the
+        # headless stdin handler) can call inject_event() during the emit
+        # and the signal won't be lost.  TUI handlers return immediately
+        # without injecting, so the wait still blocks until the user types.
+        self._input_ready.clear()
+
        if self._event_bus:
            await self._event_bus.emit_client_input_requested(
                stream_id=ctx.node_id,
@@ -831,7 +837,6 @@ class EventLoopNode(NodeProtocol):
                prompt="",
            )

-        self._input_ready.clear()
        await self._input_ready.wait()
        return not self._shutdown

@@ -989,7 +994,7 @@ class EventLoopNode(NodeProtocol):
                        is_error=result.is_error,
                    )
                    if not result.is_error:
-                        value = tc.tool_input["value"]
+                        value = tc.tool_input.get("value", "")
                        # Parse JSON strings into native types so downstream
                        # consumers get lists/dicts instead of serialised JSON,
                        # and the hallucination validator skips non-string values.
@@ -1000,8 +1005,9 @@ class EventLoopNode(NodeProtocol):
                                    value = parsed
                            except (json.JSONDecodeError, TypeError):
                                pass
-                        await accumulator.set(tc.tool_input["key"], value)
-                        outputs_set_this_turn.append(tc.tool_input["key"])
+                        key = tc.tool_input.get("key", "")
+                        await accumulator.set(key, value)
+                        outputs_set_this_turn.append(key)
                    logged_tool_calls.append(
                        {
                            "tool_use_id": tc.tool_use_id,
@@ -1283,6 +1289,24 @@ class EventLoopNode(NodeProtocol):
                accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys
            )
            if not missing:
+                # Safety check: when ALL output keys are nullable and NONE
+                # have been set, the node produced nothing useful.  Retry
+                # instead of accepting an empty result — this prevents
+                # client-facing nodes from terminating before the user
+                # ever interacts, and non-client-facing nodes from
+                # short-circuiting without doing their work.
+                output_keys = ctx.node_spec.output_keys or []
+                nullable_keys = set(ctx.node_spec.nullable_output_keys or [])
+                all_nullable = output_keys and nullable_keys >= set(output_keys)
+                none_set = not any(accumulator.get(k) is not None for k in output_keys)
+                if all_nullable and none_set:
+                    return JudgeVerdict(
+                        action="RETRY",
+                        feedback=(
+                            f"No output keys have been set yet. "
+                            f"Use set_output to set at least one of: {output_keys}"
+                        ),
+                    )
                return JudgeVerdict(action="ACCEPT")
            else:
                return JudgeVerdict(
@@ -368,7 +368,7 @@ class GraphExecutor:
            # Check if resuming from paused_at (session state resume)
            paused_at = session_state.get("paused_at") if session_state else None
            node_ids = [n.id for n in graph.nodes]
-            self.logger.info(f"🔍 Debug: paused_at={paused_at}, available node IDs={node_ids}")
+            self.logger.debug(f"paused_at={paused_at}, available node IDs={node_ids}")

            if paused_at and graph.get_node(paused_at) is not None:
                # Resume from paused_at node directly (works for any node, not just pause_nodes)
@@ -505,6 +505,21 @@ class GraphExecutor:

                path.append(current_node_id)

+                # Clear stale nullable outputs from previous visits.
+                # When a node is re-visited (e.g. review → process-batch → review),
+                # nullable outputs from the PREVIOUS visit linger in shared memory.
+                # This causes stale edge conditions to fire (e.g. "feedback is not None"
+                # from visit 1 triggers even when visit 2 sets "final_summary" instead).
+                # Clearing them ensures only the CURRENT visit's outputs affect routing.
+                if node_visit_counts.get(current_node_id, 0) > 1:
+                    nullable_keys = getattr(node_spec, "nullable_output_keys", None) or []
+                    for key in nullable_keys:
+                        if memory.read(key) is not None:
+                            memory.write(key, None, validate=False)
+                            self.logger.info(
+                                f"   🧹 Cleared stale nullable output '{key}' from previous visit"
+                            )
+
                # Check if pause (HITL) before execution
                if current_node_id in graph.pause_nodes:
                    self.logger.info(f"⏸ Paused at HITL node: {node_spec.name}")
@@ -1134,7 +1134,7 @@ Keep the same JSON structure but with shorter content values.
                decision_id=decision_id,
                success=True,
                result=response.content,
-                tokens_used=response.input_tokens + response.output_tokens,
+                tokens_used=total_input_tokens + total_output_tokens,
                latency_ms=latency_ms,
            )

@@ -1233,7 +1233,7 @@ Keep the same JSON structure but with shorter content values.
                        success=False,
                        error=_extraction_error,
                        output={},
-                        tokens_used=response.input_tokens + response.output_tokens,
+                        tokens_used=total_input_tokens + total_output_tokens,
                        latency_ms=latency_ms,
                    )
                    # JSON extraction failed completely - still strip code blocks
@@ -1275,7 +1275,7 @@ Keep the same JSON structure but with shorter content values.
            return NodeResult(
                success=True,
                output=output,
-                tokens_used=response.input_tokens + response.output_tokens,
+                tokens_used=total_input_tokens + total_output_tokens,
                latency_ms=latency_ms,
            )

@@ -14,13 +14,15 @@ from datetime import datetime
 from pathlib import Path
 from typing import Annotated

+# Project root resolution.  This file lives at core/framework/mcp/agent_builder_server.py,
+# so the project root (where exports/ lives) is four parents up.
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+
 # Ensure exports/ is on sys.path so AgentRunner can import agent modules.
-_framework_dir = Path(__file__).resolve().parent.parent  # core/framework/ -> core/
-_project_root = _framework_dir.parent  # core/ -> project root
-_exports_dir = _project_root / "exports"
+_exports_dir = _PROJECT_ROOT / "exports"
 if _exports_dir.is_dir() and str(_exports_dir) not in sys.path:
    sys.path.insert(0, str(_exports_dir))
-del _framework_dir, _project_root, _exports_dir
+del _exports_dir

 from mcp.server import FastMCP  # noqa: E402
 from pydantic import ValidationError  # noqa: E402
@@ -542,6 +544,9 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]:
    """
    Validate and normalize agent_path.

+    Resolves relative paths against _PROJECT_ROOT since the MCP server's
+    cwd (core/) differs from the user's cwd (project root).
+
    Returns:
        (Path, None) if valid
        (None, error_json) if invalid
@@ -556,6 +561,12 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]:

    path = Path(agent_path)

+    # Resolve relative paths against project root (not MCP server's cwd)
+    if not path.is_absolute() and not path.exists():
+        resolved = _PROJECT_ROOT / path
+        if resolved.exists():
+            path = resolved
+
    if not path.exists():
        return None, json.dumps(
            {
@@ -3019,18 +3030,15 @@ def _format_success_criteria(criteria: list[SuccessCriterion]) -> str:

 # Test template for Claude to use when writing tests
 CONSTRAINT_TEST_TEMPLATE = '''@pytest.mark.asyncio
-async def test_constraint_{constraint_id}_{scenario}(mock_mode):
+async def test_constraint_{constraint_id}_{scenario}(runner, auto_responder, mock_mode):
    """Test: {description}"""
-    result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode)
-
-    # IMPORTANT: result is an ExecutionResult object with these attributes:
-    # - result.success: bool - whether the agent succeeded
-    # - result.output: dict - the agent's output data (access data here!)
-    # - result.error: str or None - error message if failed
+    await auto_responder.start()
+    try:
+        result = await runner.run({{"key": "value"}})
+    finally:
+        await auto_responder.stop()

    assert result.success, f"Agent failed: {{result.error}}"
-
-    # Access output data via result.output
    output_data = result.output or {{}}

    # Add constraint-specific assertions here
@@ -3038,18 +3046,15 @@ async def test_constraint_{constraint_id}_{scenario}(mock_mode):
 '''

 SUCCESS_TEST_TEMPLATE = '''@pytest.mark.asyncio
-async def test_success_{criteria_id}_{scenario}(mock_mode):
+async def test_success_{criteria_id}_{scenario}(runner, auto_responder, mock_mode):
    """Test: {description}"""
-    result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode)
-
-    # IMPORTANT: result is an ExecutionResult object with these attributes:
-    # - result.success: bool - whether the agent succeeded
-    # - result.output: dict - the agent's output data (access data here!)
-    # - result.error: str or None - error message if failed
+    await auto_responder.start()
+    try:
+        result = await runner.run({{"key": "value"}})
+    finally:
+        await auto_responder.stop()

    assert result.success, f"Agent failed: {{result.error}}"
-
-    # Access output data via result.output
    output_data = result.output or {{}}

    # Add success criteria-specific assertions here
@@ -3105,7 +3110,6 @@ def generate_constraint_tests(
        test_type="Constraint",
        agent_name=agent_module,
        description=f"Tests for constraints defined in goal: {goal.name}",
-        agent_module=agent_module,
    )

    # Return guidelines + data for Claude to write tests directly
@@ -3121,14 +3125,22 @@ def generate_constraint_tests(
                "max_tests": 5,
                "naming_convention": "test_constraint_<constraint_id>_<scenario>",
                "required_decorator": "@pytest.mark.asyncio",
-                "required_fixture": "mock_mode",
-                "agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)",
+                "required_fixtures": "runner, auto_responder, mock_mode",
+                "agent_call_pattern": "await runner.run(input_dict)",
+                "auto_responder_pattern": (
+                    "await auto_responder.start()\n"
+                    "try:\n"
+                    "    result = await runner.run(input_dict)\n"
+                    "finally:\n"
+                    "    await auto_responder.stop()"
+                ),
                "result_type": "ExecutionResult with .success, .output (dict), .error",
                "critical_rules": [
                    "Every test function MUST be async with @pytest.mark.asyncio",
-                    "Every test MUST accept mock_mode as a parameter",
-                    "Use await default_agent.run(input, mock_mode=mock_mode)",
-                    "default_agent is already imported - do NOT add imports",
+                    "Every test MUST accept runner, auto_responder, and mock_mode fixtures",
+                    "Use await runner.run(input) -- NOT default_agent.run()",
+                    "Start auto_responder before running, stop in finally block",
+                    "runner and auto_responder are from conftest.py -- do NOT import them",
                    "NEVER call result.get() - use result.output.get() instead",
                    "Always check result.success before accessing result.output",
                ],
@@ -3192,7 +3204,6 @@ def generate_success_tests(
        test_type="Success criteria",
        agent_name=agent_module,
        description=f"Tests for success criteria defined in goal: {goal.name}",
-        agent_module=agent_module,
    )

    # Return guidelines + data for Claude to write tests directly
@@ -3214,14 +3225,22 @@ def generate_success_tests(
                "max_tests": 12,
                "naming_convention": "test_success_<criteria_id>_<scenario>",
                "required_decorator": "@pytest.mark.asyncio",
-                "required_fixture": "mock_mode",
-                "agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)",
+                "required_fixtures": "runner, auto_responder, mock_mode",
+                "agent_call_pattern": "await runner.run(input_dict)",
+                "auto_responder_pattern": (
+                    "await auto_responder.start()\n"
+                    "try:\n"
+                    "    result = await runner.run(input_dict)\n"
+                    "finally:\n"
+                    "    await auto_responder.stop()"
+                ),
                "result_type": "ExecutionResult with .success, .output (dict), .error",
                "critical_rules": [
                    "Every test function MUST be async with @pytest.mark.asyncio",
-                    "Every test MUST accept mock_mode as a parameter",
-                    "Use await default_agent.run(input, mock_mode=mock_mode)",
-                    "default_agent is already imported - do NOT add imports",
+                    "Every test MUST accept runner, auto_responder, and mock_mode fixtures",
+                    "Use await runner.run(input) -- NOT default_agent.run()",
+                    "Start auto_responder before running, stop in finally block",
+                    "runner and auto_responder are from conftest.py -- do NOT import them",
                    "NEVER call result.get() - use result.output.get() instead",
                    "Always check result.success before accessing result.output",
                ],
@@ -3318,11 +3337,13 @@ def run_tests(
    # Add short traceback and quiet summary
    cmd.append("--tb=short")

-    # Set PYTHONPATH to project root so agents can import from core.framework
+    # Set PYTHONPATH so framework and agent packages are importable
    env = os.environ.copy()
    pythonpath = env.get("PYTHONPATH", "")
    project_root = Path(__file__).parent.parent.parent.parent.resolve()
-    env["PYTHONPATH"] = f"{project_root}:{pythonpath}"
+    core_path = project_root / "core"
+    exports_path = project_root / "exports"
+    env["PYTHONPATH"] = f"{core_path}:{exports_path}:{project_root}:{pythonpath}"

    # Run pytest
    try:
@@ -3792,7 +3813,11 @@ def check_missing_credentials(

        from framework.runner import AgentRunner

-        runner = AgentRunner.load(agent_path)
+        path, err = _validate_agent_path(agent_path)
+        if err:
+            return err
+
+        runner = AgentRunner.load(str(path))
        runner.validate()

        store = _get_credential_store()
@@ -3992,7 +4017,11 @@ def verify_credentials(
    try:
        from framework.runner import AgentRunner

-        runner = AgentRunner.load(agent_path)
+        path, err = _validate_agent_path(agent_path)
+        if err:
+            return err
+
+        runner = AgentRunner.load(str(path))
        validation = runner.validate()

        return json.dumps(
@@ -4009,6 +4038,382 @@ def verify_credentials(
        return json.dumps({"error": str(e)})


+# =============================================================================
+# SESSION & CHECKPOINT TOOLS (read-only, no build session required)
+# =============================================================================
+
+_MAX_DIFF_VALUE_LEN = 500
+
+
+def _read_session_json(path: Path) -> dict | None:
+    """Read a JSON file, returning None on failure."""
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError):
+        return None
+
+
+def _scan_agent_sessions(agent_work_dir: Path) -> list[tuple[str, Path]]:
+    """Find session directories with state.json, sorted most-recent-first."""
+    sessions: list[tuple[str, Path]] = []
+    sessions_dir = agent_work_dir / "sessions"
+    if not sessions_dir.exists():
+        return sessions
+    for session_dir in sessions_dir.iterdir():
+        if session_dir.is_dir() and session_dir.name.startswith("session_"):
+            state_path = session_dir / "state.json"
+            if state_path.exists():
+                sessions.append((session_dir.name, state_path))
+    sessions.sort(key=lambda t: t[0], reverse=True)
+    return sessions
+
+
+def _truncate_value(value: object, max_len: int = _MAX_DIFF_VALUE_LEN) -> object:
+    """Truncate a value's JSON representation if too long."""
+    s = json.dumps(value, default=str)
+    if len(s) <= max_len:
+        return value
+    return {"_truncated": True, "_preview": s[:max_len] + "...", "_length": len(s)}
+
+
+@mcp.tool()
+def list_agent_sessions(
+    agent_work_dir: Annotated[
+        str,
+        "Path to the agent's working directory (e.g., ~/.hive/agents/my_agent)",
+    ],
+    status: Annotated[
+        str,
+        "Filter by status: 'active', 'paused', 'completed', 'failed', 'cancelled'. Empty for all.",
+    ] = "",
+    limit: Annotated[int, "Maximum number of results (default 20)"] = 20,
+    offset: Annotated[int, "Number of sessions to skip for pagination"] = 0,
+) -> str:
+    """
+    List sessions for an agent with optional status filter.
+
+    Use this to discover which sessions exist, find resumable sessions,
+    or identify failed sessions for debugging. Combines well with
+    query_runtime_logs for correlating session state with log data.
+    """
+    work_dir = Path(agent_work_dir)
+    all_sessions = _scan_agent_sessions(work_dir)
+
+    if not all_sessions:
+        return json.dumps({"sessions": [], "total": 0, "offset": offset, "limit": limit})
+
+    summaries = []
+    for session_id, state_path in all_sessions:
+        data = _read_session_json(state_path)
+        if data is None:
+            continue
+
+        session_status = data.get("status", "")
+        if status and session_status != status:
+            continue
+
+        timestamps = data.get("timestamps", {})
+        progress = data.get("progress", {})
+        checkpoint_dir = state_path.parent / "checkpoints"
+
+        summaries.append(
+            {
+                "session_id": session_id,
+                "status": session_status,
+                "goal_id": data.get("goal_id", ""),
+                "started_at": timestamps.get("started_at", ""),
+                "updated_at": timestamps.get("updated_at", ""),
+                "completed_at": timestamps.get("completed_at"),
+                "is_resumable": data.get("is_resumable", False),
+                "is_resumable_from_checkpoint": data.get("is_resumable_from_checkpoint", False),
+                "current_node": progress.get("current_node"),
+                "paused_at": progress.get("paused_at"),
+                "steps_executed": progress.get("steps_executed", 0),
+                "execution_quality": progress.get("execution_quality", ""),
+                "has_checkpoints": checkpoint_dir.exists()
+                and any(checkpoint_dir.glob("cp_*.json")),
+            }
+        )
+
+    total = len(summaries)
+    page = summaries[offset : offset + limit]
+    return json.dumps(
+        {"sessions": page, "total": total, "offset": offset, "limit": limit}, indent=2
+    )
+
+
+@mcp.tool()
+def get_agent_session_state(
+    agent_work_dir: Annotated[str, "Path to the agent's working directory"],
+    session_id: Annotated[str, "The session ID (e.g., 'session_20260208_143022_abc12345')"],
+) -> str:
+    """
+    Load full session state for a specific session.
+
+    Returns complete session data including status, progress, result,
+    metrics, and checkpoint info. Memory values are excluded to prevent
+    context bloat -- use get_agent_session_memory to retrieve memory contents.
+    """
+    state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json"
+    data = _read_session_json(state_path)
+    if data is None:
+        return json.dumps({"error": f"Session not found: {session_id}"})
+
+    memory = data.get("memory", {})
+    data["memory_keys"] = list(memory.keys()) if isinstance(memory, dict) else []
+    data["memory_size"] = len(memory) if isinstance(memory, dict) else 0
+    data.pop("memory", None)
+
+    return json.dumps(data, indent=2, default=str)
+
+
+@mcp.tool()
+def get_agent_session_memory(
+    agent_work_dir: Annotated[str, "Path to the agent's working directory"],
+    session_id: Annotated[str, "The session ID"],
+    key: Annotated[str, "Specific memory key to retrieve. Empty for all."] = "",
+) -> str:
+    """
+    Get memory contents from a session.
+
+    Memory stores intermediate results passed between nodes. Use this
+    to inspect what data was produced during execution.
+
+    If key is provided, returns only that memory key's value.
+    If key is empty, returns all memory keys and their values.
+    """
+    state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json"
+    data = _read_session_json(state_path)
+    if data is None:
+        return json.dumps({"error": f"Session not found: {session_id}"})
+
+    memory = data.get("memory", {})
+    if not isinstance(memory, dict):
+        memory = {}
+
+    if key:
+        if key not in memory:
+            return json.dumps(
+                {
+                    "error": f"Memory key not found: '{key}'",
+                    "available_keys": list(memory.keys()),
+                }
+            )
+        value = memory[key]
+        return json.dumps(
+            {
+                "session_id": session_id,
+                "key": key,
+                "value": value,
+                "value_type": type(value).__name__,
+            },
+            indent=2,
+            default=str,
+        )
+
+    return json.dumps(
+        {"session_id": session_id, "memory": memory, "total_keys": len(memory)},
+        indent=2,
+        default=str,
+    )
+
+
+@mcp.tool()
+def list_agent_checkpoints(
+    agent_work_dir: Annotated[str, "Path to the agent's working directory"],
+    session_id: Annotated[str, "The session ID to list checkpoints for"],
+    checkpoint_type: Annotated[
+        str,
+        "Filter by type: 'node_start', 'node_complete', 'loop_iteration'. Empty for all.",
+    ] = "",
+    is_clean: Annotated[str, "Filter by clean status: 'true', 'false', or empty for all."] = "",
+) -> str:
+    """
+    List checkpoints for a specific session.
+
+    Checkpoints capture execution state at node boundaries for
+    crash recovery and resume. Use with get_agent_checkpoint for
+    detailed checkpoint inspection.
+    """
+    session_dir = Path(agent_work_dir) / "sessions" / session_id
+    checkpoint_dir = session_dir / "checkpoints"
+
+    if not session_dir.exists():
+        return json.dumps({"error": f"Session not found: {session_id}"})
+
+    if not checkpoint_dir.exists():
+        return json.dumps(
+            {
+                "session_id": session_id,
+                "checkpoints": [],
+                "total": 0,
+                "latest_checkpoint_id": None,
+            }
+        )
+
+    # Try index.json first
+    index_data = _read_session_json(checkpoint_dir / "index.json")
+    if index_data and "checkpoints" in index_data:
+        checkpoints = index_data["checkpoints"]
+    else:
+        # Fallback: scan individual checkpoint files
+        checkpoints = []
+        for cp_file in sorted(checkpoint_dir.glob("cp_*.json")):
+            cp_data = _read_session_json(cp_file)
+            if cp_data:
+                checkpoints.append(
+                    {
+                        "checkpoint_id": cp_data.get("checkpoint_id", cp_file.stem),
+                        "checkpoint_type": cp_data.get("checkpoint_type", ""),
+                        "created_at": cp_data.get("created_at", ""),
+                        "current_node": cp_data.get("current_node"),
+                        "next_node": cp_data.get("next_node"),
+                        "is_clean": cp_data.get("is_clean", True),
+                        "description": cp_data.get("description", ""),
+                    }
+                )
+
+    # Apply filters
+    if checkpoint_type:
+        checkpoints = [c for c in checkpoints if c.get("checkpoint_type") == checkpoint_type]
+    if is_clean:
+        clean_val = is_clean.lower() == "true"
+        checkpoints = [c for c in checkpoints if c.get("is_clean") == clean_val]
+
+    latest_id = None
+    if index_data:
+        latest_id = index_data.get("latest_checkpoint_id")
+    elif checkpoints:
+        latest_id = checkpoints[-1].get("checkpoint_id")
+
+    return json.dumps(
+        {
+            "session_id": session_id,
+            "checkpoints": checkpoints,
+            "total": len(checkpoints),
+            "latest_checkpoint_id": latest_id,
+        },
+        indent=2,
+    )
+
+
+@mcp.tool()
+def get_agent_checkpoint(
+    agent_work_dir: Annotated[str, "Path to the agent's working directory"],
+    session_id: Annotated[str, "The session ID"],
+    checkpoint_id: Annotated[str, "Specific checkpoint ID, or empty for latest"] = "",
+) -> str:
+    """
+    Load a specific checkpoint with full state data.
+
+    Returns the complete checkpoint including shared memory snapshot,
+    execution path, accumulated outputs, and metrics. If checkpoint_id
+    is empty, loads the latest checkpoint.
+    """
+    session_dir = Path(agent_work_dir) / "sessions" / session_id
+    checkpoint_dir = session_dir / "checkpoints"
+
+    if not checkpoint_dir.exists():
+        return json.dumps({"error": f"No checkpoints found for session: {session_id}"})
+
+    if not checkpoint_id:
+        index_data = _read_session_json(checkpoint_dir / "index.json")
+        if index_data and index_data.get("latest_checkpoint_id"):
+            checkpoint_id = index_data["latest_checkpoint_id"]
+        else:
+            cp_files = sorted(checkpoint_dir.glob("cp_*.json"))
+            if not cp_files:
+                return json.dumps({"error": f"No checkpoints found for session: {session_id}"})
+            checkpoint_id = cp_files[-1].stem
+
+    cp_path = checkpoint_dir / f"{checkpoint_id}.json"
+    data = _read_session_json(cp_path)
+    if data is None:
+        return json.dumps({"error": f"Checkpoint not found: {checkpoint_id}"})
+
+    return json.dumps(data, indent=2, default=str)
+
+
+@mcp.tool()
+def compare_agent_checkpoints(
+    agent_work_dir: Annotated[str, "Path to the agent's working directory"],
+    session_id: Annotated[str, "The session ID"],
+    checkpoint_id_before: Annotated[str, "The earlier checkpoint ID"],
+    checkpoint_id_after: Annotated[str, "The later checkpoint ID"],
+) -> str:
+    """
+    Compare memory state between two checkpoints.
+
+    Shows what memory keys were added, removed, or changed between
+    two points in execution. Useful for understanding how data flows
+    through the agent graph.
+    """
+    checkpoint_dir = Path(agent_work_dir) / "sessions" / session_id / "checkpoints"
+
+    before = _read_session_json(checkpoint_dir / f"{checkpoint_id_before}.json")
+    if before is None:
+        return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_before}"})
+
+    after = _read_session_json(checkpoint_dir / f"{checkpoint_id_after}.json")
+    if after is None:
+        return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_after}"})
+
+    mem_before = before.get("shared_memory", {})
+    mem_after = after.get("shared_memory", {})
+
+    keys_before = set(mem_before.keys())
+    keys_after = set(mem_after.keys())
+
+    added = {k: _truncate_value(mem_after[k]) for k in keys_after - keys_before}
+    removed = list(keys_before - keys_after)
+    unchanged = []
+    changed = {}
+
+    for k in keys_before & keys_after:
+        if mem_before[k] == mem_after[k]:
+            unchanged.append(k)
+        else:
+            changed[k] = {
+                "before": _truncate_value(mem_before[k]),
+                "after": _truncate_value(mem_after[k]),
+            }
+
+    path_before = before.get("execution_path", [])
+    path_after = after.get("execution_path", [])
+    new_nodes = path_after[len(path_before) :]
+
+    return json.dumps(
+        {
+            "session_id": session_id,
+            "before": {
+                "checkpoint_id": checkpoint_id_before,
+                "current_node": before.get("current_node"),
+                "created_at": before.get("created_at", ""),
+            },
+            "after": {
+                "checkpoint_id": checkpoint_id_after,
+                "current_node": after.get("current_node"),
+                "created_at": after.get("created_at", ""),
+            },
+            "memory_diff": {
+                "added": added,
+                "removed": removed,
+                "changed": changed,
+                "unchanged": unchanged,
+            },
+            "execution_path_diff": {
+                "new_nodes": new_nodes,
+                "path_before": path_before,
+                "path_after": path_after,
+            },
+        },
+        indent=2,
+        default=str,
+    )
+
+
 # =============================================================================
 # MAIN
 # =============================================================================
@@ -332,6 +332,60 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
    resume_parser.set_defaults(func=cmd_resume)


+def _load_resume_state(
+    agent_path: str, session_id: str, checkpoint_id: str | None = None
+) -> dict | None:
+    """Load session or checkpoint state for headless resume.
+
+    Args:
+        agent_path: Path to the agent folder (e.g., exports/my_agent)
+        session_id: Session ID to resume from
+        checkpoint_id: Optional checkpoint ID within the session
+
+    Returns:
+        session_state dict for executor, or None if not found
+    """
+    agent_name = Path(agent_path).name
+    agent_work_dir = Path.home() / ".hive" / "agents" / agent_name
+    session_dir = agent_work_dir / "sessions" / session_id
+
+    if not session_dir.exists():
+        return None
+
+    if checkpoint_id:
+        # Checkpoint-based resume: load checkpoint and extract state
+        cp_path = session_dir / "checkpoints" / f"{checkpoint_id}.json"
+        if not cp_path.exists():
+            return None
+        try:
+            cp_data = json.loads(cp_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            return None
+        return {
+            "memory": cp_data.get("shared_memory", {}),
+            "paused_at": cp_data.get("next_node") or cp_data.get("current_node"),
+            "execution_path": cp_data.get("execution_path", []),
+            "node_visit_counts": {},
+        }
+    else:
+        # Session state resume: load state.json
+        state_path = session_dir / "state.json"
+        if not state_path.exists():
+            return None
+        try:
+            state_data = json.loads(state_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            return None
+        progress = state_data.get("progress", {})
+        paused_at = progress.get("paused_at") or progress.get("resume_from")
+        return {
+            "memory": state_data.get("memory", {}),
+            "paused_at": paused_at,
+            "execution_path": progress.get("path", []),
+            "node_visit_counts": progress.get("node_visit_counts", {}),
+        }
+
+
 def cmd_run(args: argparse.Namespace) -> int:
    """Run an exported agent."""
    import logging
@@ -375,7 +429,6 @@ def cmd_run(args: argparse.Namespace) -> int:
                    runner = AgentRunner.load(
                        args.agent_path,
                        model=args.model,
-                        enable_tui=True,
                    )
                except CredentialError as e:
                    print(f"\n{e}", file=sys.stderr)
@@ -419,7 +472,6 @@ def cmd_run(args: argparse.Namespace) -> int:
            runner = AgentRunner.load(
                args.agent_path,
                model=args.model,
-                enable_tui=False,
            )
        except CredentialError as e:
            print(f"\n{e}", file=sys.stderr)
@@ -428,6 +480,27 @@ def cmd_run(args: argparse.Namespace) -> int:
            print(f"Error: {e}", file=sys.stderr)
            return 1

+        # Load session/checkpoint state for resume (headless mode)
+        session_state = None
+        resume_session = getattr(args, "resume_session", None)
+        checkpoint = getattr(args, "checkpoint", None)
+        if resume_session:
+            session_state = _load_resume_state(args.agent_path, resume_session, checkpoint)
+            if session_state is None:
+                print(
+                    f"Error: Could not load session state for {resume_session}",
+                    file=sys.stderr,
+                )
+                return 1
+            if not args.quiet:
+                resume_node = session_state.get("paused_at", "unknown")
+                if checkpoint:
+                    print(f"Resuming from checkpoint: {checkpoint}")
+                else:
+                    print(f"Resuming session: {resume_session}")
+                print(f"Resume point: {resume_node}")
+                print()
+
        # Auto-inject user_id if the agent expects it but it's not provided
        entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else []
        if "user_id" in entry_input_keys and context.get("user_id") is None:
@@ -447,7 +520,7 @@ def cmd_run(args: argparse.Namespace) -> int:
            print("=" * 60)
            print()

-        result = asyncio.run(runner.run(context))
+        result = asyncio.run(runner.run(context, session_state=session_state))

    # Format output
    output = {
@@ -1205,7 +1278,6 @@ def cmd_tui(args: argparse.Namespace) -> int:
            runner = AgentRunner.load(
                agent_path,
                model=args.model,
-                enable_tui=True,
            )
        except CredentialError as e:
            print(f"\n{e}", file=sys.stderr)
@@ -17,17 +17,13 @@ from framework.graph.edge import (
    EdgeSpec,
    GraphSpec,
 )
-from framework.graph.executor import ExecutionResult, GraphExecutor
+from framework.graph.executor import ExecutionResult
 from framework.graph.node import NodeSpec
 from framework.llm.provider import LLMProvider, Tool
 from framework.runner.tool_registry import ToolRegistry
-
-# Multi-entry-point runtime imports
 from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
-from framework.runtime.core import Runtime
 from framework.runtime.execution_stream import EntryPointSpec
 from framework.runtime.runtime_log_store import RuntimeLogStore
-from framework.runtime.runtime_logger import RuntimeLogger

 if TYPE_CHECKING:
    from framework.runner.protocol import AgentMessage, CapabilityResponse
@@ -271,7 +267,6 @@ class AgentRunner:
        mock_mode: bool = False,
        storage_path: Path | None = None,
        model: str | None = None,
-        enable_tui: bool = False,
        intro_message: str = "",
    ):
        """
@@ -284,7 +279,6 @@ class AgentRunner:
            mock_mode: If True, use mock LLM responses
            storage_path: Path for runtime storage (defaults to temp)
            model: Model to use (reads from agent config or ~/.hive/configuration.json if None)
-            enable_tui: If True, forces use of AgentRuntime with EventBus
            intro_message: Optional greeting shown to user on TUI load
        """
        self.agent_path = agent_path
@@ -292,7 +286,6 @@ class AgentRunner:
        self.goal = goal
        self.mock_mode = mock_mode
        self.model = model or self._resolve_default_model()
-        self.enable_tui = enable_tui
        self.intro_message = intro_message

        # Set up storage
@@ -313,12 +306,10 @@ class AgentRunner:

        # Initialize components
        self._tool_registry = ToolRegistry()
-        self._runtime: Runtime | None = None
        self._llm: LLMProvider | None = None
-        self._executor: GraphExecutor | None = None
        self._approval_callback: Callable | None = None

-        # Multi-entry-point support (AgentRuntime)
+        # AgentRuntime — unified execution path for all agents
        self._agent_runtime: AgentRuntime | None = None
        self._uses_async_entry_points = self.graph.has_async_entry_points()

@@ -466,7 +457,6 @@ class AgentRunner:
        mock_mode: bool = False,
        storage_path: Path | None = None,
        model: str | None = None,
-        enable_tui: bool = False,
    ) -> "AgentRunner":
        """
        Load an agent from an export folder.
@@ -480,7 +470,6 @@ class AgentRunner:
            mock_mode: If True, use mock LLM responses
            storage_path: Path for runtime storage (defaults to ~/.hive/agents/{name})
            model: LLM model to use (reads from agent's default_config if None)
-            enable_tui: If True, forces use of AgentRuntime with EventBus

        Returns:
            AgentRunner instance ready to run
@@ -541,7 +530,6 @@ class AgentRunner:
                mock_mode=mock_mode,
                storage_path=storage_path,
                model=model,
-                enable_tui=enable_tui,
                intro_message=intro_message,
            )

@@ -560,7 +548,6 @@ class AgentRunner:
            mock_mode=mock_mode,
            storage_path=storage_path,
            model=model,
-            enable_tui=enable_tui,
        )

    def register_tool(
@@ -650,9 +637,6 @@ class AgentRunner:
            callback: Function to call for approval (receives node info, returns bool)
        """
        self._approval_callback = callback
-        # If executor already exists, update it
-        if self._executor is not None:
-            self._executor.approval_callback = callback

    def _setup(self) -> None:
        """Set up runtime, LLM, and executor."""
@@ -717,16 +701,11 @@ class AgentRunner:
                        print(f"Warning: {api_key_env} not set. LLM calls will fail.")
                        print(f"Set it with: export {api_key_env}=your-api-key")

-        # Get tools for executor/runtime
+        # Get tools for runtime
        tools = list(self._tool_registry.get_tools().values())
        tool_executor = self._tool_registry.get_executor()

-        if self._uses_async_entry_points or self.enable_tui:
-            # Multi-entry-point mode or TUI mode: use AgentRuntime
        self._setup_agent_runtime(tools, tool_executor)
-        else:
-            # Single-entry-point mode: use legacy GraphExecutor
-            self._setup_legacy_executor(tools, tool_executor)

    def _get_api_key_env_var(self, model: str) -> str | None:
        """Get the environment variable name for the API key based on model name."""
@@ -741,7 +720,7 @@ class AgentRunner:
        elif model_lower.startswith("anthropic/") or model_lower.startswith("claude"):
            return "ANTHROPIC_API_KEY"
        elif model_lower.startswith("gemini/") or model_lower.startswith("google/"):
-            return "GOOGLE_API_KEY"
+            return "GEMINI_API_KEY"
        elif model_lower.startswith("mistral/"):
            return "MISTRAL_API_KEY"
        elif model_lower.startswith("groq/"):
@@ -787,26 +766,6 @@ class AgentRunner:
        except Exception:
            return None

-    def _setup_legacy_executor(self, tools: list, tool_executor: Callable | None) -> None:
-        """Set up legacy single-entry-point execution using GraphExecutor."""
-        # Create runtime
-        self._runtime = Runtime(storage_path=self._storage_path)
-
-        # Create runtime logger
-        log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs")
-        runtime_logger = RuntimeLogger(store=log_store, agent_id=self.graph.id)
-
-        # Create executor
-        self._executor = GraphExecutor(
-            runtime=self._runtime,
-            llm=self._llm,
-            tools=tools,
-            tool_executor=tool_executor,
-            approval_callback=self._approval_callback,
-            runtime_logger=runtime_logger,
-            loop_config=self.graph.loop_config,
-        )
-
    def _setup_agent_runtime(self, tools: list, tool_executor: Callable | None) -> None:
        """Set up multi-entry-point execution using AgentRuntime."""
        # Convert AsyncEntryPointSpec to EntryPointSpec for AgentRuntime
@@ -824,9 +783,9 @@ class AgentRunner:
            )
            entry_points.append(ep)

-        # If TUI enabled but no entry points (single-entry agent), create default
-        if not entry_points and self.enable_tui and self.graph.entry_node:
-            logger.info("Creating default entry point for TUI")
+        # Single-entry agent with no async entry points: create a default entry point
+        if not entry_points and self.graph.entry_node:
+            logger.info("Creating default entry point for single-entry agent")
            entry_points.append(
                EntryPointSpec(
                    id="default",
@@ -905,32 +864,9 @@ class AgentRunner:
                error=error_msg,
            )

-        if self._uses_async_entry_points or self.enable_tui:
-            # Multi-entry-point mode: use AgentRuntime
        return await self._run_with_agent_runtime(
            input_data=input_data or {},
            entry_point_id=entry_point_id,
-            )
-        else:
-            # Legacy single-entry-point mode
-            return await self._run_with_executor(
-                input_data=input_data or {},
-                session_state=session_state,
-            )
-
-    async def _run_with_executor(
-        self,
-        input_data: dict,
-        session_state: dict | None = None,
-    ) -> ExecutionResult:
-        """Run using legacy GraphExecutor (single entry point)."""
-        if self._executor is None:
-            self._setup()
-
-        return await self._executor.execute(
-            graph=self.graph,
-            goal=self.goal,
-            input_data=input_data,
            session_state=session_state,
        )

@@ -938,8 +874,11 @@ class AgentRunner:
        self,
        input_data: dict,
        entry_point_id: str | None = None,
+        session_state: dict | None = None,
    ) -> ExecutionResult:
-        """Run using AgentRuntime (multi-entry-point)."""
+        """Run using AgentRuntime."""
+        import sys
+
        if self._agent_runtime is None:
            self._setup()

@@ -947,6 +886,52 @@ class AgentRunner:
        if not self._agent_runtime.is_running:
            await self._agent_runtime.start()

+        # Set up stdin-based I/O for client-facing nodes in headless mode.
+        # When a client_facing EventLoopNode calls ask_user(), it emits
+        # CLIENT_INPUT_REQUESTED on the event bus and blocks.  We subscribe
+        # a handler that prints the prompt and reads from stdin, then injects
+        # the user's response back into the node to unblock it.
+        has_client_facing = any(n.client_facing for n in self.graph.nodes)
+        sub_ids: list[str] = []
+
+        if has_client_facing and sys.stdin.isatty():
+            from framework.runtime.event_bus import EventType
+
+            runtime = self._agent_runtime
+
+            async def _handle_client_output(event):
+                """Print agent output to stdout as it streams."""
+                content = event.data.get("content", "")
+                if content:
+                    print(content, end="", flush=True)
+
+            async def _handle_input_requested(event):
+                """Read user input from stdin and inject it into the node."""
+                import asyncio
+
+                node_id = event.node_id
+                try:
+                    loop = asyncio.get_event_loop()
+                    user_input = await loop.run_in_executor(None, input, "\n>>> ")
+                except EOFError:
+                    user_input = ""
+
+                # Inject into the waiting EventLoopNode via runtime
+                await runtime.inject_input(node_id, user_input)
+
+            sub_ids.append(
+                runtime.subscribe_to_events(
+                    event_types=[EventType.CLIENT_OUTPUT_DELTA],
+                    handler=_handle_client_output,
+                )
+            )
+            sub_ids.append(
+                runtime.subscribe_to_events(
+                    event_types=[EventType.CLIENT_INPUT_REQUESTED],
+                    handler=_handle_input_requested,
+                )
+            )
+
        # Determine entry point
        if entry_point_id is None:
            # Use first entry point or "default" if no entry points defined
@@ -956,10 +941,12 @@ class AgentRunner:
            else:
                entry_point_id = "default"

+        try:
            # Trigger and wait for result
            result = await self._agent_runtime.trigger_and_wait(
                entry_point_id=entry_point_id,
                input_data=input_data,
+                session_state=session_state,
            )

            # Return result or create error result
@@ -970,30 +957,22 @@ class AgentRunner:
                    success=False,
                    error="Execution timed out or failed to complete",
                )
+        finally:
+            # Clean up subscriptions
+            for sub_id in sub_ids:
+                self._agent_runtime.unsubscribe_from_events(sub_id)

-    # === Multi-Entry-Point API (for agents with async_entry_points) ===
+    # === Runtime API ===

    async def start(self) -> None:
-        """
-        Start the agent runtime (for multi-entry-point agents).
-
-        This starts all registered entry points and allows concurrent execution.
-        For single-entry-point agents, this is a no-op.
-        """
-        if not self._uses_async_entry_points:
-            return
-
+        """Start the agent runtime."""
        if self._agent_runtime is None:
            self._setup()

        await self._agent_runtime.start()

    async def stop(self) -> None:
-        """
-        Stop the agent runtime (for multi-entry-point agents).
-
-        For single-entry-point agents, this is a no-op.
-        """
+        """Stop the agent runtime."""
        if self._agent_runtime is not None:
            await self._agent_runtime.stop()

@@ -1006,7 +985,7 @@ class AgentRunner:
        """
        Trigger execution at a specific entry point (non-blocking).

-        For multi-entry-point agents only. Returns execution ID for tracking.
+        Returns execution ID for tracking.

        Args:
            entry_point_id: Which entry point to trigger
@@ -1015,16 +994,7 @@ class AgentRunner:

        Returns:
            Execution ID for tracking
-
-        Raises:
-            RuntimeError: If agent doesn't use async entry points
        """
-        if not self._uses_async_entry_points:
-            raise RuntimeError(
-                "trigger() is only available for multi-entry-point agents. "
-                "Use run() for single-entry-point agents."
-            )
-
        if self._agent_runtime is None:
            self._setup()

@@ -1041,19 +1011,9 @@ class AgentRunner:
        """
        Get goal progress across all execution streams.

-        For multi-entry-point agents only.
-
        Returns:
            Dict with overall_progress, criteria_status, constraint_violations, etc.
-
-        Raises:
-            RuntimeError: If agent doesn't use async entry points
        """
-        if not self._uses_async_entry_points:
-            raise RuntimeError(
-                "get_goal_progress() is only available for multi-entry-point agents."
-            )
-
        if self._agent_runtime is None:
            self._setup()

@@ -1061,14 +1021,11 @@ class AgentRunner:

    def get_entry_points(self) -> list[EntryPointSpec]:
        """
-        Get all registered entry points (for multi-entry-point agents).
+        Get all registered entry points.

        Returns:
            List of EntryPointSpec objects
        """
-        if not self._uses_async_entry_points:
-            return []
-
        if self._agent_runtime is None:
            self._setup()

@@ -1492,7 +1449,7 @@ Respond with JSON only:
            self._temp_dir = None

    async def cleanup_async(self) -> None:
-        """Clean up resources (asynchronous - for multi-entry-point agents)."""
+        """Clean up resources (asynchronous)."""
        # Stop agent runtime if running
        if self._agent_runtime is not None and self._agent_runtime.is_running:
            await self._agent_runtime.stop()
@@ -1503,8 +1460,7 @@ Respond with JSON only:
    async def __aenter__(self) -> "AgentRunner":
        """Context manager entry."""
        self._setup()
-        # Start runtime for multi-entry-point agents
-        if self._uses_async_entry_points and self._agent_runtime is not None:
+        if self._agent_runtime is not None:
            await self._agent_runtime.start()
        return self

@@ -0,0 +1,172 @@
+# Agent Runtime
+
+Unified execution system for all Hive agents. Every agent — single-entry or multi-entry, headless or TUI — runs through the same runtime stack.
+
+## Topology
+
+```
+                     AgentRunner.load(agent_path)
+                              |
+                         AgentRunner
+                     (factory + public API)
+                              |
+                       _setup_agent_runtime()
+                              |
+                        AgentRuntime
+                   (lifecycle + orchestration)
+                      /       |       \
+               Stream A   Stream B   Stream C    ← one per entry point
+                  |           |          |
+            GraphExecutor  GraphExecutor  GraphExecutor
+                  |           |          |
+              Node → Node → Node  (graph traversal)
+```
+
+Single-entry agents get a `"default"` entry point automatically. There is no separate code path.
+
+## Components
+
+| Component | File | Role |
+|---|---|---|
+| `AgentRunner` | `runner/runner.py` | Load agents, configure tools/LLM, expose high-level API |
+| `AgentRuntime` | `runtime/agent_runtime.py` | Lifecycle management, entry point routing, event bus |
+| `ExecutionStream` | `runtime/execution_stream.py` | Per-entry-point execution queue, session persistence |
+| `GraphExecutor` | `graph/executor.py` | Node traversal, tool dispatch, checkpointing |
+| `EventBus` | `runtime/event_bus.py` | Pub/sub for execution events (streaming, I/O) |
+| `SharedStateManager` | `runtime/shared_state.py` | Cross-stream state with isolation levels |
+| `OutcomeAggregator` | `runtime/outcome_aggregator.py` | Goal progress tracking across streams |
+| `SessionStore` | `storage/session_store.py` | Session state persistence (`sessions/{id}/state.json`) |
+
+## Programming Interface
+
+### AgentRunner (high-level)
+
+```python
+from framework.runner import AgentRunner
+
+# Load and run
+runner = AgentRunner.load("exports/my_agent", model="anthropic/claude-sonnet-4-20250514")
+result = await runner.run({"query": "hello"})
+
+# Resume from paused session
+result = await runner.run({"query": "continue"}, session_state=saved_state)
+
+# Lifecycle
+await runner.start()                           # Start the runtime
+await runner.stop()                            # Stop the runtime
+exec_id = await runner.trigger("default", {})  # Non-blocking trigger
+progress = await runner.get_goal_progress()    # Goal evaluation
+entry_points = runner.get_entry_points()       # List entry points
+
+# Context manager
+async with AgentRunner.load("exports/my_agent") as runner:
+    result = await runner.run({"query": "hello"})
+
+# Cleanup
+runner.cleanup()          # Synchronous
+await runner.cleanup_async()  # Asynchronous
+```
+
+### AgentRuntime (lower-level)
+
+```python
+from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.execution_stream import EntryPointSpec
+
+# Create runtime with entry points
+runtime = create_agent_runtime(
+    graph=graph,
+    goal=goal,
+    storage_path=Path("~/.hive/agents/my_agent"),
+    entry_points=[
+        EntryPointSpec(id="default", name="Default", entry_node="start", trigger_type="manual"),
+    ],
+    llm=llm,
+    tools=tools,
+    tool_executor=tool_executor,
+    checkpoint_config=checkpoint_config,
+)
+
+# Lifecycle
+await runtime.start()
+await runtime.stop()
+
+# Execution
+exec_id = await runtime.trigger("default", {"query": "hello"})              # Non-blocking
+result = await runtime.trigger_and_wait("default", {"query": "hello"})      # Blocking
+result = await runtime.trigger_and_wait("default", {}, session_state=state) # Resume
+
+# Client-facing node I/O
+await runtime.inject_input(node_id="chat", content="user response")
+
+# Events
+sub_id = runtime.subscribe_to_events(
+    event_types=[EventType.CLIENT_OUTPUT_DELTA],
+    handler=my_handler,
+)
+runtime.unsubscribe_from_events(sub_id)
+
+# Inspection
+runtime.is_running           # bool
+runtime.event_bus            # EventBus
+runtime.state_manager        # SharedStateManager
+runtime.get_stats()          # Runtime statistics
+```
+
+## Execution Flow
+
+1. `AgentRunner.run()` calls `AgentRuntime.trigger_and_wait()`
+2. `AgentRuntime` routes to the `ExecutionStream` for the entry point
+3. `ExecutionStream` creates a `GraphExecutor` and calls `execute()`
+4. `GraphExecutor` traverses nodes, dispatches tools, manages checkpoints
+5. `ExecutionResult` flows back up through the stack
+6. `ExecutionStream` writes session state to disk
+
+## Session Resume
+
+All execution paths support session resume:
+
+```python
+# First run (agent pauses at a client-facing node)
+result = await runner.run({"query": "start task"})
+# result.paused_at = "review-node"
+# result.session_state = {"memory": {...}, "paused_at": "review-node", ...}
+
+# Resume
+result = await runner.run({"input": "approved"}, session_state=result.session_state)
+```
+
+Session state flows: `AgentRunner.run()` → `AgentRuntime.trigger_and_wait()` → `ExecutionStream.execute()` → `GraphExecutor.execute()`.
+
+Checkpoints are saved at node boundaries (`sessions/{id}/checkpoints/`) for crash recovery.
+
+## Event Bus
+
+The `EventBus` provides real-time execution visibility:
+
+| Event | When |
+|---|---|
+| `NODE_STARTED` | Node begins execution |
+| `NODE_COMPLETED` | Node finishes |
+| `TOOL_CALL_STARTED` | Tool invocation begins |
+| `TOOL_CALL_COMPLETED` | Tool invocation finishes |
+| `CLIENT_OUTPUT_DELTA` | Agent streams text to user |
+| `CLIENT_INPUT_REQUESTED` | Agent needs user input |
+| `EXECUTION_COMPLETED` | Full execution finishes |
+
+In headless mode, `AgentRunner` subscribes to `CLIENT_OUTPUT_DELTA` and `CLIENT_INPUT_REQUESTED` to print output and read stdin. In TUI mode, `AdenTUI` subscribes to route events to UI widgets.
+
+## Storage Layout
+
+```
+~/.hive/agents/{agent_name}/
+  sessions/
+    session_YYYYMMDD_HHMMSS_{uuid}/
+      state.json              # Session state (status, memory, progress)
+      checkpoints/            # Node-boundary snapshots
+      logs/
+        summary.json          # Execution summary
+        details.jsonl         # Detailed event log
+        tool_logs.jsonl       # Tool call log
+  runtime_logs/               # Cross-session runtime logs
+```
@@ -3,6 +3,10 @@ Pytest templates for test file generation.

 These templates provide headers and fixtures for pytest-compatible async tests.
 Tests are written to exports/{agent}/tests/ as Python files and run with pytest.
+
+Tests use AgentRunner.load() — the canonical runtime path — which creates
+AgentRuntime, ExecutionStream, and proper session/log storage. For agents
+with client-facing nodes, an auto_responder fixture handles input injection.
 """

 # Template for the test file header (imports and fixtures)
@@ -11,17 +15,19 @@ PYTEST_TEST_FILE_HEADER = '''"""

 {description}

-REQUIRES: API_KEY (OpenAI or Anthropic) for real testing.
+REQUIRES: API_KEY for execution tests. Structure tests run without keys.
 """

 import os
 import pytest
-from {agent_module} import default_agent
+from pathlib import Path
+
+# Agent path resolved from this test file's location
+AGENT_PATH = Path(__file__).resolve().parents[1]


 def _get_api_key():
    """Get API key from CredentialStoreAdapter or environment."""
-    # 1. Try CredentialStoreAdapter for Anthropic
    try:
        from aden_tools.credentials import CredentialStoreAdapter
        creds = CredentialStoreAdapter.default()
@@ -29,28 +35,43 @@ def _get_api_key():
            return creds.get("anthropic")
    except (ImportError, KeyError):
        pass
-
-    # 2. Fallback to standard environment variables for OpenAI and others
    return (
        os.environ.get("OPENAI_API_KEY") or
        os.environ.get("ANTHROPIC_API_KEY") or
        os.environ.get("CEREBRAS_API_KEY") or
-        os.environ.get("GROQ_API_KEY")
+        os.environ.get("GROQ_API_KEY") or
+        os.environ.get("GEMINI_API_KEY")
    )


 # Skip all tests if no API key and not in mock mode
 pytestmark = pytest.mark.skipif(
    not _get_api_key() and not os.environ.get("MOCK_MODE"),
-    reason="API key required. Please set OPENAI_API_KEY, ANTHROPIC_API_KEY, or use MOCK_MODE=1."
+    reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests."
 )
 '''

 # Template for conftest.py with shared fixtures
 PYTEST_CONFTEST_TEMPLATE = '''"""Shared test fixtures for {agent_name} tests."""

+import json
 import os
+import re
+import sys
+from pathlib import Path
+
+# Add exports/ and core/ to sys.path so the agent package and framework are importable
+_repo_root = Path(__file__).resolve().parents[3]
+for _p in ["exports", "core"]:
+    _path = str(_repo_root / _p)
+    if _path not in sys.path:
+        sys.path.insert(0, _path)
+
 import pytest
+from framework.runner.runner import AgentRunner
+from framework.runtime.event_bus import EventType
+
+AGENT_PATH = Path(__file__).resolve().parents[1]


 def _get_api_key():
@@ -62,19 +83,80 @@ def _get_api_key():
            return creds.get("anthropic")
    except (ImportError, KeyError):
        pass
-
    return (
        os.environ.get("OPENAI_API_KEY") or
        os.environ.get("ANTHROPIC_API_KEY") or
        os.environ.get("CEREBRAS_API_KEY") or
-        os.environ.get("GROQ_API_KEY")
+        os.environ.get("GROQ_API_KEY") or
+        os.environ.get("GEMINI_API_KEY")
    )


-@pytest.fixture
+@pytest.fixture(scope="session")
 def mock_mode():
-    """Check if running in mock mode."""
-    return bool(os.environ.get("MOCK_MODE"))
+    """Return True if running in mock mode (no API key or MOCK_MODE=1)."""
+    if os.environ.get("MOCK_MODE"):
+        return True
+    return not bool(_get_api_key())
+
+
+@pytest.fixture(scope="session")
+async def runner(tmp_path_factory, mock_mode):
+    """Create an AgentRunner using the canonical runtime path.
+
+    Uses tmp_path_factory for storage so tests don't pollute ~/.hive/agents/.
+    Goes through AgentRunner.load() -> _setup() -> AgentRuntime, the same
+    path as ``hive run``.
+    """
+    storage = tmp_path_factory.mktemp("agent_storage")
+    r = AgentRunner.load(
+        AGENT_PATH,
+        mock_mode=mock_mode,
+        storage_path=storage,
+    )
+    r._setup()
+    yield r
+    await r.cleanup_async()
+
+
+@pytest.fixture
+def auto_responder(runner):
+    """Auto-respond to client-facing node input requests.
+
+    Subscribes to CLIENT_INPUT_REQUESTED events and injects a response
+    to unblock the node. Customize the response before calling start():
+
+        auto_responder.response = "approve the report"
+        await auto_responder.start()
+    """
+    class AutoResponder:
+        def __init__(self, runner_instance):
+            self._runner = runner_instance
+            self.response = "yes, proceed"
+            self.interactions = []
+            self._sub_id = None
+
+        async def start(self):
+            runtime = self._runner._agent_runtime
+            if runtime is None:
+                return
+
+            async def _handle(event):
+                self.interactions.append(event.node_id)
+                await runtime.inject_input(event.node_id, self.response)
+
+            self._sub_id = runtime.subscribe_to_events(
+                event_types=[EventType.CLIENT_INPUT_REQUESTED],
+                handler=_handle,
+            )
+
+        async def stop(self):
+            runtime = self._runner._agent_runtime
+            if self._sub_id and runtime:
+                runtime.unsubscribe_from_events(self._sub_id)
+                self._sub_id = None
+
+    return AutoResponder(runner)


@pytest.fixture(scope="session", autouse=True)
@@ -82,19 +164,51 @@ def check_api_key():
    """Ensure API key is set for real testing."""
    if not _get_api_key():
        if os.environ.get("MOCK_MODE"):
-            print("\\n⚠️  Running in MOCK MODE - structure validation only")
-            print("   This does NOT test LLM behavior or agent quality")
-            print("   Set OPENAI_API_KEY or ANTHROPIC_API_KEY for real testing\\n")
+            print("\\n  Running in MOCK MODE - structure validation only")
+            print("  Set ANTHROPIC_API_KEY for real testing\\n")
        else:
            pytest.fail(
-                "\\n❌ No API key found!\\n\\n"
-                "Real testing requires an API key. Choose one:\\n"
-                "1. Set OpenAI key:\\n"
-                "   export OPENAI_API_KEY='your-key-here'\\n"
-                "2. Set Anthropic key:\\n"
-                "   export ANTHROPIC_API_KEY='your-key-here'\\n"
-                "3. Run structure validation only:\\n"
-                "   MOCK_MODE=1 pytest exports/{agent_name}/tests/\\n\\n"
-                "Note: Mock mode does NOT validate agent behavior or quality."
+                "\\nNo API key found!\\n"
+                "Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests.\\n"
            )
+
+
+def parse_json_from_output(result, key):
+    """Parse JSON from agent output (framework may store full LLM response as string)."""
+    val = result.output.get(key, "")
+    if isinstance(val, (dict, list)):
+        return val
+    if isinstance(val, str):
+        json_text = re.sub(r"```json\\s*|\\s*```", "", val).strip()
+        try:
+            return json.loads(json_text)
+        except (json.JSONDecodeError, TypeError):
+            return val
+    return val
+
+
+def safe_get_nested(result, key_path, default=None):
+    """Safely get nested value from result.output."""
+    output = result.output or {{}}
+    current = output
+    for key in key_path:
+        if isinstance(current, dict):
+            current = current.get(key)
+        elif isinstance(current, str):
+            try:
+                json_text = re.sub(r"```json\\s*|\\s*```", "", current).strip()
+                parsed = json.loads(json_text)
+                if isinstance(parsed, dict):
+                    current = parsed.get(key)
+                else:
+                    return default
+            except json.JSONDecodeError:
+                return default
+        else:
+            return default
+    return current if current is not None else default
+
+
+pytest.parse_json_from_output = parse_json_from_output
+pytest.safe_get_nested = safe_get_nested
 '''
@@ -951,7 +951,7 @@ async def test_client_facing_node_streams_output():
        config=LoopConfig(max_iterations=5),
    )

-    # Text-only on client_facing no longer blocks (no ask_user called),
+    # Text-only on client_facing does not block (no ask_user called),
    # so the node completes without needing a shutdown workaround.
    result = await node.execute(ctx)

@@ -116,6 +116,16 @@ Skills are also available in Cursor. To enable:
 3. Restart Cursor to load the MCP servers from `.cursor/mcp.json`
 4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`)

+
+### Opencode Support
+To enable Opencode integration:
+
+1. Create/Ensure `.opencode/` directory exists
+2. Configure MCP servers in `.opencode/mcp.json`
+3. Restart Opencode to load the MCP servers
+4. Switch to the Hive agent
+* **Tools:** Accesses `agent-builder` and standard `tools` via standard MCP protocols over stdio.
+
 ### Verify Setup

 ```bash
@@ -65,28 +65,26 @@ source .venv/bin/activate

 If you prefer to set up manually or the script fails:

-### 1. Install Core Framework
+### 1. Sync Workspace Dependencies

 ```bash
-cd core
-uv pip install -e .
+# From repository root - this creates a single .venv at the root
+uv sync
 ```

-### 2. Install Tools Package
+> **Note:** The `uv sync` command uses the workspace configuration in `pyproject.toml` to install both `core` (framework) and `tools` (aden_tools) packages together. This is the recommended approach over individual `pip install -e` commands which may fail due to circular dependencies.
+
+### 2. Activate the Virtual Environment

 ```bash
-cd tools
-uv pip install -e .
+# Linux/macOS
+source .venv/bin/activate
+
+# Windows (PowerShell)
+.venv\Scripts\Activate.ps1
 ```

-### 3. Upgrade OpenAI Package
-
-```bash
-# litellm requires openai >= 1.0.0
-uv pip install --upgrade "openai>=1.0.0"
-```
-
-### 4. Verify Installation
+### 3. Verify Installation

 ```bash
 uv run python -c "import framework; print('✓ framework OK')"
@@ -281,18 +279,20 @@ Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass

 ### "ModuleNotFoundError: No module named 'framework'"

-**Solution:** Install the core package:
+**Solution:** Sync the workspace dependencies:

 ```bash
-cd core && uv pip install -e .
+# From repository root
+uv sync
 ```

 ### "ModuleNotFoundError: No module named 'aden_tools'"

-**Solution:** Install the tools package:
+**Solution:** Sync the workspace dependencies:

 ```bash
-cd tools && uv pip install -e .
+# From repository root
+uv sync
 ```

 Or run the setup script:
@@ -350,15 +350,14 @@ The Hive framework consists of three Python packages:

 ```
 hive/
+├── .venv/                   # Single workspace venv (created by uv sync)
 ├── core/                    # Core framework (runtime, graph executor, LLM providers)
 │   ├── framework/
-│   ├── .venv/              # Created by quickstart.sh
 │   └── pyproject.toml
 │
 ├── tools/                   # Tools and MCP servers
 │   ├── src/
 │   │   └── aden_tools/     # Actual package location
-│   ├── .venv/              # Created by quickstart.sh
 │   └── pyproject.toml
 │
 ├── exports/                 # Agent packages (user-created, gitignored)
@@ -368,28 +367,29 @@ hive/
    └── templates/           # Pre-built template agents
 ```

-## Separate Virtual Environments
+## Virtual Environment Setup

-Hive primarily uses **uv** to create and manage separate virtual environments for `core` and `tools`.
+Hive uses **uv workspaces** to manage dependencies. When you run `uv sync` from the repository root, a **single `.venv`** is created at the root containing both packages.

-The project uses separate virtual environments to:
+### Benefits of Workspace Mode

- Isolate dependencies and avoid conflicts
- Allow independent development and testing of each package
- Enable MCP servers to run with their specific dependencies
+- **Single environment** - No need to switch between multiple venvs
+- **Unified dependencies** - Consistent package versions across core and tools
+- **Simpler development** - One activation, access to everything

 ### How It Works

-When you run `./quickstart.sh`, `uv` sets up:
+When you run `./quickstart.sh` or `uv sync`:

-1. **core/.venv/** - Contains the `framework` package and its dependencies (anthropic, litellm, mcp, etc.)
-2. **tools/.venv/** - Contains the `aden_tools` package and its dependencies (beautifulsoup4, pandas, etc.)
+1. **/.venv/** - Single root virtual environment is created
+2. Both `framework` (from core/) and `aden_tools` (from tools/) are installed
+3. All dependencies (anthropic, litellm, beautifulsoup4, pandas, etc.) are resolved together

-If you need to refresh environments manually, use `uv`:
+If you need to refresh the environment:

 ```bash
-cd core && uv sync
-cd ../tools && uv sync
+# From repository root
+uv sync
 ```

 ### Cross-Package Imports
@@ -521,7 +521,15 @@ export ADEN_CREDENTIALS_PATH="/custom/path"
 # Agent storage location (default: /tmp)
 export AGENT_STORAGE_PATH="/custom/storage"
 ```
+## Opencode Setup

+[Opencode](https://github.com/opencode-ai/opencode) is fully supported as a coding agent.
+
+### Automatic Setup
+Run the quickstart script in the root directorys:
+```bash
+./quickstart.sh
+```
 ## Additional Resources

 - **Framework Documentation:** [core/README.md](../core/README.md)
@@ -40,7 +40,7 @@ Welcome to the Aden Engineering Challenges! These quizzes are designed for stude
 After completing challenges, submit your work by:

 1. Creating a GitHub Gist with your answers
-2. Emailing the link to `careers@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name`
+2. Emailing the link to `contact@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name`
 3. Include your GitHub username in the email

 ## Getting Help
@@ -0,0 +1,42 @@
+# Why Conditional Edges Need Priority (Function Nodes)
+
+## The problem
+
+Function nodes return everything they computed. They don't pick one output key — they return all of them.
+
+```python
+def score_lead(inputs):
+    score = compute_score(inputs["profile"])
+    return {
+        "score": score,
+        "is_high_value": score > 80,
+        "needs_enrichment": score > 50 and not inputs["profile"].get("company"),
+    }
+```
+
+Lead comes in: score 92, no company on file. Output: `{"score": 92, "is_high_value": True, "needs_enrichment": True}`.
+
+Two conditional edges leaving this node:
+
+```
+Edge A: needs_enrichment == True  → enrichment node
+Edge B: is_high_value == True     → outreach node
+```
+
+Both are true. Without priority, the graph either fans out to both (wrong — you'd email someone while still enriching their data) or picks one randomly (wrong — non-deterministic).
+
+## Priority fixes it
+
+```
+Edge A: needs_enrichment == True   priority=2  (higher = checked first)
+Edge B: is_high_value == True      priority=1
+Edge C: is_high_value == False     priority=0
+```
+
+Executor keeps only the highest-priority matching group. A wins. Lead gets enriched first, loops back, gets re-scored — now `needs_enrichment` is false, B wins, outreach happens.
+
+## Why event loop nodes don't need this
+
+The LLM understands "if/else." You tell it in the prompt: "if needs enrichment, set `needs_enrichment`. Otherwise if high value, set `approved`." It picks one. Only one conditional edge matches.
+
+A function just returns a dict. It doesn't do "otherwise." Priority is the "otherwise" for function nodes.
@@ -303,8 +303,8 @@ if [ "$USE_ASSOC_ARRAYS" = true ]; then
    )

    declare -A DEFAULT_MODELS=(
-        ["anthropic"]="claude-opus-4-6"
-        ["openai"]="gpt-5.2"
+        ["anthropic"]="claude-haiku-4-5"
+        ["openai"]="gpt-5-mini"
        ["gemini"]="gemini-3-flash-preview"
        ["groq"]="moonshotai/kimi-k2-instruct-0905"
        ["cerebras"]="zai-glm-4.7"
@@ -945,6 +945,16 @@ else
    echo -e "${YELLOW}--${NC}"
 fi

+echo -n "  ⬡ local settings... "
+if [ -f "$SCRIPT_DIR/.claude/settings.local.json" ]; then
+    echo -e "${GREEN}ok${NC}"
+elif [ -f "$SCRIPT_DIR/.claude/settings.local.json.example" ]; then
+    cp "$SCRIPT_DIR/.claude/settings.local.json.example" "$SCRIPT_DIR/.claude/settings.local.json"
+    echo -e "${GREEN}copied from example${NC}"
+else
+    echo -e "${YELLOW}--${NC}"
+fi
+
 echo -n "  ⬡ credential store... "
 if [ -n "$HIVE_CREDENTIAL_KEY" ] && [ -d "$HOME/.hive/credentials/credentials" ]; then
    echo -e "${GREEN}ok${NC}"