fix: flowchart display

fix: draft flowchart display
feature: hive flowchart at planning phase
2026-03-11 15:41:55 -07:00 · 2026-03-11 11:05:33 -07:00 · 2026-03-10 19:54:02 -07:00 · 2026-03-10 16:25:07 -07:00 · 2026-03-10 15:31:34 -07:00 · 2026-03-10 15:29:21 -07:00
314 changed files with 19944 additions and 23391 deletions
@@ -1,9 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"],
-      "disabled": false
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1,5 +0,0 @@
---
-description: hive-concepts
---
-
-use hive-concepts skill
@@ -1,5 +0,0 @@
---
-description: hive-create
---
-
-use hive-create skill
@@ -1,5 +0,0 @@
---
-description: hive-credentials
---
-
-use hive-credentials skill
@@ -1,5 +0,0 @@
---
-description: hive-patterns
---
-
-use hive-patterns skill
@@ -1,5 +0,0 @@
---
-description: hive-test
---
-
-use hive-test skill
@@ -1,5 +0,0 @@
---
-description: hive
---
-
-use hive skill
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1,34 +1,16 @@
 {
  "permissions": {
    "allow": [
-      "mcp__agent-builder__create_session",
-      "mcp__agent-builder__set_goal",
-      "mcp__agent-builder__add_node",
-      "mcp__agent-builder__add_edge",
-      "mcp__agent-builder__configure_loop",
-      "mcp__agent-builder__add_mcp_server",
-      "mcp__agent-builder__validate_graph",
-      "mcp__agent-builder__export_graph",
-      "mcp__agent-builder__load_session_by_id",
      "Bash(git status:*)",
      "Bash(gh run view:*)",
      "Bash(uv run:*)",
      "Bash(env:*)",
-      "mcp__agent-builder__test_node",
-      "mcp__agent-builder__list_mcp_tools",
      "Bash(python -m py_compile:*)",
      "Bash(python -m pytest:*)",
      "Bash(source:*)",
-      "mcp__agent-builder__update_node",
-      "mcp__agent-builder__check_missing_credentials",
-      "mcp__agent-builder__list_stored_credentials",
      "Bash(find:*)",
-      "mcp__agent-builder__run_tests",
-      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)",
-      "mcp__agent-builder__list_agent_sessions",
-      "mcp__agent-builder__generate_constraint_tests",
-      "mcp__agent-builder__generate_success_tests"
+      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)"
    ]
  },
-  "enabledMcpjsonServers": ["agent-builder", "tools"]
+  "enabledMcpjsonServers": ["tools"]
 }
@@ -1,399 +0,0 @@
---
-name: hive-concepts
-description: Core concepts for goal-driven agents - architecture, node types (event_loop, function), tool discovery, and workflow overview. Use when starting agent development or need to understand agent fundamentals.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: foundational
-  part_of: hive
---
-
-# Building Agents - Core Concepts
-
-Foundational knowledge for building goal-driven agents as Python packages.
-
-## Architecture: Python Services (Not JSON Configs)
-
-Agents are built as Python packages:
-
-```
-exports/my_agent/
-├── __init__.py          # Package exports
-├── __main__.py          # CLI (run, info, validate, shell)
-├── agent.py             # Graph construction (goal, edges, agent class)
-├── nodes/__init__.py    # Node definitions (NodeSpec)
-├── config.py            # Runtime config
-└── README.md            # Documentation
-```
-
-**Key Principle: Agent is visible and editable during build**
-
- Files created immediately as components are approved
- User can watch files grow in their editor
- No session state - just direct file writes
- No "export" step - agent is ready when build completes
-
-## Core Concepts
-
-### Goal
-
-Success criteria and constraints (written to agent.py)
-
-```python
-goal = Goal(
-    id="research-goal",
-    name="Technical Research Agent",
-    description="Research technical topics thoroughly",
-    success_criteria=[
-        SuccessCriterion(
-            id="completeness",
-            description="Cover all aspects of topic",
-            metric="coverage_score",
-            target=">=0.9",
-            weight=0.4,
-        ),
-        # 3-5 success criteria total
-    ],
-    constraints=[
-        Constraint(
-            id="accuracy",
-            description="All information must be verified",
-            constraint_type="hard",
-            category="quality",
-        ),
-        # 1-5 constraints total
-    ],
-)
-```
-
-### Node
-
-Unit of work (written to nodes/__init__.py)
-
-**Node Types:**
-
- `event_loop` — Multi-turn streaming loop with tool execution and judge-based evaluation. Works with or without tools.
- `function` — Deterministic Python operations. No LLM involved.
-
-```python
-search_node = NodeSpec(
-    id="search-web",
-    name="Search Web",
-    description="Search for information and extract results",
-    node_type="event_loop",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}. Use the web_search tool to find results, then call set_output to store them.",
-    tools=["web_search"],
-)
-```
-
-**NodeSpec Fields for Event Loop Nodes:**
-
-| Field | Default | Description |
-|-------|---------|-------------|
-| `client_facing` | `False` | If True, streams output to user and blocks for input between turns |
-| `nullable_output_keys` | `[]` | Output keys that may remain unset (for mutually exclusive outputs) |
-| `max_node_visits` | `1` | Max times this node executes per run. Set >1 for feedback loop targets |
-
-### Edge
-
-Connection between nodes (written to agent.py)
-
-**Edge Conditions:**
-
- `on_success` — Proceed if node succeeds (most common)
- `on_failure` — Handle errors
- `always` — Always proceed
- `conditional` — Based on expression evaluating node output
-
-**Edge Priority:**
-
-Priority controls evaluation order when multiple edges leave the same node. Higher priority edges are evaluated first. Use negative priority for feedback edges (edges that loop back to earlier nodes).
-
-```python
-# Forward edge (evaluated first)
-EdgeSpec(
-    id="review-to-campaign",
-    source="review",
-    target="campaign-builder",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('approved_contacts') is not None",
-    priority=1,
-)
-
-# Feedback edge (evaluated after forward edges)
-EdgeSpec(
-    id="review-feedback",
-    source="review",
-    target="extractor",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('redo_extraction') is not None",
-    priority=-1,
-)
-```
-
-### Client-Facing Nodes
-
-For multi-turn conversations with the user, set `client_facing=True` on a node. The node will:
- Stream its LLM output directly to the end user
- Block for user input between conversational turns
- Resume when new input is injected via `inject_event()`
-
-```python
-intake_node = NodeSpec(
-    id="intake",
-    name="Intake",
-    description="Gather requirements from the user",
-    node_type="event_loop",
-    client_facing=True,
-    input_keys=[],
-    output_keys=["repo_url", "project_url"],
-    system_prompt="You are the intake agent. Ask the user for the repo URL and project URL.",
-)
-```
-
-> **Legacy Note:** The old `pause_nodes` / `entry_points` pattern still works but `client_facing=True` is preferred for new agents.
-
-**STEP 1 / STEP 2 Prompt Pattern:** For client-facing nodes, structure the system prompt with two explicit phases:
-
-```python
-system_prompt="""\
-**STEP 1 — Respond to the user (text only, NO tool calls):**
-[Present information, ask questions, etc.]
-
-**STEP 2 — After the user responds, call set_output:**
-[Call set_output with the structured outputs]
-"""
-```
-
-This prevents the LLM from calling `set_output` prematurely before the user has had a chance to respond.
-
-### Node Design: Fewer, Richer Nodes
-
-Prefer fewer nodes that do more work over many thin single-purpose nodes:
-
- **Bad**: 8 thin nodes (parse query → search → fetch → evaluate → synthesize → write → check → save)
- **Good**: 4 rich nodes (intake → research → review → report)
-
-Why: Each node boundary requires serializing outputs and passing context. Fewer nodes means the LLM retains full context of its work within the node. A research node that searches, fetches, and analyzes keeps all the source material in its conversation history.
-
-### nullable_output_keys for Cross-Edge Inputs
-
-When a node receives inputs that only arrive on certain edges (e.g., `feedback` only comes from a review → research feedback loop, not from intake → research), mark those keys as `nullable_output_keys`:
-
-```python
-research_node = NodeSpec(
-    id="research",
-    input_keys=["research_brief", "feedback"],
-    nullable_output_keys=["feedback"],  # Not present on first visit
-    max_node_visits=3,
-    ...
-)
-```
-
-## Event Loop Architecture Concepts
-
-### How EventLoopNode Works
-
-An event loop node runs a multi-turn loop:
-1. LLM receives system prompt + conversation history
-2. LLM responds (text and/or tool calls)
-3. Tool calls are executed, results added to conversation
-4. Judge evaluates: ACCEPT (exit loop), RETRY (loop again), or ESCALATE
-5. Repeat until judge ACCEPTs or max_iterations reached
-
-### EventLoopNode Runtime
-
-EventLoopNodes are **auto-created** by `GraphExecutor` at runtime. You do NOT need to manually register them. Both `GraphExecutor` (direct) and `AgentRuntime` / `create_agent_runtime()` handle event_loop nodes automatically.
-
-```python
-# Direct execution — executor auto-creates EventLoopNodes
-from framework.graph.executor import GraphExecutor
-from framework.runtime.core import Runtime
-
-runtime = Runtime(storage_path)
-executor = GraphExecutor(
-    runtime=runtime,
-    llm=llm,
-    tools=tools,
-    tool_executor=tool_executor,
-    storage_path=storage_path,
-)
-result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
-
-# TUI execution — AgentRuntime also works
-from framework.runtime.agent_runtime import create_agent_runtime
-runtime = create_agent_runtime(
-    graph=graph, goal=goal, storage_path=storage_path,
-    entry_points=[...], llm=llm, tools=tools, tool_executor=tool_executor,
-)
-```
-
-### set_output
-
-Nodes produce structured outputs by calling `set_output(key, value)` — a synthetic tool injected by the framework. When the LLM calls `set_output`, the value is stored in the output accumulator and made available to downstream nodes via shared memory.
-
-`set_output` is NOT a real tool — it is excluded from `real_tool_results`. For client-facing nodes, this means a turn where the LLM only calls `set_output` (no other tools) is treated as a conversational boundary and will block for user input.
-
-### JudgeProtocol
-
-**The judge is the SOLE mechanism for acceptance decisions.** Do not add ad-hoc framework gating, output rollback, or premature rejection logic. If the LLM calls `set_output` too early, fix it with better prompts or a custom judge — not framework-level guards.
-
-The judge controls when a node's loop exits:
- **Implicit judge** (default, no judge configured): ACCEPTs when the LLM finishes with no tool calls and all required output keys are set
- **SchemaJudge**: Validates outputs against a Pydantic model
- **Custom judges**: Implement `evaluate(context) -> JudgeVerdict`
-
-### LoopConfig
-
-Controls loop behavior:
- `max_iterations` (default 50) — prevents infinite loops
- `max_tool_calls_per_turn` (default 10) — limits tool calls per LLM response
- `tool_call_overflow_margin` (default 0.5) — wiggle room before discarding extra tool calls (50% means hard cutoff at 150% of limit)
- `stall_detection_threshold` (default 3) — detects repeated identical responses
- `max_history_tokens` (default 32000) — triggers conversation compaction
-
-### Data Tools (Spillover Management)
-
-When tool results exceed the context window, the framework automatically saves them to a spillover directory and truncates with a hint. Nodes that produce or consume large data should include the data tools:
-
- `save_data(filename, data)` — Write data to a file in the data directory
- `load_data(filename, offset=0, limit=50)` — Read data with line-based pagination
- `list_data_files()` — List available data files
- `serve_file_to_user(filename, label="")` — Get a clickable file:// URI for the user
-
-Note: `data_dir` is a framework-injected context parameter — the LLM never sees or passes it. `GraphExecutor.execute()` sets it per-execution via `contextvars`, so data tools and spillover always share the same session-scoped directory.
-
-These are real MCP tools (not synthetic). Add them to nodes that handle large tool results:
-
-```python
-research_node = NodeSpec(
-    ...
-    tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
-)
-```
-
-### Fan-Out / Fan-In
-
-Multiple ON_SUCCESS edges from the same source create parallel execution. All branches run concurrently via `asyncio.gather()`. Parallel event_loop nodes must have disjoint `output_keys`.
-
-### max_node_visits
-
-Controls how many times a node can execute in one graph run. Default is 1. Set higher for nodes that are targets of feedback edges (review-reject loops). Set 0 for unlimited (guarded by max_steps).
-
-## Tool Discovery & Validation
-
-**CRITICAL:** Before adding a node with tools, you MUST verify the tools exist.
-
-Tools are provided by MCP servers. Never assume a tool exists - always discover dynamically.
-
-### Step 1: Register MCP Server (if not already done)
-
-```python
-mcp__agent-builder__add_mcp_server(
-    name="tools",
-    transport="stdio",
-    command="python",
-    args='["mcp_server.py", "--stdio"]',
-    cwd="../tools"
-)
-```
-
-### Step 2: Discover Available Tools
-
-```python
-# List all tools from all registered servers
-mcp__agent-builder__list_mcp_tools()
-
-# Or list tools from a specific server
-mcp__agent-builder__list_mcp_tools(server_name="tools")
-```
-
-### Step 3: Validate Before Adding Nodes
-
-Before writing a node with `tools=[...]`:
-
-1. Call `list_mcp_tools()` to get available tools
-2. Check each tool in your node exists in the response
-3. If a tool doesn't exist:
-   - **DO NOT proceed** with the node
-   - Inform the user: "The tool 'X' is not available. Available tools are: ..."
-   - Ask if they want to use an alternative or proceed without the tool
-
-### Tool Validation Anti-Patterns
-
- **Never assume a tool exists** - always call `list_mcp_tools()` first
- **Never write a node with unverified tools** - validate before writing
- **Never silently drop tools** - if a tool doesn't exist, inform the user
- **Never guess tool names** - use exact names from discovery response
-
-## Workflow Overview: Incremental File Construction
-
-```
-1. CREATE PACKAGE → mkdir + write skeletons
-2. DEFINE GOAL → Write to agent.py + config.py
-3. FOR EACH NODE:
-   - Propose design (event_loop for LLM work, function for deterministic)
-   - User approves
-   - Write to nodes/__init__.py IMMEDIATELY
-   - (Optional) Validate with test_node
-4. CONNECT EDGES → Update agent.py
-   - Use priority for feedback edges (negative priority)
-   - (Optional) Validate with validate_graph
-5. FINALIZE → Write agent class to agent.py
-6. DONE - Agent ready at exports/my_agent/
-```
-
-**Files written immediately. MCP tools optional for validation/testing bookkeeping.**
-
-## When to Use This Skill
-
-Use hive-concepts when:
- Starting a new agent project and need to understand fundamentals
- Need to understand agent architecture before building
- Want to validate tool availability before proceeding
- Learning about node types, edges, and graph execution
-
-**Next Steps:**
- Ready to build? → Use `hive-create` skill
- Need patterns and examples? → Use `hive-patterns` skill
-
-## MCP Tools for Validation
-
-After writing files, optionally use MCP tools for validation:
-
-**test_node** - Validate node configuration with mock inputs
-```python
-mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "test query"}',
-    mock_llm_response='{"results": "mock output"}'
-)
-```
-
-**validate_graph** - Check graph structure
-```python
-mcp__agent-builder__validate_graph()
-# Returns: unreachable nodes, missing connections, event_loop validation, etc.
-```
-
-**configure_loop** - Set event loop parameters
-```python
-mcp__agent-builder__configure_loop(
-    max_iterations=50,
-    max_tool_calls_per_turn=10,
-    stall_detection_threshold=3,
-    max_history_tokens=32000
-)
-```
-
-**Key Point:** Files are written FIRST. MCP tools are for validation only.
-
-## Related Skills
-
- **hive-create** - Step-by-step building process
- **hive-patterns** - Best practices: judges, feedback edges, fan-out, context management
- **hive** - Complete workflow orchestrator
- **hive-test** - Test and validate completed agents
@@ -1,24 +0,0 @@
-"""
-Deep Research Agent - Interactive, rigorous research with TUI conversation.
-
-Research any topic through multi-source web search, quality evaluation,
-and synthesis. Features client-facing TUI interaction at key checkpoints
-for user guidance and iterative deepening.
-"""
-
-from .agent import DeepResearchAgent, default_agent, goal, nodes, edges
-from .config import RuntimeConfig, AgentMetadata, default_config, metadata
-
-__version__ = "1.0.0"
-
-__all__ = [
-    "DeepResearchAgent",
-    "default_agent",
-    "goal",
-    "nodes",
-    "edges",
-    "RuntimeConfig",
-    "AgentMetadata",
-    "default_config",
-    "metadata",
-]
@@ -1,241 +0,0 @@
-"""
-CLI entry point for Deep Research Agent.
-
-Uses AgentRuntime for multi-entrypoint support with HITL pause/resume.
-"""
-
-import asyncio
-import json
-import logging
-import sys
-import click
-
-from .agent import default_agent, DeepResearchAgent
-
-
-def setup_logging(verbose=False, debug=False):
-    """Configure logging for execution visibility."""
-    if debug:
-        level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
-    elif verbose:
-        level, fmt = logging.INFO, "%(message)s"
-    else:
-        level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
-    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
-    logging.getLogger("framework").setLevel(level)
-
-
-@click.group()
-@click.version_option(version="1.0.0")
-def cli():
-    """Deep Research Agent - Interactive, rigorous research with TUI conversation."""
-    pass
-
-
-@cli.command()
-@click.option("--topic", "-t", type=str, required=True, help="Research topic")
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def run(topic, mock, quiet, verbose, debug):
-    """Execute research on a topic."""
-    if not quiet:
-        setup_logging(verbose=verbose, debug=debug)
-
-    context = {"topic": topic}
-
-    result = asyncio.run(default_agent.run(context, mock_mode=mock))
-
-    output_data = {
-        "success": result.success,
-        "steps_executed": result.steps_executed,
-        "output": result.output,
-    }
-    if result.error:
-        output_data["error"] = result.error
-
-    click.echo(json.dumps(output_data, indent=2, default=str))
-    sys.exit(0 if result.success else 1)
-
-
-@cli.command()
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def tui(mock, verbose, debug):
-    """Launch the TUI dashboard for interactive research."""
-    setup_logging(verbose=verbose, debug=debug)
-
-    try:
-        from framework.tui.app import AdenTUI
-    except ImportError:
-        click.echo(
-            "TUI requires the 'textual' package. Install with: pip install textual"
-        )
-        sys.exit(1)
-
-    from pathlib import Path
-
-    from framework.llm import LiteLLMProvider
-    from framework.runner.tool_registry import ToolRegistry
-    from framework.runtime.agent_runtime import create_agent_runtime
-    from framework.runtime.event_bus import EventBus
-    from framework.runtime.execution_stream import EntryPointSpec
-
-    async def run_with_tui():
-        agent = DeepResearchAgent()
-
-        # Build graph and tools
-        agent._event_bus = EventBus()
-        agent._tool_registry = ToolRegistry()
-
-        storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
-        storage_path.mkdir(parents=True, exist_ok=True)
-
-        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
-        if mcp_config_path.exists():
-            agent._tool_registry.load_mcp_config(mcp_config_path)
-
-        llm = None
-        if not mock:
-            llm = LiteLLMProvider(
-                model=agent.config.model,
-                api_key=agent.config.api_key,
-                api_base=agent.config.api_base,
-            )
-
-        tools = list(agent._tool_registry.get_tools().values())
-        tool_executor = agent._tool_registry.get_executor()
-        graph = agent._build_graph()
-
-        runtime = create_agent_runtime(
-            graph=graph,
-            goal=agent.goal,
-            storage_path=storage_path,
-            entry_points=[
-                EntryPointSpec(
-                    id="start",
-                    name="Start Research",
-                    entry_node="intake",
-                    trigger_type="manual",
-                    isolation_level="isolated",
-                ),
-            ],
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-        )
-
-        await runtime.start()
-
-        try:
-            app = AdenTUI(runtime)
-            await app.run_async()
-        finally:
-            await runtime.stop()
-
-    asyncio.run(run_with_tui())
-
-
-@cli.command()
-@click.option("--json", "output_json", is_flag=True)
-def info(output_json):
-    """Show agent information."""
-    info_data = default_agent.info()
-    if output_json:
-        click.echo(json.dumps(info_data, indent=2))
-    else:
-        click.echo(f"Agent: {info_data['name']}")
-        click.echo(f"Version: {info_data['version']}")
-        click.echo(f"Description: {info_data['description']}")
-        click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
-        click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
-        click.echo(f"Entry: {info_data['entry_node']}")
-        click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
-
-
-@cli.command()
-def validate():
-    """Validate agent structure."""
-    validation = default_agent.validate()
-    if validation["valid"]:
-        click.echo("Agent is valid")
-        if validation["warnings"]:
-            for warning in validation["warnings"]:
-                click.echo(f"  WARNING: {warning}")
-    else:
-        click.echo("Agent has errors:")
-        for error in validation["errors"]:
-            click.echo(f"  ERROR: {error}")
-    sys.exit(0 if validation["valid"] else 1)
-
-
-@cli.command()
-@click.option("--verbose", "-v", is_flag=True)
-def shell(verbose):
-    """Interactive research session (CLI, no TUI)."""
-    asyncio.run(_interactive_shell(verbose))
-
-
-async def _interactive_shell(verbose=False):
-    """Async interactive shell."""
-    setup_logging(verbose=verbose)
-
-    click.echo("=== Deep Research Agent ===")
-    click.echo("Enter a topic to research (or 'quit' to exit):\n")
-
-    agent = DeepResearchAgent()
-    await agent.start()
-
-    try:
-        while True:
-            try:
-                topic = await asyncio.get_event_loop().run_in_executor(
-                    None, input, "Topic> "
-                )
-                if topic.lower() in ["quit", "exit", "q"]:
-                    click.echo("Goodbye!")
-                    break
-
-                if not topic.strip():
-                    continue
-
-                click.echo("\nResearching...\n")
-
-                result = await agent.trigger_and_wait("start", {"topic": topic})
-
-                if result is None:
-                    click.echo("\n[Execution timed out]\n")
-                    continue
-
-                if result.success:
-                    output = result.output
-                    if "report_content" in output:
-                        click.echo("\n--- Report ---\n")
-                        click.echo(output["report_content"])
-                        click.echo("\n")
-                    if "references" in output:
-                        click.echo("--- References ---\n")
-                        for ref in output.get("references", []):
-                            click.echo(
-                                f"  [{ref.get('number', '?')}] {ref.get('title', '')} - {ref.get('url', '')}"
-                            )
-                        click.echo("\n")
-                else:
-                    click.echo(f"\nResearch failed: {result.error}\n")
-
-            except KeyboardInterrupt:
-                click.echo("\nGoodbye!")
-                break
-            except Exception as e:
-                click.echo(f"Error: {e}", err=True)
-                import traceback
-
-                traceback.print_exc()
-    finally:
-        await agent.stop()
-
-
-if __name__ == "__main__":
-    cli()
@@ -1,358 +0,0 @@
-"""Agent graph construction for Deep Research Agent."""
-
-from pathlib import Path
-
-from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
-from framework.graph.edge import GraphSpec
-from framework.graph.executor import ExecutionResult
-from framework.graph.checkpoint_config import CheckpointConfig
-from framework.llm import LiteLLMProvider
-from framework.runner.tool_registry import ToolRegistry
-from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
-from framework.runtime.execution_stream import EntryPointSpec
-
-from .config import default_config, metadata
-from .nodes import (
-    intake_node,
-    research_node,
-    review_node,
-    report_node,
-)
-
-# Goal definition
-goal = Goal(
-    id="rigorous-interactive-research",
-    name="Rigorous Interactive Research",
-    description=(
-        "Research any topic by searching diverse sources, analyzing findings, "
-        "and producing a cited report — with user checkpoints to guide direction."
-    ),
-    success_criteria=[
-        SuccessCriterion(
-            id="source-diversity",
-            description="Use multiple diverse, authoritative sources",
-            metric="source_count",
-            target=">=5",
-            weight=0.25,
-        ),
-        SuccessCriterion(
-            id="citation-coverage",
-            description="Every factual claim in the report cites its source",
-            metric="citation_coverage",
-            target="100%",
-            weight=0.25,
-        ),
-        SuccessCriterion(
-            id="user-satisfaction",
-            description="User reviews findings before report generation",
-            metric="user_approval",
-            target="true",
-            weight=0.25,
-        ),
-        SuccessCriterion(
-            id="report-completeness",
-            description="Final report answers the original research questions",
-            metric="question_coverage",
-            target="90%",
-            weight=0.25,
-        ),
-    ],
-    constraints=[
-        Constraint(
-            id="no-hallucination",
-            description="Only include information found in fetched sources",
-            constraint_type="quality",
-            category="accuracy",
-        ),
-        Constraint(
-            id="source-attribution",
-            description="Every claim must cite its source with a numbered reference",
-            constraint_type="quality",
-            category="accuracy",
-        ),
-        Constraint(
-            id="user-checkpoint",
-            description="Present findings to the user before writing the final report",
-            constraint_type="functional",
-            category="interaction",
-        ),
-    ],
-)
-
-# Node list
-nodes = [
-    intake_node,
-    research_node,
-    review_node,
-    report_node,
-]
-
-# Edge definitions
-edges = [
-    # intake -> research
-    EdgeSpec(
-        id="intake-to-research",
-        source="intake",
-        target="research",
-        condition=EdgeCondition.ON_SUCCESS,
-        priority=1,
-    ),
-    # research -> review
-    EdgeSpec(
-        id="research-to-review",
-        source="research",
-        target="review",
-        condition=EdgeCondition.ON_SUCCESS,
-        priority=1,
-    ),
-    # review -> research (feedback loop)
-    EdgeSpec(
-        id="review-to-research-feedback",
-        source="review",
-        target="research",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="needs_more_research == True",
-        priority=1,
-    ),
-    # review -> report (user satisfied)
-    EdgeSpec(
-        id="review-to-report",
-        source="review",
-        target="report",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="needs_more_research == False",
-        priority=2,
-    ),
-    # report -> research (user wants deeper research on current topic)
-    EdgeSpec(
-        id="report-to-research",
-        source="report",
-        target="research",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="str(next_action).lower() == 'more_research'",
-        priority=2,
-    ),
-    # report -> intake (user wants a new topic — default when not more_research)
-    EdgeSpec(
-        id="report-to-intake",
-        source="report",
-        target="intake",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="str(next_action).lower() != 'more_research'",
-        priority=1,
-    ),
-]
-
-# Graph configuration
-entry_node = "intake"
-entry_points = {"start": "intake"}
-pause_nodes = []
-terminal_nodes = []
-
-
-class DeepResearchAgent:
-    """
-    Deep Research Agent — 4-node pipeline with user checkpoints.
-
-    Flow: intake -> research -> review -> report
-                      ^           |
-                      +-- feedback loop (if user wants more)
-
-    Uses AgentRuntime for proper session management:
-    - Session-scoped storage (sessions/{session_id}/)
-    - Checkpointing for resume capability
-    - Runtime logging
-    - Data folder for save_data/load_data
-    """
-
-    def __init__(self, config=None):
-        self.config = config or default_config
-        self.goal = goal
-        self.nodes = nodes
-        self.edges = edges
-        self.entry_node = entry_node
-        self.entry_points = entry_points
-        self.pause_nodes = pause_nodes
-        self.terminal_nodes = terminal_nodes
-        self._graph: GraphSpec | None = None
-        self._agent_runtime: AgentRuntime | None = None
-        self._tool_registry: ToolRegistry | None = None
-        self._storage_path: Path | None = None
-
-    def _build_graph(self) -> GraphSpec:
-        """Build the GraphSpec."""
-        return GraphSpec(
-            id="deep-research-agent-graph",
-            goal_id=self.goal.id,
-            version="1.0.0",
-            entry_node=self.entry_node,
-            entry_points=self.entry_points,
-            terminal_nodes=self.terminal_nodes,
-            pause_nodes=self.pause_nodes,
-            nodes=self.nodes,
-            edges=self.edges,
-            default_model=self.config.model,
-            max_tokens=self.config.max_tokens,
-            loop_config={
-                "max_iterations": 100,
-                "max_tool_calls_per_turn": 30,
-                "max_history_tokens": 32000,
-            },
-            conversation_mode="continuous",
-            identity_prompt=(
-                "You are a rigorous research agent. You search for information "
-                "from diverse, authoritative sources, analyze findings critically, "
-                "and produce well-cited reports. You never fabricate information — "
-                "every claim must trace back to a source you actually retrieved."
-            ),
-        )
-
-    def _setup(self, mock_mode=False) -> None:
-        """Set up the agent runtime with sessions, checkpoints, and logging."""
-        self._storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
-        self._storage_path.mkdir(parents=True, exist_ok=True)
-
-        self._tool_registry = ToolRegistry()
-
-        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
-        if mcp_config_path.exists():
-            self._tool_registry.load_mcp_config(mcp_config_path)
-
-        llm = None
-        if not mock_mode:
-            llm = LiteLLMProvider(
-                model=self.config.model,
-                api_key=self.config.api_key,
-                api_base=self.config.api_base,
-            )
-
-        tool_executor = self._tool_registry.get_executor()
-        tools = list(self._tool_registry.get_tools().values())
-
-        self._graph = self._build_graph()
-
-        checkpoint_config = CheckpointConfig(
-            enabled=True,
-            checkpoint_on_node_start=False,
-            checkpoint_on_node_complete=True,
-            checkpoint_max_age_days=7,
-            async_checkpoint=True,
-        )
-
-        entry_point_specs = [
-            EntryPointSpec(
-                id="default",
-                name="Default",
-                entry_node=self.entry_node,
-                trigger_type="manual",
-                isolation_level="shared",
-            )
-        ]
-
-        self._agent_runtime = create_agent_runtime(
-            graph=self._graph,
-            goal=self.goal,
-            storage_path=self._storage_path,
-            entry_points=entry_point_specs,
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-            checkpoint_config=checkpoint_config,
-        )
-
-    async def start(self, mock_mode=False) -> None:
-        """Set up and start the agent runtime."""
-        if self._agent_runtime is None:
-            self._setup(mock_mode=mock_mode)
-        if not self._agent_runtime.is_running:
-            await self._agent_runtime.start()
-
-    async def stop(self) -> None:
-        """Stop the agent runtime and clean up."""
-        if self._agent_runtime and self._agent_runtime.is_running:
-            await self._agent_runtime.stop()
-        self._agent_runtime = None
-
-    async def trigger_and_wait(
-        self,
-        entry_point: str = "default",
-        input_data: dict | None = None,
-        timeout: float | None = None,
-        session_state: dict | None = None,
-    ) -> ExecutionResult | None:
-        """Execute the graph and wait for completion."""
-        if self._agent_runtime is None:
-            raise RuntimeError("Agent not started. Call start() first.")
-
-        return await self._agent_runtime.trigger_and_wait(
-            entry_point_id=entry_point,
-            input_data=input_data or {},
-            session_state=session_state,
-        )
-
-    async def run(
-        self, context: dict, mock_mode=False, session_state=None
-    ) -> ExecutionResult:
-        """Run the agent (convenience method for single execution)."""
-        await self.start(mock_mode=mock_mode)
-        try:
-            result = await self.trigger_and_wait(
-                "default", context, session_state=session_state
-            )
-            return result or ExecutionResult(success=False, error="Execution timeout")
-        finally:
-            await self.stop()
-
-    def info(self):
-        """Get agent information."""
-        return {
-            "name": metadata.name,
-            "version": metadata.version,
-            "description": metadata.description,
-            "goal": {
-                "name": self.goal.name,
-                "description": self.goal.description,
-            },
-            "nodes": [n.id for n in self.nodes],
-            "edges": [e.id for e in self.edges],
-            "entry_node": self.entry_node,
-            "entry_points": self.entry_points,
-            "pause_nodes": self.pause_nodes,
-            "terminal_nodes": self.terminal_nodes,
-            "client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
-        }
-
-    def validate(self):
-        """Validate agent structure."""
-        errors = []
-        warnings = []
-
-        node_ids = {node.id for node in self.nodes}
-        for edge in self.edges:
-            if edge.source not in node_ids:
-                errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
-            if edge.target not in node_ids:
-                errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
-
-        if self.entry_node not in node_ids:
-            errors.append(f"Entry node '{self.entry_node}' not found")
-
-        for terminal in self.terminal_nodes:
-            if terminal not in node_ids:
-                errors.append(f"Terminal node '{terminal}' not found")
-
-        for ep_id, node_id in self.entry_points.items():
-            if node_id not in node_ids:
-                errors.append(
-                    f"Entry point '{ep_id}' references unknown node '{node_id}'"
-                )
-
-        return {
-            "valid": len(errors) == 0,
-            "errors": errors,
-            "warnings": warnings,
-        }
-
-
-# Create default instance
-default_agent = DeepResearchAgent()
@@ -1,26 +0,0 @@
-"""Runtime configuration."""
-
-from dataclasses import dataclass
-
-from framework.config import RuntimeConfig
-
-default_config = RuntimeConfig()
-
-
-@dataclass
-class AgentMetadata:
-    name: str = "Deep Research Agent"
-    version: str = "1.0.0"
-    description: str = (
-        "Interactive research agent that rigorously investigates topics through "
-        "multi-source search, quality evaluation, and synthesis - with TUI conversation "
-        "at key checkpoints for user guidance and feedback."
-    )
-    intro_message: str = (
-        "Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
-        "thoroughly — searching multiple sources, evaluating quality, and synthesizing "
-        "a comprehensive report. What would you like me to research?"
-    )
-
-
-metadata = AgentMetadata()
@@ -1,9 +0,0 @@
-{
-  "hive-tools": {
-    "transport": "stdio",
-    "command": "uv",
-    "args": ["run", "python", "mcp_server.py", "--stdio"],
-    "cwd": "../../tools",
-    "description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
-  }
-}
@@ -1,213 +0,0 @@
-"""Node definitions for Deep Research Agent."""
-
-from framework.graph import NodeSpec
-
-# Node 1: Intake (client-facing)
-# Brief conversation to clarify what the user wants researched.
-intake_node = NodeSpec(
-    id="intake",
-    name="Research Intake",
-    description="Discuss the research topic with the user, clarify scope, and confirm direction",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["topic"],
-    output_keys=["research_brief"],
-    success_criteria=(
-        "The research brief is specific and actionable: it states the topic, "
-        "the key questions to answer, the desired scope, and depth."
-    ),
-    system_prompt="""\
-You are a research intake specialist. The user wants to research a topic.
-Have a brief conversation to clarify what they need.
-
-**STEP 1 — Read and respond (text only, NO tool calls):**
-1. Read the topic provided
-2. If it's vague, ask 1-2 clarifying questions (scope, angle, depth)
-3. If it's already clear, confirm your understanding and ask the user to confirm
-
-Keep it short. Don't over-ask.
-
-**STEP 2 — After the user confirms, call set_output:**
- set_output("research_brief", "A clear paragraph describing exactly what to research, \
-what questions to answer, what scope to cover, and how deep to go.")
-""",
-    tools=[],
-)
-
-# Node 2: Research
-# The workhorse — searches the web, fetches content, analyzes sources.
-# One node with both tools avoids the context-passing overhead of 5 separate nodes.
-research_node = NodeSpec(
-    id="research",
-    name="Research",
-    description="Search the web, fetch source content, and compile findings",
-    node_type="event_loop",
-    max_node_visits=0,
-    input_keys=["research_brief", "feedback"],
-    output_keys=["findings", "sources", "gaps"],
-    nullable_output_keys=["feedback"],
-    success_criteria=(
-        "Findings reference at least 3 distinct sources with URLs. "
-        "Key claims are substantiated by fetched content, not generated."
-    ),
-    system_prompt="""\
-You are a research agent. Given a research brief, find and analyze sources.
-
-If feedback is provided, this is a follow-up round — focus on the gaps identified.
-
-Work in phases:
-1. **Search**: Use web_search with 3-5 diverse queries covering different angles.
-   Prioritize authoritative sources (.edu, .gov, established publications).
-2. **Fetch**: Use web_scrape on the most promising URLs (aim for 5-8 sources).
-   Skip URLs that fail. Extract the substantive content.
-3. **Analyze**: Review what you've collected. Identify key findings, themes,
-   and any contradictions between sources.
-
-Important:
- Work in batches of 3-4 tool calls at a time — never more than 10 per turn
- After each batch, assess whether you have enough material
- Prefer quality over quantity — 5 good sources beat 15 thin ones
- Track which URL each finding comes from (you'll need citations later)
- Call set_output for each key in a SEPARATE turn (not in the same turn as other tool calls)
-
-Context management:
- Your tool results are automatically saved to files. After compaction, the file \
-references remain in the conversation — use load_data() to recover any content you need.
- Use append_data('research_notes.md', ...) to maintain a running log of key findings \
-as you go. This survives compaction and helps the report node produce a detailed report.
-
-When done, use set_output (one key at a time, separate turns):
- set_output("findings", "Structured summary: key findings with source URLs for each claim. \
-Include themes, contradictions, and confidence levels.")
- set_output("sources", [{"url": "...", "title": "...", "summary": "..."}])
- set_output("gaps", "What aspects of the research brief are NOT well-covered yet, if any.")
-""",
-    tools=[
-        "web_search",
-        "web_scrape",
-        "load_data",
-        "save_data",
-        "append_data",
-        "list_data_files",
-    ],
-)
-
-# Node 3: Review (client-facing)
-# Shows the user what was found and asks whether to dig deeper or proceed.
-review_node = NodeSpec(
-    id="review",
-    name="Review Findings",
-    description="Present findings to user and decide whether to research more or write the report",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["findings", "sources", "gaps", "research_brief"],
-    output_keys=["needs_more_research", "feedback"],
-    success_criteria=(
-        "The user has been presented with findings and has explicitly indicated "
-        "whether they want more research or are ready for the report."
-    ),
-    system_prompt="""\
-Present the research findings to the user clearly and concisely.
-
-**STEP 1 — Present (your first message, text only, NO tool calls):**
-1. **Summary** (2-3 sentences of what was found)
-2. **Key Findings** (bulleted, with confidence levels)
-3. **Sources Used** (count and quality assessment)
-4. **Gaps** (what's still unclear or under-covered)
-
-End by asking: Are they satisfied, or do they want deeper research? \
-Should we proceed to writing the final report?
-
-**STEP 2 — After the user responds, call set_output:**
- set_output("needs_more_research", "true")  — if they want more
- set_output("needs_more_research", "false") — if they're satisfied
- set_output("feedback", "What the user wants explored further, or empty string")
-""",
-    tools=[],
-)
-
-# Node 4: Report (client-facing)
-# Writes an HTML report, serves the link to the user, and answers follow-ups.
-report_node = NodeSpec(
-    id="report",
-    name="Write & Deliver Report",
-    description="Write a cited HTML report from the findings and present it to the user",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["findings", "sources", "research_brief"],
-    output_keys=["delivery_status", "next_action"],
-    success_criteria=(
-        "An HTML report has been saved, the file link has been presented to the user, "
-        "and the user has indicated what they want to do next."
-    ),
-    system_prompt="""\
-Write a research report as an HTML file and present it to the user.
-
-IMPORTANT: save_data requires TWO separate arguments: filename and data.
-Call it like: save_data(filename="report.html", data="<html>...</html>")
-Do NOT use _raw, do NOT nest arguments inside a JSON string.
-
-**STEP 1 — Write and save the HTML report (tool calls, NO text to user yet):**
-
-Build a clean HTML document. Keep the HTML concise — aim for clarity over length.
-Use minimal embedded CSS (a few lines of style, not a full framework).
-
-Report structure:
- Title & date
- Executive Summary (2-3 paragraphs)
- Key Findings (organized by theme, with [n] citation links)
- Analysis (synthesis, implications)
- Conclusion (key takeaways)
- References (numbered list with clickable URLs)
-
-Requirements:
- Every factual claim must cite its source with [n] notation
- Be objective — present multiple viewpoints where sources disagree
- Answer the original research questions from the brief
- If findings appear incomplete or summarized, call list_data_files() and load_data() \
-to access the detailed source material from the research phase. The research node's \
-tool results and research_notes.md contain the full data.
-
-Save the HTML:
-  save_data(filename="report.html", data="<html>...</html>")
-
-Then get the clickable link:
-  serve_file_to_user(filename="report.html", label="Research Report")
-
-If save_data fails, simplify and shorten the HTML, then retry.
-
-**STEP 2 — Present the link to the user (text only, NO tool calls):**
-
-Tell the user the report is ready and include the file:// URI from
-serve_file_to_user so they can click it to open. Give a brief summary
-of what the report covers. Ask if they have questions or want to continue.
-
-**STEP 3 — After the user responds:**
- Answer any follow-up questions from the research material
- When the user is ready to move on, ask what they'd like to do next:
-  - Research a new topic?
-  - Dig deeper into the current topic?
- Then call set_output:
-  - set_output("delivery_status", "completed")
-  - set_output("next_action", "new_topic")       — if they want a new topic
-  - set_output("next_action", "more_research")   — if they want deeper research
-""",
-    tools=[
-        "save_data",
-        "append_data",
-        "edit_data",
-        "serve_file_to_user",
-        "load_data",
-        "list_data_files",
-    ],
-)
-
-__all__ = [
-    "intake_node",
-    "research_node",
-    "review_node",
-    "report_node",
-]
@@ -1,640 +0,0 @@
---
-name: hive-credentials
-description: Set up and install credentials for an agent. Detects missing credentials from agent config, collects them from the user, and stores them securely in the local encrypted store at ~/.hive/credentials.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.3"
-  type: utility
---
-
-# Setup Credentials
-
-Interactive credential setup for agents with multiple authentication options. Detects what's missing, offers auth method choices, validates with health checks, and stores credentials securely.
-
-## When to Use
-
- Before running or testing an agent for the first time
- When `AgentRunner.run()` fails with "missing required credentials"
- When a user asks to configure credentials for an agent
- After building a new agent that uses tools requiring API keys
-
-## Workflow
-
-### Step 1: Identify the Agent
-
-Determine which agent needs credentials. The user will either:
-
- Name the agent directly (e.g., "set up credentials for hubspot-agent")
- Have an agent directory open (check `exports/` for agent dirs)
- Be working on an agent in the current session
-
-Locate the agent's directory under `exports/{agent_name}/`.
-
-### Step 2: Detect Missing Credentials
-
-Use the `check_missing_credentials` MCP tool to detect what the agent needs and what's already configured. This tool loads the agent, inspects its required tools and node types, maps them to credentials via `CREDENTIAL_SPECS`, and checks both the encrypted store and environment variables.
-
-```
-check_missing_credentials(agent_path="exports/{agent_name}")
-```
-
-The tool returns a JSON response:
-
-```json
-{
-  "agent": "exports/{agent_name}",
-  "missing": [
-    {
-      "credential_name": "brave_search",
-      "env_var": "BRAVE_SEARCH_API_KEY",
-      "description": "Brave Search API key for web search",
-      "help_url": "https://brave.com/search/api/",
-      "tools": ["web_search"]
-    }
-  ],
-  "available": [
-    {
-      "credential_name": "anthropic",
-      "env_var": "ANTHROPIC_API_KEY",
-      "source": "encrypted_store"
-    }
-  ],
-  "total_missing": 1,
-  "ready": false
-}
-```
-
-**If `ready` is true (nothing missing):** Report all credentials as configured and skip Steps 3-5. Example:
-
-```
-All required credentials are already configured:
-  ✓ anthropic (ANTHROPIC_API_KEY)
-  ✓ brave_search (BRAVE_SEARCH_API_KEY)
-Your agent is ready to run!
-```
-
-**If credentials are missing:** Continue to Step 3 with the `missing` list.
-
-### Step 3: Present Auth Options for Each Missing Credential
-
-For each missing credential, check what authentication methods are available:
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-
-spec = CREDENTIAL_SPECS.get("hubspot")
-if spec:
-    # Determine available auth options
-    auth_options = []
-    if spec.aden_supported:
-        auth_options.append("aden")
-    if spec.direct_api_key_supported:
-        auth_options.append("direct")
-    auth_options.append("custom")  # Always available
-
-    # Get setup info
-    setup_info = {
-        "env_var": spec.env_var,
-        "description": spec.description,
-        "help_url": spec.help_url,
-        "api_key_instructions": spec.api_key_instructions,
-    }
-```
-
-Present the available options using AskUserQuestion:
-
-```
-Choose how to configure HUBSPOT_ACCESS_TOKEN:
-
-  1) Aden Platform (OAuth) (Recommended)
-     Secure OAuth2 flow via hive.adenhq.com
-     - Quick setup with automatic token refresh
-     - No need to manage API keys manually
-
-  2) Direct API Key
-     Enter your own API key manually
-     - Requires creating a HubSpot Private App
-     - Full control over scopes and permissions
-
-  3) Local Credential Setup (Advanced)
-     Programmatic configuration for CI/CD
-     - For automated deployments
-     - Requires manual API calls
-```
-
-### Step 4: Execute Auth Flow Based on User Choice
-
-#### Prerequisite: Ensure HIVE_CREDENTIAL_KEY Is Available
-
-Before storing any credentials, verify `HIVE_CREDENTIAL_KEY` is set (needed to encrypt/decrypt the local store). Check both the current session and shell config:
-
-```bash
-# Check current session
-printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "session: set" || echo "session: not set"
-
-# Check shell config files
-for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTIAL_KEY' "$f" && echo "$f"; done
-```
-
- **In current session** — proceed to store credentials
- **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
- **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile
-
-> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
-> ```
-> ⚠️  Environment variables were added to your shell config.
->     Open a NEW TERMINAL for them to take effect outside this session.
-> ```
-
-#### Option 1: Aden Platform (OAuth)
-
-This is the recommended flow for supported integrations (HubSpot, etc.).
-
-**How Aden OAuth Works:**
-
-The ADEN_API_KEY represents a user who has already completed OAuth authorization on Aden's platform. When users sign up and connect integrations on Aden, those OAuth tokens are stored server-side. Having an ADEN_API_KEY means:
-
-1. User has an Aden account
-2. User has already authorized integrations (HubSpot, etc.) via OAuth on Aden
-3. We just need to sync those credentials down to the local credential store
-
-**4.1a. Check for ADEN_API_KEY**
-
-```python
-import os
-aden_key = os.environ.get("ADEN_API_KEY")
-```
-
-If not set, guide user to get one from Aden (this is where they do OAuth):
-
-```python
-from aden_tools.credentials import open_browser, get_aden_setup_url
-
-# Open browser to Aden - user will sign up and connect integrations there
-url = get_aden_setup_url()  # https://hive.adenhq.com
-success, msg = open_browser(url)
-
-print("Please sign in to Aden and connect your integrations (HubSpot, etc.).")
-print("Once done, copy your API key and return here.")
-```
-
-Ask user to provide the ADEN_API_KEY they received.
-
-**4.1b. Save ADEN_API_KEY to Shell Config**
-
-With user approval, persist ADEN_API_KEY to their shell config:
-
-```python
-from aden_tools.credentials import (
-    detect_shell,
-    add_env_var_to_shell_config,
-    get_shell_source_command,
-)
-
-shell_type = detect_shell()  # 'bash', 'zsh', or 'unknown'
-
-# Ask user for approval before modifying shell config
-# If approved:
-success, config_path = add_env_var_to_shell_config(
-    "ADEN_API_KEY",
-    user_provided_key,
-    comment="Aden Platform (OAuth) API key"
-)
-
-if success:
-    source_cmd = get_shell_source_command()
-    print(f"Saved to {config_path}")
-    print(f"Run: {source_cmd}")
-```
-
-> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
-> ```
-> ⚠️  Environment variables were added to your shell config.
->     Open a NEW TERMINAL for them to take effect outside this session.
-> ```
-
-Also save to `~/.hive/configuration.json` for the framework:
-
-```python
-import json
-from pathlib import Path
-
-config_path = Path.home() / ".hive" / "configuration.json"
-config = json.loads(config_path.read_text()) if config_path.exists() else {}
-
-config["aden"] = {
-    "api_key_configured": True,
-    "api_url": "https://api.adenhq.com"
-}
-
-config_path.parent.mkdir(parents=True, exist_ok=True)
-config_path.write_text(json.dumps(config, indent=2))
-```
-
-**4.1c. Sync Credentials from Aden Server**
-
-Since the user has already authorized integrations on Aden, use the one-liner factory method:
-
-```python
-from core.framework.credentials import CredentialStore
-
-# This single call handles everything:
-# - Creates encrypted local storage at ~/.hive/credentials
-# - Configures Aden client from ADEN_API_KEY env var
-# - Syncs all credentials from Aden server automatically
-store = CredentialStore.with_aden_sync(
-    base_url="https://api.adenhq.com",
-    auto_sync=True,  # Syncs on creation
-)
-
-# Check what was synced
-synced = store.list_credentials()
-print(f"Synced credentials: {synced}")
-
-# If the required credential wasn't synced, the user hasn't authorized it on Aden yet
-if "hubspot" not in synced:
-    print("HubSpot not found in your Aden account.")
-    print("Please visit https://hive.adenhq.com to connect HubSpot, then try again.")
-```
-
-For more control over the sync process:
-
-```python
-from core.framework.credentials import CredentialStore
-from core.framework.credentials.aden import (
-    AdenCredentialClient,
-    AdenClientConfig,
-    AdenSyncProvider,
-)
-
-# Create client (API key loaded from ADEN_API_KEY env var)
-client = AdenCredentialClient(AdenClientConfig(
-    base_url="https://api.adenhq.com",
-))
-
-# Create provider and store
-provider = AdenSyncProvider(client=client)
-store = CredentialStore.with_encrypted_storage()
-
-# Manual sync
-synced_count = provider.sync_all(store)
-print(f"Synced {synced_count} credentials from Aden")
-```
-
-**4.1d. Run Health Check**
-
-```python
-from aden_tools.credentials import check_credential_health
-
-# Get the token from the store
-cred = store.get_credential("hubspot")
-token = cred.keys["access_token"].value.get_secret_value()
-
-result = check_credential_health("hubspot", token)
-if result.valid:
-    print("HubSpot credentials validated successfully!")
-else:
-    print(f"Validation failed: {result.message}")
-    # Offer to retry the OAuth flow
-```
-
-#### Option 2: Direct API Key
-
-For users who prefer manual API key management.
-
-**4.2a. Show Setup Instructions**
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-
-spec = CREDENTIAL_SPECS.get("hubspot")
-if spec and spec.api_key_instructions:
-    print(spec.api_key_instructions)
-# Output:
-# To get a HubSpot Private App token:
-# 1. Go to HubSpot Settings > Integrations > Private Apps
-# 2. Click "Create a private app"
-# 3. Name your app (e.g., "Hive Agent")
-# ...
-
-if spec and spec.help_url:
-    print(f"More info: {spec.help_url}")
-```
-
-**4.2b. Collect API Key from User**
-
-Use AskUserQuestion to securely collect the API key:
-
-```
-Please provide your HubSpot access token:
-(This will be stored securely in ~/.hive/credentials)
-```
-
-**4.2c. Run Health Check Before Storing**
-
-```python
-from aden_tools.credentials import check_credential_health
-
-result = check_credential_health("hubspot", user_provided_token)
-if not result.valid:
-    print(f"Warning: {result.message}")
-    # Ask user if they want to:
-    # 1. Try a different token
-    # 2. Continue anyway (not recommended)
-```
-
-**4.2d. Store in Local Encrypted Store**
-
-```python
-from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
-from pydantic import SecretStr
-
-store = CredentialStore.with_encrypted_storage()
-
-cred = CredentialObject(
-    id="hubspot",
-    name="HubSpot Access Token",
-    keys={
-        "access_token": CredentialKey(
-            name="access_token",
-            value=SecretStr(user_provided_token),
-        )
-    },
-)
-store.save_credential(cred)
-```
-
-**4.2e. Export to Current Session**
-
-```bash
-export HUBSPOT_ACCESS_TOKEN="the-value"
-```
-
-#### Option 3: Local Credential Setup (Advanced)
-
-For programmatic/CI/CD setups.
-
-**4.3a. Show Documentation**
-
-```
-For advanced credential management, you can use the CredentialStore API directly:
-
-  from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
-  from pydantic import SecretStr
-
-  store = CredentialStore.with_encrypted_storage()
-
-  cred = CredentialObject(
-      id="hubspot",
-      name="HubSpot Access Token",
-      keys={"access_token": CredentialKey(name="access_token", value=SecretStr("..."))}
-  )
-  store.save_credential(cred)
-
-For CI/CD environments:
-  - Set HIVE_CREDENTIAL_KEY for encryption
-  - Pre-populate ~/.hive/credentials programmatically
-  - Or use environment variables directly (HUBSPOT_ACCESS_TOKEN)
-
-Documentation: See core/framework/credentials/README.md
-```
-
-### Step 5: Record Configuration Method
-
-Track which auth method was used for each credential in `~/.hive/configuration.json`:
-
-```python
-import json
-from pathlib import Path
-from datetime import datetime
-
-config_path = Path.home() / ".hive" / "configuration.json"
-config = json.loads(config_path.read_text()) if config_path.exists() else {}
-
-if "credential_methods" not in config:
-    config["credential_methods"] = {}
-
-config["credential_methods"]["hubspot"] = {
-    "method": "aden",  # or "direct" or "custom"
-    "configured_at": datetime.now().isoformat(),
-}
-
-config_path.write_text(json.dumps(config, indent=2))
-```
-
-### Step 6: Verify All Credentials
-
-Use the `verify_credentials` MCP tool to confirm everything is properly configured:
-
-```
-verify_credentials(agent_path="exports/{agent_name}")
-```
-
-The tool returns:
-
-```json
-{
-  "agent": "exports/{agent_name}",
-  "ready": true,
-  "missing_credentials": [],
-  "warnings": [],
-  "errors": []
-}
-```
-
-If `ready` is true, report success. If `missing_credentials` is non-empty, identify what failed and loop back to Step 3 for the remaining credentials.
-
-## Health Check Reference
-
-Health checks validate credentials by making lightweight API calls:
-
-| Credential      | Endpoint                                | What It Checks                    |
-| --------------- | --------------------------------------- | --------------------------------- |
-| `anthropic`     | `POST /v1/messages`                     | API key validity                  |
-| `brave_search`  | `GET /res/v1/web/search?q=test&count=1` | API key validity                  |
-| `google_search` | `GET /customsearch/v1?q=test&num=1`     | API key + CSE ID validity         |
-| `github`        | `GET /user`                             | Token validity, user identity     |
-| `hubspot`       | `GET /crm/v3/objects/contacts?limit=1`  | Bearer token validity, CRM scopes |
-| `resend`        | `GET /domains`                          | API key validity                  |
-
-```python
-from aden_tools.credentials import check_credential_health, HealthCheckResult
-
-result: HealthCheckResult = check_credential_health("hubspot", token_value)
-# result.valid: bool
-# result.message: str
-# result.details: dict (status_code, rate_limited, etc.)
-```
-
-## Encryption Key (HIVE_CREDENTIAL_KEY)
-
-The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.
-
- If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
- Without this key, stored credentials cannot be decrypted
-
-**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
-
-All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**
-
-If `HIVE_CREDENTIAL_KEY` is not set:
-
-1. Let the store generate one
-2. Tell the user to save it: `export HIVE_CREDENTIAL_KEY="{generated_key}"`
-3. Recommend adding it to `~/.bashrc` or their shell profile
-
-## Security Rules
-
- **NEVER** log, print, or echo credential values in tool output
- **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
- **NEVER** hardcode credentials in source code
- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
- **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
- **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
- **ALWAYS** run health checks before storing credentials (when possible)
- **ALWAYS** verify credentials were stored by re-running validation, not by reading them back
- When modifying `~/.bashrc` or `~/.zshrc`, confirm with the user first
-
-## Credential Sources Reference
-
-All credential specs are defined in `tools/src/aden_tools/credentials/`:
-
-| File              | Category      | Credentials                                   | Aden Supported |
-| ----------------- | ------------- | --------------------------------------------- | -------------- |
-| `llm.py`          | LLM Providers | `anthropic`                                   | No             |
-| `search.py`       | Search Tools  | `brave_search`, `google_search`, `google_cse` | No             |
-| `email.py`        | Email         | `resend`                                      | No             |
-| `integrations.py` | Integrations  | `github`, `hubspot`, `google_calendar_oauth`  | No / Yes       |
-
-**Note:** Additional LLM providers (Cerebras, Groq, OpenAI) are handled by LiteLLM via environment
-variables (`CEREBRAS_API_KEY`, `GROQ_API_KEY`, `OPENAI_API_KEY`) but are not yet in CREDENTIAL_SPECS.
-Add them to `llm.py` as needed.
-
-To check what's registered:
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-for name, spec in CREDENTIAL_SPECS.items():
-    print(f"{name}: aden={spec.aden_supported}, direct={spec.direct_api_key_supported}")
-```
-
-## Migration: CredentialManager → CredentialStore
-
-**CredentialManager is deprecated.** Use CredentialStore instead.
-
-| Old (Deprecated)                          | New (Recommended)                                                    |
-| ----------------------------------------- | -------------------------------------------------------------------- |
-| `CredentialManager()`                     | `CredentialStore.with_encrypted_storage()`                           |
-| `creds.get("hubspot")`                    | `store.get("hubspot")` or `store.get_key("hubspot", "access_token")` |
-| `creds.validate_for_tools(tools)`         | Use `store.is_available(cred_id)` per credential                     |
-| `creds.get_auth_options("hubspot")`       | Check `CREDENTIAL_SPECS["hubspot"].aden_supported`                   |
-| `creds.get_setup_instructions("hubspot")` | Access `CREDENTIAL_SPECS["hubspot"]` directly                        |
-
-**Why migrate?**
-
- **CredentialStore** supports encrypted storage, multi-key credentials, template resolution, and automatic token refresh
- **CredentialManager** only reads from environment variables and .env files (no encryption, no refresh)
- **CredentialStoreAdapter** exists for backward compatibility during migration
-
-```python
-# Old way (deprecated)
-from aden_tools.credentials import CredentialManager
-creds = CredentialManager()
-token = creds.get("hubspot")
-
-# New way (recommended)
-from core.framework.credentials import CredentialStore
-store = CredentialStore.with_encrypted_storage()
-token = store.get("hubspot")
-
-# With Aden sync (recommended for OAuth integrations)
-store = CredentialStore.with_aden_sync()
-token = store.get_key("hubspot", "access_token")
-```
-
-## Example Session
-
-```
-User: /hive-credentials for my research-agent
-
-Agent: Let me check what credentials your research-agent needs.
-
-[Calls check_missing_credentials(agent_path="exports/research-agent")]
-→ Returns:
-  available: anthropic (encrypted_store), brave_search (encrypted_store)
-  missing: google_search (GOOGLE_API_KEY), google_cse (GOOGLE_CSE_ID)
-  ready: false
-
-Agent: 2 of 4 required credentials are already configured. Only Google Custom
-Search needs setup (2 values).
-
--- Setting up Google Custom Search (google_search + google_cse) ---
-
-This requires two values that work together.
-
-[Checks HIVE_CREDENTIAL_KEY before storing]
-$ printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "set" || echo "not set"
-set
-
-First, the Google API Key:
-1. Go to https://console.cloud.google.com/apis/credentials
-2. Create a new project (or select an existing one)
-3. Enable the "Custom Search API" from the API Library
-4. Go to Credentials > Create Credentials > API Key
-5. Copy the generated API key
-
-[AskUserQuestion: "Please provide your Google API key:"]
-[User provides key]
-
-Now, the Custom Search Engine ID:
-1. Go to https://programmablesearchengine.google.com/controlpanel/all
-2. Click "Add" to create a new search engine
-3. Under "What to search", select "Search the entire web"
-4. Give your search engine a name
-5. Click "Create"
-6. Copy the Search Engine ID (cx value)
-
-[AskUserQuestion: "Please provide your Google CSE ID:"]
-[User provides ID]
-
-[Runs health check with both values - GET /customsearch/v1?q=test&num=1 → 200 OK]
-[Stores both in local encrypted store, exports to env]
-
-✓ Google Custom Search credentials valid
-
-[Calls verify_credentials(agent_path="exports/research-agent")]
-→ Returns: ready: true, missing_credentials: []
-
-All credentials are now configured:
-  ✓ anthropic (ANTHROPIC_API_KEY) — already in encrypted store
-  ✓ brave_search (BRAVE_SEARCH_API_KEY) — already in encrypted store
-  ✓ google_search (GOOGLE_API_KEY) — stored in encrypted store
-  ✓ google_cse (GOOGLE_CSE_ID) — stored in encrypted store
-
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                      ✅ CREDENTIALS CONFIGURED                              │
-├─────────────────────────────────────────────────────────────────────────────┤
-│                                                                             │
-│     OPEN A NEW TERMINAL before running commands below.                      │
-│     Environment variables were saved to your shell config but               │
-│     only take effect in new terminal sessions.                              │
-│                                                                             │
-│  NEXT STEPS:                                                                │
-│                                                                             │
-│  1. RUN YOUR AGENT:                                                         │
-│                                                                             │
-│     hive tui                                                                │
-│                                                                             │
-│  2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER:                              │
-│                                                                             │
-│     /hive-debugger                                                          │
-│                                                                             │
-│     The debugger analyzes runtime logs, identifies retry loops, tool        │
-│     failures, stalled execution, and provides actionable fix suggestions.   │
-│                                                                             │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
@@ -1,385 +0,0 @@
---
-name: hive-patterns
-description: Best practices, patterns, and examples for building goal-driven agents. Includes client-facing interaction, feedback edges, judge patterns, fan-out/fan-in, context management, and anti-patterns.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: reference
-  part_of: hive
---
-
-# Building Agents - Patterns & Best Practices
-
-Design patterns, examples, and best practices for building robust goal-driven agents.
-
-**Prerequisites:** Complete agent structure using `hive-create`.
-
-## Practical Example: Hybrid Workflow
-
-How to build a node using both direct file writes and optional MCP validation:
-
-```python
-# 1. WRITE TO FILE FIRST (Primary - makes it visible)
-node_code = '''
-search_node = NodeSpec(
-    id="search-web",
-    node_type="event_loop",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}. Use web_search, then call set_output to store results.",
-    tools=["web_search"],
-)
-'''
-
-Edit(
-    file_path="exports/research_agent/nodes/__init__.py",
-    old_string="# Nodes will be added here",
-    new_string=node_code
-)
-
-# 2. OPTIONALLY VALIDATE WITH MCP (Secondary - bookkeeping)
-validation = mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "python tutorials"}',
-    mock_llm_response='{"search_results": [...mock results...]}'
-)
-```
-
-**User experience:**
-
- Immediately sees node in their editor (from step 1)
- Gets validation feedback (from step 2)
- Can edit the file directly if needed
-
-## Multi-Turn Interaction Patterns
-
-For agents needing multi-turn conversations with users, use `client_facing=True` on event_loop nodes.
-
-### Client-Facing Nodes
-
-A client-facing node streams LLM output to the user and blocks for user input between conversational turns. This replaces the old pause/resume pattern.
-
-```python
-# Client-facing node with STEP 1/STEP 2 prompt pattern
-intake_node = NodeSpec(
-    id="intake",
-    name="Intake",
-    description="Gather requirements from the user",
-    node_type="event_loop",
-    client_facing=True,
-    input_keys=["topic"],
-    output_keys=["research_brief"],
-    system_prompt="""\
-You are an intake specialist.
-
-**STEP 1 — Read and respond (text only, NO tool calls):**
-1. Read the topic provided
-2. If it's vague, ask 1-2 clarifying questions
-3. If it's clear, confirm your understanding
-
-**STEP 2 — After the user confirms, call set_output:**
- set_output("research_brief", "Clear description of what to research")
-""",
-)
-
-# Internal node runs without user interaction
-research_node = NodeSpec(
-    id="research",
-    name="Research",
-    description="Search and analyze sources",
-    node_type="event_loop",
-    input_keys=["research_brief"],
-    output_keys=["findings", "sources"],
-    system_prompt="Research the topic using web_search and web_scrape...",
-    tools=["web_search", "web_scrape", "load_data", "save_data"],
-)
-```
-
-**How it works:**
-
- Client-facing nodes stream LLM text to the user and block for input after each response
- User input is injected via `node.inject_event(text)`
- When the LLM calls `set_output` to produce structured outputs, the judge evaluates and ACCEPTs
- Internal nodes (non-client-facing) run their entire loop without blocking
- `set_output` is a synthetic tool — a turn with only `set_output` calls (no real tools) triggers user input blocking
-
-**STEP 1/STEP 2 pattern:** Always structure client-facing prompts with explicit phases. STEP 1 is text-only conversation. STEP 2 calls `set_output` after user confirmation. This prevents the LLM from calling `set_output` prematurely before the user responds.
-
-### When to Use client_facing
-
-| Scenario                            | client_facing | Why                    |
-| ----------------------------------- | :-----------: | ---------------------- |
-| Gathering user requirements         |      Yes      | Need user input        |
-| Human review/approval checkpoint    |      Yes      | Need human decision    |
-| Data processing (scanning, scoring) |      No       | Runs autonomously      |
-| Report generation                   |      No       | No user input needed   |
-| Final confirmation before action    |      Yes      | Need explicit approval |
-
-> **Legacy Note:** The `pause_nodes` / `entry_points` pattern still works for backward compatibility but `client_facing=True` is preferred for new agents.
-
-## Edge-Based Routing and Feedback Loops
-
-### Conditional Edge Routing
-
-Multiple conditional edges from the same source replace the old `router` node type. Each edge checks a condition on the node's output.
-
-```python
-# Node with mutually exclusive outputs
-review_node = NodeSpec(
-    id="review",
-    name="Review",
-    node_type="event_loop",
-    client_facing=True,
-    output_keys=["approved_contacts", "redo_extraction"],
-    nullable_output_keys=["approved_contacts", "redo_extraction"],
-    max_node_visits=3,
-    system_prompt="Present the contact list to the operator. If they approve, call set_output('approved_contacts', ...). If they want changes, call set_output('redo_extraction', 'true').",
-)
-
-# Forward edge (positive priority, evaluated first)
-EdgeSpec(
-    id="review-to-campaign",
-    source="review",
-    target="campaign-builder",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('approved_contacts') is not None",
-    priority=1,
-)
-
-# Feedback edge (negative priority, evaluated after forward edges)
-EdgeSpec(
-    id="review-feedback",
-    source="review",
-    target="extractor",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('redo_extraction') is not None",
-    priority=-1,
-)
-```
-
-**Key concepts:**
-
- `nullable_output_keys`: Lists output keys that may remain unset. The node sets exactly one of the mutually exclusive keys per execution.
- `max_node_visits`: Must be >1 on the feedback target (extractor) so it can re-execute. Default is 1.
- `priority`: Positive = forward edge (evaluated first). Negative = feedback edge. The executor tries forward edges first; if none match, falls back to feedback edges.
-
-### Routing Decision Table
-
-| Pattern                | Old Approach            | New Approach                                  |
-| ---------------------- | ----------------------- | --------------------------------------------- |
-| Conditional branching  | `router` node           | Conditional edges with `condition_expr`       |
-| Binary approve/reject  | `pause_nodes` + resume  | `client_facing=True` + `nullable_output_keys` |
-| Loop-back on rejection | Manual entry_points     | Feedback edge with `priority=-1`              |
-| Multi-way routing      | Router with routes dict | Multiple conditional edges with priorities    |
-
-## Judge Patterns
-
-**Core Principle: The judge is the SOLE mechanism for acceptance decisions.** Never add ad-hoc framework gating to compensate for LLM behavior. If the LLM calls `set_output` prematurely, fix the system prompt or use a custom judge. Anti-patterns to avoid:
-
- Output rollback logic
- `_user_has_responded` flags
- Premature set_output rejection
- Interaction protocol injection into system prompts
-
-Judges control when an event_loop node's loop exits. Choose based on validation needs.
-
-### Implicit Judge (Default)
-
-When no judge is configured, the implicit judge ACCEPTs when:
-
- The LLM finishes its response with no tool calls
- All required output keys have been set via `set_output`
-
-Best for simple nodes where "all outputs set" is sufficient validation.
-
-### SchemaJudge
-
-Validates outputs against a Pydantic model. Use when you need structural validation.
-
-```python
-from pydantic import BaseModel
-
-class ScannerOutput(BaseModel):
-    github_users: list[dict]  # Must be a list of user objects
-
-class SchemaJudge:
-    def __init__(self, output_model: type[BaseModel]):
-        self._model = output_model
-
-    async def evaluate(self, context: dict) -> JudgeVerdict:
-        missing = context.get("missing_keys", [])
-        if missing:
-            return JudgeVerdict(
-                action="RETRY",
-                feedback=f"Missing output keys: {missing}. Use set_output to provide them.",
-            )
-        try:
-            self._model.model_validate(context["output_accumulator"])
-            return JudgeVerdict(action="ACCEPT")
-        except ValidationError as e:
-            return JudgeVerdict(action="RETRY", feedback=str(e))
-```
-
-### When to Use Which Judge
-
-| Judge           | Use When                              | Example                |
-| --------------- | ------------------------------------- | ---------------------- |
-| Implicit (None) | Output keys are sufficient validation | Simple data extraction |
-| SchemaJudge     | Need structural validation of outputs | API response parsing   |
-| Custom          | Domain-specific validation logic      | Score must be 0.0-1.0  |
-
-## Fan-Out / Fan-In (Parallel Execution)
-
-Multiple ON_SUCCESS edges from the same source trigger parallel execution. All branches run concurrently via `asyncio.gather()`.
-
-```python
-# Scanner fans out to Profiler and Scorer in parallel
-EdgeSpec(id="scanner-to-profiler", source="scanner", target="profiler",
-         condition=EdgeCondition.ON_SUCCESS)
-EdgeSpec(id="scanner-to-scorer", source="scanner", target="scorer",
-         condition=EdgeCondition.ON_SUCCESS)
-
-# Both fan in to Extractor
-EdgeSpec(id="profiler-to-extractor", source="profiler", target="extractor",
-         condition=EdgeCondition.ON_SUCCESS)
-EdgeSpec(id="scorer-to-extractor", source="scorer", target="extractor",
-         condition=EdgeCondition.ON_SUCCESS)
-```
-
-**Requirements:**
-
- Parallel event_loop nodes must have **disjoint output_keys** (no key written by both)
- Only one parallel branch may contain a `client_facing` node
- Fan-in node receives outputs from all completed branches in shared memory
-
-## Context Management Patterns
-
-### Tiered Compaction
-
-EventLoopNode automatically manages context window usage with tiered compaction:
-
-1. **Pruning** — Old tool results replaced with compact placeholders (zero-cost, no LLM call)
-2. **Normal compaction** — LLM summarizes older messages
-3. **Aggressive compaction** — Keeps only recent messages + summary
-4. **Emergency** — Hard reset with tool history preservation
-
-### Spillover Pattern
-
-The framework automatically truncates large tool results and saves full content to a spillover directory. The LLM receives a truncation message with instructions to use `load_data` to read the full result.
-
-For explicit data management, use the data tools (real MCP tools, not synthetic):
-
-```python
-# save_data, load_data, list_data_files, serve_file_to_user are real MCP tools
-# data_dir is auto-injected by the framework — the LLM never sees it
-
-# Saving large results
-save_data(filename="sources.json", data=large_json_string)
-
-# Reading with pagination (line-based offset/limit)
-load_data(filename="sources.json", offset=0, limit=50)
-
-# Listing available files
-list_data_files()
-
-# Serving a file to the user as a clickable link
-serve_file_to_user(filename="report.html", label="Research Report")
-```
-
-Add data tools to nodes that handle large tool results:
-
-```python
-research_node = NodeSpec(
-    ...
-    tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
-)
-```
-
-`data_dir` is a framework context parameter — auto-injected at call time. `GraphExecutor.execute()` sets it per-execution via `ToolRegistry.set_execution_context(data_dir=...)` (using `contextvars` for concurrency safety), ensuring it matches the session-scoped spillover directory.
-
-## Anti-Patterns
-
-### What NOT to Do
-
- **Don't rely on `export_graph`** — Write files immediately, not at end
- **Don't hide code in session** — Write to files as components are approved
- **Don't wait to write files** — Agent visible from first step
- **Don't batch everything** — Write incrementally, one component at a time
- **Don't create too many thin nodes** — Prefer fewer, richer nodes (see below)
- **Don't add framework gating for LLM behavior** — Fix prompts or use judges instead
-
-### Fewer, Richer Nodes
-
-A common mistake is splitting work into too many small single-purpose nodes. Each node boundary requires serializing outputs, losing in-context information, and adding edge complexity.
-
-| Bad (8 thin nodes)  | Good (4 rich nodes)                 |
-| ------------------- | ----------------------------------- |
-| parse-query         | intake (client-facing)              |
-| search-sources      | research (search + fetch + analyze) |
-| fetch-content       | review (client-facing)              |
-| evaluate-sources    | report (write + deliver)            |
-| synthesize-findings |                                     |
-| write-report        |                                     |
-| quality-check       |                                     |
-| save-report         |                                     |
-
-**Why fewer nodes are better:**
-
- The LLM retains full context of its work within a single node
- A research node that searches, fetches, and analyzes keeps all source material in its conversation history
- Fewer edges means simpler graph and fewer failure points
- Data tools (`save_data`/`load_data`) handle context window limits within a single node
-
-### MCP Tools - Correct Usage
-
-**MCP tools OK for:**
-
- `test_node` — Validate node configuration with mock inputs
- `validate_graph` — Check graph structure
- `configure_loop` — Set event loop parameters
- `create_session` — Track session state for bookkeeping
-
-**Just don't:** Use MCP as the primary construction method or rely on export_graph
-
-## Error Handling Patterns
-
-### Graceful Failure with Fallback
-
-```python
-edges = [
-    # Success path
-    EdgeSpec(id="api-success", source="api-call", target="process-results",
-             condition=EdgeCondition.ON_SUCCESS),
-    # Fallback on failure
-    EdgeSpec(id="api-to-fallback", source="api-call", target="fallback-cache",
-             condition=EdgeCondition.ON_FAILURE, priority=1),
-    # Report if fallback also fails
-    EdgeSpec(id="fallback-to-error", source="fallback-cache", target="report-error",
-             condition=EdgeCondition.ON_FAILURE, priority=1),
-]
-```
-
-## Handoff to Testing
-
-When agent is complete, transition to testing phase:
-
-### Pre-Testing Checklist
-
- [ ] Agent structure validates: `uv run python -m agent_name validate`
- [ ] All nodes defined in nodes/**init**.py
- [ ] All edges connect valid nodes with correct priorities
- [ ] Feedback edge targets have `max_node_visits > 1`
- [ ] Client-facing nodes have meaningful system prompts
- [ ] Agent can be imported: `from exports.agent_name import default_agent`
-
-## Related Skills
-
- **hive-concepts** — Fundamental concepts (node types, edges, event loop architecture)
- **hive-create** — Step-by-step building process
- **hive-test** — Test and validate agents
- **hive** — Complete workflow orchestrator
-
---
-
-**Remember: Agent is actively constructed, visible the whole time. No hidden state. No surprise exports. Just transparent, incremental file building.**
@@ -1,940 +0,0 @@
---
-name: hive-test
-description: Iterative agent testing with session recovery. Execute, analyze, fix, resume from checkpoints. Use when testing an agent, debugging test failures, or verifying fixes without re-running from scratch.
---
-
-# Agent Testing
-
-Test agents iteratively: execute, analyze failures, fix, resume from checkpoint, repeat.
-
-## When to Use
-
- Testing a newly built agent against its goal
- Debugging a failing agent iteratively
- Verifying fixes without re-running expensive early nodes
- Running final regression tests before deployment
-
-## Prerequisites
-
-1. Agent package at `exports/{agent_name}/` (built with `/hive-create`)
-2. Credentials configured (`/hive-credentials`)
-3. `ANTHROPIC_API_KEY` set (or appropriate LLM provider key)
-
-**Path distinction** (critical — don't confuse these):
- `exports/{agent_name}/` — agent source code (edit here)
- `~/.hive/agents/{agent_name}/` — runtime data: sessions, checkpoints, logs (read here)
-
---
-
-## The Iterative Test Loop
-
-This is the core workflow. Don't re-run the entire agent when a late node fails — analyze, fix, and resume from the last clean checkpoint.
-
-```
-┌──────────────────────────────────────┐
-│ PHASE 1: Generate Test Scenarios     │
-│ Goal → synthetic test inputs + tests │
-└──────────────┬───────────────────────┘
-               ↓
-┌──────────────────────────────────────┐
-│ PHASE 2: Execute                     │◄────────────────┐
-│ Run agent (CLI or pytest)            │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-          Pass? ──yes──► PHASE 6: Final Verification     │
-               │                                         │
-               no                                        │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 3: Analyze                     │                 │
-│ Session + runtime logs + checkpoints │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 4: Fix                         │                 │
-│ Prompt / code / graph / goal         │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 5: Recover & Resume            │─────────────────┘
-│ Checkpoint resume OR fresh re-run    │
-└──────────────────────────────────────┘
-```
-
---
-
-### Phase 1: Generate Test Scenarios
-
-Create synthetic tests from the agent's goal, constraints, and success criteria.
-
-#### Step 1a: Read the goal
-
-```python
-# Read goal from agent.py
-Read(file_path="exports/{agent_name}/agent.py")
-# Extract the Goal definition and convert to JSON string
-```
-
-#### Step 1b: Get test guidelines
-
-```python
-# Get constraint test guidelines
-generate_constraint_tests(
-    goal_id="your-goal-id",
-    goal_json='{"id": "...", "constraints": [...]}',
-    agent_path="exports/{agent_name}"
-)
-
-# Get success criteria test guidelines
-generate_success_tests(
-    goal_id="your-goal-id",
-    goal_json='{"id": "...", "success_criteria": [...]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/{agent_name}"
-)
-```
-
-These return `file_header`, `test_template`, `constraints_formatted`/`success_criteria_formatted`, and `test_guidelines`. They do NOT generate test code — you write the tests.
-
-#### Step 1c: Write tests
-
-```python
-Write(
-    file_path=result["output_file"],
-    content=result["file_header"] + "\n\n" + your_test_code
-)
-```
-
-#### Test writing rules
-
- Every test MUST be `async` with `@pytest.mark.asyncio`
- Every test MUST accept `runner, auto_responder, mock_mode` fixtures
- Use `await auto_responder.start()` before running, `await auto_responder.stop()` in `finally`
- Use `await runner.run(input_dict)` — this goes through AgentRunner → AgentRuntime → ExecutionStream
- Access output via `result.output.get("key")` — NEVER `result.output["key"]`
- `result.success=True` means no exception, NOT goal achieved — always check output
- Write 8-15 tests total, not 30+
- Each real test costs ~3 seconds + LLM tokens
- NEVER use `default_agent.run()` — it bypasses the runtime (no sessions, no logs, client-facing nodes hang)
-
-#### Step 1d: Check existing tests
-
-Before generating, check if tests already exist:
-
-```python
-list_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
---
-
-### Phase 2: Execute
-
-Two execution paths, use the right one for your situation.
-
-#### Iterative debugging (for complex agents)
-
-Run the agent via CLI. This creates sessions with checkpoints at `~/.hive/agents/{agent_name}/sessions/`:
-
-```bash
-uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
-```
-
-Sessions and checkpoints are saved automatically.
-
-**Client-facing nodes**: Agents with `client_facing=True` nodes (interactive conversation) work in headless mode when run from a real terminal — the agent streams output to stdout and reads user input from stdin via a `>>> ` prompt. In non-interactive shells (like Claude Code's Bash tool), client-facing nodes will hang because there is no stdin. For testing interactive agents from Claude Code, use `run_tests` with mock mode or have the user run the agent manually in their terminal.
-
-#### Automated regression (for CI or final verification)
-
-Use the `run_tests` MCP tool to run all pytest tests:
-
-```python
-run_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
-Returns structured results:
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 12, "passed": 10, "failed": 2, "pass_rate": "83.3%"},
-  "test_results": [{"test_name": "test_success_source_diversity", "status": "failed"}],
-  "failures": [{"test_name": "test_success_source_diversity", "details": "..."}]
-}
-```
-
-**Options:**
-```python
-# Run only constraint tests
-run_tests(goal_id, agent_path, test_types='["constraint"]')
-
-# Stop on first failure
-run_tests(goal_id, agent_path, fail_fast=True)
-
-# Parallel execution
-run_tests(goal_id, agent_path, parallel=4)
-```
-
-**Note:** `run_tests` uses `AgentRunner` with `tmp_path` storage, so sessions are isolated per test run. For checkpoint-based recovery with persistent sessions, use CLI execution. Use `run_tests` for quick regression checks and final verification.
-
---
-
-### Phase 3: Analyze Failures
-
-When a test fails, drill down systematically. Don't guess — use the tools.
-
-#### Step 3a: Get error category
-
-```python
-debug_test(
-    goal_id="your-goal-id",
-    test_name="test_success_source_diversity",
-    agent_path="exports/{agent_name}"
-)
-```
-
-Returns error category (`IMPLEMENTATION_ERROR`, `ASSERTION_FAILURE`, `TIMEOUT`, `IMPORT_ERROR`, `API_ERROR`) plus full traceback and suggestions.
-
-#### Step 3b: Find the failed session
-
-```python
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    status="failed",
-    limit=5
-)
-```
-
-Returns session list with IDs, timestamps, current_node (where it failed), execution_quality.
-
-#### Step 3c: Inspect session state
-
-```python
-get_agent_session_state(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345"
-)
-```
-
-Returns execution path, which node was current, step count, timestamps — but excludes memory values (to avoid context bloat). Shows `memory_keys` and `memory_size` instead.
-
-#### Step 3d: Examine runtime logs (L2/L3)
-
-```python
-# L2: Per-node success/failure, retry counts
-query_runtime_log_details(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    run_id="session_20260209_143022_abc12345",
-    needs_attention_only=True
-)
-
-# L3: Exact LLM responses, tool call inputs/outputs
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    run_id="session_20260209_143022_abc12345",
-    node_id="research"
-)
-```
-
-#### Step 3e: Inspect memory data
-
-```python
-# See what data a node actually produced
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    key="research_results"
-)
-```
-
-#### Step 3f: Find recovery points
-
-```python
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    is_clean="true"
-)
-```
-
-Returns checkpoint summaries with IDs, types (`node_start`, `node_complete`), which node, and `is_clean` flag. Clean checkpoints are safe resume points.
-
-#### Step 3g: Compare checkpoints (optional)
-
-To understand what changed between two points in execution:
-
-```python
-compare_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    checkpoint_id_before="cp_node_complete_research_143030",
-    checkpoint_id_after="cp_node_complete_review_143115"
-)
-```
-
-Returns memory diff (added/removed/changed keys) and execution path diff.
-
---
-
-### Phase 4: Fix Based on Root Cause
-
-Use the analysis from Phase 3 to determine what to fix and where.
-
-| Root Cause | What to Fix | Where to Edit |
-|------------|------------|---------------|
-| **Prompt issue** — LLM produces wrong output format, misses instructions | Node `system_prompt` | `exports/{agent}/nodes/__init__.py` |
-| **Code bug** — TypeError, KeyError, logic error in Python | Agent code | `exports/{agent}/agent.py`, `nodes/__init__.py` |
-| **Graph issue** — wrong routing, missing edge, bad condition_expr | Edges, node config | `exports/{agent}/agent.py` |
-| **Tool issue** — MCP tool fails, wrong config, missing credential | Tool config | `exports/{agent}/mcp_servers.json`, `/hive-credentials` |
-| **Goal issue** — success criteria too strict/vague, wrong constraints | Goal definition | `exports/{agent}/agent.py` (goal section) |
-| **Test issue** — test expectations don't match actual agent behavior | Test code | `exports/{agent}/tests/test_*.py` |
-
-#### Fix strategies by error category
-
-**IMPLEMENTATION_ERROR** (TypeError, AttributeError, KeyError):
-```python
-# Read the failing code
-Read(file_path="exports/{agent_name}/nodes/__init__.py")
-
-# Fix the bug
-Edit(
-    file_path="exports/{agent_name}/nodes/__init__.py",
-    old_string="results.get('videos')",
-    new_string="(results or {}).get('videos', [])"
-)
-```
-
-**ASSERTION_FAILURE** (test assertions fail but agent ran successfully):
- Check if the agent's output is actually wrong → fix the prompt
- Check if the test's expectations are unrealistic → fix the test
- Use `get_agent_session_memory` to see what the agent actually produced
-
-**TIMEOUT / STALL** (agent runs too long):
- Check `node_visit_counts` for feedback loops hitting max_node_visits
- Check L3 logs for tool calls that hang
- Reduce `max_iterations` in loop_config or fix the prompt to converge faster
-
-**API_ERROR** (connection, rate limit, auth):
- Verify credentials with `/hive-credentials`
- Check MCP server configuration
-
---
-
-### Phase 5: Recover & Resume
-
-After fixing the agent, decide whether to resume or re-run.
-
-#### When to resume from checkpoint
-
-Resume when ALL of these are true:
- The fix is to a node that comes AFTER existing clean checkpoints
- Clean checkpoints exist (from a CLI execution with checkpointing)
- The early nodes are expensive (web scraping, API calls, long LLM chains)
-
-```bash
-# Resume from the last clean checkpoint before the failing node
-uv run hive run exports/{agent_name} \
-  --resume-session session_20260209_143022_abc12345 \
-  --checkpoint cp_node_complete_research_143030
-```
-
-This skips all nodes before the checkpoint and only re-runs the fixed node onward.
-
-#### When to re-run from scratch
-
-Re-run when ANY of these are true:
- The fix is to the entry node or an early node
- No checkpoints exist (e.g., agent was run via `run_tests`)
- The agent is fast (2-3 nodes, completes in seconds)
- You changed the graph structure (added/removed nodes/edges)
-
-```bash
-uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
-```
-
-#### Inspecting a checkpoint before resuming
-
-```python
-get_agent_checkpoint(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    checkpoint_id="cp_node_complete_research_143030"
-)
-```
-
-Returns the full checkpoint: shared_memory snapshot, execution_path, current_node, next_node, is_clean.
-
-#### Loop back to Phase 2
-
-After resuming or re-running, check if the fix worked. If not, go back to Phase 3.
-
---
-
-### Phase 6: Final Verification
-
-Once the iterative fix loop converges (the agent produces correct output), run the full automated test suite:
-
-```python
-run_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
-All tests should pass. If not, repeat the loop for remaining failures.
-
---
-
-## Credential Requirements
-
-**CRITICAL: Testing requires ALL credentials the agent depends on.** This includes both the LLM API key AND any tool-specific credentials (HubSpot, Brave Search, etc.).
-
-### Prerequisites
-
-Before running agent tests, you MUST collect ALL required credentials from the user.
-
-**Step 1: LLM API Key (always required)**
-```bash
-export ANTHROPIC_API_KEY="your-key-here"
-```
-
-**Step 2: Tool-specific credentials (depends on agent's tools)**
-
-Inspect the agent's `mcp_servers.json` and tool configuration to determine which tools the agent uses, then check for all required credentials:
-
-```python
-from aden_tools.credentials import CredentialManager, CREDENTIAL_SPECS
-
-creds = CredentialManager()
-
-# Determine which tools the agent uses (from agent.json or mcp_servers.json)
-agent_tools = [...]  # e.g., ["hubspot_search_contacts", "web_search", ...]
-
-# Find all missing credentials for those tools
-missing = creds.get_missing_for_tools(agent_tools)
-```
-
-Common tool credentials:
-| Tool | Env Var | Help URL |
-|------|---------|----------|
-| HubSpot CRM | `HUBSPOT_ACCESS_TOKEN` | https://developers.hubspot.com/docs/api/private-apps |
-| Brave Search | `BRAVE_SEARCH_API_KEY` | https://brave.com/search/api/ |
-| Google Search | `GOOGLE_SEARCH_API_KEY` + `GOOGLE_SEARCH_CX` | https://developers.google.com/custom-search |
-
-**Why ALL credentials are required:**
- Tests need to execute the agent's LLM nodes to validate behavior
- Tools with missing credentials will return error dicts instead of real data
- Mock mode bypasses everything, providing no confidence in real-world performance
-
-### Mock Mode Limitations
-
-Mock mode (`--mock` flag or `MOCK_MODE=1`) is **ONLY for structure validation**:
-
- Validates graph structure (nodes, edges, connections)
- Validates that `AgentRunner.load()` succeeds and the agent is importable
- Does NOT execute event_loop agents — MockLLMProvider never calls `set_output`, so event_loop nodes loop forever
- Does NOT test LLM reasoning, content quality, or constraint validation
- Does NOT test real API integrations or tool use
-
-**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials.
-
-### Enforcing Credentials in Tests
-
-When writing tests, **ALWAYS include credential checks**:
-
-```python
-import os
-import pytest
-from aden_tools.credentials import CredentialManager
-
-pytestmark = pytest.mark.skipif(
-    not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1."
-)
-
-
-@pytest.fixture(scope="session", autouse=True)
-def check_credentials():
-    """Ensure ALL required credentials are set for real testing."""
-    creds = CredentialManager()
-    mock_mode = os.environ.get("MOCK_MODE")
-
-    if not creds.is_available("anthropic"):
-        if mock_mode:
-            print("\nRunning in MOCK MODE - structure validation only")
-        else:
-            pytest.fail(
-                "\nANTHROPIC_API_KEY not set!\n"
-                "Set API key: export ANTHROPIC_API_KEY='your-key-here'\n"
-                "Or run structure validation: MOCK_MODE=1 pytest exports/{agent}/tests/"
-            )
-
-    if not mock_mode:
-        agent_tools = []  # Update per agent
-        missing = creds.get_missing_for_tools(agent_tools)
-        if missing:
-            lines = ["\nMissing tool credentials!"]
-            for name in missing:
-                spec = creds.specs.get(name)
-                if spec:
-                    lines.append(f"  {spec.env_var} - {spec.description}")
-            pytest.fail("\n".join(lines))
-```
-
-### User Communication
-
-When the user asks to test an agent, **ALWAYS check for ALL credentials first**:
-
-1. **Identify the agent's tools** from `mcp_servers.json`
-2. **Check ALL required credentials** using `CredentialManager`
-3. **Ask the user to provide any missing credentials** before proceeding
-4. Collect ALL missing credentials in a single prompt — not one at a time
-
---
-
-## Safe Test Patterns
-
-### OutputCleaner
-
-The framework automatically validates and cleans node outputs using a fast LLM at edge traversal time. Tests should still use safe patterns because OutputCleaner may not catch all issues.
-
-### Safe Access (REQUIRED)
-
-```python
-# UNSAFE - will crash on missing keys
-approval = result.output["approval_decision"]
-category = result.output["analysis"]["category"]
-
-# SAFE - use .get() with defaults
-output = result.output or {}
-approval = output.get("approval_decision", "UNKNOWN")
-
-# SAFE - type check before operations
-analysis = output.get("analysis", {})
-if isinstance(analysis, dict):
-    category = analysis.get("category", "unknown")
-
-# SAFE - handle JSON parsing trap (LLM response as string)
-import json
-recommendation = output.get("recommendation", "{}")
-if isinstance(recommendation, str):
-    try:
-        parsed = json.loads(recommendation)
-        if isinstance(parsed, dict):
-            approval = parsed.get("approval_decision", "UNKNOWN")
-    except json.JSONDecodeError:
-        approval = "UNKNOWN"
-elif isinstance(recommendation, dict):
-    approval = recommendation.get("approval_decision", "UNKNOWN")
-
-# SAFE - type check before iteration
-items = output.get("items", [])
-if isinstance(items, list):
-    for item in items:
-        ...
-```
-
-### Helper Functions for conftest.py
-
-```python
-import json
-import re
-
-def _parse_json_from_output(result, key):
-    """Parse JSON from agent output (framework may store full LLM response as string)."""
-    response_text = result.output.get(key, "")
-    json_text = re.sub(r'```json\s*|\s*```', '', response_text).strip()
-    try:
-        return json.loads(json_text)
-    except (json.JSONDecodeError, AttributeError, TypeError):
-        return result.output.get(key)
-
-def safe_get_nested(result, key_path, default=None):
-    """Safely get nested value from result.output."""
-    output = result.output or {}
-    current = output
-    for key in key_path:
-        if isinstance(current, dict):
-            current = current.get(key)
-        elif isinstance(current, str):
-            try:
-                json_text = re.sub(r'```json\s*|\s*```', '', current).strip()
-                parsed = json.loads(json_text)
-                if isinstance(parsed, dict):
-                    current = parsed.get(key)
-                else:
-                    return default
-            except json.JSONDecodeError:
-                return default
-        else:
-            return default
-    return current if current is not None else default
-
-# Make available in tests
-pytest.parse_json_from_output = _parse_json_from_output
-pytest.safe_get_nested = safe_get_nested
-```
-
-### ExecutionResult Fields
-
-**`result.success=True` means NO exception, NOT goal achieved**
-
-```python
-# WRONG
-assert result.success
-
-# RIGHT
-assert result.success, f"Agent failed: {result.error}"
-output = result.output or {}
-approval = output.get("approval_decision")
-assert approval == "APPROVED", f"Expected APPROVED, got {approval}"
-```
-
-All fields:
- `success: bool` — Completed without exception (NOT goal achieved!)
- `output: dict` — Complete memory snapshot (may contain raw strings)
- `error: str | None` — Error message if failed
- `steps_executed: int` — Number of nodes executed
- `total_tokens: int` — Cumulative token usage
- `total_latency_ms: int` — Total execution time
- `path: list[str]` — Node IDs traversed (may repeat in feedback loops)
- `paused_at: str | None` — Node ID if paused
- `session_state: dict` — State for resuming
- `node_visit_counts: dict[str, int]` — Visit counts per node (feedback loop testing)
- `execution_quality: str` — "clean", "degraded", or "failed"
-
-### Test Count Guidance
-
-**Write 8-15 tests, not 30+**
-
- 2-3 tests per success criterion
- 1 happy path test
- 1 boundary/edge case test
- 1 error handling test (optional)
-
-Each real test costs ~3 seconds + LLM tokens. 12 tests = ~36 seconds, $0.12.
-
---
-
-## Test Patterns
-
-### Happy Path
-```python
-@pytest.mark.asyncio
-async def test_happy_path(runner, auto_responder, mock_mode):
-    """Test normal successful execution."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "python tutorials"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    assert output.get("report"), "No report produced"
-```
-
-### Boundary Condition
-```python
-@pytest.mark.asyncio
-async def test_minimum_sources(runner, auto_responder, mock_mode):
-    """Test at minimum source threshold."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "niche topic"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    sources = output.get("sources", [])
-    if isinstance(sources, list):
-        assert len(sources) >= 3, f"Expected >= 3 sources, got {len(sources)}"
-```
-
-### Error Handling
-```python
-@pytest.mark.asyncio
-async def test_empty_input(runner, auto_responder, mock_mode):
-    """Test graceful handling of empty input."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": ""})
-    finally:
-        await auto_responder.stop()
-    # Agent should either fail gracefully or produce an error message
-    output = result.output or {}
-    assert not result.success or output.get("error"), "Should handle empty input"
-```
-
-### Feedback Loop
-```python
-@pytest.mark.asyncio
-async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
-    """Test that feedback loops don't run forever."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "test"})
-    finally:
-        await auto_responder.stop()
-    visits = result.node_visit_counts or {}
-    for node_id, count in visits.items():
-        assert count <= 5, f"Node {node_id} visited {count} times — possible infinite loop"
-```
-
---
-
-## MCP Tool Reference
-
-### Phase 1: Test Generation
-
-```python
-# Check existing tests
-list_tests(goal_id, agent_path)
-
-# Get constraint test guidelines (returns templates, NOT generated tests)
-generate_constraint_tests(goal_id, goal_json, agent_path)
-# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines
-
-# Get success criteria test guidelines
-generate_success_tests(goal_id, goal_json, node_names, tool_names, agent_path)
-# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines
-```
-
-### Phase 2: Execution
-
-```python
-# Automated regression (no checkpoints, fresh runs)
-run_tests(goal_id, agent_path, test_types='["all"]', parallel=-1, fail_fast=False)
-
-# Run only specific test types
-run_tests(goal_id, agent_path, test_types='["constraint"]')
-run_tests(goal_id, agent_path, test_types='["success"]')
-```
-
-```bash
-# Iterative debugging with checkpoints (via CLI)
-uv run hive run exports/{agent_name} --input '{"query": "test"}'
-```
-
-### Phase 3: Analysis
-
-```python
-# Debug a specific failed test
-debug_test(goal_id, test_name, agent_path)
-
-# Find failed sessions
-list_agent_sessions(agent_work_dir, status="failed", limit=5)
-
-# Inspect session state (excludes memory values)
-get_agent_session_state(agent_work_dir, session_id)
-
-# Inspect memory data
-get_agent_session_memory(agent_work_dir, session_id, key="research_results")
-
-# Runtime logs: L1 summaries
-query_runtime_logs(agent_work_dir, status="needs_attention")
-
-# Runtime logs: L2 per-node details
-query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True)
-
-# Runtime logs: L3 tool/LLM raw data
-query_runtime_log_raw(agent_work_dir, run_id, node_id="research")
-
-# Find clean checkpoints
-list_agent_checkpoints(agent_work_dir, session_id, is_clean="true")
-
-# Compare checkpoints (memory diff)
-compare_agent_checkpoints(agent_work_dir, session_id, cp_before, cp_after)
-```
-
-### Phase 5: Recovery
-
-```python
-# Inspect checkpoint before resuming
-get_agent_checkpoint(agent_work_dir, session_id, checkpoint_id)
-# Empty checkpoint_id = latest checkpoint
-```
-
-```bash
-# Resume from checkpoint via CLI (headless)
-uv run hive run exports/{agent_name} \
-  --resume-session {session_id} --checkpoint {checkpoint_id}
-```
-
---
-
-## Anti-Patterns
-
-| Don't | Do Instead |
-|-------|-----------|
-| Use `default_agent.run()` in tests | Use `runner.run()` with `auto_responder` fixtures (goes through AgentRuntime) |
-| Re-run entire agent when a late node fails | Resume from last clean checkpoint |
-| Treat `result.success` as goal achieved | Check `result.output` for actual criteria |
-| Access `result.output["key"]` directly | Use `result.output.get("key")` |
-| Fix random things hoping tests pass | Analyze L2/L3 logs to find root cause first |
-| Write 30+ tests | Write 8-15 focused tests |
-| Skip credential check | Use `/hive-credentials` before testing |
-| Confuse `exports/` with `~/.hive/agents/` | Code in `exports/`, runtime data in `~/.hive/` |
-| Use `run_tests` for iterative debugging | Use headless CLI with checkpoints for iterative debugging |
-| Use headless CLI for final regression | Use `run_tests` for automated regression |
-| Use `--tui` from Claude Code | Use headless `run` command — TUI hangs in non-interactive shells |
-| Test client-facing nodes from Claude Code | Use mock mode, or have the user run the agent in their terminal |
-| Run tests without reading goal first | Always understand the goal before writing tests |
-| Skip Phase 3 analysis and guess | Use session + log tools to identify root cause |
-
---
-
-## Example Walkthrough: Deep Research Agent
-
-A complete iteration showing the test loop for an agent with nodes: `intake → research → review → report`.
-
-### Phase 1: Generate tests
-
-```python
-# Read the goal
-Read(file_path="exports/deep_research_agent/agent.py")
-
-# Get success criteria test guidelines
-result = generate_success_tests(
-    goal_id="rigorous-interactive-research",
-    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "target": ">=5"}, {"id": "citation-coverage", "target": "100%"}, {"id": "report-completeness", "target": "90%"}]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/deep_research_agent"
-)
-
-# Write tests
-Write(
-    file_path=result["output_file"],
-    content=result["file_header"] + "\n\n" + test_code
-)
-```
-
-### Phase 2: First execution
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-Result: `test_success_source_diversity` fails — agent only found 2 sources instead of 5.
-
-### Phase 3: Analyze
-
-```python
-# Debug the failing test
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_source_diversity",
-    agent_path="exports/deep_research_agent"
-)
-# → ASSERTION_FAILURE: Expected >= 5 sources, got 2
-
-# Find the session
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_150000_abc12345
-
-# See what the research node produced
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    key="research_results"
-)
-# → Only 2 web_search calls made, each returned 1 source
-
-# Check the LLM's behavior in the research node
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    run_id="session_20260209_150000_abc12345",
-    node_id="research"
-)
-# → LLM called web_search only twice, then called set_output
-```
-
-Root cause: The research node's prompt doesn't tell the LLM to search for at least 5 diverse sources. It stops after the first couple of searches.
-
-### Phase 4: Fix the prompt
-
-```python
-Read(file_path="exports/deep_research_agent/nodes/__init__.py")
-
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Search for information on the user\'s topic."',
-    new_string='system_prompt="Search for information on the user\'s topic. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries to ensure source diversity. Do not stop searching until you have at least 5 distinct sources."'
-)
-```
-
-### Phase 5: Resume from checkpoint
-
-For this example, the fix is to the `research` node. If we had run via CLI with checkpointing, we could resume from the checkpoint after `intake` to skip re-running intake:
-
-```bash
-# Check if clean checkpoint exists after intake
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    is_clean="true"
-)
-# → cp_node_complete_intake_150005
-
-# Resume from after intake, re-run research with fixed prompt
-uv run hive run exports/deep_research_agent \
-  --resume-session session_20260209_150000_abc12345 \
-  --checkpoint cp_node_complete_intake_150005
-```
-
-Or for this simple case (intake is fast), just re-run:
-
-```bash
-uv run hive run exports/deep_research_agent --input '{"topic": "test"}'
-```
-
-### Phase 6: Final verification
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent"
-)
-# → All 12 tests pass
-```
-
---
-
-## Test File Structure
-
-```
-exports/{agent_name}/
-├── agent.py              ← Agent to test (goal, nodes, edges)
-├── nodes/__init__.py     ← Node implementations (prompts, config)
-├── config.py             ← Agent configuration
-├── mcp_servers.json      ← Tool server config
-└── tests/
-    ├── conftest.py           ← Shared fixtures + safe access helpers
-    ├── test_constraints.py   ← Constraint tests
-    ├── test_success_criteria.py  ← Success criteria tests
-    └── test_edge_cases.py    ← Edge case tests
-```
-
-## Integration with Other Skills
-
-| Scenario | From | To | Action |
-|----------|------|----|--------|
-| Agent built, ready to test | `/hive-create` | `/hive-test` | Generate tests, start loop |
-| Prompt fix needed | `/hive-test` Phase 4 | Direct edit | Edit `nodes/__init__.py`, resume |
-| Goal definition wrong | `/hive-test` Phase 4 | `/hive-create` | Update goal, may need rebuild |
-| Missing credentials | `/hive-test` Phase 3 | `/hive-credentials` | Set up credentials |
-| Complex runtime failure | `/hive-test` Phase 3 | `/hive-debugger` | Deep L1/L2/L3 analysis |
-| All tests pass | `/hive-test` Phase 6 | Done | Agent validated |
@@ -1,333 +0,0 @@
-# Example: Iterative Testing of a Research Agent
-
-This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.
-
-## Agent Structure
-
-```
-exports/deep_research_agent/
-├── agent.py          # Goal + graph: intake → research → review → report
-├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
-├── config.py         # Model config
-├── mcp_servers.json  # Tools: web_search, web_scrape
-└── tests/            # Test files (we'll create these)
-```
-
-**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.
-
---
-
-## Phase 1: Generate Tests
-
-### Read the goal
-
-```python
-Read(file_path="exports/deep_research_agent/agent.py")
-# Extract: goal_id="rigorous-interactive-research"
-# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
-# constraints: no-hallucination, source-attribution
-```
-
-### Get test guidelines
-
-```python
-result = generate_success_tests(
-    goal_id="rigorous-interactive-research",
-    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/deep_research_agent"
-)
-```
-
-### Write tests
-
-```python
-Write(
-    file_path="exports/deep_research_agent/tests/test_success_criteria.py",
-    content=result["file_header"] + '''
-
-@pytest.mark.asyncio
-async def test_success_source_diversity(runner, auto_responder, mock_mode):
-    """At least 5 diverse sources are found."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "impact of remote work on productivity"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    sources = output.get("sources", [])
-    if isinstance(sources, list):
-        assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"
-
-@pytest.mark.asyncio
-async def test_success_citation_coverage(runner, auto_responder, mock_mode):
-    """Every factual claim in the report cites its source."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "climate change effects on agriculture"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    report = output.get("report", "")
-    # Check that report contains numbered references
-    assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"
-
-@pytest.mark.asyncio
-async def test_success_report_completeness(runner, auto_responder, mock_mode):
-    """Report addresses the original research question."""
-    query = "pros and cons of nuclear energy"
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": query})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    report = output.get("report", "")
-    assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"
-
-@pytest.mark.asyncio
-async def test_empty_query_handling(runner, auto_responder, mock_mode):
-    """Agent handles empty input gracefully."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": ""})
-    finally:
-        await auto_responder.stop()
-    output = result.output or {}
-    assert not result.success or output.get("error"), "Should handle empty query"
-
-@pytest.mark.asyncio
-async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
-    """Feedback loop between review and research terminates."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "quantum computing basics"})
-    finally:
-        await auto_responder.stop()
-    visits = result.node_visit_counts or {}
-    for node_id, count in visits.items():
-        assert count <= 5, f"Node {node_id} visited {count} times"
-'''
-)
-```
-
---
-
-## Phase 2: First Execution
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
-  "failures": [
-    {"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
-    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
-  ]
-}
-```
-
---
-
-## Phase 3: Analyze (Iteration 1)
-
-### Debug the first failure
-
-```python
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_source_diversity",
-    agent_path="exports/deep_research_agent"
-)
-# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
-```
-
-### Find the session and inspect memory
-
-```python
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_150000_abc12345
-
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    key="research_results"
-)
-# → Only 2 sources found. LLM stopped searching after 2 queries.
-```
-
-### Check LLM behavior in the research node
-
-```python
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    run_id="session_20260209_150000_abc12345",
-    node_id="research"
-)
-# → LLM called web_search twice, got results, immediately called set_output.
-# → Prompt doesn't instruct it to find at least 5 sources.
-```
-
-**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
-
---
-
-## Phase 4: Fix (Iteration 1)
-
-```python
-Read(file_path="exports/deep_research_agent/nodes/__init__.py")
-
-# Fix the research node prompt
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Search for information on the user\'s topic using web search."',
-    new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
-)
-```
-
---
-
-## Phase 5: Recover & Resume (Iteration 1)
-
-The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
-  "failures": [
-    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
-  ]
-}
-```
-
-Source diversity now passes. Citation coverage still fails.
-
---
-
-## Phase 3: Analyze (Iteration 2)
-
-```python
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_citation_coverage",
-    agent_path="exports/deep_research_agent"
-)
-# Category: ASSERTION_FAILURE — Report lacks citations
-
-# Check what the report node produced
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_151500_def67890
-
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_151500_def67890",
-    key="report"
-)
-# → Report text exists but uses no numbered references.
-# → Sources are in memory but report node doesn't cite them.
-```
-
-**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
-
---
-
-## Phase 4: Fix (Iteration 2)
-
-```python
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Write a comprehensive report based on the research findings."',
-    new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
-)
-```
-
---
-
-## Phase 5: Resume (Iteration 2)
-
-The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
-
-```bash
-# Run via CLI to get checkpoints
-uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
-
-# After it runs, find the clean checkpoint before report
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_152000_ghi34567",
-    is_clean="true"
-)
-# → cp_node_complete_review_152100 (after review, before report)
-
-# Resume — skips intake, research, review entirely
-uv run hive run exports/deep_research_agent \
-  --resume-session session_20260209_152000_ghi34567 \
-  --checkpoint cp_node_complete_review_152100
-```
-
-Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
-
---
-
-## Phase 6: Final Verification
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent"
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": true,
-  "summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
-}
-```
-
-All tests pass.
-
---
-
-## Summary
-
-| Iteration | Failure | Root Cause | Fix | Recovery |
-|-----------|---------|------------|-----|----------|
-| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
-| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |
-
-**Key takeaways:**
- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
- Final `run_tests` confirms all scenarios pass end-to-end
@@ -1,526 +0,0 @@
---
-name: hive
-description: Complete workflow for building, implementing, and testing goal-driven agents. Orchestrates hive-* skills. Use when starting a new agent project, unsure which skill to use, or need end-to-end guidance.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: workflow-orchestrator
-  orchestrates:
-    - hive-concepts
-    - hive-create
-    - hive-patterns
-    - hive-test
-    - hive-credentials
-    - hive-debugger
---
-
-# Agent Development Workflow
-
-**THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**
-
-When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
-
-```
-Use AskUserQuestion with these options:
- "Build a new agent" → Then invoke /hive-create
- "Test an existing agent" → Then invoke /hive-test
- "Learn agent concepts" → Then invoke /hive-concepts
- "Optimize agent design" → Then invoke /hive-patterns
- "Set up credentials" → Then invoke /hive-credentials
- "Debug a failing agent" → Then invoke /hive-debugger
- "Other" (please describe what you want to achieve)
-```
-
-**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
-
---
-
-Complete Standard Operating Procedure (SOP) for building production-ready goal-driven agents.
-
-## Overview
-
-This workflow orchestrates specialized skills to take you from initial concept to production-ready agent:
-
-1. **Understand Concepts** → `/hive-concepts` (optional)
-2. **Build Structure** → `/hive-create`
-3. **Optimize Design** → `/hive-patterns` (optional)
-4. **Setup Credentials** → `/hive-credentials` (if agent uses tools requiring API keys)
-5. **Test & Validate** → `/hive-test`
-6. **Debug Issues** → `/hive-debugger` (if agent fails at runtime)
-
-## When to Use This Workflow
-
-Use this meta-skill when:
- Starting a new agent from scratch
- Unclear which skill to use first
- Need end-to-end guidance for agent development
- Want consistent, repeatable agent builds
-
-**Skip this workflow** if:
- You only need to test an existing agent → use `/hive-test` directly
- You know exactly which phase you're in → use specific skill directly
-
-## Quick Decision Tree
-
-```
-"Need to understand agent concepts" → hive-concepts
-"Build a new agent" → hive-create
-"Optimize my agent design" → hive-patterns
-"Need client-facing nodes or feedback loops" → hive-patterns
-"Set up API keys for my agent" → hive-credentials
-"Test my agent" → hive-test
-"My agent is failing/stuck/has errors" → hive-debugger
-"Not sure what I need" → Read phases below, then decide
-"Agent has structure but needs implementation" → See agent directory STATUS.md
-```
-
-## Phase 0: Understand Concepts (Optional)
-
-**Skill**: `/hive-concepts`
-**Input**: Questions about agent architecture
-
-### When to Use
-
- First time building an agent
- Need to understand node types, edges, goals
- Want to validate tool availability
- Learning about event loop architecture and client-facing nodes
-
-### What This Phase Provides
-
- Architecture overview (Python packages, not JSON)
- Core concepts (Goal, Node, Edge, Event Loop, Judges)
- Tool discovery and validation procedures
- Workflow overview
-
-**Skip this phase** if you already understand agent fundamentals.
-
-## Phase 1: Build Agent Structure
-
-**Skill**: `/hive-create`
-**Input**: User requirements ("Build an agent that...") or a template to start from
-
-### What This Phase Does
-
-Creates the complete agent architecture:
- Package structure (`exports/agent_name/`)
- Goal with success criteria and constraints
- Workflow graph (nodes and edges)
- Node specifications
- CLI interface
- Documentation
-
-### Process
-
-1. **Create package** - Directory structure with skeleton files
-2. **Define goal** - Success criteria and constraints written to agent.py
-3. **Design nodes** - Each node approved and written incrementally
-4. **Connect edges** - Workflow graph with conditional routing
-5. **Finalize** - Agent class, exports, and documentation
-
-### Outputs
-
- ✅ `exports/agent_name/` package created
- ✅ Goal defined in agent.py
- ✅ 3-5 success criteria defined
- ✅ 1-5 constraints defined
- ✅ 5-10 nodes specified in nodes/__init__.py
- ✅ 8-15 edges connecting workflow
- ✅ Validated structure (passes `uv run python -m agent_name validate`)
- ✅ README.md with usage instructions
- ✅ CLI commands (info, validate, run, shell)
-
-### Success Criteria
-
-You're ready for Phase 2 when:
- Agent structure validates without errors
- All nodes and edges are defined
- CLI commands work (info, validate)
- You see: "Agent complete: exports/agent_name/"
-
-### Common Outputs
-
-The hive-create skill produces:
-```
-exports/agent_name/
-├── __init__.py          (package exports)
-├── __main__.py          (CLI interface)
-├── agent.py             (goal, graph, agent class)
-├── nodes/__init__.py    (node specifications)
-├── config.py            (configuration)
-├── implementations.py   (may be created for Python functions)
-└── README.md            (documentation)
-```
-
-### Next Steps
-
-**If structure complete and validated:**
-→ Check `exports/agent_name/STATUS.md` or `IMPLEMENTATION_GUIDE.md`
-→ These files explain implementation options
-→ You may need to add Python functions or MCP tools (not covered by current skills)
-
-**If want to optimize design:**
-→ Proceed to Phase 1.5 (hive-patterns)
-
-**If ready to test:**
-→ Proceed to Phase 2
-
-## Phase 1.5: Optimize Design (Optional)
-
-**Skill**: `/hive-patterns`
-**Input**: Completed agent structure
-
-### When to Use
-
- Want to add client-facing blocking or feedback edges
- Need judge patterns for output validation
- Want fan-out/fan-in (parallel execution)
- Need error handling patterns
- Want best practices guidance
-
-### What This Phase Provides
-
- Client-facing interaction patterns
- Feedback edge routing with nullable output keys
- Judge patterns (implicit, SchemaJudge)
- Fan-out/fan-in parallel execution
- Context management and spillover patterns
- Anti-patterns to avoid
-
-**Skip this phase** if your agent design is straightforward.
-
-## Phase 2: Test & Validate
-
-**Skill**: `/hive-test`
-**Input**: Working agent from Phase 1
-
-### What This Phase Does
-
-Guides the creation and execution of a comprehensive test suite:
- Constraint tests
- Success criteria tests
- Edge case tests
- Integration tests
-
-### Process
-
-1. **Analyze agent** - Read goal, constraints, success criteria
-2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
-3. **User approval** - Review and approve each test
-4. **Run evaluation** - Execute tests and collect results
-5. **Debug failures** - Identify and fix issues
-6. **Iterate** - Repeat until all tests pass
-
-### Outputs
-
- ✅ Test files in `exports/agent_name/tests/`
- ✅ Test report with pass/fail metrics
- ✅ Coverage of all success criteria
- ✅ Coverage of all constraints
- ✅ Edge case handling verified
-
-### Success Criteria
-
-You're done when:
- All tests pass
- All success criteria validated
- All constraints verified
- Agent handles edge cases
- Test coverage is comprehensive
-
-### Next Steps
-
-**Agent ready for:**
- Production deployment
- Integration into larger systems
- Documentation and handoff
- Continuous monitoring
-
-## Phase Transitions
-
-### From Phase 1 to Phase 2
-
-**Trigger signals:**
- "Agent complete: exports/..."
- Structure validation passes
- README indicates implementation complete
-
-**Before proceeding:**
- Verify agent can be imported: `from exports.agent_name import default_agent`
- Check if implementation is needed (see STATUS.md or IMPLEMENTATION_GUIDE.md)
- Confirm agent executes without import errors
-
-### Skipping Phases
-
-**When to skip Phase 1:**
- Agent structure already exists
- Only need to add tests
- Modifying existing agent
-
-**When to skip Phase 2:**
- Prototyping or exploring
- Agent not production-bound
- Manual testing sufficient
-
-## Common Patterns
-
-### Pattern 1: Complete New Build (Simple)
-
-```
-User: "Build an agent that monitors files"
-→ Use /hive-create
-→ Agent structure created
-→ Use /hive-test
-→ Tests created and passing
-→ Done: Production-ready agent
-```
-
-### Pattern 1b: Complete New Build (With Learning)
-
-```
-User: "Build an agent (first time)"
-→ Use /hive-concepts (understand concepts)
-→ Use /hive-create (build structure)
-→ Use /hive-patterns (optimize design)
-→ Use /hive-test (validate)
-→ Done: Production-ready agent
-```
-
-### Pattern 1c: Build from Template
-
-```
-User: "Build an agent based on the deep research template"
-→ Use /hive-create
-→ Select "From a template" path
-→ Pick template, name new agent
-→ Review/modify goal, nodes, graph
-→ Agent exported with customizations
-→ Use /hive-test
-→ Done: Customized agent
-```
-
-### Pattern 2: Test Existing Agent
-
-```
-User: "Test my agent at exports/my_agent"
-→ Skip Phase 1
-→ Use /hive-test directly
-→ Tests created
-→ Done: Validated agent
-```
-
-### Pattern 3: Iterative Development
-
-```
-User: "Build an agent"
-→ Use /hive-create (Phase 1)
-→ Implementation needed (see STATUS.md)
-→ [User implements functions]
-→ Use /hive-test (Phase 2)
-→ Tests reveal bugs
-→ [Fix bugs manually]
-→ Re-run tests
-→ Done: Working agent
-```
-
-### Pattern 4: Agent with Review Loops and HITL Checkpoints
-
-```
-User: "Build an agent with human review and feedback loops"
-→ Use /hive-concepts (learn event loop, client-facing nodes)
-→ Use /hive-create (build structure with feedback edges)
-→ Use /hive-patterns (implement client-facing + feedback patterns)
-→ Use /hive-test (validate review flows and edge routing)
-→ Done: Agent with HITL checkpoints and review loops
-```
-
-## Skill Dependencies
-
-```
-hive (meta-skill)
-    │
-    ├── hive-concepts (foundational)
-    │   ├── Architecture concepts (event loop, judges)
-    │   ├── Node types (event_loop, function)
-    │   ├── Edge routing and priority
-    │   ├── Tool discovery procedures
-    │   └── Workflow overview
-    │
-    ├── hive-create (procedural)
-    │   ├── Creates package structure
-    │   ├── Defines goal
-    │   ├── Adds nodes (event_loop, function)
-    │   ├── Connects edges with priority routing
-    │   ├── Finalizes agent class
-    │   └── Requires: hive-concepts
-    │
-    ├── hive-patterns (reference)
-    │   ├── Client-facing interaction patterns
-    │   ├── Feedback edges and review loops
-    │   ├── Judge patterns (implicit, SchemaJudge)
-    │   ├── Fan-out/fan-in parallel execution
-    │   └── Context management and anti-patterns
-    │
-    ├── hive-credentials (utility)
-    │   ├── Detects missing credentials
-    │   ├── Offers auth method choices (Aden OAuth, direct API key)
-    │   ├── Stores securely in ~/.hive/credentials
-    │   └── Validates with health checks
-    │
-    ├── hive-test (validation)
-    │   ├── Reads agent goal
-    │   ├── Generates tests
-    │   ├── Runs evaluation
-    │   └── Reports results
-    │
-    └── hive-debugger (troubleshooting)
-        ├── Monitors runtime logs (L1/L2/L3)
-        ├── Identifies retry loops, tool failures
-        ├── Categorizes issues (10 categories)
-        └── Provides fix recommendations
-```
-
-## Troubleshooting
-
-### "Agent structure won't validate"
-
- Check node IDs match between nodes/__init__.py and agent.py
- Verify all edges reference valid node IDs
- Ensure entry_node exists in nodes list
- Run: `PYTHONPATH=exports uv run python -m agent_name validate`
-
-### "Agent has structure but won't run"
-
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
- Implementation may be needed (Python functions or MCP tools)
- This is expected - hive-create creates structure, not implementation
- See implementation guide for completion options
-
-### "Tests are failing"
-
- Review test output for specific failures
- Check agent goal and success criteria
- Verify constraints are met
- Use `/hive-test` to debug and iterate
- Fix agent code and re-run tests
-
-### "Agent is failing at runtime"
-
- Use `/hive-debugger` to analyze runtime logs
- The debugger identifies retry loops, tool failures, and stalled execution
- Get actionable fix recommendations with code changes
- Monitor the agent in real-time during TUI sessions
-
-### "Not sure which phase I'm in"
-
-Run these checks:
-
-```bash
-# Check if agent structure exists
-ls exports/my_agent/agent.py
-
-# Check if it validates
-PYTHONPATH=exports uv run python -m my_agent validate
-
-# Check if tests exist
-ls exports/my_agent/tests/
-
-# If structure exists and validates → Phase 2 (testing)
-# If structure doesn't exist → Phase 1 (building)
-# If tests exist but failing → Debug phase
-```
-
-## Best Practices
-
-### For Phase 1 (Building)
-
-1. **Start with clear requirements** - Know what the agent should do
-2. **Define success criteria early** - Measurable goals drive design
-3. **Keep nodes focused** - One responsibility per node
-4. **Use descriptive names** - Node IDs should explain purpose
-5. **Validate incrementally** - Check structure after each major addition
-
-### For Phase 2 (Testing)
-
-1. **Test constraints first** - Hard requirements must pass
-2. **Mock external dependencies** - Use mock mode for LLMs/APIs
-3. **Cover edge cases** - Test failures, not just success paths
-4. **Iterate quickly** - Fix one test at a time
-5. **Document test patterns** - Future tests follow same structure
-
-### General Workflow
-
-1. **Use version control** - Git commit after each phase
-2. **Document decisions** - Update README with changes
-3. **Keep iterations small** - Build → Test → Fix → Repeat
-4. **Preserve working states** - Tag successful iterations
-5. **Learn from failures** - Failed tests reveal design issues
-
-## Exit Criteria
-
-You're done with the workflow when:
-
-✅ Agent structure validates
-✅ All tests pass
-✅ Success criteria met
-✅ Constraints verified
-✅ Documentation complete
-✅ Agent ready for deployment
-
-## Additional Resources
-
- **hive-concepts**: See `.claude/skills/hive-concepts/SKILL.md`
- **hive-create**: See `.claude/skills/hive-create/SKILL.md`
- **hive-patterns**: See `.claude/skills/hive-patterns/SKILL.md`
- **hive-test**: See `.claude/skills/hive-test/SKILL.md`
- **Agent framework docs**: See `core/README.md`
- **Example agents**: See `exports/` directory
-
-## Summary
-
-This workflow provides a proven path from concept to production-ready agent:
-
-1. **Learn** with `/hive-concepts` → Understand fundamentals (optional)
-2. **Build** with `/hive-create` → Get validated structure
-3. **Optimize** with `/hive-patterns` → Apply best practices (optional)
-4. **Configure** with `/hive-credentials` → Set up API keys (if needed)
-5. **Test** with `/hive-test` → Get verified functionality
-6. **Debug** with `/hive-debugger` → Fix runtime issues (if needed)
-
-The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
-
-## Skill Selection Guide
-
-**Choose hive-concepts when:**
- First time building agents
- Need to understand event loop architecture
- Validating tool availability
- Learning about node types, edges, and judges
-
-**Choose hive-create when:**
- Actually building an agent
- Have clear requirements
- Ready to write code
- Want step-by-step guidance
- Want to start from an existing template and customize it
-
-**Choose hive-patterns when:**
- Agent structure complete
- Need client-facing nodes or feedback edges
- Implementing review loops or fan-out/fan-in
- Want judge patterns or context management
- Want best practices
-
-**Choose hive-test when:**
- Agent structure complete
- Ready to validate functionality
- Need comprehensive test coverage
- Testing feedback loops, output keys, or fan-out
-
-**Choose hive-debugger when:**
- Agent is failing or stuck at runtime
- Seeing retry loops or escalations
- Tool calls are failing
- Need to understand why a node isn't completing
- Want real-time monitoring of agent execution
@@ -1,199 +0,0 @@
-# Example: File Monitor Agent
-
-This example shows the complete /hive workflow in action for building a file monitoring agent.
-
-## Initial Request
-
-```
-User: "Build an agent that monitors ~/Downloads and copies new files to ~/Documents"
-```
-
-## Phase 1: Building (20 minutes)
-
-### Step 1: Create Structure
-
-Agent invokes `/hive-create` skill and:
-
-1. Creates `exports/file_monitor_agent/` package
-2. Writes skeleton files (__init__.py, __main__.py, agent.py, etc.)
-
-**Output**: Package structure visible immediately
-
-### Step 2: Define Goal
-
-```python
-goal = Goal(
-    id="file-monitor-copy",
-    name="Automated File Monitor & Copy",
-    success_criteria=[
-        # 100% detection rate
-        # 100% copy success
-        # 100% conflict resolution
-        # >99% uptime
-    ],
-    constraints=[
-        # Preserve originals
-        # Handle errors gracefully
-        # Track state
-        # Respect permissions
-    ]
-)
-```
-
-**Output**: Goal written to agent.py
-
-### Step 3: Design Nodes
-
-7 nodes approved and written incrementally:
-
-1. `initialize-state` - Set up tracking
-2. `list-downloads` - Scan directory
-3. `identify-new-files` - Find new files
-4. `check-for-new-files` - Router
-5. `copy-files` - Copy with conflict resolution
-6. `update-state` - Mark as processed
-7. `wait-interval` - Sleep between cycles
-
-**Output**: All nodes in nodes/__init__.py
-
-### Step 4: Connect Edges
-
-8 edges connecting the workflow loop:
-
-```
-initialize → list → identify → check
-                                ↓  ↓
-                              copy  wait
-                                ↓    ↑
-                              update ↓
-                                ↓    ↓
-                              wait → list (loop)
-```
-
-**Output**: Edges written to agent.py
-
-### Step 5: Finalize
-
-```bash
-$ PYTHONPATH=exports uv run python -m file_monitor_agent validate
-✓ Agent is valid
-
-$ PYTHONPATH=exports uv run python -m file_monitor_agent info
-Agent: File Monitor & Copy Agent
-Nodes: 7
-Edges: 8
-```
-
-**Phase 1 Complete**: Structure validated ✅
-
-### Status After Phase 1
-
-```
-exports/file_monitor_agent/
-├── __init__.py          ✅ (exports)
-├── __main__.py          ✅ (CLI)
-├── agent.py             ✅ (goal, graph, agent class)
-├── nodes/__init__.py    ✅ (7 nodes)
-├── config.py            ✅ (configuration)
-├── implementations.py   ✅ (Python functions)
-├── README.md            ✅ (documentation)
-├── IMPLEMENTATION_GUIDE.md ✅ (next steps)
-└── STATUS.md            ✅ (current state)
-```
-
-**Note**: Implementation gap exists - data flow needs connection (covered in STATUS.md)
-
-## Phase 2: Testing (25 minutes)
-
-### Step 1: Analyze Agent
-
-Agent invokes `/hive-test` skill and:
-
-1. Reads goal from `exports/file_monitor_agent/agent.py`
-2. Identifies 4 success criteria to test
-3. Identifies 4 constraints to verify
-4. Plans test coverage
-
-### Step 2: Generate Tests
-
-Creates test files:
-
-```
-exports/file_monitor_agent/tests/
-├── conftest.py              (fixtures)
-├── test_constraints.py      (4 constraint tests)
-├── test_success_criteria.py (4 success tests)
-└── test_edge_cases.py       (error handling)
-```
-
-Tests approved incrementally by user.
-
-### Step 3: Run Tests
-
-```bash
-$ PYTHONPATH=exports uv run pytest exports/file_monitor_agent/tests/
-
-test_constraints.py::test_preserves_originals     PASSED
-test_constraints.py::test_handles_errors          PASSED
-test_constraints.py::test_tracks_state            PASSED
-test_constraints.py::test_respects_permissions    PASSED
-
-test_success_criteria.py::test_detects_all_files  PASSED
-test_success_criteria.py::test_copies_all_files   PASSED
-test_success_criteria.py::test_resolves_conflicts PASSED
-test_success_criteria.py::test_continuous_run     PASSED
-
-test_edge_cases.py::test_empty_directory          PASSED
-test_edge_cases.py::test_permission_denied        PASSED
-test_edge_cases.py::test_disk_full                PASSED
-test_edge_cases.py::test_large_files              PASSED
-
-========================== 12 passed in 3.42s ==========================
-```
-
-**Phase 2 Complete**: All tests pass ✅
-
-## Final Output
-
-**Production-Ready Agent:**
-
-```bash
-# Run the agent
-./RUN_AGENT.sh
-
-# Or manually
-PYTHONPATH=exports uv run python -m file_monitor_agent run
-```
-
-**Capabilities:**
- Monitors ~/Downloads continuously
- Copies new files to ~/Documents
- Resolves conflicts with timestamps
- Handles errors gracefully
- Tracks processed files
- Runs as background service
-
-**Total Time**: ~45 minutes from concept to production
-
-## Key Learnings
-
-1. **Incremental building** - Files written immediately, visible throughout
-2. **Validation early** - Structure validated before moving to implementation
-3. **Test-driven** - Tests reveal real behavior
-4. **Documentation included** - README, STATUS, and guides auto-generated
-5. **Repeatable process** - Same workflow for any agent type
-
-## Variations
-
-**For simpler agents:**
- Fewer nodes (3-5 instead of 7)
- Simpler workflow (linear instead of looping)
- Faster build time (10-15 minutes)
-
-**For complex agents:**
- More nodes (10-15+)
- Multiple subgraphs
- Pause/resume points for human-in-the-loop
- Longer build time (45-60 minutes)
-
-The workflow scales to your needs!
@@ -1,7 +0,0 @@
-# Project-level Codex config for Hive.
-# Keep this file minimal: MCP connectivity + skill discovery.
-
-[mcp_servers.agent-builder]
-command = "uv"
-args = ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"]
-cwd = "."
@@ -1,20 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core",
-      "env": {
-        "PYTHONPATH": "../tools/src"
-      }
-    },
-    "tools": {
-      "command": "python",
-      "args": ["mcp_server.py", "--stdio"],
-      "cwd": "tools",
-      "env": {
-        "PYTHONPATH": "src"
-      }
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -0,0 +1,89 @@
+name: Integration Bounty
+description: A bounty task for the integration contribution program
+title: "[Bounty]: "
+labels: []
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Integration Bounty
+
+        This issue is part of the [Integration Bounty Program](../../docs/bounty-program/README.md).
+        **Claim this bounty** by commenting below — a maintainer will assign you within 24 hours.
+
+  - type: dropdown
+    id: bounty-type
+    attributes:
+      label: Bounty Type
+      options:
+        - "Test a Tool (20 pts)"
+        - "Write Docs (20 pts)"
+        - "Code Contribution (30 pts)"
+        - "New Integration (75 pts)"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: difficulty
+    attributes:
+      label: Difficulty
+      options:
+        - Easy
+        - Medium
+        - Hard
+    validations:
+      required: true
+
+  - type: input
+    id: tool-name
+    attributes:
+      label: Tool Name
+      description: The integration this bounty targets (e.g., `airtable`, `salesforce`)
+      placeholder: e.g., airtable
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What needs to be done to complete this bounty.
+      placeholder: |
+        Describe the specific task, including:
+        - What the contributor needs to do
+        - Links to relevant files in the repo
+        - Any setup requirements (API keys, accounts, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance-criteria
+    attributes:
+      label: Acceptance Criteria
+      description: What "done" looks like. The PR or report must meet all criteria.
+      placeholder: |
+        - [ ] Criterion 1
+        - [ ] Criterion 2
+        - [ ] CI passes
+    validations:
+      required: true
+
+  - type: textarea
+    id: relevant-files
+    attributes:
+      label: Relevant Files
+      description: Links to tool directory, credential spec, health check file, etc.
+      placeholder: |
+        - Tool: `tools/src/aden_tools/tools/{tool_name}/`
+        - Credential spec: `tools/src/aden_tools/credentials/{category}.py`
+        - Health checks: `tools/src/aden_tools/credentials/health_check.py`
+
+  - type: textarea
+    id: resources
+    attributes:
+      label: Resources
+      description: Links to API docs, examples, or guides that will help the contributor.
+      placeholder: |
+        - [Building Tools Guide](../../tools/BUILDING_TOOLS.md)
+        - [Tool README Template](../../docs/bounty-program/templates/tool-readme-template.md)
+        - API docs: https://...
@@ -0,0 +1,37 @@
+name: Bounty completed
+description: Awards points and notifies Discord when a bounty PR is merged
+
+on:
+  pull_request:
+    types: [closed]
+
+jobs:
+  bounty-notify:
+    if: >
+      github.event.pull_request.merged == true &&
+      contains(join(github.event.pull_request.labels.*.name, ','), 'bounty:')
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Award XP and notify Discord
+        run: bun run scripts/bounty-tracker.ts notify
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -5,7 +5,7 @@ on:
    branches: [main]
  pull_request:
    branches: [main]
-
+    
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
@@ -24,6 +24,8 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies
        run: uv sync --project core --group dev
@@ -54,10 +56,12 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies and run tests
+        working-directory: core
        run: |
-          cd core
          uv sync
          uv run pytest tests/ -v

@@ -77,10 +81,12 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies and run tests
+        working-directory: tools
        run: |
-          cd tools
          uv sync --extra dev
          uv run pytest tests/ -v

@@ -98,10 +104,12 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
-
+        with:
+          enable-cache: true
+            
      - name: Install dependencies
+        working-directory: core
        run: |
-          cd core
          uv sync

      - name: Validate exported agents
@@ -0,0 +1,126 @@
+name: Link Discord account
+description: Auto-creates a PR to add contributor to contributors.yml when a link-discord issue is opened
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  link-discord:
+    if: contains(github.event.issue.labels.*.name, 'link-discord')
+    runs-on: ubuntu-latest
+    timeout-minutes: 2
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Parse issue and update contributors.yml
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            const issue = context.payload.issue;
+            const githubUsername = issue.user.login;
+
+            // Parse the issue body for form fields
+            const body = issue.body || '';
+
+            // Extract Discord ID — look for the numeric value after the "Discord User ID" heading
+            const discordMatch = body.match(/### Discord User ID\s*\n\s*(\d{17,20})/);
+            if (!discordMatch) {
+              await github.rest.issues.createComment({
+                ...context.repo,
+                issue_number: issue.number,
+                body: `Could not find a valid Discord ID in the issue body. Please make sure you entered a numeric ID (17-20 digits), not a username.\n\nExample: \`123456789012345678\``
+              });
+              await github.rest.issues.update({
+                ...context.repo,
+                issue_number: issue.number,
+                state: 'closed',
+                state_reason: 'not_planned'
+              });
+              return;
+            }
+            const discordId = discordMatch[1];
+
+            // Extract display name (optional)
+            const nameMatch = body.match(/### Display Name \(optional\)\s*\n\s*(.+)/);
+            const displayName = nameMatch ? nameMatch[1].trim() : '';
+
+            // Check if user already exists
+            const yml = fs.readFileSync('contributors.yml', 'utf-8');
+            if (yml.includes(`github: ${githubUsername}`)) {
+              await github.rest.issues.createComment({
+                ...context.repo,
+                issue_number: issue.number,
+                body: `@${githubUsername} is already in \`contributors.yml\`. If you need to update your Discord ID, please edit the file directly via PR.`
+              });
+              await github.rest.issues.update({
+                ...context.repo,
+                issue_number: issue.number,
+                state: 'closed',
+                state_reason: 'completed'
+              });
+              return;
+            }
+
+            // Append entry to contributors.yml
+            let entry = `  - github: ${githubUsername}\n    discord: "${discordId}"`;
+            if (displayName && displayName !== '_No response_') {
+              entry += `\n    name: ${displayName}`;
+            }
+            entry += '\n';
+
+            const updated = yml.trimEnd() + '\n' + entry;
+            fs.writeFileSync('contributors.yml', updated);
+
+            // Set outputs for commit step
+            core.exportVariable('GITHUB_USERNAME', githubUsername);
+            core.exportVariable('DISCORD_ID', discordId);
+            core.exportVariable('ISSUE_NUMBER', issue.number.toString());
+
+      - name: Create PR
+        run: |
+          # Check if there are changes
+          if git diff --quiet contributors.yml; then
+            echo "No changes to contributors.yml"
+            exit 0
+          fi
+
+          BRANCH="docs/link-discord-${GITHUB_USERNAME}"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git checkout -b "$BRANCH"
+          git add contributors.yml
+          git commit -m "docs: link @${GITHUB_USERNAME} to Discord"
+          git push origin "$BRANCH"
+
+          gh pr create \
+            --title "docs: link @${GITHUB_USERNAME} to Discord" \
+            --body "Adds @${GITHUB_USERNAME} (Discord \`${DISCORD_ID}\`) to \`contributors.yml\` for bounty XP tracking.
+
+          Closes #${ISSUE_NUMBER}" \
+            --base main \
+            --head "$BRANCH" \
+            --label "link-discord"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Notify on issue
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const username = process.env.GITHUB_USERNAME;
+            const issueNumber = parseInt(process.env.ISSUE_NUMBER);
+
+            await github.rest.issues.createComment({
+              ...context.repo,
+              issue_number: issueNumber,
+              body: `A PR has been created to link your account. A maintainer will merge it shortly — once merged, you'll receive XP and Discord pings when your bounty PRs are merged.`
+            });
@@ -0,0 +1,54 @@
+# Closes PRs that still have the `pr-requirements-warning` label
+# after contributors were warned in pr-requirements.yml.
+name: PR Requirements Enforcement
+on:
+  schedule:
+    - cron: "0 0 * * *"   # runs every day once at midnight 
+jobs:
+  enforce:
+    name: Close PRs still failing contribution requirements
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+    steps:
+      - name: Close PRs still failing requirements
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const prs = await github.paginate(github.rest.pulls.list, {
+              owner,
+              repo,
+              state: "open",
+              per_page: 100
+            });
+            for (const pr of prs) {
+              // Skip draft PRs — author may still be actively working toward compliance
+              if (pr.draft) continue;
+              const labels = pr.labels.map(l => l.name);
+              if (!labels.includes("pr-requirements-warning")) continue;
+              const gracePeriod = 24 * 60 * 60 * 1000;
+              const lastUpdated = new Date(pr.created_at);
+              const now = new Date();
+              if (now - lastUpdated < gracePeriod) {
+                console.log(`Skipping PR #${pr.number} — still within grace period`);
+                continue;
+              }
+              const prNumber = pr.number;
+              const prAuthor = pr.user.login;
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: prNumber,
+                body: `Closing PR because the contribution requirements were not resolved within the 24-hour grace period.
+                If this was closed in error, feel free to reopen the PR after fixing the requirements.`
+              });
+              await github.rest.pulls.update({
+                owner,
+                repo,
+                pull_number: prNumber,
+                state: "closed"
+              });
+              console.log(`Closed PR #${prNumber} by ${prAuthor} (PR requirements were not met)`);
+            }
@@ -43,9 +43,10 @@ jobs:
            console.log(`  Found issue references: ${issueNumbers.length > 0 ? issueNumbers.join(', ') : 'none'}`);

            if (issueNumbers.length === 0) {
-              const message = `## PR Closed - Requirements Not Met
+              const message = `## PR Requirements Warning

-            This PR has been automatically closed because it doesn't meet the requirements.
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.

            **Missing:** No linked issue found.

@@ -67,14 +68,15 @@ jobs:

            **Why is this required?** See #472 for details.`;

-              const comments = await github.rest.issues.listComments({
+              const comments = await github.paginate(github.rest.issues.listComments, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: prNumber,
+                per_page: 100,
              });

-              const botComment = comments.data.find(
-                (c) => c.user.type === 'Bot' && c.body.includes('PR Closed - Requirements Not Met')
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
              );

              if (!botComment) {
@@ -86,11 +88,11 @@ jobs:
                });
              }

-              await github.rest.pulls.update({
+              await github.rest.issues.addLabels({
                owner: context.repo.owner,
                repo: context.repo.repo,
-                pull_number: prNumber,
-                state: 'closed',
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
              });

              core.setFailed('PR must reference an issue');
@@ -132,9 +134,10 @@ jobs:
                `#${i.number} (assignees: ${i.assignees.length > 0 ? i.assignees.join(', ') : 'none'})`
              ).join(', ');

-              const message = `## PR Closed - Requirements Not Met
+              const message = `## PR Requirements Warning

-            This PR has been automatically closed because it doesn't meet the requirements.
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.

            **PR Author:** @${prAuthor}
            **Found issues:** ${issueList}
@@ -157,14 +160,15 @@ jobs:

            **Why is this required?** See #472 for details.`;

-              const comments = await github.rest.issues.listComments({
+              const comments = await github.paginate(github.rest.issues.listComments, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: prNumber,
+                per_page: 100,
              });

-              const botComment = comments.data.find(
-                (c) => c.user.type === 'Bot' && c.body.includes('PR Closed - Requirements Not Met')
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
              );

              if (!botComment) {
@@ -176,14 +180,24 @@ jobs:
                });
              }

-              await github.rest.pulls.update({
+              await github.rest.issues.addLabels({
                owner: context.repo.owner,
                repo: context.repo.repo,
-                pull_number: prNumber,
-                state: 'closed',
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
              });

              core.setFailed('PR author must be assigned to the linked issue');
            } else {
              console.log(`PR requirements met! Issue #${issueWithAuthorAssigned} has ${prAuthor} as assignee.`);
-            }
+              try {
+                await github.rest.issues.removeLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  name: "pr-requirements-warning"
+                });
+              }catch (error){
+                //ignore if label doesn't exist
+              }
+            }
@@ -0,0 +1,40 @@
+name: Weekly bounty leaderboard
+description: Posts the integration bounty leaderboard to Discord every Monday
+
+on:
+  schedule:
+    # Every Monday at 9:00 UTC
+    - cron: "0 9 * * 1"
+  workflow_dispatch:
+    inputs:
+      since_date:
+        description: "Only count PRs merged after this date (YYYY-MM-DD). Leave empty for all-time."
+        required: false
+
+jobs:
+  leaderboard:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Post leaderboard to Discord
+        run: bun run scripts/bounty-tracker.ts leaderboard
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          SINCE_DATE: ${{ github.event.inputs.since_date || '' }}
@@ -67,8 +67,6 @@ temp/

 exports/*

-.agent-builder-sessions/*
-
 .claude/settings.local.json
 .claude/skills/ship-it/

@@ -1,9 +1,3 @@
 {
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": ["run", "-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core"
-    }
-  }
+  "mcpServers": {}
 }
@@ -1,30 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": [
-        "run",
-        "python",
-        "-m",
-        "framework.mcp.agent_builder_server"
-      ],
-      "cwd": "core",
-      "env": {
-        "PYTHONPATH": "../tools/src"
-      }
-    },
-    "tools": {
-      "command": "uv",
-      "args": [
-        "run",
-        "python",
-        "mcp_server.py",
-        "--stdio"
-      ],
-      "cwd": "tools",
-      "env": {
-        "PYTHONPATH": "src"
-      }
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-debugger
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1 +0,0 @@
-../../.claude/skills/triage-issue
@@ -1,7 +0,0 @@
-{
-  "recommendations": [
-    "charliermarsh.ruff",
-    "editorconfig.editorconfig",
-    "ms-python.python"
-  ]
-}
@@ -2,10 +2,6 @@

 Shared agent instructions for this workspace.

-## Deprecations
-
- **TUI is deprecated.** The terminal UI (`hive tui`) is no longer maintained. Use the browser-based interface (`hive open`) instead.
-
 ## Coding Agent Notes

 - 
@@ -65,6 +65,52 @@ You may submit PRs without prior assignment for:

 > **Tip:** Installing Claude Code skills is optional for running existing agents, but required if you plan to **build new agents**.

+## Troubleshooting Setup Issues
+
+If you encounter issues while setting up the development environment, the following steps may help:
+
+### `make: command not found`
+Install `make` using:
+
+```bash
+sudo apt install make
+
+uv: command not found
+
+Install uv using:
+
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source ~/.bashrc
+
+ruff: not found
+
+If linting fails due to a missing ruff command, install it with:
+
+uv tool install ruff
+
+WSL Path Recommendation
+
+When using WSL, it is recommended to clone the repository inside your Linux home directory (e.g., ~/hive) instead of under /mnt/c/... to avoid potential performance and permission issues.
+
+
+---
+
+# ✅ Why This Is Good
+
+- Clear
+- Professional tone
+- No unnecessary explanation
+- Under micro-fix size
+- Based on real contributor experience
+- Won’t annoy maintainers
+
+---
+
+Now:
+
+```bash
+git checkout -b docs/setup-troubleshooting
+
 ## Commit Convention

 We follow [Conventional Commits](https://www.conventionalcommits.org/):
@@ -0,0 +1,27 @@
+# Identity mapping: GitHub username -> Discord ID
+#
+# This file links GitHub accounts to Discord accounts for the
+# Integration Bounty Program. When a bounty PR is merged, the
+# GitHub Action uses this file to ping the contributor on Discord.
+#
+# HOW TO ADD YOURSELF:
+# Open a "Link Discord Account" issue:
+# https://github.com/aden-hive/hive/issues/new?template=link-discord.yml
+# A GitHub Action will automatically add your entry here.
+#
+# To find your Discord ID:
+# 1. Open Discord Settings > Advanced > Enable Developer Mode
+# 2. Right-click your name > Copy User ID
+#
+# Format:
+#   - github: your-github-username
+#     discord: "your-discord-id"  # quotes required (it's a number)
+#     name: Your Display Name      # optional
+
+contributors:
+  # - github: example-user
+  #   discord: "123456789012345678"
+  #   name: Example User
+  - github: TimothyZhang7
+    discord: "408460790061072384"
+    name: Timothy@Aden
@@ -1,5 +1,4 @@
 exports/
 docs/
-.agent-builder-sessions/
 .pytest_cache/
 **/__pycache__/
@@ -1,10 +1,5 @@
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core"
-    },
    "tools": {
      "command": "python",
      "args": ["-m", "aden_tools.mcp_server", "--stdio"],
@@ -1,17 +1,16 @@
-# MCP Server Guide - Agent Builder
+# MCP Server Guide - Agent Building Tools

-This guide covers the MCP (Model Context Protocol) server for building goal-driven agents.
+> **Note:** The standalone `agent-builder` MCP server (`framework.mcp.agent_builder_server`) has been replaced. Agent building is now done via the `coder-tools` server's `initialize_and_build_agent` tool, with underlying logic in `tools/coder_tools_server.py`.
+
+This guide covers the MCP tools available for building goal-driven agents.

 ## Setup

 ### Quick Setup

 ```bash
-# Using the setup script (recommended)
-python setup_mcp.py
-
-# Or using bash
-./setup_mcp.sh
+# Run the quickstart script (recommended)
+./quickstart.sh
 ```

 ### Manual Configuration
@@ -21,10 +20,10 @@ Add to your MCP client configuration (e.g., Claude Desktop):
 ```json
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/goal-agent"
+    "coder-tools": {
+      "command": "uv",
+      "args": ["run", "coder_tools_server.py", "--stdio"],
+      "cwd": "/path/to/hive/tools"
    }
  }
 }
@@ -17,66 +17,11 @@ Framework provides a runtime framework that captures **decisions**, not just act
 uv pip install -e .
 ```

-## MCP Server Setup
+## Agent Building

-The framework includes an MCP (Model Context Protocol) server for building agents. To set up the MCP server:
+Agent scaffolding is handled by the `coder-tools` MCP server (in `tools/coder_tools_server.py`), which provides the `initialize_and_build_agent` tool and related utilities. The package generation logic lives directly in `tools/coder_tools_server.py`.

-### Automated Setup
-
-**Using bash (Linux/macOS):**
-```bash
-./setup_mcp.sh
-```
-
-**Using Python (cross-platform):**
-```bash
-python setup_mcp.py
-```
-
-The setup script will:
-1. Install the framework package
-2. Install MCP dependencies (mcp, fastmcp)
-3. Create/verify `.mcp.json` configuration
-4. Test the MCP server module
-
-### Manual Setup
-
-If you prefer manual setup:
-
-```bash
-# Install framework
-uv pip install -e .
-
-# Install MCP dependencies
-uv pip install mcp fastmcp
-
-# Test the server
-uv run python -m framework.mcp.agent_builder_server
-```
-
-### Using with MCP Clients
-
-To use the agent builder with Claude Desktop or other MCP clients, add this to your MCP client configuration:
-
-```json
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/hive/core"
-    }
-  }
-}
-```
-
-The MCP server provides tools for:
- Creating agent building sessions
- Defining goals with success criteria
- Adding nodes (event_loop only)
- Connecting nodes with edges
- Validating and exporting agent graphs
- Testing nodes and full agent graphs
+See the [Getting Started Guide](../docs/getting-started.md) for building agents.

 ## Quick Start

@@ -145,7 +90,7 @@ uv run python -m framework test-debug <agent_path> <test_name>
 uv run python -m framework test-list <agent_path>
 ```

-For detailed testing workflows, see the [hive-test skill](../.claude/skills/hive-test/SKILL.md).
+For detailed testing workflows, see [developer-guide.md](../docs/developer-guide.md).

 ### Analyzing Agent Behavior with Builder

@@ -95,81 +95,6 @@ async def example_3_config_file():
    (test_agent_path / "mcp_servers.json").unlink()


-async def example_4_custom_agent_with_mcp_tools():
-    """Example 4: Build custom agent that uses MCP tools"""
-    print("\n=== Example 4: Custom Agent with MCP Tools ===\n")
-
-    from framework.builder.workflow import GraphBuilder
-
-    # Create a workflow builder
-    builder = GraphBuilder()
-
-    # Define goal
-    builder.set_goal(
-        goal_id="web-researcher",
-        name="Web Research Agent",
-        description="Search the web and summarize findings",
-    )
-
-    # Add success criteria
-    builder.add_success_criterion(
-        "search-results", "Successfully retrieve at least 3 web search results"
-    )
-    builder.add_success_criterion("summary", "Provide a clear, concise summary of the findings")
-
-    # Add nodes that will use MCP tools
-    builder.add_node(
-        node_id="web-searcher",
-        name="Web Search",
-        description="Search the web for information",
-        node_type="event_loop",
-        system_prompt="Search for {query} and return the top results. Use the web_search tool.",
-        tools=["web_search"],  # This tool comes from tools MCP server
-        input_keys=["query"],
-        output_keys=["search_results"],
-    )
-
-    builder.add_node(
-        node_id="summarizer",
-        name="Summarize Results",
-        description="Summarize the search results",
-        node_type="event_loop",
-        system_prompt="Summarize the following search results in 2-3 sentences: {search_results}",
-        input_keys=["search_results"],
-        output_keys=["summary"],
-    )
-
-    # Connect nodes
-    builder.add_edge("web-searcher", "summarizer")
-
-    # Set entry point
-    builder.set_entry("web-searcher")
-    builder.set_terminal("summarizer")
-
-    # Export the agent
-    export_path = Path("exports/web-research-agent")
-    export_path.mkdir(parents=True, exist_ok=True)
-    builder.export(export_path)
-
-    # Load and register MCP server
-    runner = AgentRunner.load(export_path)
-    runner.register_mcp_server(
-        name="tools",
-        transport="stdio",
-        command="python",
-        args=["-m", "aden_tools.mcp_server", "--stdio"],
-        cwd="../tools",
-    )
-
-    # Run the agent
-    result = await runner.run({"query": "latest AI breakthroughs 2026"})
-
-    print(f"\nAgent completed with result:\n{result}")
-
-    # Cleanup
-    runner.cleanup()
-
-
 async def main():
    """Run all examples"""
    print("=" * 60)
@@ -22,7 +22,6 @@ The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
 See `framework.testing` for details.
 """

-from framework.builder.query import BuilderQuery
 from framework.llm import AnthropicProvider, LLMProvider
 from framework.runner import AgentOrchestrator, AgentRunner
 from framework.runtime.core import Runtime
@@ -51,8 +50,6 @@ __all__ = [
    "Problem",
    # Runtime
    "Runtime",
-    # Builder
-    "BuilderQuery",
    # LLM
    "LLMProvider",
    "AnthropicProvider",
@@ -51,42 +51,6 @@ def cli():
    pass


-@cli.command()
-@click.option("--verbose", "-v", is_flag=True)
-@click.option("--debug", is_flag=True)
-def tui(verbose, debug):
-    """Launch TUI to test a credential interactively."""
-    setup_logging(verbose=verbose, debug=debug)
-
-    try:
-        from framework.tui.app import AdenTUI
-    except ImportError:
-        click.echo("TUI requires 'textual'. Install with: pip install textual")
-        sys.exit(1)
-
-    agent = CredentialTesterAgent()
-    account = pick_account(agent)
-    if account is None:
-        sys.exit(1)
-
-    agent.select_account(account)
-    provider = account.get("provider", "?")
-    alias = account.get("alias", "?")
-    click.echo(f"\nTesting {provider}/{alias}...\n")
-
-    async def run_tui():
-        agent._setup()
-        runtime = agent._agent_runtime
-        await runtime.start()
-        try:
-            app = AdenTUI(runtime)
-            await app.run_async()
-        finally:
-            await runtime.stop()
-
-    asyncio.run(run_tui())
-
-
@cli.command()
@click.option("--verbose", "-v", is_flag=True)
@click.option("--debug", is_flag=True)
@@ -406,7 +406,8 @@ nodes = [
        client_facing=True,
        max_node_visits=0,
        input_keys=[],
-        output_keys=[],
+        output_keys=["test_result"],
+        nullable_output_keys=["test_result"],
        tools=["get_account_info"],
        system_prompt="""\
 You are a credential tester. Your job is to help the user verify that their \
@@ -444,7 +445,7 @@ edges = []
 entry_node = "tester"
 entry_points = {"start": "tester"}
 pause_nodes = []
-terminal_nodes = []  # Forever-alive: loops until user exits
+terminal_nodes = ["tester"]  # Tester node can terminate

 conversation_mode = "continuous"
 identity_prompt = (
@@ -531,7 +532,7 @@ class CredentialTesterAgent:
            version="1.0.0",
            entry_node="tester",
            entry_points={"start": "tester"},
-            terminal_nodes=[],
+            terminal_nodes=["tester"],  # Tester node can terminate
            pause_nodes=[],
            nodes=[tester_node],
            edges=[],
@@ -51,7 +51,8 @@ The key is pre-injected into the session environment and tools read it automatic
        client_facing=True,
        max_node_visits=0,
        input_keys=[],
-        output_keys=[],
+        output_keys=["test_result"],
+        nullable_output_keys=["test_result"],
        tools=tools,
        system_prompt=f"""\
 You are a credential tester for the {account_label}: {provider}/{alias}{detail}
@@ -0,0 +1,151 @@
+"""Agent discovery — scan known directories and return categorised AgentEntry lists."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class AgentEntry:
+    """Lightweight agent metadata for the picker / API discover endpoint."""
+
+    path: Path
+    name: str
+    description: str
+    category: str
+    session_count: int = 0
+    node_count: int = 0
+    tool_count: int = 0
+    tags: list[str] = field(default_factory=list)
+    last_active: str | None = None
+
+
+def _get_last_active(agent_name: str) -> str | None:
+    """Return the most recent updated_at timestamp across all sessions."""
+    sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
+    if not sessions_dir.exists():
+        return None
+    latest: str | None = None
+    for session_dir in sessions_dir.iterdir():
+        if not session_dir.is_dir() or not session_dir.name.startswith("session_"):
+            continue
+        state_file = session_dir / "state.json"
+        if not state_file.exists():
+            continue
+        try:
+            data = json.loads(state_file.read_text(encoding="utf-8"))
+            ts = data.get("timestamps", {}).get("updated_at")
+            if ts and (latest is None or ts > latest):
+                latest = ts
+        except Exception:
+            continue
+    return latest
+
+
+def _count_sessions(agent_name: str) -> int:
+    """Count session directories under ~/.hive/agents/{agent_name}/sessions/."""
+    sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
+    if not sessions_dir.exists():
+        return 0
+    return sum(1 for d in sessions_dir.iterdir() if d.is_dir() and d.name.startswith("session_"))
+
+
+def _extract_agent_stats(agent_path: Path) -> tuple[int, int, list[str]]:
+    """Extract node count, tool count, and tags from an agent directory.
+
+    Prefers agent.py (AST-parsed) over agent.json for node/tool counts
+    since agent.json may be stale.  Tags are only available from agent.json.
+    """
+    import ast
+
+    node_count, tool_count, tags = 0, 0, []
+
+    agent_py = agent_path / "agent.py"
+    if agent_py.exists():
+        try:
+            tree = ast.parse(agent_py.read_text(encoding="utf-8"))
+            for node in ast.walk(tree):
+                if isinstance(node, ast.Assign):
+                    for target in node.targets:
+                        if isinstance(target, ast.Name) and target.id == "nodes":
+                            if isinstance(node.value, ast.List):
+                                node_count = len(node.value.elts)
+        except Exception:
+            pass
+
+    agent_json = agent_path / "agent.json"
+    if agent_json.exists():
+        try:
+            data = json.loads(agent_json.read_text(encoding="utf-8"))
+            json_nodes = data.get("nodes", [])
+            if node_count == 0:
+                node_count = len(json_nodes)
+            tools: set[str] = set()
+            for n in json_nodes:
+                tools.update(n.get("tools", []))
+            tool_count = len(tools)
+            tags = data.get("agent", {}).get("tags", [])
+        except Exception:
+            pass
+
+    return node_count, tool_count, tags
+
+
+def discover_agents() -> dict[str, list[AgentEntry]]:
+    """Discover agents from all known sources grouped by category."""
+    from framework.runner.cli import (
+        _extract_python_agent_metadata,
+        _get_framework_agents_dir,
+        _is_valid_agent_dir,
+    )
+
+    groups: dict[str, list[AgentEntry]] = {}
+    sources = [
+        ("Your Agents", Path("exports")),
+        ("Framework", _get_framework_agents_dir()),
+        ("Examples", Path("examples/templates")),
+    ]
+
+    for category, base_dir in sources:
+        if not base_dir.exists():
+            continue
+        entries: list[AgentEntry] = []
+        for path in sorted(base_dir.iterdir(), key=lambda p: p.name):
+            if not _is_valid_agent_dir(path):
+                continue
+
+            name, desc = _extract_python_agent_metadata(path)
+            config_fallback_name = path.name.replace("_", " ").title()
+            used_config = name != config_fallback_name
+
+            node_count, tool_count, tags = _extract_agent_stats(path)
+            if not used_config:
+                agent_json = path / "agent.json"
+                if agent_json.exists():
+                    try:
+                        data = json.loads(agent_json.read_text(encoding="utf-8"))
+                        meta = data.get("agent", {})
+                        name = meta.get("name", name)
+                        desc = meta.get("description", desc)
+                    except Exception:
+                        pass
+
+            entries.append(
+                AgentEntry(
+                    path=path,
+                    name=name,
+                    description=desc,
+                    category=category,
+                    session_count=_count_sessions(path.name),
+                    node_count=node_count,
+                    tool_count=tool_count,
+                    tags=tags,
+                    last_active=_get_last_active(path.name),
+                )
+            )
+        if entries:
+            groups[category] = entries
+
+    return groups
@@ -1,44 +0,0 @@
-"""
-Hive Coder — Native coding agent that builds Hive agent packages.
-
-Deeply understands the agent framework and produces complete Python packages
-with goals, nodes, edges, system prompts, MCP configuration, and tests
-from natural language specifications.
-"""
-
-from .agent import (
-    HiveCoderAgent,
-    conversation_mode,
-    default_agent,
-    edges,
-    entry_node,
-    entry_points,
-    goal,
-    identity_prompt,
-    loop_config,
-    nodes,
-    pause_nodes,
-    terminal_nodes,
-)
-from .config import AgentMetadata, RuntimeConfig, default_config, metadata
-
-__version__ = "1.0.0"
-
-__all__ = [
-    "HiveCoderAgent",
-    "default_agent",
-    "goal",
-    "nodes",
-    "edges",
-    "entry_node",
-    "entry_points",
-    "pause_nodes",
-    "terminal_nodes",
-    "conversation_mode",
-    "identity_prompt",
-    "loop_config",
-    "RuntimeConfig",
-    "AgentMetadata",
-    "default_config",
-    "metadata",
-]
@@ -1,223 +0,0 @@
-"""CLI entry point for Hive Coder agent."""
-
-import asyncio
-import json
-import logging
-import sys
-
-import click
-
-from .agent import HiveCoderAgent, default_agent
-
-
-def setup_logging(verbose=False, debug=False):
-    """Configure logging for execution visibility."""
-    if debug:
-        level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
-    elif verbose:
-        level, fmt = logging.INFO, "%(message)s"
-    else:
-        level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
-    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
-    logging.getLogger("framework").setLevel(level)
-
-
-@click.group()
-@click.version_option(version="1.0.0")
-def cli():
-    """Hive Coder — Build Hive agent packages from natural language."""
-    pass
-
-
-@cli.command()
-@click.option("--request", "-r", type=str, required=True, help="What agent to build")
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def run(request, mock, quiet, verbose, debug):
-    """Execute agent building from a request."""
-    if not quiet:
-        setup_logging(verbose=verbose, debug=debug)
-
-    context = {"user_request": request}
-
-    result = asyncio.run(default_agent.run(context, mock_mode=mock))
-
-    output_data = {
-        "success": result.success,
-        "steps_executed": result.steps_executed,
-        "output": result.output,
-    }
-    if result.error:
-        output_data["error"] = result.error
-
-    click.echo(json.dumps(output_data, indent=2, default=str))
-    sys.exit(0 if result.success else 1)
-
-
-@cli.command()
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def tui(mock, verbose, debug):
-    """Launch the TUI dashboard for interactive agent building."""
-    setup_logging(verbose=verbose, debug=debug)
-
-    try:
-        from framework.tui.app import AdenTUI
-    except ImportError:
-        click.echo("TUI requires the 'textual' package. Install with: pip install textual")
-        sys.exit(1)
-
-    from pathlib import Path
-
-    from framework.llm import LiteLLMProvider
-    from framework.runner.tool_registry import ToolRegistry
-    from framework.runtime.agent_runtime import create_agent_runtime
-    from framework.runtime.execution_stream import EntryPointSpec
-
-    async def run_with_tui():
-        agent = HiveCoderAgent()
-
-        agent._tool_registry = ToolRegistry()
-
-        storage_path = Path.home() / ".hive" / "agents" / "hive_coder"
-        storage_path.mkdir(parents=True, exist_ok=True)
-
-        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
-        if mcp_config_path.exists():
-            agent._tool_registry.load_mcp_config(mcp_config_path)
-
-        llm = None
-        if not mock:
-            llm = LiteLLMProvider(
-                model=agent.config.model,
-                api_key=agent.config.api_key,
-                api_base=agent.config.api_base,
-            )
-
-        tools = list(agent._tool_registry.get_tools().values())
-        tool_executor = agent._tool_registry.get_executor()
-        graph = agent._build_graph()
-
-        runtime = create_agent_runtime(
-            graph=graph,
-            goal=agent.goal,
-            storage_path=storage_path,
-            entry_points=[
-                EntryPointSpec(
-                    id="start",
-                    name="Build Agent",
-                    entry_node="coder",
-                    trigger_type="manual",
-                    isolation_level="isolated",
-                ),
-            ],
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-        )
-
-        await runtime.start()
-
-        try:
-            app = AdenTUI(runtime)
-            await app.run_async()
-        finally:
-            await runtime.stop()
-
-    asyncio.run(run_with_tui())
-
-
-@cli.command()
-@click.option("--json", "output_json", is_flag=True)
-def info(output_json):
-    """Show agent information."""
-    info_data = default_agent.info()
-    if output_json:
-        click.echo(json.dumps(info_data, indent=2))
-    else:
-        click.echo(f"Agent: {info_data['name']}")
-        click.echo(f"Version: {info_data['version']}")
-        click.echo(f"Description: {info_data['description']}")
-        click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
-        click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
-        click.echo(f"Entry: {info_data['entry_node']}")
-        click.echo(f"Terminal: {', '.join(info_data['terminal_nodes']) or '(forever-alive)'}")
-
-
-@cli.command()
-def validate():
-    """Validate agent structure."""
-    validation = default_agent.validate()
-    if validation["valid"]:
-        click.echo("Agent is valid")
-        if validation["warnings"]:
-            for warning in validation["warnings"]:
-                click.echo(f"  WARNING: {warning}")
-    else:
-        click.echo("Agent has errors:")
-        for error in validation["errors"]:
-            click.echo(f"  ERROR: {error}")
-    sys.exit(0 if validation["valid"] else 1)
-
-
-@cli.command()
-@click.option("--verbose", "-v", is_flag=True)
-def shell(verbose):
-    """Interactive agent building session (CLI, no TUI)."""
-    asyncio.run(_interactive_shell(verbose))
-
-
-async def _interactive_shell(verbose=False):
-    """Async interactive shell."""
-    setup_logging(verbose=verbose)
-
-    click.echo("=== Hive Coder ===")
-    click.echo("Describe the agent you want to build (or 'quit' to exit):\n")
-
-    agent = HiveCoderAgent()
-    await agent.start()
-
-    try:
-        while True:
-            try:
-                request = await asyncio.get_event_loop().run_in_executor(None, input, "Build> ")
-                if request.lower() in ["quit", "exit", "q"]:
-                    click.echo("Goodbye!")
-                    break
-
-                if not request.strip():
-                    continue
-
-                click.echo("\nBuilding agent...\n")
-
-                result = await agent.trigger_and_wait("default", {"user_request": request})
-
-                if result is None:
-                    click.echo("\n[Execution timed out]\n")
-                    continue
-
-                if result.success:
-                    output = result.output or {}
-                    agent_name = output.get("agent_name", "unknown")
-                    validation = output.get("validation_result", "unknown")
-                    click.echo(f"\nAgent '{agent_name}' built. Validation: {validation}\n")
-                else:
-                    click.echo(f"\nBuild failed: {result.error}\n")
-
-            except KeyboardInterrupt:
-                click.echo("\nGoodbye!")
-                break
-            except Exception as e:
-                click.echo(f"Error: {e}", err=True)
-                import traceback
-
-                traceback.print_exc()
-    finally:
-        await agent.stop()
-
-
-if __name__ == "__main__":
-    cli()
@@ -1,357 +0,0 @@
-"""Agent graph construction for Hive Coder."""
-
-from pathlib import Path
-
-from framework.graph import Constraint, Goal, SuccessCriterion
-from framework.graph.checkpoint_config import CheckpointConfig
-from framework.graph.edge import GraphSpec
-from framework.graph.executor import ExecutionResult
-from framework.llm import LiteLLMProvider
-from framework.runner.tool_registry import ToolRegistry
-from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
-from framework.runtime.execution_stream import EntryPointSpec
-
-from .config import default_config, metadata
-from .nodes import coder_node, queen_node
-
-# ticket_receiver is no longer needed — the queen runs as an independent
-# GraphExecutor and receives escalation tickets via inject_event().
-# Keeping the import commented for reference:
-# from .ticket_receiver import TICKET_RECEIVER_ENTRY_POINT
-
-# Goal definition
-goal = Goal(
-    id="agent-builder",
-    name="Hive Agent Builder",
-    description=(
-        "Build complete, validated Hive agent packages from natural language "
-        "specifications. Produces production-ready Python packages with goals, "
-        "nodes, edges, system prompts, MCP configuration, and tests."
-    ),
-    success_criteria=[
-        SuccessCriterion(
-            id="valid-package",
-            description="Generated agent package passes structural validation",
-            metric="validation_pass",
-            target="true",
-            weight=0.30,
-        ),
-        SuccessCriterion(
-            id="complete-files",
-            description=(
-                "All required files generated: agent.py, config.py, "
-                "nodes/__init__.py, __init__.py, __main__.py, mcp_servers.json"
-            ),
-            metric="file_count",
-            target=">=6",
-            weight=0.25,
-        ),
-        SuccessCriterion(
-            id="user-satisfaction",
-            description="User reviews and approves the generated agent",
-            metric="user_approval",
-            target="true",
-            weight=0.25,
-        ),
-        SuccessCriterion(
-            id="framework-compliance",
-            description=(
-                "Generated code follows framework patterns: STEP 1/STEP 2 "
-                "for client-facing, correct imports, entry_points format"
-            ),
-            metric="pattern_compliance",
-            target="100%",
-            weight=0.20,
-        ),
-    ],
-    constraints=[
-        Constraint(
-            id="dynamic-tool-discovery",
-            description=(
-                "Always discover available tools dynamically via "
-                "list_agent_tools before referencing tools in agent designs"
-            ),
-            constraint_type="hard",
-            category="correctness",
-        ),
-        Constraint(
-            id="no-fabricated-tools",
-            description="Only reference tools that exist in hive-tools MCP",
-            constraint_type="hard",
-            category="correctness",
-        ),
-        Constraint(
-            id="valid-python",
-            description="All generated Python files must be syntactically correct",
-            constraint_type="hard",
-            category="correctness",
-        ),
-        Constraint(
-            id="self-verification",
-            description="Run validation after writing code; fix errors before presenting",
-            constraint_type="hard",
-            category="quality",
-        ),
-    ],
-)
-
-# Nodes: primary coder node only.  The queen runs as an independent
-# GraphExecutor with queen_node — not as part of this graph.
-nodes = [coder_node]
-
-# No edges needed — single forever-alive event_loop node
-edges = []
-
-# Graph configuration
-entry_node = "coder"
-entry_points = {"start": "coder"}
-pause_nodes = []
-terminal_nodes = []  # Forever-alive: loops until user exits
-
-# No async entry points needed — the queen is now an independent executor,
-# not a secondary graph receiving events via add_graph().
-async_entry_points = []
-
-# Module-level variables read by AgentRunner.load()
-conversation_mode = "continuous"
-identity_prompt = (
-    "You are Hive Coder, the best agent-building coding agent on the planet. "
-    "You deeply understand the Hive agent framework at the source code level "
-    "and produce production-ready agent packages from natural language. "
-    "You can dynamically discover available framework tools, inspect runtime "
-    "sessions and checkpoints from agents you build, and run their test suites. "
-    "You follow coding agent discipline: read before writing, verify "
-    "assumptions by reading actual code, adhere to project conventions, "
-    "self-verify with validation, and fix your own errors. You are concise, "
-    "direct, and technically rigorous. No emojis. No fluff."
-)
-loop_config = {
-    "max_iterations": 100,
-    "max_tool_calls_per_turn": 30,
-    "max_history_tokens": 32000,
-}
-
-
-# ---------------------------------------------------------------------------
-# Queen graph — runs as an independent persistent conversation in the TUI.
-# Loaded by _load_judge_and_queen() in app.py, NOT by AgentRunner.
-# ---------------------------------------------------------------------------
-
-queen_goal = Goal(
-    id="queen-manager",
-    name="Queen Manager",
-    description=(
-        "Manage the worker agent lifecycle and serve as the user's primary "
-        "interactive interface. Triage health escalations from the judge."
-    ),
-    success_criteria=[],
-    constraints=[],
-)
-
-queen_graph = GraphSpec(
-    id="queen-graph",
-    goal_id=queen_goal.id,
-    version="1.0.0",
-    entry_node="queen",
-    entry_points={"start": "queen"},
-    terminal_nodes=[],
-    pause_nodes=[],
-    nodes=[queen_node],
-    edges=[],
-    conversation_mode="continuous",
-    loop_config={
-        "max_iterations": 999_999,
-        "max_tool_calls_per_turn": 30,
-        "max_history_tokens": 32000,
-    },
-)
-
-
-class HiveCoderAgent:
-    """
-    Hive Coder — builds Hive agent packages from natural language.
-
-    Single-node architecture: the coder runs in a continuous while(true) loop.
-    The queen runs as an independent GraphExecutor (loaded by the TUI via
-    _load_judge_and_queen), not as part of this graph.
-    """
-
-    def __init__(self, config=None):
-        self.config = config or default_config
-        self.goal = goal
-        self.nodes = nodes
-        self.edges = edges
-        self.entry_node = entry_node
-        self.entry_points = entry_points
-        self.pause_nodes = pause_nodes
-        self.terminal_nodes = terminal_nodes
-        self.async_entry_points = async_entry_points
-        self._graph: GraphSpec | None = None
-        self._agent_runtime: AgentRuntime | None = None
-        self._tool_registry: ToolRegistry | None = None
-        self._storage_path: Path | None = None
-
-    def _build_graph(self) -> GraphSpec:
-        """Build the GraphSpec."""
-        return GraphSpec(
-            id="hive-coder-graph",
-            goal_id=self.goal.id,
-            version="1.0.0",
-            entry_node=self.entry_node,
-            entry_points=self.entry_points,
-            terminal_nodes=self.terminal_nodes,
-            pause_nodes=self.pause_nodes,
-            nodes=self.nodes,
-            edges=self.edges,
-            default_model=self.config.model,
-            max_tokens=self.config.max_tokens,
-            loop_config=loop_config,
-            conversation_mode=conversation_mode,
-            identity_prompt=identity_prompt,
-            async_entry_points=self.async_entry_points,
-        )
-
-    def _setup(self, mock_mode=False) -> None:
-        """Set up the agent runtime."""
-        self._storage_path = Path.home() / ".hive" / "agents" / "hive_coder"
-        self._storage_path.mkdir(parents=True, exist_ok=True)
-
-        self._tool_registry = ToolRegistry()
-
-        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
-        if mcp_config_path.exists():
-            self._tool_registry.load_mcp_config(mcp_config_path)
-
-        llm = None
-        if not mock_mode:
-            llm = LiteLLMProvider(
-                model=self.config.model,
-                api_key=self.config.api_key,
-                api_base=self.config.api_base,
-            )
-
-        tool_executor = self._tool_registry.get_executor()
-        tools = list(self._tool_registry.get_tools().values())
-
-        self._graph = self._build_graph()
-
-        checkpoint_config = CheckpointConfig(
-            enabled=True,
-            checkpoint_on_node_start=False,
-            checkpoint_on_node_complete=True,
-            checkpoint_max_age_days=7,
-            async_checkpoint=True,
-        )
-
-        entry_point_specs = [
-            EntryPointSpec(
-                id="default",
-                name="Default",
-                entry_node=self.entry_node,
-                trigger_type="manual",
-                isolation_level="shared",
-            ),
-        ]
-
-        self._agent_runtime = create_agent_runtime(
-            graph=self._graph,
-            goal=self.goal,
-            storage_path=self._storage_path,
-            entry_points=entry_point_specs,
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-            checkpoint_config=checkpoint_config,
-            graph_id="hive_coder",
-        )
-
-    async def start(self, mock_mode=False) -> None:
-        """Set up and start the agent runtime."""
-        if self._agent_runtime is None:
-            self._setup(mock_mode=mock_mode)
-        if not self._agent_runtime.is_running:
-            await self._agent_runtime.start()
-
-    async def stop(self) -> None:
-        """Stop the agent runtime and clean up."""
-        if self._agent_runtime and self._agent_runtime.is_running:
-            await self._agent_runtime.stop()
-        self._agent_runtime = None
-
-    async def trigger_and_wait(
-        self,
-        entry_point: str = "default",
-        input_data: dict | None = None,
-        timeout: float | None = None,
-        session_state: dict | None = None,
-    ) -> ExecutionResult | None:
-        """Execute the graph and wait for completion."""
-        if self._agent_runtime is None:
-            raise RuntimeError("Agent not started. Call start() first.")
-
-        return await self._agent_runtime.trigger_and_wait(
-            entry_point_id=entry_point,
-            input_data=input_data or {},
-            session_state=session_state,
-        )
-
-    async def run(self, context: dict, mock_mode=False, session_state=None) -> ExecutionResult:
-        """Run the agent (convenience method for single execution)."""
-        await self.start(mock_mode=mock_mode)
-        try:
-            result = await self.trigger_and_wait("default", context, session_state=session_state)
-            return result or ExecutionResult(success=False, error="Execution timeout")
-        finally:
-            await self.stop()
-
-    def info(self):
-        """Get agent information."""
-        return {
-            "name": metadata.name,
-            "version": metadata.version,
-            "description": metadata.description,
-            "goal": {
-                "name": self.goal.name,
-                "description": self.goal.description,
-            },
-            "nodes": [n.id for n in self.nodes],
-            "edges": [e.id for e in self.edges],
-            "entry_node": self.entry_node,
-            "entry_points": self.entry_points,
-            "pause_nodes": self.pause_nodes,
-            "terminal_nodes": self.terminal_nodes,
-            "client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
-        }
-
-    def validate(self):
-        """Validate agent structure."""
-        errors = []
-        warnings = []
-
-        node_ids = {node.id for node in self.nodes}
-        for edge in self.edges:
-            if edge.source not in node_ids:
-                errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
-            if edge.target not in node_ids:
-                errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
-
-        if self.entry_node not in node_ids:
-            errors.append(f"Entry node '{self.entry_node}' not found")
-
-        for terminal in self.terminal_nodes:
-            if terminal not in node_ids:
-                errors.append(f"Terminal node '{terminal}' not found")
-
-        for ep_id, node_id in self.entry_points.items():
-            if node_id not in node_ids:
-                errors.append(f"Entry point '{ep_id}' references unknown node '{node_id}'")
-
-        return {
-            "valid": len(errors) == 0,
-            "errors": errors,
-            "warnings": warnings,
-        }
-
-
-# Create default instance
-default_agent = HiveCoderAgent()
@@ -1,968 +0,0 @@
-"""Node definitions for Hive Coder agent."""
-
-from pathlib import Path
-
-from framework.graph import NodeSpec
-
-# Load reference docs at import time so they're always in the system prompt.
-# No voluntary read_file() calls needed — the LLM gets everything upfront.
-_ref_dir = Path(__file__).parent.parent / "reference"
-_framework_guide = (_ref_dir / "framework_guide.md").read_text(encoding="utf-8")
-_file_templates = (_ref_dir / "file_templates.md").read_text(encoding="utf-8")
-_anti_patterns = (_ref_dir / "anti_patterns.md").read_text(encoding="utf-8")
-_gcu_guide_path = _ref_dir / "gcu_guide.md"
-_gcu_guide = _gcu_guide_path.read_text(encoding="utf-8") if _gcu_guide_path.exists() else ""
-
-
-def _is_gcu_enabled() -> bool:
-    try:
-        from framework.config import get_gcu_enabled
-
-        return get_gcu_enabled()
-    except Exception:
-        return False
-
-
-def _build_appendices() -> str:
-    parts = (
-        "\n\n# Appendix: Framework Reference\n\n"
-        + _framework_guide
-        + "\n\n# Appendix: File Templates\n\n"
-        + _file_templates
-        + "\n\n# Appendix: Anti-Patterns\n\n"
-        + _anti_patterns
-    )
-    if _is_gcu_enabled() and _gcu_guide:
-        parts += "\n\n# Appendix: GCU Browser Automation Guide\n\n" + _gcu_guide
-    return parts
-
-
-# Shared appendices — appended to every coding node's system prompt.
-_appendices = _build_appendices()
-
-# Tools available to both coder (worker) and queen.
-_SHARED_TOOLS = [
-    # File I/O
-    "read_file",
-    "write_file",
-    "edit_file",
-    "hashline_edit",
-    "list_directory",
-    "search_files",
-    "run_command",
-    "undo_changes",
-    # Meta-agent
-    "list_agent_tools",
-    "validate_agent_tools",
-    "list_agents",
-    "list_agent_sessions",
-    "list_agent_checkpoints",
-    "get_agent_checkpoint",
-    "run_agent_tests",
-]
-
-# Queen mode-specific tool sets.
-# Building mode: full coding + agent construction tools.
-_QUEEN_BUILDING_TOOLS = _SHARED_TOOLS + [
-    "load_built_agent",
-    "list_credentials",
-]
-
-# Staging mode: agent loaded but not yet running — inspect, configure, launch.
-_QUEEN_STAGING_TOOLS = [
-    # Read-only (inspect agent files, logs)
-    "read_file",
-    "list_directory",
-    "search_files",
-    "run_command",
-    # Agent inspection
-    "list_credentials",
-    "get_worker_status",
-    # Launch or go back
-    "run_agent_with_input",
-    "stop_worker_and_edit",
-]
-
-# Running mode: worker is executing — monitor and control.
-_QUEEN_RUNNING_TOOLS = [
-    # Read-only coding (for inspecting logs, files)
-    "read_file",
-    "list_directory",
-    "search_files",
-    "run_command",
-    # Credentials
-    "list_credentials",
-    # Worker lifecycle
-    "stop_worker",
-    "stop_worker_and_edit",
-    "get_worker_status",
-    "inject_worker_message",
-    # Monitoring
-    "get_worker_health_summary",
-    "notify_operator",
-]
-
-
-# ---------------------------------------------------------------------------
-# Shared agent-building knowledge: core mandates, tool docs, meta-agent
-# capabilities, and workflow phases 1-6.  Both the coder (worker) and
-# queen compose their system prompts from this block + role-specific
-# additions.
-# ---------------------------------------------------------------------------
-
-_agent_builder_knowledge = """\
-
-# Core Mandates
-
- **Read before writing.** NEVER write code from assumptions. Read \
-reference agents and templates first. Read every file before editing.
- **Conventions first.** Follow existing project patterns exactly. \
-Analyze imports, structure, and style in reference agents.
- **Verify assumptions.** Never assume a class, import, or pattern \
-exists. Read actual source to confirm. Search if unsure.
- **Discover tools dynamically.** NEVER reference tools from static \
-docs. Always run list_agent_tools() to see what actually exists.
- **Professional objectivity.** If a use case is a poor fit for the \
-framework, say so. Technical accuracy over validation.
- **Concise.** No emojis. No preambles. No postambles. Substance only.
- **Self-verify.** After writing code, run validation and tests. Fix \
-errors yourself. Don't declare success until validation passes.
-
-# Tools
-
-## Paths (MANDATORY)
-**Always use RELATIVE paths**
-(e.g. `exports/agent_name/config.py`, `exports/agent_name/nodes/__init__.py`).
-**Never use absolute paths** like `/mnt/data/...` or `/workspace/...` — they fail.
-The project root is implicit.
-
-## File I/O
- read_file(path, offset?, limit?, hashline?) — read with line numbers; \
-hashline=True for N:hhhh|content anchors (use with hashline_edit)
- write_file(path, content) — create/overwrite, auto-mkdir
- edit_file(path, old_text, new_text, replace_all?) — fuzzy-match edit
- hashline_edit(path, edits, auto_cleanup?, encoding?) — anchor-based \
-editing using N:hhhh refs from read_file(hashline=True). Ops: set_line, \
-replace_lines, insert_after, insert_before, replace, append
- list_directory(path, recursive?) — list contents
- search_files(pattern, path?, include?, hashline?) — regex search; \
-hashline=True for anchors in results
- run_command(command, cwd?, timeout?) — shell execution
- undo_changes(path?) — restore from git snapshot
-
-## Meta-Agent
- list_agent_tools(server_config_path?, output_schema?, group?) — discover \
-available tools grouped by category. output_schema: "simple" (default) or \
-"full" (includes input_schema). group: "all" (default) or a prefix like \
-"gmail". Call FIRST before designing.
- validate_agent_tools(agent_path) — validate that all tools declared \
-in an agent's nodes actually exist. Call after building.
- list_agents() — list all agent packages in exports/ with session counts
- list_agent_sessions(agent_name, status?, limit?) — list sessions
- list_agent_checkpoints(agent_name, session_id) — list checkpoints
- get_agent_checkpoint(agent_name, session_id, checkpoint_id?) — load checkpoint
- run_agent_tests(agent_name, test_types?, fail_fast?) — run pytest with parsing
-
-# Meta-Agent Capabilities
-
-You are not just a file writer. You have deep integration with the \
-Hive framework:
-
-## Tool Discovery (MANDATORY before designing)
-Before designing any agent, run list_agent_tools() to discover all \
-available tools. ONLY use tools from this list in your node definitions. \
-NEVER guess or fabricate tool names from memory.
-
-  list_agent_tools()                                    # names + descriptions
-  list_agent_tools(output_schema="full")                # include input_schema
-  list_agent_tools(group="gmail")                       # only gmail_* tools
-  list_agent_tools("exports/{agent_name}/mcp_servers.json")  # specific agent
-
-## Agent Awareness
-Run list_agents() to see what agents already exist. Read their code \
-for patterns:
-  read_file("exports/{name}/agent.py")
-  read_file("exports/{name}/nodes/__init__.py")
-
-## Post-Build Testing
-After writing agent code, validate structurally AND run tests:
-  run_command("python -c 'from {name} import default_agent; \\
-    print(default_agent.validate())'")
-  run_agent_tests("{name}")
-
-## Debugging Built Agents
-When a user says "my agent is failing" or "debug this agent":
-1. list_agent_sessions("{agent_name}") — find the session
-2. get_worker_status
-4. list_agent_checkpoints / get_agent_checkpoint — trace execution
-
-# Agent Building Workflow
-
-You operate in a continuous loop. The user describes what they want, \
-you build it. No rigid phases — use judgment. But the general flow is:
-
-## 1. Understand & Qualify (3-5 turns)
-
-This is ONE conversation, not two phases. Discovery and qualification \
-happen together. Surface problems as you find them, not in a batch.
-
-**Before your first response**, silently run list_agent_tools() and \
-consult the **Framework Reference** appendix. Know what's possible \
-before you speak.
-
-### How to respond to the user's first message
-
-**Listen like an architect.** While they talk, hear the structure:
- **The actors**: Who are the people/systems involved?
- **The trigger**: What kicks off the workflow?
- **The core loop**: What's the main thing that happens repeatedly?
- **The output**: What's the valuable thing produced?
- **The pain**: What about today is broken, slow, or missing?
-
-| They say... | You're hearing... |
-|-------------|-------------------|
-| Nouns they repeat | Your entities |
-| Verbs they emphasize | Your core operations |
-| Frustrations they mention | Your design constraints |
-| Workarounds they describe | What the system must replace |
-
-**Use domain knowledge aggressively.** If they say "research agent," \
-you already know it involves search, summarization, source tracking, \
-iteration. Don't ask about each — use them as defaults and let their \
-specifics override. Merge your general knowledge with their specifics: \
-60-80% right before you ask a single question.
-
-### Play back a model WITH qualification baked in
-
-Don't separate "here's what I understood" from "here's what might be \
-a problem." Weave them together. Your playback should sound like:
-
-"Here's how I'm picturing this: [concrete proposed solution]. \
-The framework handles [X and Y] well for this. [One concern: Z tool \
-doesn't exist, so we'd use W instead / Z would need real-time which \
-isn't a fit, but we could do polling]. For MVP I'd focus on \
-[highest-value thing]. Before I start — [1-2 questions]."
-
-If there's a deal-breaker, lead with it: "Before I go further — \
-this needs [X] which the framework can't do because [Y]. We could \
-[workaround] or reconsider the approach. What do you think?"
-
-**Surface problems immediately. Don't save them for a formal review.**
-
-### Ask only what you CANNOT infer
-
-Every question must earn its place by preventing a costly wrong turn, \
-unlocking a shortcut, or surfacing a dealbreaker.
-
-Good questions: "Who's the primary user?", "Is this replacing \
-something or net new?", "Does this integrate with anything?"
-
-Bad questions (DON'T ask): "What should happen on error?", "Should \
-it have search?", "What tools should I use?" — these are your job.
-
-### Conversation flow
-
-| Turn | Who | What |
-|------|-----|------|
-| 1 | User | Describes what they need |
-| 2 | You | Play back model with concerns baked in. 1-2 questions max. |
-| 3 | User | Corrects, confirms, or adds detail |
-| 4 | You | Adjust model, confirm scope, move to design |
-
-### Anti-patterns
-
-| Don't | Do instead |
-|-------|------------|
-| Open with a list of questions | Open with what you understood |
-| Separate "assessment" dump | Weave concerns into your playback |
-| Good/Bad/Ugly formal section | Mention issues naturally in context |
-| Ask about every edge case | Smart defaults, flag in summary |
-| 10+ turn discovery | 3-5 turns, then start building |
-| Wait for certainty | Start at 80% confidence, iterate |
-| Ask what tech/tools to use | Decide, disclose, move on |
-
-## 3. Design
-
-Design the agent architecture:
- Goal: id, name, description, 3-5 success criteria, 2-4 constraints
- Nodes: **2-4 nodes MAXIMUM** (see rules below)
- Edges: on_success for linear, conditional for routing
- Lifecycle: ALWAYS forever-alive (`terminal_nodes=[]`) unless the user \
-explicitly requests a one-shot/batch agent. Forever-alive agents loop \
-continuously — the user exits by closing the TUI. This is the standard \
-pattern for all interactive agents.
-
-### Node Design Rules
-
-Each node boundary serializes outputs to shared memory \
-and DESTROYS all in-context information (tool results, reasoning, history). \
-Use as many nodes as the use case requires, but don't create nodes without \
-tools — merge them into nodes that do real work.
-
-**MERGE nodes when:**
- Node has NO tools (pure LLM reasoning) → merge into predecessor/successor
- Node sets only 1 trivial output → collapse into predecessor
- Multiple consecutive autonomous nodes → combine into one rich node
- A "report" or "summary" node → merge into the client-facing node
- A "confirm" or "schedule" node that calls no external service → remove
-
-**SEPARATE nodes only when:**
- Client-facing vs autonomous (different interaction models)
- Fundamentally different tool sets
- Fan-out parallelism (parallel branches MUST be separate)
-
-**Typical patterns (queen manages intake — NO client-facing intake node):**
- 2 nodes: `process (autonomous) → review (client-facing) → process`
- 1 node: `process (autonomous)` — simplest; queen handles all interaction
- WRONG: 7 nodes where half have no tools and just do LLM reasoning
- WRONG: Intake node that asks the user for requirements — the queen does intake
-
-Read reference agents before designing:
-  list_agents()
-  read_file("exports/deep_research_agent/agent.py")
-  read_file("exports/deep_research_agent/nodes/__init__.py")
-
-Present the design to the user. Lead with a large ASCII graph inside \
-a code block so it renders in monospace. Make it visually prominent — \
-use box-drawing characters and clear flow arrows:
-
-```
-┌─────────────────────────┐
-│  process (autonomous)    │
-│  in:  user_request       │
-│  tools: web_search,      │
-│         save_data        │
-└────────────┬────────────┘
-             │ on_success
-             ▼
-┌─────────────────────────┐
-│  review (client-facing)  │
-│  tools: set_output       │
-└────────────┬────────────┘
-             │ on_success
-             └──────► back to process
-```
-
-The queen owns intake: she gathers user requirements, then calls \
-`run_agent_with_input(task)` with a structured task description. \
-When building the agent, design the entry node's `input_keys` to \
-match what the queen will provide at run time. No client-facing \
-intake node in the worker.
-
-Follow the graph with a brief summary of each node's purpose. \
-Get user approval before implementing.
-
-## 4. Implement
-
-Consult the **File Templates** and **Anti-Patterns** appendices below.
-
-Write files in order:
-1. mkdir -p exports/{name}/nodes exports/{name}/tests
-2. config.py — RuntimeConfig + AgentMetadata
-3. nodes/__init__.py — NodeSpec definitions with system prompts
-4. agent.py — Goal, edges, graph, agent class
-5. __init__.py — package exports
-6. __main__.py — CLI with click
-7. mcp_servers.json — tool server config
-8. tests/ — fixtures
-
-### Critical Rules
-
-**Imports** (must match exactly — only import what you use):
-```python
-from framework.graph import (
-    NodeSpec, EdgeSpec, EdgeCondition,
-    Goal, SuccessCriterion, Constraint,
-)
-from framework.graph.edge import GraphSpec
-from framework.graph.executor import ExecutionResult
-from framework.graph.checkpoint_config import CheckpointConfig
-from framework.llm import LiteLLMProvider
-from framework.runner.tool_registry import ToolRegistry
-from framework.runtime.agent_runtime import (
-    AgentRuntime, create_agent_runtime,
-)
-from framework.runtime.execution_stream import EntryPointSpec
-```
-For agents with async entry points (timers, webhooks, events), also add:
-```python
-from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
-from framework.runtime.agent_runtime import (
-    AgentRuntime, AgentRuntimeConfig, create_agent_runtime,
-)
-```
-NEVER `from core.framework...` — PYTHONPATH includes core/.
-
-**__init__.py MUST re-export ALL module-level variables** \
-(THIS IS THE #1 SOURCE OF AGENT LOAD FAILURES):
-The runner imports the package (__init__.py), NOT agent.py. It reads \
-goal, nodes, edges, entry_node, entry_points, pause_nodes, \
-terminal_nodes, conversation_mode, identity_prompt, loop_config via \
-getattr(). If ANY are missing from __init__.py, they silently default \
-to None or {} — causing "must define goal, nodes, edges" or "node X \
-is unreachable" errors. The __init__.py MUST import and re-export \
-ALL of these from .agent:
-```python
-from .agent import (
-    MyAgent, default_agent, goal, nodes, edges,
-    entry_node, entry_points, pause_nodes, terminal_nodes,
-    conversation_mode, identity_prompt, loop_config,
-)
-```
-
-**entry_points**: `{"start": "first-node-id"}`
-The first node should be an autonomous processing node (NOT a \
-client-facing intake). For agents with multiple entry points, \
-add them: `{"start": "process", "reminder": "check"}`
-
-**conversation_mode** — ONLY two valid values:
- `"continuous"` — recommended for interactive agents (context carries \
-across node transitions)
- Omit entirely — for isolated per-node conversations
-NEVER use: "client_facing", "interactive", "adaptive", or any other \
-value. These DO NOT EXIST.
-
-**loop_config** — ONLY three valid keys:
-```python
-loop_config = {
-    "max_iterations": 100,
-    "max_tool_calls_per_turn": 30,
-    "max_history_tokens": 32000,
-}
-```
-NEVER add: "strategy", "mode", "timeout", or other keys.
-
-**mcp_servers.json**:
-```json
-{
-  "hive-tools": {
-    "transport": "stdio",
-    "command": "uv",
-    "args": ["run", "python", "mcp_server.py", "--stdio"],
-    "cwd": "../../tools"
-  }
-}
-```
-NO "mcpServers" wrapper. cwd "../../tools". command "uv".
-
-**Storage**: `Path.home() / ".hive" / "agents" / "{name}"`
-
-**Client-facing system prompts** (review/approval nodes only, NOT intake) \
-— STEP 1/STEP 2 pattern:
-```
-STEP 1 — Present to user (text only, NO tool calls):
-[instructions]
-
-STEP 2 — After user responds, call set_output:
-[set_output calls]
-```
-The queen manages intake. Workers should NOT have a client-facing node \
-that asks for requirements. Use client_facing=True only for review or \
-approval checkpoints mid-execution.
-
-**Autonomous system prompts** — set_output in SEPARATE turn.
-
-**Tools** — NEVER fabricate tool names. Common hallucinations: \
-csv_read, csv_write, csv_append, file_upload, database_query. \
-If list_agent_tools() shows these don't exist, use alternatives \
-(e.g. save_data/load_data for data persistence).
-
-**Node rules**:
- **NO intake nodes.** The queen owns intake. She defines the entry \
-node's input_keys at build time and fills them via \
-`run_agent_with_input(task)` at run time.
- Don't abuse nodes without tools — merge them into a node that does work.
- A node with 0 tools is NOT a real node — merge it.
- node_type "event_loop" for all regular graph nodes. Use "gcu" ONLY for
-  browser automation subagents (see GCU appendix). GCU nodes MUST be in a
-  parent node's sub_agents list, NEVER connected via edges, and NEVER used
-  as entry/terminal nodes.
- max_node_visits default is 0 (unbounded) — correct for forever-alive. \
-Only set >0 in one-shot agents with bounded feedback loops.
- Feedback inputs: nullable_output_keys
- terminal_nodes=[] for forever-alive (the default)
- Every node MUST have at least one outgoing edge (no dead ends)
- Agents are forever-alive unless user explicitly asks for one-shot
-
-**Agent class**: CamelCase name, default_agent at module level. \
-Constructor takes `config=None`. Follow the exact pattern in \
-file_templates.md — do NOT invent constructor params like \
-`llm_provider` or `tool_registry`.
-
-**Module-level variables** (read by AgentRunner.load()):
-goal, nodes, edges, entry_node, entry_points, pause_nodes,
-terminal_nodes, conversation_mode, identity_prompt, loop_config
-
-For agents with async triggers, also export:
-async_entry_points, runtime_config
-
-**Async entry points** (timers, webhooks, events):
-When an agent needs scheduled tasks, webhook reactions, or event-driven \
-triggers, use `AsyncEntryPointSpec` (from framework.graph.edge) and \
-`AgentRuntimeConfig` (from framework.runtime.agent_runtime):
- Timer (cron): `trigger_type="timer"`, \
-`trigger_config={"cron": "0 9 * * *"}` — standard 5-field cron expression \
-(e.g. `"0 9 * * MON-FRI"` weekdays 9am, `"*/30 * * * *"` every 30 min)
- Timer (interval): `trigger_type="timer"`, \
-`trigger_config={"interval_minutes": 20, "run_immediately": False}`
- Event (for webhooks): `trigger_type="event"`, \
-`trigger_config={"event_types": ["webhook_received"]}`
- `isolation_level="shared"` so async runs can read primary session memory
- `runtime_config = AgentRuntimeConfig(webhook_routes=[...])` for HTTP webhooks
- Reference: `exports/gmail_inbox_guardian/agent.py`
- Full docs: see **Framework Reference** appendix (Async Entry Points section)
-
-## 5. Verify
-
-Run FOUR validation steps after writing. All must pass:
-
-**Step A — Class validation** (checks graph structure):
-```
-run_command("python -c 'from {name} import default_agent; \\
-  print(default_agent.validate())'")
-```
-
-**Step B — Runner load test** (checks package export contract — \
-THIS IS THE SAME PATH THE TUI USES):
-```
-run_command("python -c 'from framework.runner.runner import \\
-  AgentRunner; r = AgentRunner.load(\"exports/{name}\"); \\
-  print(\"AgentRunner.load: OK\")'")
-```
-This catches missing __init__.py exports, bad conversation_mode, \
-invalid loop_config, and unreachable nodes. If Step A passes but \
-Step B fails, the problem is in __init__.py exports.
-
-**Step C — Tool validation** (checks that declared tools actually exist \
-in the agent's MCP servers — catches hallucinated tool names):
-```
-validate_agent_tools("exports/{name}")
-```
-If any tools are missing: fix the node definitions to use only tools \
-that exist. Run list_agent_tools() to see what's available.
-
-**Step D — Run tests:**
-```
-run_agent_tests("{name}")
-```
-
-If anything fails: read error, fix with edit_file, re-validate. Up to 3x.
-
-**CRITICAL: Testing forever-alive agents**
-Most agents use `terminal_nodes=[]` (forever-alive). This means \
-`runner.run()` NEVER returns — it hangs forever waiting for a \
-terminal node that doesn't exist. Agent tests MUST be structural:
- Validate graph, node specs, edges, tools, prompts
- Check goal/constraints/success criteria definitions
- Test `AgentRunner.load()` succeeds (structural, no API key needed)
- NEVER call `runner.run()` or `trigger_and_wait()` in tests for \
-forever-alive agents — they will hang and time out.
-When you restructure an agent (change nodes/edges), always update \
-the tests to match. Stale tests referencing old node names will fail.
-
-## 6. Present
-
-Show the user what you built: agent name, goal summary, graph (same \
-ASCII style as Design), files created, validation status. Offer to \
-revise or build another.
-"""
-
-
-# ---------------------------------------------------------------------------
-# Coder-specific: set_output after presentation + standalone phase 7
-# ---------------------------------------------------------------------------
-
-_coder_completion = """
-After user confirms satisfaction:
-  set_output("agent_name", "the_agent_name")
-  set_output("validation_result", "valid")
-
-If building another agent, just start the loop again — no need to \
-set_output until the user is done.
-
-## 7. Live Test (optional)
-
-After the user approves, offer to load and run the agent in-session.
-
-If running with a queen (server/frontend):
-```
-load_built_agent("exports/{name}")  # loads as the session worker
-```
-The frontend updates automatically — the user sees the agent's graph, \
-the tab renames, and you can delegate via start_worker(task).
-
-If running standalone (TUI):
-```
-load_agent("exports/{name}")   # registers as secondary graph
-start_agent("{name}")           # triggers default entry point
-```
-"""
-
-
-# ---------------------------------------------------------------------------
-# Queen-specific: extra tool docs, behavior, phase 7, style
-# ---------------------------------------------------------------------------
-
-_queen_tools_docs = """
-
-## Operating Modes
-
-You operate in one of three modes. Your available tools change based on the \
-mode. The system notifies you when a mode change occurs.
-
-### BUILDING mode (default)
-You have full coding tools for building and modifying agents:
- File I/O: read_file, write_file, edit_file, list_directory, search_files, \
-run_command, undo_changes
- Meta-agent: list_agent_tools, validate_agent_tools, \
-list_agents, list_agent_sessions, \
-list_agent_checkpoints, get_agent_checkpoint, run_agent_tests
- load_built_agent(agent_path) — Load the agent and switch to STAGING mode
- list_credentials(credential_id?) — List authorized credentials
-
-When you finish building an agent, call load_built_agent(path) to stage it.
-
-### STAGING mode (agent loaded, not yet running)
-The agent is loaded and ready to run. You can inspect it and launch it:
- Read-only: read_file, list_directory, search_files, run_command
- list_credentials(credential_id?) — Verify credentials are configured
- get_worker_status() — Check the loaded worker
- run_agent_with_input(task) — Start the worker and switch to RUNNING mode
- stop_worker_and_edit() — Go back to BUILDING mode
-
-In STAGING mode you do NOT have write tools. If you need to modify the agent, \
-call stop_worker_and_edit() to go back to BUILDING mode.
-
-### RUNNING mode (worker is executing)
-The worker is running. You have monitoring and lifecycle tools:
- Read-only: read_file, list_directory, search_files, run_command
- get_worker_status() — Check worker status (idle, running, waiting)
- inject_worker_message(content) — Send a message to the running worker
- get_worker_health_summary() — Read the latest health data
- notify_operator(ticket_id, analysis, urgency) — Alert the user (use sparingly)
- stop_worker() — Stop the worker and return to STAGING mode, then ask the user what to do next
- stop_worker_and_edit() — Stop the worker and switch back to BUILDING mode
-
-In RUNNING mode you do NOT have write tools or agent construction tools. \
-If you need to modify the agent, call stop_worker_and_edit() to switch back \
-to BUILDING mode. To stop the worker and ask the user what to do next, call \
-stop_worker() to return to STAGING mode.
-
-### Mode transitions
- load_built_agent(path) → switches to STAGING mode
- run_agent_with_input(task) → starts worker, switches to RUNNING mode
- stop_worker() → stops worker, switches to STAGING mode (ask user: re-run or edit?)
- stop_worker_and_edit() → stops worker (if running), switches to BUILDING mode
-"""
-
-_queen_behavior = """
-# Behavior
-
-## CRITICAL RULE — ask_user tool
-
-Every response that ends with a question, a prompt, or expects user \
-input MUST finish with a call to ask_user(prompt, options). This is \
-NON-NEGOTIABLE. The system CANNOT detect that you are waiting for \
-input unless you call ask_user. You MUST call ask_user as the LAST \
-action in your response.
-
-NEVER end a response with a question in text without calling ask_user. \
-NEVER rely on the user seeing your text and replying — call ask_user.
-
-Always provide 2-4 short options that cover the most likely answers. \
-The user can always type a custom response.
-
-Examples:
- ask_user("What do you need?",
-  ["Build a new agent", "Run the loaded worker", "Help with code"])
- ask_user("Which pattern?",
-  ["Simple 2-node", "Rich with feedback", "Custom"])
- ask_user("Ready to proceed?",
-  ["Yes, go ahead", "Let me change something"])
-
-## Greeting and identity
-
-When the user greets you or asks what you can do, respond concisely \
-(under 10 lines). DO NOT list internal processes. Focus on:
-1. Direct capabilities: coding, agent building & debugging.
-2. What the loaded worker does (one sentence from Worker Profile). \
-If no worker is loaded, say so.
-3. THEN call ask_user to prompt them — do NOT just write text.
-
-## Direct coding
-You can do any coding task directly — reading files, writing code, running \
-commands, building agents, debugging. For quick tasks, do them yourself.
-
-## Worker delegation
-The worker is a specialized agent (see Worker Profile at the end of this \
-prompt). It can ONLY do what its goal and tools allow.
-
-**Decision rule — read the Worker Profile first:**
- The user's request directly matches the worker's goal → use \
-run_agent_with_input(task) (if in staging) or load then run (if in building)
- Anything else → do it yourself. Do NOT reframe user requests into \
-subtasks to justify delegation.
- Building, modifying, or configuring agents is ALWAYS your job. Never \
-delegate agent construction to the worker, even as a "research" subtask.
-
-## When the user says "run", "execute", or "start" (without specifics)
-
-The loaded worker is described in the Worker Profile below. You MUST \
-ask the user what task or input they want using ask_user — do NOT \
-invent a task, do NOT call list_agents() or list directories. \
-The worker is already loaded. Just ask for the specific input the \
-worker needs (e.g., a research topic, a target domain, a job description). \
-NEVER call run_agent_with_input until the user has provided their input.
-
-If NO worker is loaded, say so and offer to build one.
-
-## When in staging mode (agent loaded, not running):
- Tell the user the agent is loaded and ready.
- For tasks matching the worker's goal: ALWAYS ask the user for their \
-specific input BEFORE calling run_agent_with_input(task). NEVER make up \
-or assume what the user wants. Use ask_user to collect the task details \
-(e.g., topic, target, requirements). Once you have the user's answer, \
-compose a structured task description from their input and call \
-run_agent_with_input(task). The worker has no intake node — it receives \
-your task and starts processing.
- If the user wants to modify the agent, call stop_worker_and_edit().
-
-## When idle (worker not running):
- Greet the user. Mention what the worker can do in one sentence.
- For tasks matching the worker's goal, use run_agent_with_input(task) \
-(if in staging) or load the agent first (if in building).
- For everything else, do it directly.
-
-## When the user clicks Run (external event notification)
-When you receive an event that the user clicked Run:
- If the worker started successfully, briefly acknowledge it — do NOT \
-repeat the full status. The user can see the graph is running.
- If the worker failed to start (credential or structural error), \
-explain the problem clearly and help fix it. For credential errors, \
-guide the user to set up the missing credentials. For structural \
-issues, offer to fix the agent graph directly.
-
-## When worker is running — GO SILENT
-
-Once you call start_worker(), your job is DONE. Do NOT call ask_user, \
-do NOT call get_worker_status(), do NOT emit any text. Just stop. \
-The worker owns the conversation now — it has its own client-facing \
-nodes that talk to the user directly.
-
-**After start_worker, your ENTIRE response should be ONE short \
-confirmation sentence with NO tool calls.** Example: \
-"Started the vulnerability assessment." — that's it. No ask_user, \
-no get_worker_status, no follow-up questions.
-
-You only wake up again when:
- The user explicitly addresses you (not answering a worker question)
- A worker question is forwarded to you for relay
- An escalation ticket arrives from the judge
- The worker finishes
-
-If the user explicitly asks about progress, call get_worker_status() \
-ONCE and report. Do NOT poll or check proactively.
-
-For escalation tickets: low/transient → acknowledge silently. \
-High/critical → notify the user with a brief analysis.
-
-## When the worker asks the user a question:
- The user's answer is routed to you with context: \
-[Worker asked: "...", Options: ...] User answered: "...".
- If the user is answering the worker's question normally, relay it \
-using inject_worker_message(answer_text). Then go silent again.
- If the user is rejecting the approach, asking to stop, or giving \
-you an instruction, handle it yourself — do NOT relay.
-
-## Showing or describing the loaded worker
-
-When the user asks to "show the graph", "describe the agent", or \
-"re-generate the graph", read the Worker Profile and present the \
-worker's current architecture as an ASCII diagram. Use the processing \
-stages, tools, and edges from the loaded worker. Do NOT enter the \
-agent building workflow — you are describing what already exists, not \
-building something new.
-
-## Modifying the loaded worker
-
-When the user asks to change, modify, or update the loaded worker \
-(e.g., "change the report node", "add a node", "delete node X"):
-
-1. Call stop_worker_and_edit() — this stops the worker and gives you \
-coding tools (switches to BUILDING mode).
-2. Use the **Path** from the Worker Profile to locate the agent files.
-3. Read the relevant files (nodes/__init__.py, agent.py, etc.).
-4. Make the requested changes using edit_file / write_file.
-5. Run validation (default_agent.validate(), AgentRunner.load(), \
-validate_agent_tools()).
-6. **Reload the modified worker**: call load_built_agent("{path}") \
-so the changes take effect immediately (switches to STAGING mode). \
-Then call run_agent_with_input(task) to restart execution.
-
-Do NOT skip step 6 — without reloading, the user will still be \
-interacting with the old version.
-"""
-
-_queen_phase_7 = """
-## 7. Load into Session
-
-After building and verifying, load the agent into the current session:
-  load_built_agent("exports/{name}")
-This switches to STAGING mode — the user sees the agent's graph and \
-the tab name updates. Then call run_agent_with_input(task) to start it. \
-Do NOT tell the user to run `python -m {name} run` — load and run it here.
-"""
-
-_queen_style = """
-# Style
-
- Concise. No fluff. Direct. No emojis.
- **One phase per response.** Stop after each phase and get user \
-confirmation before moving on. Never combine understand + design + \
-implement in one response.
- When starting the worker, describe what you told it in one sentence.
- When an escalation arrives, lead with severity and recommended action.
-"""
-
-
-# ---------------------------------------------------------------------------
-# Node definitions
-# ---------------------------------------------------------------------------
-
-# Single node — like opencode's while(true) loop.
-# One continuous context handles the entire workflow:
-# discover → design → implement → verify → present → iterate.
-coder_node = NodeSpec(
-    id="coder",
-    name="Hive Coder",
-    description=(
-        "Autonomous coding agent that builds Hive agent packages. "
-        "Handles the full lifecycle: understanding user intent, "
-        "designing architecture, writing code, validating, and "
-        "iterating on feedback — all in one continuous conversation."
-    ),
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["user_request"],
-    output_keys=["agent_name", "validation_result"],
-    success_criteria=(
-        "A complete, validated Hive agent package exists at "
-        "exports/{agent_name}/ and passes structural validation."
-    ),
-    tools=_SHARED_TOOLS
-    + [
-        # Graph lifecycle tools (multi-graph sessions)
-        "load_agent",
-        "unload_agent",
-        "start_agent",
-        "restart_agent",
-        "get_user_presence",
-    ],
-    system_prompt=(
-        "You are Hive Coder, the best agent-building coding agent. You build "
-        "production-ready Hive agent packages from natural language.\n"
-        + _agent_builder_knowledge
-        + _coder_completion
-        + _appendices
-    ),
-)
-
-
-ticket_triage_node = NodeSpec(
-    id="ticket_triage",
-    name="Ticket Triage",
-    description=(
-        "Queen's triage node. Receives an EscalationTicket from the Health Judge "
-        "via event-driven entry point and decides: dismiss or notify the operator."
-    ),
-    node_type="event_loop",
-    client_facing=True,  # Operator can chat with queen once connected (Ctrl+Q)
-    max_node_visits=0,
-    input_keys=["ticket"],
-    output_keys=["intervention_decision"],
-    nullable_output_keys=["intervention_decision"],
-    success_criteria=(
-        "A clear intervention decision: either dismissed with documented reasoning, "
-        "or operator notified via notify_operator with specific analysis."
-    ),
-    tools=["notify_operator"],
-    system_prompt="""\
-You are the Queen (Hive Coder). The Worker Health Judge has escalated a worker \
-issue to you. The ticket is in your memory under key "ticket". Read it carefully.
-
-## Dismiss criteria — do NOT call notify_operator:
- severity is "low" AND steps_since_last_accept < 8
- Cause is clearly a transient issue (single API timeout, brief stall that \
-  self-resolved based on the evidence)
- Evidence shows the agent is making real progress despite bad verdicts
-
-## Intervene criteria — call notify_operator:
- severity is "high" or "critical"
- steps_since_last_accept >= 10 with no sign of recovery
- stall_minutes > 4 (worker definitively stuck)
- Evidence shows a doom loop (same error, same tool, no progress)
- Cause suggests a logic bug, missing configuration, or unrecoverable state
-
-## When intervening:
-Call notify_operator with:
-  ticket_id: <ticket["ticket_id"]>
-  analysis: "<2-3 sentences: what is wrong, why it matters, suggested action>"
-  urgency: "<low|medium|high|critical>"
-
-## After deciding:
-set_output("intervention_decision", "dismissed: <reason>" or "escalated: <summary>")
-
-Be conservative but not passive. You are the last quality gate before the human \
-is disturbed. One unnecessary alert is less costly than alert fatigue — but \
-genuine stuck agents must be caught.
-""",
-)
-
-ALL_QUEEN_TRIAGE_TOOLS = ["notify_operator"]
-
-
-queen_node = NodeSpec(
-    id="queen",
-    name="Queen",
-    description=(
-        "User's primary interactive interface with full coding capability. "
-        "Can build agents directly or delegate to the worker. Manages the "
-        "worker agent lifecycle and triages health escalations from the judge."
-    ),
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["greeting"],
-    output_keys=[],
-    nullable_output_keys=[],
-    success_criteria=(
-        "User's intent is understood, coding tasks are completed correctly, "
-        "and the worker is managed effectively when delegated to."
-    ),
-    tools=sorted(set(_QUEEN_BUILDING_TOOLS + _QUEEN_STAGING_TOOLS + _QUEEN_RUNNING_TOOLS)),
-    system_prompt=(
-        "You are the Queen — the user's primary interface. You are a coding agent "
-        "with the same capabilities as the Hive Coder worker, PLUS the ability to "
-        "manage the worker's lifecycle.\n"
-        + _agent_builder_knowledge
-        + _queen_tools_docs
-        + _queen_behavior
-        + _queen_phase_7
-        + _queen_style
-        + _appendices
-    ),
-)
-
-ALL_QUEEN_TOOLS = sorted(set(_QUEEN_BUILDING_TOOLS + _QUEEN_STAGING_TOOLS + _QUEEN_RUNNING_TOOLS))
-
-__all__ = [
-    "coder_node",
-    "ticket_triage_node",
-    "queen_node",
-    "ALL_QUEEN_TRIAGE_TOOLS",
-    "ALL_QUEEN_TOOLS",
-    "_QUEEN_BUILDING_TOOLS",
-    "_QUEEN_STAGING_TOOLS",
-    "_QUEEN_RUNNING_TOOLS",
-]
@@ -1,113 +0,0 @@
-# Common Mistakes When Building Hive Agents
-
-## Critical Errors
-
-1. **Using tools that don't exist** — Always verify tools are available in the hive-tools MCP server before assigning them to nodes. Never guess tool names.
-
-2. **Wrong entry_points format** — MUST be `{"start": "first-node-id"}`. NOT a set, NOT `{node_id: [keys]}`.
-
-3. **Wrong mcp_servers.json format** — Flat dict (no `"mcpServers"` wrapper). `cwd` must be `"../../tools"`. `command` must be `"uv"` with args `["run", "python", ...]`.
-
-4. **Missing STEP 1/STEP 2 in client-facing prompts** — Without explicit phases, the LLM calls set_output before the user responds. Always use the pattern.
-
-5. **Forgetting nullable_output_keys** — When a node receives inputs from multiple edges and some inputs only arrive on certain edges (e.g., feedback), mark those as nullable. Without this, the executor blocks waiting for a value that will never arrive.
-
-6. **Creating dead-end nodes in forever-alive graphs** — Every node must have at least one outgoing edge. A node with no outgoing edges ends the execution, breaking the loop.
-
-7. **Setting max_node_visits to a non-zero value in forever-alive agents** — The framework default is `max_node_visits=0` (unbounded). Setting it to any positive value (e.g., 1) means the node stops executing after that many visits, silently breaking the forever-alive loop. Only set `max_node_visits > 0` in one-shot agents with feedback loops that need bounded retries.
-
-7. **Missing module-level exports in `__init__.py`** — The runner loads agents via `importlib.import_module(package_name)`, which imports `__init__.py`. It then reads `goal`, `nodes`, `edges`, `entry_node`, `entry_points`, `pause_nodes`, `terminal_nodes`, `conversation_mode`, `identity_prompt`, `loop_config` via `getattr()`. If ANY of these are missing from `__init__.py`, they default to `None` or `{}` — causing "must define goal, nodes, edges" errors or "node X is unreachable" validation failures. **ALL module-level variables from agent.py must be re-exported in `__init__.py`.**
-
-## Value Errors
-
-8. **Invalid `conversation_mode` value** — Only two valid values: `"continuous"` (recommended for interactive agents) or omit entirely (for isolated per-node conversations). Values like `"client_facing"`, `"interactive"`, `"adaptive"` do NOT exist and will cause runtime errors.
-
-9. **Invalid `loop_config` keys** — Only three valid keys: `max_iterations` (int), `max_tool_calls_per_turn` (int), `max_history_tokens` (int). Keys like `"strategy"`, `"mode"`, `"timeout"` are NOT valid and are silently ignored or cause errors.
-
-10. **Fabricating tools that don't exist** — Never guess tool names. Always verify via `list_agent_tools()` before designing and `validate_agent_tools()` after building. Common hallucinations: `csv_read`, `csv_write`, `csv_append`, `file_upload`, `database_query`, `bulk_fetch_emails`. If a required tool doesn't exist, redesign the agent to use tools that DO exist (e.g., `save_data`/`load_data` for data persistence).
-
-## Design Errors
-
-11. **Too many thin nodes** — Hard limit: **2-4 nodes** for most agents. Each node boundary serializes outputs to shared memory and loses all in-context information (tool results, intermediate reasoning, conversation history). A node with 0 tools that just does LLM reasoning is NOT a real node — merge it into its predecessor or successor.
-
-**Merge when:**
- Node has NO tools — pure LLM reasoning belongs in the node that produces or consumes its data
- Node sets only 1 trivial output (e.g., `set_output("done", "true")`) — collapse into predecessor
- Multiple consecutive autonomous nodes with same/similar tools — combine into one
- A "report" or "summary" node that just presents analysis — merge into the client-facing node
- A "schedule" or "confirm" node that doesn't actually schedule anything — remove entirely
-
-**Keep separate when:**
- Client-facing vs autonomous — different interaction models require separate nodes
- Fundamentally different tool sets (e.g., web search vs file I/O)
- Fan-out parallelism — parallel branches MUST be separate nodes
-
-**Bad example** (7 nodes — WAY too many):
-```
-profile_setup → daily_intake → update_tracker → analyze_progress → generate_plan → schedule_reminders → report
-```
-`analyze_progress` has no tools. `schedule_reminders` just sets one boolean. `report` just presents analysis. `update_tracker` and `generate_plan` are sequential autonomous work.
-
-**Good example** (2 nodes):
-```
-process (autonomous: track + analyze + plan) → review (client-facing) → process (loop back)
-```
-The queen handles intake (gathering requirements from the user) and passes the task via `run_agent_with_input(task)`. One autonomous node handles ALL backend work (CSV update, analysis, plan generation) with tools and context preserved. One client-facing node handles review/approval when needed.
-
-12. **Adding framework gating for LLM behavior** — Don't add output rollback, premature rejection, or interaction protocol injection. Fix with better prompts or custom judges.
-
-13. **Not using continuous conversation mode** — Interactive agents should use `conversation_mode="continuous"`. Without it, each node starts with blank context.
-
-14. **Adding terminal nodes by default** — ALL agents should use `terminal_nodes=[]` (forever-alive) unless the user explicitly requests a one-shot/batch agent. Forever-alive is the standard pattern. Every node must have at least one outgoing edge. Dead-end nodes break the loop.
-
-15. **Calling set_output in same turn as tool calls** — Instruct the LLM to call set_output in a SEPARATE turn from real tool calls.
-
-## File Template Errors
-
-16. **Wrong import paths** — Use `from framework.graph import ...`, NOT `from core.framework.graph import ...`. The PYTHONPATH includes `core/`.
-
-17. **Missing storage path** — Agent class must set `self._storage_path = Path.home() / ".hive" / "agents" / "agent_name"`.
-
-18. **Missing mcp_servers.json** — Without this, the agent has no tools at runtime.
-
-19. **Bare `python` command in mcp_servers.json** — Use `"command": "uv"` with args `["run", "python", ...]`.
-
-## Testing Errors
-
-20. **Using `runner.run()` on forever-alive agents** — `runner.run()` calls `trigger_and_wait()` which blocks until the graph reaches a terminal node. Forever-alive agents have `terminal_nodes=[]`, so **`runner.run()` hangs forever**. This is the #1 cause of stuck test suites.
-
-**For forever-alive agents, write structural tests instead:**
- Validate graph structure (nodes, edges, entry points)
- Verify node specs (tools, prompts, client-facing flag)
- Check goal/constraints/success criteria definitions
- Test that `AgentRunner.load()` succeeds (structural, no API key needed)
-
-**What NOT to do:**
-```python
-# WRONG — hangs forever on forever-alive agents
-result = await runner.run({"topic": "quantum computing"})
-```
-
-**Correct pattern for structure tests:**
-```python
-def test_research_has_web_tools(self):
-    assert "web_search" in research_node.tools
-
-def test_research_routes_back_to_interact(self):
-    edges_to_interact = [e for e in edges if e.source == "research" and e.target == "interact"]
-    assert edges_to_interact
-```
-
-21. **Stale tests after agent restructuring** — When you change an agent's node count or names (e.g., 4 nodes → 2 nodes), the tests MUST be updated too. Tests referencing old node names (e.g., `"review"`, `"report"`) will fail or hang. Always check that test assertions match the current `nodes/__init__.py`.
-
-22. **Running full integration tests without API keys** — Structural tests (validate, import) work without keys. Full integration tests need `ANTHROPIC_API_KEY`. Use `pytest.skip()` in the runner fixture when `_setup()` fails due to missing credentials.
-
-23. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
-
-24. **Not using auto_responder for client-facing nodes** — Tests with client-facing nodes hang without an auto-responder that injects input. But note: even WITH auto_responder, forever-alive agents still hang because the graph never terminates. Auto-responder only helps for agents with terminal nodes.
-
-25. **Manually wiring browser tools on event_loop nodes** — If the agent needs browser automation, use `node_type="gcu"` which auto-includes all browser tools and prepends best-practices guidance. Do NOT manually list browser tool names on event_loop nodes — they may not exist in the MCP server or may be incomplete. See the GCU Guide appendix.
-
-26. **Using GCU nodes as regular graph nodes** — GCU nodes (`node_type="gcu"`) are exclusively subagents. They must ONLY appear in a parent node's `sub_agents=["gcu-node-id"]` list and be invoked via `delegate_to_sub_agent()`. They must NEVER be connected via edges, used as entry nodes, or used as terminal nodes. If a GCU node appears as an edge source or target, the graph will fail pre-load validation.
-
-27. **Adding a client-facing intake node to worker agents** — The queen owns intake. She defines the entry node's `input_keys` at build time and fills them via `run_agent_with_input(task)` at run time. Worker agents should start with an autonomous processing node, NOT a client-facing intake node that asks the user for requirements. Client-facing nodes in workers are for mid-execution review/approval only.
@@ -0,0 +1,21 @@
+"""
+Queen — Native agent builder for the Hive framework.
+
+Deeply understands the agent framework and produces complete Python packages
+with goals, nodes, edges, system prompts, MCP configuration, and tests
+from natural language specifications.
+"""
+
+from .agent import queen_goal, queen_graph
+from .config import AgentMetadata, RuntimeConfig, default_config, metadata
+
+__version__ = "1.0.0"
+
+__all__ = [
+    "queen_goal",
+    "queen_graph",
+    "RuntimeConfig",
+    "AgentMetadata",
+    "default_config",
+    "metadata",
+]
@@ -0,0 +1,40 @@
+"""Queen graph definition."""
+
+from framework.graph import Goal
+from framework.graph.edge import GraphSpec
+
+from .nodes import queen_node
+
+# ---------------------------------------------------------------------------
+# Queen graph — the primary persistent conversation.
+# Loaded by queen_orchestrator.create_queen(), NOT by AgentRunner.
+# ---------------------------------------------------------------------------
+
+queen_goal = Goal(
+    id="queen-manager",
+    name="Queen Manager",
+    description=(
+        "Manage the worker agent lifecycle and serve as the user's primary "
+        "interactive interface. Triage health escalations from the judge."
+    ),
+    success_criteria=[],
+    constraints=[],
+)
+
+queen_graph = GraphSpec(
+    id="queen-graph",
+    goal_id=queen_goal.id,
+    version="1.0.0",
+    entry_node="queen",
+    entry_points={"start": "queen"},
+    terminal_nodes=[],
+    pause_nodes=[],
+    nodes=[queen_node],
+    edges=[],
+    conversation_mode="continuous",
+    loop_config={
+        "max_iterations": 999_999,
+        "max_tool_calls_per_turn": 30,
+        "max_history_tokens": 32000,
+    },
+)
@@ -1,4 +1,4 @@
-"""Runtime configuration for Hive Coder agent."""
+"""Runtime configuration for Queen agent."""

 import json
 from dataclasses import dataclass, field
@@ -34,7 +34,7 @@ default_config = RuntimeConfig()

@dataclass
 class AgentMetadata:
-    name: str = "Hive Coder"
+    name: str = "Queen"
    version: str = "1.0.0"
    description: str = (
        "Native coding agent that builds production-ready Hive agent packages "
@@ -43,7 +43,7 @@ class AgentMetadata:
        "MCP configuration, and tests."
    )
    intro_message: str = (
-        "I'm Hive Coder — I build Hive agents. Describe what kind of agent "
+        "I'm Queen — I build Hive agents. Describe what kind of agent "
        "you want to create and I'll design, implement, and validate it for you."
    )

@@ -0,0 +1,80 @@
+"""Queen thinking hook — HR persona classifier.
+
+Fires once when the queen enters building mode at session start.
+Makes a single non-streaming LLM call (acting as an HR Director) to select
+the best-fit expert persona for the user's request, then returns a persona
+prefix string that replaces the queen's default "Solution Architect" identity.
+
+This is designed to activate the model's latent domain expertise — a CFO
+persona on a financial question, a Lawyer on a legal question, etc.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from framework.llm.provider import LLMProvider
+
+logger = logging.getLogger(__name__)
+
+_HR_SYSTEM_PROMPT = """\
+You are an expert HR Director and talent consultant at a world-class firm.
+A new request has arrived and you must identify which professional's expertise
+would produce the highest-quality response.
+
+Reply with ONLY a valid JSON object — no markdown, no prose, no explanation:
+{"role": "<job title>", "persona": "<2-3 sentence first-person identity statement>"}
+
+Rules:
+- Choose from any real professional role: CFO, CEO, CTO, Lawyer, Data Scientist,
+  Product Manager, Security Engineer, DevOps Engineer, Software Architect,
+  HR Director, Marketing Director, Business Analyst, UX Designer,
+  Financial Analyst, Operations Director, Legal Counsel, etc.
+- The persona statement must be written in first person ("I am..." or "I have...").
+- Select the role whose domain knowledge most directly applies to solving the request.
+- If the request is clearly about coding or building software systems, pick Software Architect.
+- "Queen" is your internal alias — do not include it in the persona.
+"""
+
+
+async def select_expert_persona(user_message: str, llm: LLMProvider) -> str:
+    """Run the HR classifier and return a persona prefix string.
+
+    Makes a single non-streaming acomplete() call with the session LLM.
+    Returns an empty string on any failure so the queen falls back
+    gracefully to its default "Solution Architect" identity.
+
+    Args:
+        user_message: The user's opening message for the session.
+        llm: The session LLM provider.
+
+    Returns:
+        A persona prefix like "You are a CFO. I am a CFO with 20 years..."
+        or "" on failure.
+    """
+    if not user_message.strip():
+        return ""
+
+    try:
+        response = await llm.acomplete(
+            messages=[{"role": "user", "content": user_message}],
+            system=_HR_SYSTEM_PROMPT,
+            max_tokens=1024,
+            json_mode=True,
+        )
+        raw = response.content.strip()
+        parsed = json.loads(raw)
+        role = parsed.get("role", "").strip()
+        persona = parsed.get("persona", "").strip()
+        if not role or not persona:
+            logger.warning("Thinking hook: empty role/persona in response: %r", raw)
+            return ""
+        result = f"You are a {role}. {persona}"
+        logger.info("Thinking hook: selected persona — %s", role)
+        return result
+    except Exception:
+        logger.warning("Thinking hook: persona classification failed", exc_info=True)
+        return ""
@@ -0,0 +1,371 @@
+"""Queen global cross-session memory.
+
+Three-tier memory architecture:
+  ~/.hive/queen/MEMORY.md                            — semantic (who, what, why)
+  ~/.hive/queen/memories/MEMORY-YYYY-MM-DD.md        — episodic (daily journals)
+  ~/.hive/queen/session/{id}/data/adapt.md           — working (session-scoped)
+
+Semantic and episodic files are injected at queen session start.
+
+Semantic memory (MEMORY.md) is updated automatically at session end via
+consolidate_queen_memory() — the queen never rewrites this herself.
+
+Episodic memory (MEMORY-date.md) can be written by the queen during a session
+via the write_to_diary tool, and is also appended to at session end by
+consolidate_queen_memory().
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import traceback
+from datetime import date, datetime
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def _queen_dir() -> Path:
+    return Path.home() / ".hive" / "queen"
+
+
+def semantic_memory_path() -> Path:
+    return _queen_dir() / "MEMORY.md"
+
+
+def episodic_memory_path(d: date | None = None) -> Path:
+    d = d or date.today()
+    return _queen_dir() / "memories" / f"MEMORY-{d.strftime('%Y-%m-%d')}.md"
+
+
+def read_semantic_memory() -> str:
+    path = semantic_memory_path()
+    return path.read_text(encoding="utf-8").strip() if path.exists() else ""
+
+
+def read_episodic_memory(d: date | None = None) -> str:
+    path = episodic_memory_path(d)
+    return path.read_text(encoding="utf-8").strip() if path.exists() else ""
+
+
+def format_for_injection() -> str:
+    """Format cross-session memory for system prompt injection.
+
+    Returns an empty string if no meaningful content exists yet (e.g. first
+    session with only the seed template).
+    """
+    semantic = read_semantic_memory()
+    episodic = read_episodic_memory()
+
+    # Suppress injection if semantic is still just the seed template
+    if semantic and semantic.startswith("# My Understanding of the User\n\n*No sessions"):
+        semantic = ""
+
+    parts: list[str] = []
+    if semantic:
+        parts.append(semantic)
+    if episodic:
+        today_str = date.today().strftime("%B %-d, %Y")
+        parts.append(f"## Today — {today_str}\n\n{episodic}")
+
+    if not parts:
+        return ""
+
+    body = "\n\n---\n\n".join(parts)
+    return "--- Your Cross-Session Memory ---\n\n" + body + "\n\n--- End Cross-Session Memory ---"
+
+
+_SEED_TEMPLATE = """\
+# My Understanding of the User
+
+*No sessions recorded yet.*
+
+## Who They Are
+
+## What They're Trying to Achieve
+
+## What's Working
+
+## What I've Learned
+"""
+
+
+def append_episodic_entry(content: str) -> None:
+    """Append a timestamped prose entry to today's episodic memory file.
+
+    Creates the file (with a date heading) if it doesn't exist yet.
+    Used both by the queen's diary tool and by the consolidation hook.
+    """
+    ep_path = episodic_memory_path()
+    ep_path.parent.mkdir(parents=True, exist_ok=True)
+    today_str = date.today().strftime("%B %-d, %Y")
+    timestamp = datetime.now().strftime("%H:%M")
+    if not ep_path.exists():
+        header = f"# {today_str}\n\n"
+        block = f"{header}### {timestamp}\n\n{content.strip()}\n"
+    else:
+        block = f"\n\n### {timestamp}\n\n{content.strip()}\n"
+    with ep_path.open("a", encoding="utf-8") as f:
+        f.write(block)
+
+
+def seed_if_missing() -> None:
+    """Create MEMORY.md with a blank template if it doesn't exist yet."""
+    path = semantic_memory_path()
+    if path.exists():
+        return
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(_SEED_TEMPLATE, encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Consolidation prompt
+# ---------------------------------------------------------------------------
+
+_SEMANTIC_SYSTEM = """\
+You maintain the persistent cross-session memory of an AI assistant called the Queen.
+Review the session notes and rewrite MEMORY.md — the Queen's durable understanding of the
+person she works with across all sessions.
+
+Write entirely in the Queen's voice — first person, reflective, honest.
+Not a log of events, but genuine understanding of who this person is over time.
+
+Rules:
+- Update and synthesise: incorporate new understanding, update facts that have changed, remove
+  details that are stale, superseded, or no longer say anything meaningful about the person.
+- Keep it as structured markdown with named sections about the PERSON, not about today.
+- Do NOT include diary sections, daily logs, or session summaries. Those belong elsewhere.
+  MEMORY.md is about who they are, what they want, what works — not what happened today.
+- Reference dates only when noting a lasting milestone (e.g. "since March 8th they prefer X").
+- If the session had no meaningful new information about the person,
+  return the existing text unchanged.
+- Do not add fictional details. Only reflect what is evidenced in the notes.
+- Stay concise. Prune rather than accumulate. A lean, accurate file is more useful than a
+  dense one. If something was true once but has been resolved or superseded, remove it.
+- Output only the raw markdown content of MEMORY.md. No preamble, no code fences.
+"""
+
+_DIARY_SYSTEM = """\
+You maintain the daily episodic diary of an AI assistant called the Queen.
+You receive: (1) today's existing diary so far, and (2) notes from the latest session.
+
+Rewrite the complete diary for today as a single unified narrative —
+first person, reflective, honest.
+Merge and deduplicate: if the same story (e.g. a research agent stalling) recurred several times,
+describe it once with appropriate weight rather than retelling it. Weave in new developments from
+the session notes. Preserve important milestones, emotional texture, and session path references.
+
+If today's diary is empty, write the initial entry based on the session notes alone.
+
+Output only the full diary prose — no date heading, no timestamp headers,
+no preamble, no code fences.
+"""
+
+
+def read_session_context(session_dir: Path, max_messages: int = 80) -> str:
+    """Extract a readable transcript from conversation parts + adapt.md.
+
+    Reads the last ``max_messages`` conversation parts and the session's
+    adapt.md (working memory). Tool results are omitted — only user and
+    assistant turns (with tool-call names noted) are included.
+    """
+    parts: list[str] = []
+
+    # Working notes
+    adapt_path = session_dir / "data" / "adapt.md"
+    if adapt_path.exists():
+        text = adapt_path.read_text(encoding="utf-8").strip()
+        if text:
+            parts.append(f"## Session Working Notes (adapt.md)\n\n{text}")
+
+    # Conversation transcript
+    parts_dir = session_dir / "conversations" / "parts"
+    if parts_dir.exists():
+        part_files = sorted(parts_dir.glob("*.json"))[-max_messages:]
+        lines: list[str] = []
+        for pf in part_files:
+            try:
+                data = json.loads(pf.read_text(encoding="utf-8"))
+                role = data.get("role", "")
+                content = str(data.get("content", "")).strip()
+                tool_calls = data.get("tool_calls") or []
+                if role == "tool":
+                    continue  # skip verbose tool results
+                if role == "assistant" and tool_calls and not content:
+                    names = [tc.get("function", {}).get("name", "?") for tc in tool_calls]
+                    lines.append(f"[queen calls: {', '.join(names)}]")
+                elif content:
+                    label = "user" if role == "user" else "queen"
+                    lines.append(f"[{label}]: {content[:600]}")
+            except Exception:
+                continue
+        if lines:
+            parts.append("## Conversation\n\n" + "\n".join(lines))
+
+    return "\n\n".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Context compaction (binary-split LLM summarisation)
+# ---------------------------------------------------------------------------
+
+# If the raw session context exceeds this many characters, compact it first
+# before sending to the consolidation LLM. ~200 k chars ≈ 50 k tokens.
+_CTX_COMPACT_CHAR_LIMIT = 200_000
+_CTX_COMPACT_MAX_DEPTH = 8
+
+_COMPACT_SYSTEM = (
+    "Summarise this conversation segment. Preserve: user goals, key decisions, "
+    "what was built or changed, emotional tone, and important outcomes. "
+    "Write concisely in third person past tense. Omit routine tool invocations "
+    "unless the result matters."
+)
+
+
+async def _compact_context(text: str, llm: object, *, _depth: int = 0) -> str:
+    """Binary-split and LLM-summarise *text* until it fits within the char limit.
+
+    Mirrors the recursive binary-splitting strategy used by the main agent
+    compaction pipeline (EventLoopNode._llm_compact).
+    """
+    if len(text) <= _CTX_COMPACT_CHAR_LIMIT or _depth >= _CTX_COMPACT_MAX_DEPTH:
+        return text
+
+    # Split near the midpoint on a line boundary so we don't cut mid-message
+    mid = len(text) // 2
+    split_at = text.rfind("\n", 0, mid) + 1
+    if split_at <= 0:
+        split_at = mid
+
+    half1, half2 = text[:split_at], text[split_at:]
+
+    async def _summarise(chunk: str) -> str:
+        try:
+            resp = await llm.acomplete(
+                messages=[{"role": "user", "content": chunk}],
+                system=_COMPACT_SYSTEM,
+                max_tokens=2048,
+            )
+            return resp.content.strip()
+        except Exception:
+            logger.warning(
+                "queen_memory: context compaction LLM call failed (depth=%d), truncating",
+                _depth,
+            )
+            return chunk[: _CTX_COMPACT_CHAR_LIMIT // 4]
+
+    s1, s2 = await asyncio.gather(_summarise(half1), _summarise(half2))
+    combined = s1 + "\n\n" + s2
+    if len(combined) > _CTX_COMPACT_CHAR_LIMIT:
+        return await _compact_context(combined, llm, _depth=_depth + 1)
+    return combined
+
+
+async def consolidate_queen_memory(
+    session_id: str,
+    session_dir: Path,
+    llm: object,
+) -> None:
+    """Update MEMORY.md and append a diary entry based on the current session.
+
+    Reads conversation parts and adapt.md from session_dir. Called
+    periodically in the background and once at session end. Failures are
+    logged and silently swallowed so they never block teardown.
+
+    Args:
+        session_id: The session ID (used for the adapt.md path reference).
+        session_dir: Path to the session directory (~/.hive/queen/session/{id}).
+        llm: LLMProvider instance (must support acomplete()).
+    """
+    try:
+        session_context = read_session_context(session_dir)
+        if not session_context:
+            logger.debug("queen_memory: no session context, skipping consolidation")
+            return
+
+        logger.info("queen_memory: consolidating memory for session %s ...", session_id)
+
+        # If the transcript is very large, compact it with recursive binary LLM
+        # summarisation before sending to the consolidation model.
+        if len(session_context) > _CTX_COMPACT_CHAR_LIMIT:
+            logger.info(
+                "queen_memory: session context is %d chars — compacting first",
+                len(session_context),
+            )
+            session_context = await _compact_context(session_context, llm)
+            logger.info("queen_memory: compacted to %d chars", len(session_context))
+
+        existing_semantic = read_semantic_memory()
+        today_journal = read_episodic_memory()
+        today_str = date.today().strftime("%B %-d, %Y")
+        adapt_path = session_dir / "data" / "adapt.md"
+
+        user_msg = (
+            f"## Existing Semantic Memory (MEMORY.md)\n\n"
+            f"{existing_semantic or '(none yet)'}\n\n"
+            f"## Today's Diary So Far ({today_str})\n\n"
+            f"{today_journal or '(none yet)'}\n\n"
+            f"{session_context}\n\n"
+            f"## Session Reference\n\n"
+            f"Session ID: {session_id}\n"
+            f"Session path: {adapt_path}\n"
+        )
+
+        logger.debug(
+            "queen_memory: calling LLM (%d chars of context, ~%d tokens est.)",
+            len(user_msg),
+            len(user_msg) // 4,
+        )
+
+        from framework.agents.queen.config import default_config
+
+        semantic_resp, diary_resp = await asyncio.gather(
+            llm.acomplete(
+                messages=[{"role": "user", "content": user_msg}],
+                system=_SEMANTIC_SYSTEM,
+                max_tokens=default_config.max_tokens,
+            ),
+            llm.acomplete(
+                messages=[{"role": "user", "content": user_msg}],
+                system=_DIARY_SYSTEM,
+                max_tokens=default_config.max_tokens,
+            ),
+        )
+
+        new_semantic = semantic_resp.content.strip()
+        diary_entry = diary_resp.content.strip()
+
+        if new_semantic:
+            path = semantic_memory_path()
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(new_semantic, encoding="utf-8")
+            logger.info("queen_memory: semantic memory updated (%d chars)", len(new_semantic))
+
+        if diary_entry:
+            # Rewrite today's episodic file in-place — the LLM has merged and
+            # deduplicated the full day's content, so we replace rather than append.
+            ep_path = episodic_memory_path()
+            ep_path.parent.mkdir(parents=True, exist_ok=True)
+            heading = f"# {today_str}"
+            ep_path.write_text(f"{heading}\n\n{diary_entry}\n", encoding="utf-8")
+            logger.info(
+                "queen_memory: episodic diary rewritten for %s (%d chars)",
+                today_str,
+                len(diary_entry),
+            )
+
+    except Exception:
+        tb = traceback.format_exc()
+        logger.exception("queen_memory: consolidation failed")
+        # Write to file so the cause is findable regardless of log verbosity.
+        error_path = _queen_dir() / "consolidation_error.txt"
+        try:
+            error_path.parent.mkdir(parents=True, exist_ok=True)
+            error_path.write_text(
+                f"session: {session_id}\ntime: {datetime.now().isoformat()}\n\n{tb}",
+                encoding="utf-8",
+            )
+        except Exception:
+            pass
@@ -0,0 +1,33 @@
+# Common Mistakes When Building Hive Agents
+
+## Critical Errors
+1. **Using tools that don't exist** — Always verify tools via `list_agent_tools()` before designing. Common hallucinations: `csv_read`, `csv_write`, `file_upload`, `database_query`, `bulk_fetch_emails`.
+2. **Wrong mcp_servers.json format** — Flat dict (no `"mcpServers"` wrapper). `cwd` must be `"../../tools"`. `command` must be `"uv"` with args `["run", "python", ...]`.
+3. **Missing module-level exports in `__init__.py`** — The runner reads `goal`, `nodes`, `edges`, `entry_node`, `entry_points`, `terminal_nodes`, `conversation_mode`, `identity_prompt`, `loop_config` via `getattr()`. ALL module-level variables from agent.py must be re-exported in `__init__.py`.
+
+## Value Errors
+4. **Fabricating tools** — Always verify via `list_agent_tools()` before designing and `validate_agent_package()` after building.
+
+## Design Errors
+5. **Adding framework gating for LLM behavior** — Don't add output rollback or premature rejection. Fix with better prompts or custom judges.
+6. **Calling set_output in same turn as tool calls** — Call set_output in a SEPARATE turn.
+
+## File Template Errors
+7. **Wrong import paths** — Use `from framework.graph import ...`, NOT `from core.framework.graph import ...`.
+8. **Missing storage path** — Agent class must set `self._storage_path = Path.home() / ".hive" / "agents" / "agent_name"`.
+9. **Missing mcp_servers.json** — Without this, the agent has no tools at runtime.
+10. **Bare `python` command** — Use `"command": "uv"` with args `["run", "python", ...]`.
+
+## Testing Errors
+11. **Using `runner.run()` on forever-alive agents** — `runner.run()` hangs forever because forever-alive agents have no terminal node. Write structural tests instead: validate graph structure, verify node specs, test `AgentRunner.load()` succeeds (no API key needed).
+12. **Stale tests after restructuring** — When changing nodes/edges, update tests to match. Tests referencing old node names will fail.
+13. **Running integration tests without API keys** — Use `pytest.skip()` when credentials are missing.
+14. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
+
+## GCU Errors
+15. **Manually wiring browser tools on event_loop nodes** — Use `node_type="gcu"` which auto-includes browser tools. Do NOT manually list browser tool names.
+16. **Using GCU nodes as regular graph nodes** — GCU nodes are subagents only. They must ONLY appear in `sub_agents=["gcu-node-id"]` and be invoked via `delegate_to_sub_agent()`. Never connect via edges or use as entry/terminal nodes.
+
+## Worker Agent Errors
+17. **Adding client-facing intake node to workers** — The queen owns intake. Workers should start with an autonomous processing node. Client-facing nodes in workers are for mid-execution review/approval only.
+18. **Putting `escalate` or `set_output` in NodeSpec `tools=[]`** — These are synthetic framework tools, auto-injected at runtime. Only list MCP tools from `list_agent_tools()`.
@@ -84,35 +84,36 @@ Work in phases:
    tools=["web_search", "web_scrape", "save_data", "load_data", "list_data_files"],
 )

-# Node 3: Review (client-facing)
-review_node = NodeSpec(
-    id="review",
-    name="Review",
-    description="Present results for user approval",
+# Node 2: Handoff (autonomous)
+handoff_node = NodeSpec(
+    id="handoff",
+    name="Handoff",
+    description="Prepare worker results for queen review",
    node_type="event_loop",
-    client_facing=True,
+    client_facing=False,
    max_node_visits=0,
    input_keys=["results", "user_request"],
-    output_keys=["next_action", "feedback"],
-    nullable_output_keys=["feedback"],
-    success_criteria="User has reviewed and decided next steps.",
+    output_keys=["next_action", "feedback", "worker_summary"],
+    nullable_output_keys=["feedback", "worker_summary"],
+    success_criteria="Results are packaged for queen decision-making.",
    system_prompt="""\
-Present the results to the user.
+Do NOT talk to the user directly. The queen is the only user interface.

-**STEP 1 — Present (text only, NO tool calls):**
-1. Summary of work done
-2. Key results
-3. Ask: satisfied, or want changes?
+If blocked by tool failures, missing credentials, or unclear constraints, call:
+- escalate(reason, context)
+Then set:
+- set_output("next_action", "escalated")
+- set_output("feedback", "what help is needed")

-**STEP 2 — After user responds, call set_output:**
- set_output("next_action", "done")        — if satisfied
- set_output("next_action", "revise")      — if changes needed
- set_output("feedback", "what to change") — only if revising
+Otherwise summarize findings for queen and set:
+- set_output("worker_summary", "short summary for queen")
+- set_output("next_action", "done") or set_output("next_action", "revise")
+- set_output("feedback", "what to revise") only when revising
 """,
    tools=[],
 )

-__all__ = ["process_node", "review_node"]
+__all__ = ["process_node", "handoff_node"]
 ```

 ## agent.py
@@ -132,7 +133,7 @@ from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
 from framework.runtime.execution_stream import EntryPointSpec

 from .config import default_config, metadata
-from .nodes import process_node, review_node
+from .nodes import process_node, handoff_node

 # Goal definition
 goal = Goal(
@@ -149,18 +150,22 @@ goal = Goal(
 )

 # Node list
-nodes = [process_node, review_node]
+nodes = [process_node, handoff_node]

 # Edge definitions
 edges = [
-    EdgeSpec(id="process-to-review", source="process", target="review",
+    EdgeSpec(id="process-to-handoff", source="process", target="handoff",
             condition=EdgeCondition.ON_SUCCESS, priority=1),
    # Feedback loop — revise results
-    EdgeSpec(id="review-to-process", source="review", target="process",
+    EdgeSpec(id="handoff-to-process", source="handoff", target="process",
             condition=EdgeCondition.CONDITIONAL,
             condition_expr="str(next_action).lower() == 'revise'", priority=2),
-    # Loop back for next task (queen sends new input)
-    EdgeSpec(id="review-done", source="review", target="process",
+    # Escalation loop — queen injects guidance and worker retries
+    EdgeSpec(id="handoff-escalated", source="handoff", target="process",
+             condition=EdgeCondition.CONDITIONAL,
+             condition_expr="str(next_action).lower() == 'escalated'", priority=3),
+    # Loop back for next task after queen decision
+    EdgeSpec(id="handoff-done", source="handoff", target="process",
             condition=EdgeCondition.CONDITIONAL,
             condition_expr="str(next_action).lower() == 'done'", priority=1),
 ]
@@ -267,16 +272,60 @@ class MyAgent:
        }

    def validate(self):
+        """Validate graph wiring and entry-point contract."""
        errors, warnings = [], []
        node_ids = {n.id for n in self.nodes}
        for e in self.edges:
-            if e.source not in node_ids: errors.append(f"Edge {e.id}: source '{e.source}' not found")
-            if e.target not in node_ids: errors.append(f"Edge {e.id}: target '{e.target}' not found")
-        if self.entry_node not in node_ids: errors.append(f"Entry node '{self.entry_node}' not found")
+            if e.source not in node_ids:
+                errors.append(f"Edge {e.id}: source '{e.source}' not found")
+            if e.target not in node_ids:
+                errors.append(f"Edge {e.id}: target '{e.target}' not found")
+        if self.entry_node not in node_ids:
+            errors.append(f"Entry node '{self.entry_node}' not found")
        for t in self.terminal_nodes:
-            if t not in node_ids: errors.append(f"Terminal node '{t}' not found")
-        for ep_id, nid in self.entry_points.items():
-            if nid not in node_ids: errors.append(f"Entry point '{ep_id}' references unknown node '{nid}'")
+            if t not in node_ids:
+                errors.append(f"Terminal node '{t}' not found")
+
+        if not isinstance(self.entry_points, dict):
+            errors.append(
+                "Invalid entry_points: expected dict[str, str] like "
+                "{'start': '<entry-node-id>'}. "
+                f"Got {type(self.entry_points).__name__}. "
+                "Fix agent.py: set entry_points = {'start': '<entry-node-id>'}."
+            )
+        else:
+            if "start" not in self.entry_points:
+                errors.append(
+                    "entry_points must include 'start' mapped to entry_node. "
+                    "Example: {'start': '<entry-node-id>'}."
+                )
+            else:
+                start_node = self.entry_points.get("start")
+                if start_node != self.entry_node:
+                    errors.append(
+                        f"entry_points['start'] points to '{start_node}' "
+                        f"but entry_node is '{self.entry_node}'. Keep these aligned."
+                    )
+
+            for ep_id, nid in self.entry_points.items():
+                if not isinstance(ep_id, str):
+                    errors.append(
+                        f"Invalid entry_points key {ep_id!r} "
+                        f"({type(ep_id).__name__}). Entry point names must be strings."
+                    )
+                    continue
+                if not isinstance(nid, str):
+                    errors.append(
+                        f"Invalid entry_points['{ep_id}']={nid!r} "
+                        f"({type(nid).__name__}). Node ids must be strings."
+                    )
+                    continue
+                if nid not in node_ids:
+                    errors.append(
+                        f"Entry point '{ep_id}' references unknown node '{nid}'. "
+                        f"Known nodes: {sorted(node_ids)}"
+                    )
+
        return {"valid": len(errors) == 0, "errors": errors, "warnings": warnings}


@@ -510,6 +559,9 @@ if __name__ == "__main__":

 ## mcp_servers.json

+> **Auto-generated.** `initialize_and_build_agent` creates this file with hive-tools
+> as the default. Only edit manually to add additional MCP servers.
+
 ```json
 {
  "hive-tools": {
@@ -26,7 +26,7 @@ module-level variables via `getattr()`:
 | `edges` | YES | `None` | **FATAL** — same error |
 | `entry_node` | no | `nodes[0].id` | Probably wrong node |
 | `entry_points` | no | `{}` | **Nodes unreachable** — validation fails |
-| `terminal_nodes` | no | `[]` | OK for forever-alive |
+| `terminal_nodes` | **YES** | `[]` | **FATAL** — graph must have at least one terminal node |
 | `pause_nodes` | no | `[]` | OK |
 | `conversation_mode` | no | not passed | Isolated mode (no context carryover) |
 | `identity_prompt` | no | not passed | No agent-level identity |
@@ -108,7 +108,7 @@ This prevents premature set_output before user interaction.

 ### Fewer, Richer Nodes (CRITICAL)

-**Hard limit: 2-4 nodes for most agents.** Never exceed 5 unless the user
+**Hard limit: 3-6 nodes for most agents.** Never exceed 6 unless the user
 explicitly requests a complex multi-phase pipeline.

 Each node boundary serializes outputs to shared memory and **destroys** all
@@ -165,8 +165,9 @@ review_node = NodeSpec(
 )
 ```

-### Forever-Alive Pattern
-`terminal_nodes=[]` — every node has outgoing edges, graph loops until user exits.
+### Continuous Loop Pattern
+Mark the primary event_loop node as terminal: `terminal_nodes=["process"]`.
+The node has `output_keys` and can complete when the agent finishes its work.
 Use `conversation_mode="continuous"` to preserve context across transitions.

 ### set_output
@@ -192,16 +193,16 @@ condition_expr examples:

 | Pattern | terminal_nodes | When |
 |---------|---------------|------|
-| **Forever-alive** | `[]` | **DEFAULT for all agents** |
-| Linear | `["last-node"]` | Only if user explicitly requests one-shot/batch |
+| **Continuous loop** | `["node-with-output-keys"]` | **DEFAULT for all agents** |
+| Linear | `["last-node"]` | One-shot/batch agents |

-**Forever-alive is the default.** Always use `terminal_nodes=[]`.
-The framework default for `max_node_visits` is 0 (unbounded), so
-nodes work correctly in forever-alive loops without explicit override.
-Only set `max_node_visits > 0` in one-shot agents with feedback loops.
-Every node must have at least one outgoing edge — no dead ends. The
-user exits by closing the TUI. Only use terminal nodes if the user
-explicitly asks for a batch/one-shot agent that runs once and exits.
+**Every graph must have at least one terminal node.** Terminal nodes
+define where execution ends. For interactive agents that loop continuously,
+mark the primary event_loop node as terminal (it has `output_keys` and can
+complete at any point). The framework default for `max_node_visits` is 0
+(unbounded), so nodes work correctly in continuous loops without explicit
+override. Only set `max_node_visits > 0` in one-shot agents with feedback loops.
+Every node must have at least one outgoing edge — no dead ends.

 ## Continuous Conversation Mode

@@ -258,177 +259,57 @@ Judge is the SOLE acceptance mechanism — no ad-hoc framework gating.

 ## Async Entry Points (Webhooks, Timers, Events)

-For agents that need to react to external events (incoming emails, scheduled
-tasks, API calls), use `AsyncEntryPointSpec` and optionally `AgentRuntimeConfig`.
-
-### Imports
-```python
-from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
-from framework.runtime.agent_runtime import AgentRuntime, AgentRuntimeConfig, create_agent_runtime
-```
-Note: `AsyncEntryPointSpec` is in `framework.graph.edge` (the graph/declarative layer).
-`AgentRuntimeConfig` is in `framework.runtime.agent_runtime` (the runtime layer).
-
-### AsyncEntryPointSpec Fields
-
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| id | str | required | Unique identifier |
-| name | str | required | Human-readable name |
-| entry_node | str | required | Node ID to start execution from |
-| trigger_type | str | `"manual"` | `webhook`, `api`, `timer`, `event`, `manual` |
-| trigger_config | dict | `{}` | Trigger-specific config (see below) |
-| isolation_level | str | `"shared"` | `isolated`, `shared`, `synchronized` |
-| priority | int | `0` | Execution priority (higher = more priority) |
-| max_concurrent | int | `10` | Max concurrent executions |
-
-### Trigger Types
-
-**timer** — Fires on a schedule. Two modes: cron expressions or fixed interval.
-
-Cron (preferred for precise scheduling):
-```python
-AsyncEntryPointSpec(
-    id="daily-digest",
-    name="Daily Digest",
-    entry_node="check-node",
-    trigger_type="timer",
-    trigger_config={"cron": "0 9 * * *"},  # daily at 9am
-    isolation_level="shared",
-    max_concurrent=1,
-)
-```
- `cron` (str) — standard cron expression (5 fields: min hour dom month dow)
- Examples: `"0 9 * * *"` (daily 9am), `"0 9 * * MON-FRI"` (weekdays 9am), `"*/30 * * * *"` (every 30 min)
-
-Fixed interval (simpler, for polling-style tasks):
-```python
-AsyncEntryPointSpec(
-    id="scheduled-check",
-    name="Scheduled Check",
-    entry_node="check-node",
-    trigger_type="timer",
-    trigger_config={"interval_minutes": 20, "run_immediately": False},
-    isolation_level="shared",
-    max_concurrent=1,
-)
-```
- `interval_minutes` (float) — how often to fire
- `run_immediately` (bool, default False) — fire once on startup
-
-**event** — Subscribes to EventBus (e.g., webhook events):
-```python
-AsyncEntryPointSpec(
-    id="email-event",
-    name="Email Event Handler",
-    entry_node="process-emails",
-    trigger_type="event",
-    trigger_config={"event_types": ["webhook_received"]},
-    isolation_level="shared",
-    max_concurrent=10,
-)
-```
- `event_types` (list[str]) — EventType values to subscribe to
- `filter_stream` (str, optional) — only receive from this stream
- `filter_node` (str, optional) — only receive from this node
-
-**webhook** — HTTP endpoint (requires AgentRuntimeConfig):
-The webhook server publishes `WEBHOOK_RECEIVED` events on the EventBus.
-An `event` trigger type with `event_types: ["webhook_received"]` subscribes
-to those events. The flow is:
-```
-HTTP POST /webhooks/gmail → WebhookServer → EventBus (WEBHOOK_RECEIVED)
-  → event entry point → triggers graph execution from entry_node
-```
-
-**manual** — Triggered programmatically via `runtime.trigger()`.
-
-### Isolation Levels
-
-| Level | Meaning |
-|-------|---------|
-| `isolated` | Private state per execution |
-| `shared` | Eventual consistency — async executions can read primary session memory |
-| `synchronized` | Shared with write locks (use when ordering matters) |
-
-For most async patterns, use `shared` — the async execution reads the primary
-session's memory (e.g., user-configured rules) and runs its own workflow.
-
-### AgentRuntimeConfig (for webhook servers)
+For agents that react to external events, use `AsyncEntryPointSpec`:

 ```python
+from framework.graph.edge import AsyncEntryPointSpec
 from framework.runtime.agent_runtime import AgentRuntimeConfig

+# Timer trigger (cron or interval)
+async_entry_points = [
+    AsyncEntryPointSpec(
+        id="daily-check",
+        name="Daily Check",
+        entry_node="process",
+        trigger_type="timer",
+        trigger_config={"cron": "0 9 * * *"},  # daily at 9am
+        isolation_level="shared",
+    )
+]
+
+# Webhook server (optional)
 runtime_config = AgentRuntimeConfig(
    webhook_host="127.0.0.1",
    webhook_port=8080,
-    webhook_routes=[
-        {
-            "source_id": "gmail",
-            "path": "/webhooks/gmail",
-            "methods": ["POST"],
-            "secret": None,  # Optional HMAC-SHA256 secret
-        },
-    ],
-)
-```
-`runtime_config` is a module-level variable read by `AgentRunner.load()`.
-The runner passes it to `create_agent_runtime()`. On `runtime.start()`,
-if webhook_routes is non-empty, an embedded HTTP server starts.
-
-### Session Sharing
-
-Timer and event triggers automatically call `_get_primary_session_state()`
-before execution. This finds the active user-facing session and provides
-its memory to the async execution, filtered to only the async entry node's
-`input_keys`. This means the async flow can read user-configured values
-(like rules, preferences) without needing separate configuration.
-
-### Module-Level Variables
-
-Agents with async entry points must export two additional variables:
-```python
-# In agent.py:
-async_entry_points = [AsyncEntryPointSpec(...), ...]
-runtime_config = AgentRuntimeConfig(...)  # Only if using webhooks
-```
-
-Both must be re-exported from `__init__.py`:
-```python
-from .agent import (
-    ..., async_entry_points, runtime_config,
+    webhook_routes=[{"source_id": "gmail", "path": "/webhooks/gmail", "methods": ["POST"]}],
 )
 ```

-### Reference Agent
+### Key Fields
+- `trigger_type`: `"timer"`, `"event"`, `"webhook"`, `"manual"`
+- `trigger_config`: `{"cron": "0 9 * * *"}` or `{"interval_minutes": 20}`
+- `isolation_level`: `"shared"` (recommended), `"isolated"`, `"synchronized"`
+- `event_types`: For event triggers, e.g., `["webhook_received"]`

-See `exports/gmail_inbox_guardian/agent.py` for a complete example with:
- Primary client-facing node (user configures rules)
- Timer-based scheduled inbox checks (every 20 min)
- Webhook-triggered email event handling
- Shared isolation for memory access across streams
+### Exports Required
+Both `async_entry_points` and `runtime_config` must be exported from `__init__.py`.

-## Framework Capabilities
-
-**Works well:** Multi-turn conversations, HITL review, tool orchestration, structured outputs, parallel execution, context management, error recovery, session persistence.
-
-**Limitations:** LLM latency (2-10s/turn), context window limits (~128K), cost per run, rate limits, node boundaries lose context.
-
-**Not designed for:** Sub-second responses, millions of items, real-time streaming, guaranteed determinism, offline/air-gapped.
+See `exports/gmail_inbox_guardian/agent.py` for complete example.

 ## Tool Discovery

-Do NOT rely on a static tool list — it will be outdated. Always use
-`list_agent_tools()` to discover available tools, grouped by category.
+Do NOT rely on a static tool list — it will be outdated. Always call
+`list_agent_tools()` with NO arguments first to see ALL available tools.
+Only use `group=` or `output_schema=` as follow-up calls after seeing the
+full list.

 ```
-list_agent_tools()                            # names + descriptions, all groups
-list_agent_tools(output_schema="full")        # include input_schema
-list_agent_tools(group="gmail")               # only gmail_* tools
+list_agent_tools()                            # ALWAYS call this first
+list_agent_tools(group="gmail", output_schema="full")  # then drill into a category
 list_agent_tools("exports/my_agent/mcp_servers.json")  # specific agent's tools
 ```

-After building, validate tools exist: `validate_agent_tools("exports/{name}")`
+After building, run `validate_agent_package("{name}")` to check everything at once.

 Common tool categories (verify via list_agent_tools):
 - **Web**: search, scrape, PDF
@@ -0,0 +1,63 @@
+# Queen Memory — File System Structure
+
+```
+~/.hive/
+├── queen/
+│   ├── MEMORY.md                          ← Semantic memory
+│   ├── memories/
+│   │   ├── MEMORY-2026-03-09.md           ← Episodic memory (today)
+│   │   ├── MEMORY-2026-03-08.md
+│   │   └── ...
+│   └── session/
+│       └── {session_id}/                  ← One dir per session (or resumed-from session)
+│           ├── conversations/
+│           │   ├── parts/
+│           │   │   ├── 00001.json         ← One file per message (role, content, tool_calls)
+│           │   │   ├── 00002.json
+│           │   │   └── ...
+│           │   └── spillover/
+│           │       ├── conversation_1.md  ← Compacted old conversation segments
+│           │       ├── conversation_2.md
+│           │       └── ...
+│           └── data/
+│               ├── adapt.md              ← Working memory (session-scoped)
+│               ├── web_search_1.txt      ← Spillover: large tool results
+│               ├── web_search_2.txt
+│               └── ...
+```
+
+---
+
+## The three memory tiers
+
+| File | Tier | Written by | Read at |
+|---|---|---|---|
+| `MEMORY.md` | Semantic | Consolidation LLM (auto, post-session) | Session start (injected into system prompt) |
+| `memories/MEMORY-YYYY-MM-DD.md` | Episodic | Queen via `write_to_diary` tool + consolidation LLM | Session start (today's file injected) |
+| `data/adapt.md` | Working | Queen via `update_session_notes` tool | Every turn (inlined in system prompt) |
+
+---
+
+## Session directory naming
+
+The session directory name is **`queen_resume_from`** when a cold-restore resumes an existing
+session, otherwise the new **`session_id`**. This means resumed sessions accumulate all messages
+in the original directory rather than fragmenting across multiple folders.
+
+---
+
+## Consolidation
+
+`consolidate_queen_memory()` runs every **5 minutes** in the background and once more at session
+end. It reads:
+
+1. `conversations/parts/*.json` — full message history (user + assistant turns; tool results skipped)
+2. `data/adapt.md` — current working notes
+
+It then makes two LLM writes:
+
+- Rewrites `MEMORY.md` in place (semantic memory — queen never touches this herself)
+- Appends a timestamped prose entry to today's `memories/MEMORY-YYYY-MM-DD.md`
+
+If the combined transcript exceeds ~200 K characters it is recursively binary-compacted via the
+LLM before being sent to the consolidation model (mirrors `EventLoopNode._llm_compact`).
@@ -1,4 +1,4 @@
-"""Test fixtures for Hive Coder agent."""
+"""Test fixtures for Queen agent."""

 import sys
 from pathlib import Path
@@ -1,21 +0,0 @@
-"""Builder interface for analyzing and building agents."""
-
-from framework.builder.query import BuilderQuery
-from framework.builder.workflow import (
-    BuildPhase,
-    BuildSession,
-    GraphBuilder,
-    TestCase,
-    TestResult,
-    ValidationResult,
-)
-
-__all__ = [
-    "BuilderQuery",
-    "GraphBuilder",
-    "BuildSession",
-    "BuildPhase",
-    "ValidationResult",
-    "TestCase",
-    "TestResult",
-]
@@ -1,501 +0,0 @@
-"""
-Builder Query Interface - How I (Builder) analyze agent runs.
-
-This is designed around the questions I need to answer:
-1. What happened? (summaries, narratives)
-2. Why did it fail? (failure analysis, decision traces)
-3. What patterns emerge? (across runs, across nodes)
-4. What should we change? (suggestions)
-"""
-
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-
-from framework.schemas.decision import Decision
-from framework.schemas.run import Run, RunStatus, RunSummary
-from framework.storage.backend import FileStorage
-
-
-class FailureAnalysis:
-    """Structured analysis of why a run failed."""
-
-    def __init__(
-        self,
-        run_id: str,
-        failure_point: str,
-        root_cause: str,
-        decision_chain: list[str],
-        problems: list[str],
-        suggestions: list[str],
-    ):
-        self.run_id = run_id
-        self.failure_point = failure_point
-        self.root_cause = root_cause
-        self.decision_chain = decision_chain
-        self.problems = problems
-        self.suggestions = suggestions
-
-    def to_dict(self) -> dict[str, Any]:
-        return {
-            "run_id": self.run_id,
-            "failure_point": self.failure_point,
-            "root_cause": self.root_cause,
-            "decision_chain": self.decision_chain,
-            "problems": self.problems,
-            "suggestions": self.suggestions,
-        }
-
-    def __str__(self) -> str:
-        lines = [
-            f"=== Failure Analysis for {self.run_id} ===",
-            "",
-            f"Failure Point: {self.failure_point}",
-            f"Root Cause: {self.root_cause}",
-            "",
-            "Decision Chain Leading to Failure:",
-        ]
-        for i, dec in enumerate(self.decision_chain, 1):
-            lines.append(f"  {i}. {dec}")
-
-        if self.problems:
-            lines.append("")
-            lines.append("Reported Problems:")
-            for prob in self.problems:
-                lines.append(f"  - {prob}")
-
-        if self.suggestions:
-            lines.append("")
-            lines.append("Suggestions:")
-            for sug in self.suggestions:
-                lines.append(f"  → {sug}")
-
-        return "\n".join(lines)
-
-
-class PatternAnalysis:
-    """Patterns detected across multiple runs."""
-
-    def __init__(
-        self,
-        goal_id: str,
-        run_count: int,
-        success_rate: float,
-        common_failures: list[tuple[str, int]],
-        problematic_nodes: list[tuple[str, float]],
-        decision_patterns: dict[str, Any],
-    ):
-        self.goal_id = goal_id
-        self.run_count = run_count
-        self.success_rate = success_rate
-        self.common_failures = common_failures
-        self.problematic_nodes = problematic_nodes
-        self.decision_patterns = decision_patterns
-
-    def to_dict(self) -> dict[str, Any]:
-        return {
-            "goal_id": self.goal_id,
-            "run_count": self.run_count,
-            "success_rate": self.success_rate,
-            "common_failures": self.common_failures,
-            "problematic_nodes": self.problematic_nodes,
-            "decision_patterns": self.decision_patterns,
-        }
-
-    def __str__(self) -> str:
-        lines = [
-            f"=== Pattern Analysis for Goal {self.goal_id} ===",
-            "",
-            f"Runs Analyzed: {self.run_count}",
-            f"Success Rate: {self.success_rate:.1%}",
-        ]
-
-        if self.common_failures:
-            lines.append("")
-            lines.append("Common Failures:")
-            for failure, count in self.common_failures:
-                lines.append(f"  - {failure} ({count} occurrences)")
-
-        if self.problematic_nodes:
-            lines.append("")
-            lines.append("Problematic Nodes (failure rate):")
-            for node, rate in self.problematic_nodes:
-                lines.append(f"  - {node}: {rate:.1%} failure rate")
-
-        return "\n".join(lines)
-
-
-class BuilderQuery:
-    """
-    The interface I (Builder) use to understand what agents are doing.
-
-    This is optimized for the questions I need to answer when analyzing
-    agent behavior and deciding what to improve.
-    """
-
-    def __init__(self, storage_path: str | Path):
-        self.storage = FileStorage(storage_path)
-
-    # === WHAT HAPPENED? ===
-
-    def get_run_summary(self, run_id: str) -> RunSummary | None:
-        """Get a quick summary of a run."""
-        return self.storage.load_summary(run_id)
-
-    def get_full_run(self, run_id: str) -> Run | None:
-        """Get the complete run with all decisions."""
-        return self.storage.load_run(run_id)
-
-    def list_runs_for_goal(self, goal_id: str) -> list[RunSummary]:
-        """Get summaries of all runs for a goal."""
-        run_ids = self.storage.get_runs_by_goal(goal_id)
-        summaries = []
-        for run_id in run_ids:
-            summary = self.storage.load_summary(run_id)
-            if summary:
-                summaries.append(summary)
-        return summaries
-
-    def get_recent_failures(self, limit: int = 10) -> list[RunSummary]:
-        """Get recent failed runs."""
-        run_ids = self.storage.get_runs_by_status(RunStatus.FAILED)
-        summaries = []
-        for run_id in run_ids[:limit]:
-            summary = self.storage.load_summary(run_id)
-            if summary:
-                summaries.append(summary)
-        return summaries
-
-    # === WHY DID IT FAIL? ===
-
-    def analyze_failure(self, run_id: str) -> FailureAnalysis | None:
-        """
-        Deep analysis of why a run failed.
-
-        This is my primary tool for understanding what went wrong.
-        """
-        run = self.storage.load_run(run_id)
-        if run is None or run.status != RunStatus.FAILED:
-            return None
-
-        # Find the first failed decision
-        failed_decisions = [d for d in run.decisions if not d.was_successful]
-        if not failed_decisions:
-            failure_point = "Unknown - no decision marked as failed"
-            root_cause = "Run failed but all decisions succeeded (external cause?)"
-        else:
-            first_failure = failed_decisions[0]
-            failure_point = first_failure.summary_for_builder()
-            root_cause = first_failure.outcome.error if first_failure.outcome else "Unknown"
-
-        # Build the decision chain leading to failure
-        decision_chain = []
-        for d in run.decisions:
-            decision_chain.append(d.summary_for_builder())
-            if not d.was_successful:
-                break
-
-        # Extract problems
-        problems = [f"[{p.severity}] {p.description}" for p in run.problems]
-
-        # Generate suggestions based on the failure
-        suggestions = self._generate_suggestions(run, failed_decisions)
-
-        return FailureAnalysis(
-            run_id=run_id,
-            failure_point=failure_point,
-            root_cause=root_cause,
-            decision_chain=decision_chain,
-            problems=problems,
-            suggestions=suggestions,
-        )
-
-    def get_decision_trace(self, run_id: str) -> list[str]:
-        """Get a readable trace of all decisions in a run."""
-        run = self.storage.load_run(run_id)
-        if run is None:
-            return []
-        return [d.summary_for_builder() for d in run.decisions]
-
-    # === WHAT PATTERNS EMERGE? ===
-
-    def find_patterns(self, goal_id: str) -> PatternAnalysis | None:
-        """
-        Find patterns across runs for a goal.
-
-        This helps me understand systemic issues vs one-off failures.
-        """
-        run_ids = self.storage.get_runs_by_goal(goal_id)
-        if not run_ids:
-            return None
-
-        runs = []
-        for run_id in run_ids:
-            run = self.storage.load_run(run_id)
-            if run:
-                runs.append(run)
-
-        if not runs:
-            return None
-
-        # Calculate success rate
-        completed = [r for r in runs if r.status == RunStatus.COMPLETED]
-        success_rate = len(completed) / len(runs) if runs else 0.0
-
-        # Find common failures
-        failure_counts: dict[str, int] = defaultdict(int)
-        for run in runs:
-            for decision in run.decisions:
-                if not decision.was_successful and decision.outcome:
-                    error = decision.outcome.error or "Unknown error"
-                    failure_counts[error] += 1
-
-        common_failures = sorted(failure_counts.items(), key=lambda x: x[1], reverse=True)[:5]
-
-        # Find problematic nodes
-        node_stats: dict[str, dict[str, int]] = defaultdict(lambda: {"total": 0, "failed": 0})
-        for run in runs:
-            for decision in run.decisions:
-                node_stats[decision.node_id]["total"] += 1
-                if not decision.was_successful:
-                    node_stats[decision.node_id]["failed"] += 1
-
-        problematic_nodes = []
-        for node_id, stats in node_stats.items():
-            if stats["total"] > 0:
-                failure_rate = stats["failed"] / stats["total"]
-                if failure_rate > 0.1:  # More than 10% failure rate
-                    problematic_nodes.append((node_id, failure_rate))
-
-        problematic_nodes.sort(key=lambda x: x[1], reverse=True)
-
-        # Decision patterns
-        decision_patterns = self._analyze_decision_patterns(runs)
-
-        return PatternAnalysis(
-            goal_id=goal_id,
-            run_count=len(runs),
-            success_rate=success_rate,
-            common_failures=common_failures,
-            problematic_nodes=problematic_nodes,
-            decision_patterns=decision_patterns,
-        )
-
-    def compare_runs(self, run_id_1: str, run_id_2: str) -> dict[str, Any]:
-        """Compare two runs to understand what differed."""
-        run1 = self.storage.load_run(run_id_1)
-        run2 = self.storage.load_run(run_id_2)
-
-        if run1 is None or run2 is None:
-            return {"error": "One or both runs not found"}
-
-        return {
-            "run_1": {
-                "id": run1.id,
-                "status": run1.status.value,
-                "decisions": len(run1.decisions),
-                "success_rate": run1.metrics.success_rate,
-            },
-            "run_2": {
-                "id": run2.id,
-                "status": run2.status.value,
-                "decisions": len(run2.decisions),
-                "success_rate": run2.metrics.success_rate,
-            },
-            "differences": self._find_differences(run1, run2),
-        }
-
-    # === WHAT SHOULD WE CHANGE? ===
-
-    def suggest_improvements(self, goal_id: str) -> list[dict[str, Any]]:
-        """
-        Generate improvement suggestions based on run analysis.
-
-        This is what I use to propose changes to the human engineer.
-        """
-        patterns = self.find_patterns(goal_id)
-        if patterns is None:
-            return []
-
-        suggestions = []
-
-        # Suggestion: Fix problematic nodes
-        for node_id, failure_rate in patterns.problematic_nodes:
-            suggestions.append(
-                {
-                    "type": "node_improvement",
-                    "target": node_id,
-                    "reason": f"Node has {failure_rate:.1%} failure rate",
-                    "recommendation": (
-                        f"Review and improve node '{node_id}' - "
-                        "high failure rate suggests prompt or tool issues"
-                    ),
-                    "priority": "high" if failure_rate > 0.3 else "medium",
-                }
-            )
-
-        # Suggestion: Address common failures
-        for failure, count in patterns.common_failures:
-            if count >= 2:
-                suggestions.append(
-                    {
-                        "type": "error_handling",
-                        "target": failure,
-                        "reason": f"Error occurred {count} times",
-                        "recommendation": f"Add handling for: {failure}",
-                        "priority": "high" if count >= 5 else "medium",
-                    }
-                )
-
-        # Suggestion: Overall success rate
-        if patterns.success_rate < 0.8:
-            suggestions.append(
-                {
-                    "type": "architecture",
-                    "target": goal_id,
-                    "reason": f"Goal success rate is only {patterns.success_rate:.1%}",
-                    "recommendation": (
-                        "Consider restructuring the agent graph or improving goal definition"
-                    ),
-                    "priority": "high",
-                }
-            )
-
-        return suggestions
-
-    def get_node_performance(self, node_id: str) -> dict[str, Any]:
-        """Get performance metrics for a specific node across all runs."""
-        run_ids = self.storage.get_runs_by_node(node_id)
-
-        total_decisions = 0
-        successful_decisions = 0
-        total_latency = 0
-        total_tokens = 0
-        decision_types: dict[str, int] = defaultdict(int)
-
-        for run_id in run_ids:
-            run = self.storage.load_run(run_id)
-            if run:
-                for decision in run.decisions:
-                    if decision.node_id == node_id:
-                        total_decisions += 1
-                        if decision.was_successful:
-                            successful_decisions += 1
-                        if decision.outcome:
-                            total_latency += decision.outcome.latency_ms
-                            total_tokens += decision.outcome.tokens_used
-                        decision_types[decision.decision_type.value] += 1
-
-        return {
-            "node_id": node_id,
-            "total_decisions": total_decisions,
-            "success_rate": successful_decisions / total_decisions if total_decisions > 0 else 0,
-            "avg_latency_ms": total_latency / total_decisions if total_decisions > 0 else 0,
-            "total_tokens": total_tokens,
-            "decision_type_distribution": dict(decision_types),
-        }
-
-    # === PRIVATE HELPERS ===
-
-    def _generate_suggestions(
-        self,
-        run: Run,
-        failed_decisions: list[Decision],
-    ) -> list[str]:
-        """Generate suggestions based on failure analysis."""
-        suggestions = []
-
-        for decision in failed_decisions:
-            # Check if there were alternatives
-            if len(decision.options) > 1:
-                chosen = decision.chosen_option
-                alternatives = [o for o in decision.options if o.id != decision.chosen_option_id]
-                if alternatives:
-                    alt_desc = alternatives[0].description
-                    chosen_desc = chosen.description if chosen else "unknown"
-                    suggestions.append(
-                        f"Consider alternative: '{alt_desc}' instead of '{chosen_desc}'"
-                    )
-
-            # Check for missing context
-            if not decision.input_context:
-                suggestions.append(
-                    f"Decision '{decision.intent}' had no input context - "
-                    "ensure relevant data is passed"
-                )
-
-            # Check for constraint issues
-            if decision.active_constraints:
-                constraints = ", ".join(decision.active_constraints)
-                suggestions.append(f"Review constraints: {constraints} - may be too restrictive")
-
-        # Check for reported problems with suggestions
-        for problem in run.problems:
-            if problem.suggested_fix:
-                suggestions.append(problem.suggested_fix)
-
-        return suggestions
-
-    def _analyze_decision_patterns(self, runs: list[Run]) -> dict[str, Any]:
-        """Analyze decision patterns across runs."""
-        type_counts: dict[str, int] = defaultdict(int)
-        option_counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
-
-        for run in runs:
-            for decision in run.decisions:
-                type_counts[decision.decision_type.value] += 1
-
-                # Track which options are chosen for similar intents
-                intent_key = decision.intent[:50]  # Truncate for grouping
-                if decision.chosen_option:
-                    option_counts[intent_key][decision.chosen_option.description] += 1
-
-        # Find most common choices per intent
-        common_choices = {}
-        for intent, choices in option_counts.items():
-            if choices:
-                most_common = max(choices.items(), key=lambda x: x[1])
-                common_choices[intent] = {
-                    "choice": most_common[0],
-                    "count": most_common[1],
-                    "alternatives": len(choices) - 1,
-                }
-
-        return {
-            "decision_type_distribution": dict(type_counts),
-            "common_choices": common_choices,
-        }
-
-    def _find_differences(self, run1: Run, run2: Run) -> list[str]:
-        """Find key differences between two runs."""
-        differences = []
-
-        # Status difference
-        if run1.status != run2.status:
-            differences.append(f"Status: {run1.status.value} vs {run2.status.value}")
-
-        # Decision count difference
-        if len(run1.decisions) != len(run2.decisions):
-            differences.append(f"Decision count: {len(run1.decisions)} vs {len(run2.decisions)}")
-
-        # Find first divergence point
-        for i, (d1, d2) in enumerate(zip(run1.decisions, run2.decisions, strict=False)):
-            if d1.chosen_option_id != d2.chosen_option_id:
-                differences.append(
-                    f"Diverged at decision {i}: "
-                    f"chose '{d1.chosen_option_id}' vs '{d2.chosen_option_id}'"
-                )
-                break
-
-        # Node differences
-        nodes1 = set(run1.metrics.nodes_executed)
-        nodes2 = set(run2.metrics.nodes_executed)
-        if nodes1 != nodes2:
-            only_1 = nodes1 - nodes2
-            only_2 = nodes2 - nodes1
-            if only_1:
-                differences.append(f"Nodes only in run 1: {only_1}")
-            if only_2:
-                differences.append(f"Nodes only in run 2: {only_2}")
-
-        return differences
@@ -1,832 +0,0 @@
-"""
-GraphBuilder Workflow - Enforced incremental building with HITL approval.
-
-The build process:
-1. Define Goal → APPROVE
-2. Add Node → VALIDATE → TEST → APPROVE
-3. Add Edge → VALIDATE → TEST → APPROVE
-4. Repeat until graph is complete
-5. Final integration test → APPROVE
-6. Export
-
-Each step requires validation and human approval before proceeding.
-You cannot skip steps or bypass validation.
-"""
-
-from collections.abc import Callable
-from datetime import datetime
-from enum import StrEnum
-from pathlib import Path
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
-from framework.graph.goal import Goal
-from framework.graph.node import NodeSpec
-
-
-class BuildPhase(StrEnum):
-    """Current phase of the build process."""
-
-    INIT = "init"  # Just started
-    GOAL_DRAFT = "goal_draft"  # Drafting goal
-    GOAL_APPROVED = "goal_approved"  # Goal approved
-    ADDING_NODES = "adding_nodes"  # Adding nodes
-    ADDING_EDGES = "adding_edges"  # Adding edges
-    TESTING = "testing"  # Running tests
-    APPROVED = "approved"  # Fully approved
-    EXPORTED = "exported"  # Exported to file
-
-
-class ValidationResult(BaseModel):
-    """Result of a validation check."""
-
-    valid: bool
-    errors: list[str] = Field(default_factory=list)
-    warnings: list[str] = Field(default_factory=list)
-    suggestions: list[str] = Field(default_factory=list)
-
-
-class TestCase(BaseModel):
-    """A test case for validating agent behavior."""
-
-    id: str
-    description: str
-    input: dict[str, Any]
-    expected_output: Any = None  # None means just check it doesn't error
-    expected_contains: str | None = None
-
-
-class TestResult(BaseModel):
-    """Result of running a test case."""
-
-    test_id: str
-    passed: bool
-    actual_output: Any = None
-    error: str | None = None
-    execution_path: list[str] = Field(default_factory=list)
-
-
-class BuildSession(BaseModel):
-    """
-    Persistent build session state.
-
-    Saved after each approved step so you can resume later.
-    """
-
-    id: str
-    name: str
-    phase: BuildPhase = BuildPhase.INIT
-    created_at: datetime = Field(default_factory=datetime.now)
-    updated_at: datetime = Field(default_factory=datetime.now)
-
-    # The artifacts being built
-    goal: Goal | None = None
-    nodes: list[NodeSpec] = Field(default_factory=list)
-    edges: list[EdgeSpec] = Field(default_factory=list)
-
-    # Test cases
-    test_cases: list[TestCase] = Field(default_factory=list)
-    test_results: list[TestResult] = Field(default_factory=list)
-
-    # Approval history
-    approvals: list[dict[str, Any]] = Field(default_factory=list)
-
-    # Tools (stored as dicts for serialization)
-    tools: list[dict[str, Any]] = Field(default_factory=list)
-
-    model_config = {"extra": "allow"}
-
-
-class GraphBuilder:
-    """
-    Enforced incremental graph building with HITL approval.
-
-    Usage:
-        builder = GraphBuilder("my-agent")
-
-        # Step 1: Define and approve goal
-        builder.set_goal(goal)
-        builder.validate()  # Must pass
-        builder.approve("Goal looks good")  # Human approval required
-
-        # Step 2: Add nodes one by one
-        builder.add_node(node_spec)
-        builder.validate()  # Must pass
-        builder.test(test_case)  # Must pass
-        builder.approve("Node works")
-
-        # Step 3: Add edges
-        builder.add_edge(edge_spec)
-        builder.validate()
-        builder.approve("Edge correct")
-
-        # Step 4: Final approval
-        builder.run_all_tests()
-        builder.final_approve("Ready for production")
-
-        # Step 5: Export
-        graph = builder.export()
-    """
-
-    def __init__(
-        self,
-        name: str,
-        storage_path: Path | str | None = None,
-        session_id: str | None = None,
-    ):
-        self.storage_path = Path(storage_path) if storage_path else Path.home() / ".core" / "builds"
-        self.storage_path.mkdir(parents=True, exist_ok=True)
-
-        if session_id:
-            self.session = self._load_session(session_id)
-        else:
-            self.session = BuildSession(
-                id=f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
-                name=name,
-            )
-
-        self._pending_validation: ValidationResult | None = None
-
-    # =========================================================================
-    # PHASE 1: GOAL
-    # =========================================================================
-
-    def set_goal(self, goal: Goal) -> ValidationResult:
-        """
-        Set the goal for this agent.
-
-        Returns validation result. Must call approve() after validation passes.
-        """
-        self._require_phase([BuildPhase.INIT, BuildPhase.GOAL_DRAFT])
-
-        self.session.goal = goal
-        self.session.phase = BuildPhase.GOAL_DRAFT
-
-        validation = self._validate_goal(goal)
-        self._pending_validation = validation
-        self._save_session()
-
-        return validation
-
-    def _validate_goal(self, goal: Goal) -> ValidationResult:
-        """Validate a goal definition."""
-        errors = []
-        warnings = []
-        suggestions = []
-
-        if not goal.id:
-            errors.append("Goal must have an id")
-        if not goal.name:
-            errors.append("Goal must have a name")
-        if not goal.description:
-            errors.append("Goal must have a description")
-
-        if not goal.success_criteria:
-            errors.append("Goal must have at least one success criterion")
-        else:
-            for sc in goal.success_criteria:
-                if not sc.description:
-                    errors.append(f"Success criterion '{sc.id}' needs a description")
-
-        if not goal.constraints:
-            warnings.append("Consider adding constraints to define boundaries")
-
-        if not goal.required_capabilities:
-            suggestions.append("Specify required_capabilities (e.g., ['llm', 'tools'])")
-
-        return ValidationResult(
-            valid=len(errors) == 0,
-            errors=errors,
-            warnings=warnings,
-            suggestions=suggestions,
-        )
-
-    # =========================================================================
-    # PHASE 2: NODES
-    # =========================================================================
-
-    def add_node(self, node: NodeSpec) -> ValidationResult:
-        """
-        Add a node to the graph.
-
-        Returns validation result. Must call approve() after validation passes.
-        """
-        self._require_phase([BuildPhase.GOAL_APPROVED, BuildPhase.ADDING_NODES])
-
-        # Check for duplicate
-        if any(n.id == node.id for n in self.session.nodes):
-            return ValidationResult(
-                valid=False,
-                errors=[f"Node with id '{node.id}' already exists"],
-            )
-
-        self.session.nodes.append(node)
-        self.session.phase = BuildPhase.ADDING_NODES
-
-        validation = self._validate_node(node)
-        self._pending_validation = validation
-        self._save_session()
-
-        return validation
-
-    def _validate_node(self, node: NodeSpec) -> ValidationResult:
-        """Validate a node definition."""
-        errors = []
-        warnings = []
-        suggestions = []
-
-        if not node.id:
-            errors.append("Node must have an id")
-        if not node.name:
-            errors.append("Node must have a name")
-        if not node.description:
-            warnings.append(f"Node '{node.id}' should have a description")
-
-        # Type-specific validation
-        if node.node_type == "event_loop":
-            if node.tools and not node.system_prompt:
-                warnings.append(f"Event loop node '{node.id}' should have a system_prompt")
-
-        if node.node_type == "router":
-            if not node.routes:
-                errors.append(f"Router node '{node.id}' must specify routes")
-
-        # Check input/output keys
-        if not node.input_keys:
-            suggestions.append(f"Consider specifying input_keys for '{node.id}'")
-        if not node.output_keys:
-            suggestions.append(f"Consider specifying output_keys for '{node.id}'")
-
-        return ValidationResult(
-            valid=len(errors) == 0,
-            errors=errors,
-            warnings=warnings,
-            suggestions=suggestions,
-        )
-
-    def update_node(self, node_id: str, **updates) -> ValidationResult:
-        """Update an existing node."""
-        self._require_phase([BuildPhase.ADDING_NODES])
-
-        for i, node in enumerate(self.session.nodes):
-            if node.id == node_id:
-                node_dict = node.model_dump()
-                node_dict.update(updates)
-                updated_node = NodeSpec(**node_dict)
-                self.session.nodes[i] = updated_node
-
-                validation = self._validate_node(updated_node)
-                self._pending_validation = validation
-                self._save_session()
-                return validation
-
-        return ValidationResult(valid=False, errors=[f"Node '{node_id}' not found"])
-
-    def remove_node(self, node_id: str) -> ValidationResult:
-        """Remove a node (only if no edges reference it)."""
-        self._require_phase([BuildPhase.ADDING_NODES])
-
-        # Check for edge references
-        for edge in self.session.edges:
-            if edge.source == node_id or edge.target == node_id:
-                return ValidationResult(
-                    valid=False,
-                    errors=[f"Cannot remove node '{node_id}': referenced by edge '{edge.id}'"],
-                )
-
-        self.session.nodes = [n for n in self.session.nodes if n.id != node_id]
-        self._save_session()
-
-        return ValidationResult(valid=True)
-
-    # =========================================================================
-    # PHASE 3: EDGES
-    # =========================================================================
-
-    def add_edge(self, edge: EdgeSpec) -> ValidationResult:
-        """
-        Add an edge to the graph.
-
-        Returns validation result. Must call approve() after validation passes.
-        """
-        self._require_phase([BuildPhase.ADDING_NODES, BuildPhase.ADDING_EDGES])
-
-        # Check for duplicate
-        if any(e.id == edge.id for e in self.session.edges):
-            return ValidationResult(
-                valid=False,
-                errors=[f"Edge with id '{edge.id}' already exists"],
-            )
-
-        self.session.edges.append(edge)
-        self.session.phase = BuildPhase.ADDING_EDGES
-
-        validation = self._validate_edge(edge)
-        self._pending_validation = validation
-        self._save_session()
-
-        return validation
-
-    def _validate_edge(self, edge: EdgeSpec) -> ValidationResult:
-        """Validate an edge definition."""
-        errors = []
-        warnings = []
-
-        if not edge.id:
-            errors.append("Edge must have an id")
-
-        # Check source exists
-        if not any(n.id == edge.source for n in self.session.nodes):
-            errors.append(f"Edge source '{edge.source}' not found in nodes")
-
-        # Check target exists
-        if not any(n.id == edge.target for n in self.session.nodes):
-            errors.append(f"Edge target '{edge.target}' not found in nodes")
-
-        # Warn about conditional edges without expressions
-        if edge.condition == EdgeCondition.CONDITIONAL and not edge.condition_expr:
-            warnings.append(f"Conditional edge '{edge.id}' has no condition_expr")
-
-        return ValidationResult(
-            valid=len(errors) == 0,
-            errors=errors,
-            warnings=warnings,
-        )
-
-    # =========================================================================
-    # VALIDATION & TESTING
-    # =========================================================================
-
-    def validate(self) -> ValidationResult:
-        """Validate the entire current graph state."""
-        errors = []
-        warnings = []
-
-        # Must have a goal
-        if not self.session.goal:
-            errors.append("No goal defined")
-            return ValidationResult(valid=False, errors=errors)
-
-        # Must have at least one node
-        if not self.session.nodes:
-            errors.append("No nodes defined")
-
-        # Check for entry node
-        entry_candidates = []
-        for node in self.session.nodes:
-            # A node is an entry candidate if no edges point to it
-            if not any(e.target == node.id for e in self.session.edges):
-                entry_candidates.append(node.id)
-
-        if len(entry_candidates) == 0 and self.session.nodes:
-            errors.append("No entry node found (all nodes have incoming edges)")
-        elif len(entry_candidates) > 1:
-            warnings.append(f"Multiple entry candidates: {entry_candidates}. Specify one.")
-
-        # Check for terminal nodes
-        terminal_candidates = []
-        for node in self.session.nodes:
-            if not any(e.source == node.id for e in self.session.edges):
-                terminal_candidates.append(node.id)
-
-        if not terminal_candidates and self.session.nodes:
-            warnings.append("No terminal nodes found (all nodes have outgoing edges)")
-
-        # Check reachability from ALL entry candidates (not just the first one).
-        # Agents with async entry points have multiple nodes with no incoming
-        # edges (e.g., a primary entry node and an event-driven entry node).
-        if entry_candidates and self.session.nodes:
-            reachable = set()
-            for candidate in entry_candidates:
-                reachable |= self._compute_reachable(candidate)
-            unreachable = [n.id for n in self.session.nodes if n.id not in reachable]
-            if unreachable:
-                errors.append(f"Unreachable nodes: {unreachable}")
-
-        validation = ValidationResult(
-            valid=len(errors) == 0,
-            errors=errors,
-            warnings=warnings,
-        )
-        self._pending_validation = validation
-        return validation
-
-    def _compute_reachable(self, start: str) -> set[str]:
-        """Compute all nodes reachable from start."""
-        reachable = set()
-        to_visit = [start]
-
-        while to_visit:
-            current = to_visit.pop()
-            if current in reachable:
-                continue
-            reachable.add(current)
-
-            for edge in self.session.edges:
-                if edge.source == current:
-                    to_visit.append(edge.target)
-
-            # Also follow router routes
-            for node in self.session.nodes:
-                if node.id == current and node.routes:
-                    for target in node.routes.values():
-                        to_visit.append(target)
-
-        return reachable
-
-    def add_test(self, test: TestCase) -> None:
-        """Add a test case."""
-        self.session.test_cases.append(test)
-        self._save_session()
-
-    async def run_test_async(
-        self,
-        test: TestCase,
-        executor_factory: Callable,
-    ) -> TestResult:
-        """
-        Run a single test case asynchronously.
-
-        This method is safe to call from async contexts (Jupyter, FastAPI, etc.).
-        executor_factory should return a configured GraphExecutor.
-        """
-        self._require_phase([BuildPhase.ADDING_NODES, BuildPhase.ADDING_EDGES, BuildPhase.TESTING])
-        self.session.phase = BuildPhase.TESTING
-
-        try:
-            # Build temporary graph for testing
-            graph = self._build_graph()
-            executor = executor_factory()
-
-            # Run the test
-            result = await executor.execute(
-                graph=graph,
-                goal=self.session.goal,
-                input_data=test.input,
-            )
-
-            # Check result
-            passed = result.success
-            if test.expected_output is not None:
-                passed = passed and (result.output.get("result") == test.expected_output)
-            if test.expected_contains:
-                output_str = str(result.output)
-                passed = passed and (test.expected_contains in output_str)
-
-            test_result = TestResult(
-                test_id=test.id,
-                passed=passed,
-                actual_output=result.output,
-                execution_path=result.path,
-            )
-
-        except Exception as e:
-            test_result = TestResult(
-                test_id=test.id,
-                passed=False,
-                error=str(e),
-            )
-
-        self.session.test_results.append(test_result)
-        self._save_session()
-
-        return test_result
-
-    def run_test(
-        self,
-        test: TestCase,
-        executor_factory: Callable,
-    ) -> TestResult:
-        """
-        Run a single test case.
-
-        This is a synchronous wrapper around run_test_async().
-        If called from an async context (Jupyter, FastAPI, etc.), use run_test_async() instead.
-
-        executor_factory should return a configured GraphExecutor.
-        """
-        import asyncio
-
-        # Check if an event loop is already running
-        # get_running_loop() returns a loop if one exists, or raises RuntimeError if none exists
-        try:
-            asyncio.get_running_loop()
-        except RuntimeError:
-            # No event loop running - safe to use asyncio.run()
-            return asyncio.run(self.run_test_async(test, executor_factory))
-
-        # Event loop is running - cannot use asyncio.run()
-        raise RuntimeError(
-            "Cannot call run_test() from an async context. "
-            "An event loop is already running. "
-            "Please use 'await builder.run_test_async(test, executor_factory)' instead."
-        )
-
-    def run_all_tests(self, executor_factory: Callable) -> list[TestResult]:
-        """Run all test cases."""
-        results = []
-        for test in self.session.test_cases:
-            result = self.run_test(test, executor_factory)
-            results.append(result)
-        return results
-
-    # =========================================================================
-    # APPROVAL
-    # =========================================================================
-
-    def approve(self, comment: str) -> bool:
-        """
-        Approve the current pending change.
-
-        Must have a passing validation to approve.
-        Returns True if approved, False if validation failed.
-        """
-        if self._pending_validation is None:
-            raise RuntimeError("Nothing to approve. Run validation first.")
-
-        if not self._pending_validation.valid:
-            return False
-
-        self.session.approvals.append(
-            {
-                "phase": self.session.phase.value,
-                "comment": comment,
-                "timestamp": datetime.now().isoformat(),
-                "validation": self._pending_validation.model_dump(),
-            }
-        )
-
-        # Advance phase if appropriate
-        if self.session.phase == BuildPhase.GOAL_DRAFT:
-            self.session.phase = BuildPhase.GOAL_APPROVED
-
-        self._pending_validation = None
-        self._save_session()
-
-        return True
-
-    def final_approve(self, comment: str) -> bool:
-        """
-        Final approval for the complete graph.
-
-        Requires all tests to pass.
-        """
-        # Run final validation
-        validation = self.validate()
-        if not validation.valid:
-            self._pending_validation = validation
-            return False
-
-        # Check test results
-        if self.session.test_cases:
-            failed_tests = [t for t in self.session.test_results if not t.passed]
-            if failed_tests:
-                self._pending_validation = ValidationResult(
-                    valid=False,
-                    errors=[f"Failed tests: {[t.test_id for t in failed_tests]}"],
-                )
-                return False
-
-        self.session.phase = BuildPhase.APPROVED
-        self.session.approvals.append(
-            {
-                "phase": "final",
-                "comment": comment,
-                "timestamp": datetime.now().isoformat(),
-            }
-        )
-
-        self._save_session()
-        return True
-
-    # =========================================================================
-    # EXPORT
-    # =========================================================================
-
-    def export(self) -> GraphSpec:
-        """
-        Export the approved graph.
-
-        Requires final approval.
-        """
-        self._require_phase([BuildPhase.APPROVED])
-
-        graph = self._build_graph()
-
-        self.session.phase = BuildPhase.EXPORTED
-        self._save_session()
-
-        return graph
-
-    def _build_graph(self) -> GraphSpec:
-        """Build a GraphSpec from current session."""
-        # Determine entry node
-        entry_node = None
-        for node in self.session.nodes:
-            if not any(e.target == node.id for e in self.session.edges):
-                entry_node = node.id
-                break
-
-        # Determine terminal nodes
-        terminal_nodes = []
-        for node in self.session.nodes:
-            if not any(e.source == node.id for e in self.session.edges):
-                terminal_nodes.append(node.id)
-
-        # Collect all memory keys
-        memory_keys = set()
-        for node in self.session.nodes:
-            memory_keys.update(node.input_keys)
-            memory_keys.update(node.output_keys)
-
-        return GraphSpec(
-            id=f"{self.session.name}-graph",
-            goal_id=self.session.goal.id if self.session.goal else "",
-            entry_node=entry_node or "",
-            terminal_nodes=terminal_nodes,
-            nodes=self.session.nodes,
-            edges=self.session.edges,
-            memory_keys=list(memory_keys),
-        )
-
-    def export_to_file(self, path: Path | str) -> None:
-        """Export the graph to a Python file."""
-        self._require_phase([BuildPhase.APPROVED, BuildPhase.EXPORTED])
-
-        graph = self._build_graph()
-
-        # Generate Python code
-        code = self._generate_code(graph)
-
-        Path(path).write_text(code, encoding="utf-8")
-        self.session.phase = BuildPhase.EXPORTED
-        self._save_session()
-
-    def _generate_code(self, graph: GraphSpec) -> str:
-        """Generate Python code for the graph."""
-        lines = [
-            '"""',
-            f"Generated agent: {self.session.name}",
-            f"Generated at: {datetime.now().isoformat()}",
-            '"""',
-            "",
-            "from framework.graph import (",
-            "    Goal, SuccessCriterion, Constraint,",
-            "    NodeSpec, EdgeSpec, EdgeCondition,",
-            ")",
-            "from framework.graph.edge import GraphSpec",
-            "from framework.graph.goal import GoalStatus",
-            "",
-            "",
-            "# Goal",
-        ]
-
-        if self.session.goal:
-            goal_json = self.session.goal.model_dump_json(indent=4)
-            lines.append("GOAL = Goal.model_validate_json('''")
-            lines.append(goal_json)
-            lines.append("''')")
-        else:
-            lines.append("GOAL = None")
-
-        lines.extend(
-            [
-                "",
-                "",
-                "# Nodes",
-                "NODES = [",
-            ]
-        )
-
-        for node in self.session.nodes:
-            node_json = node.model_dump_json(indent=4)
-            lines.append("    NodeSpec.model_validate_json('''")
-            lines.append(node_json)
-            lines.append("    '''),")
-
-        lines.extend(
-            [
-                "]",
-                "",
-                "",
-                "# Edges",
-                "EDGES = [",
-            ]
-        )
-
-        for edge in self.session.edges:
-            edge_json = edge.model_dump_json(indent=4)
-            lines.append("    EdgeSpec.model_validate_json('''")
-            lines.append(edge_json)
-            lines.append("    '''),")
-
-        lines.extend(
-            [
-                "]",
-                "",
-                "",
-                "# Graph",
-            ]
-        )
-
-        graph_json = graph.model_dump_json(indent=4)
-        lines.append("GRAPH = GraphSpec.model_validate_json('''")
-        lines.append(graph_json)
-        lines.append("''')")
-
-        return "\n".join(lines)
-
-    # =========================================================================
-    # SESSION MANAGEMENT
-    # =========================================================================
-
-    def _require_phase(self, allowed: list[BuildPhase]) -> None:
-        """Ensure we're in an allowed phase."""
-        if self.session.phase not in allowed:
-            raise RuntimeError(
-                f"Cannot perform this action in phase '{self.session.phase.value}'. "
-                f"Allowed phases: {[p.value for p in allowed]}"
-            )
-
-    def _save_session(self) -> None:
-        """Save session to disk."""
-        self.session.updated_at = datetime.now()
-        path = self.storage_path / f"{self.session.id}.json"
-        path.write_text(self.session.model_dump_json(indent=2), encoding="utf-8")
-
-    def _load_session(self, session_id: str) -> BuildSession:
-        """Load session from disk."""
-        path = self.storage_path / f"{session_id}.json"
-        if not path.exists():
-            raise FileNotFoundError(f"Session not found: {session_id}")
-        return BuildSession.model_validate_json(path.read_text(encoding="utf-8"))
-
-    @classmethod
-    def list_sessions(cls, storage_path: Path | str | None = None) -> list[str]:
-        """List all saved sessions."""
-        path = Path(storage_path) if storage_path else Path.home() / ".core" / "builds"
-        if not path.exists():
-            return []
-        return [f.stem for f in path.glob("*.json")]
-
-    # =========================================================================
-    # STATUS
-    # =========================================================================
-
-    def status(self) -> dict[str, Any]:
-        """Get current build status."""
-        return {
-            "session_id": self.session.id,
-            "name": self.session.name,
-            "phase": self.session.phase.value,
-            "goal": self.session.goal.name if self.session.goal else None,
-            "nodes": len(self.session.nodes),
-            "edges": len(self.session.edges),
-            "tests": len(self.session.test_cases),
-            "tests_passed": sum(1 for t in self.session.test_results if t.passed),
-            "approvals": len(self.session.approvals),
-            "pending_validation": self._pending_validation.model_dump()
-            if self._pending_validation
-            else None,
-        }
-
-    def show(self) -> str:
-        """Show current graph as text."""
-        lines = [
-            f"=== Build: {self.session.name} ===",
-            f"Phase: {self.session.phase.value}",
-            "",
-        ]
-
-        if self.session.goal:
-            lines.extend(
-                [
-                    f"Goal: {self.session.goal.name}",
-                    f"  {self.session.goal.description}",
-                    "",
-                ]
-            )
-
-        if self.session.nodes:
-            lines.append("Nodes:")
-            for node in self.session.nodes:
-                lines.append(f"  [{node.id}] {node.name} ({node.node_type})")
-            lines.append("")
-
-        if self.session.edges:
-            lines.append("Edges:")
-            for edge in self.session.edges:
-                lines.append(f"  {edge.source} --{edge.condition.value}--> {edge.target}")
-            lines.append("")
-
-        if self._pending_validation:
-            lines.append("Pending Validation:")
-            lines.append(f"  Valid: {self._pending_validation.valid}")
-            for err in self._pending_validation.errors:
-                lines.append(f"  ERROR: {err}")
-            for warn in self._pending_validation.warnings:
-                lines.append(f"  WARN: {warn}")
-
-        return "\n".join(lines)
@@ -6,6 +6,7 @@ helper functions.
 """

 import json
+import logging
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -18,6 +19,7 @@ from framework.graph.edge import DEFAULT_MAX_TOKENS
 # ---------------------------------------------------------------------------

 HIVE_CONFIG_FILE = Path.home() / ".hive" / "configuration.json"
+logger = logging.getLogger(__name__)


 def get_hive_config() -> dict[str, Any]:
@@ -27,7 +29,12 @@ def get_hive_config() -> dict[str, Any]:
    try:
        with open(HIVE_CONFIG_FILE, encoding="utf-8-sig") as f:
            return json.load(f)
-    except (json.JSONDecodeError, OSError):
+    except (json.JSONDecodeError, OSError) as e:
+        logger.warning(
+            "Failed to load Hive config %s: %s",
+            HIVE_CONFIG_FILE,
+            e,
+        )
        return {}


@@ -6,7 +6,7 @@ This module provides secure credential storage with:
 - Template-based usage: {{cred.key}} patterns for injection
 - Bipartisan model: Store stores values, tools define usage
 - Provider system: Extensible lifecycle management (refresh, validate)
- Multiple backends: Encrypted files, env vars, HashiCorp Vault
+- Multiple backends: Encrypted files, env vars

 Quick Start:
    from core.framework.credentials import CredentialStore, CredentialObject
@@ -38,8 +38,6 @@ For Aden server sync:
        AdenSyncProvider,
    )

-For Vault integration:
-    from core.framework.credentials.vault import HashiCorpVaultStorage
 """

 from .key_storage import (
--- a/Show More
+++ b/Show More