Merge pull request #5769 from aden-hive/queen-mode-separation

Queen mode separation: building, staging, and running modes
2026-03-03 21:31:23 -08:00
parent c7818c2c33 a604fee3aa
commit 132d00d166
32 changed files with 2928 additions and 843 deletions
@@ -69,7 +69,7 @@ goal = Goal(
            id="dynamic-tool-discovery",
            description=(
                "Always discover available tools dynamically via "
-                "discover_mcp_tools before referencing tools in agent designs"
+                "list_agent_tools before referencing tools in agent designs"
            ),
            constraint_type="hard",
            category="correctness",
@@ -52,7 +52,6 @@ _SHARED_TOOLS = [
    "undo_changes",
    # Meta-agent
    "list_agent_tools",
    "discover_mcp_tools",
    "validate_agent_tools",
    "list_agents",
    "list_agent_sessions",
@@ -63,6 +62,47 @@ _SHARED_TOOLS = [
    "run_agent_tests",
 ]
 # Queen mode-specific tool sets.
 # Building mode: full coding + agent construction tools.
 _QUEEN_BUILDING_TOOLS = _SHARED_TOOLS + [
    "load_built_agent",
    "list_credentials",
 ]
 # Staging mode: agent loaded but not yet running — inspect, configure, launch.
 _QUEEN_STAGING_TOOLS = [
    # Read-only (inspect agent files, logs)
    "read_file",
    "list_directory",
    "search_files",
    "run_command",
    # Agent inspection
    "list_credentials",
    "get_worker_status",
    # Launch or go back
    "run_agent_with_input",
    "stop_worker_and_edit",
 ]
 # Running mode: worker is executing — monitor and control.
 _QUEEN_RUNNING_TOOLS = [
    # Read-only coding (for inspecting logs, files)
    "read_file",
    "list_directory",
    "search_files",
    "run_command",
    # Credentials
    "list_credentials",
    # Worker lifecycle
    "stop_worker",
    "stop_worker_and_edit",
    "get_worker_status",
    "inject_worker_message",
    # Monitoring
    "get_worker_health_summary",
    "notify_operator",
 ]
 # ---------------------------------------------------------------------------
 # Shared agent-building knowledge: core mandates, tool docs, meta-agent
@@ -101,10 +141,10 @@ errors yourself. Don't declare success until validation passes.
 - undo_changes(path?) — restore from git snapshot
 ## Meta-Agent
- list_agent_tools(server_config_path?) — list all tool names available \
+- list_agent_tools(server_config_path?, output_schema?, group?) — discover \
-for agent building, grouped by category. Call this FIRST before designing.
+available tools grouped by category. output_schema: "simple" (default) or \
- discover_mcp_tools(server_config_path?) — connect to MCP servers \
+"full" (includes input_schema). group: "all" (default) or a prefix like \
-and list all available tools with full schemas. Use for parameter details.
+"gmail". Call FIRST before designing.
 - validate_agent_tools(agent_path) — validate that all tools declared \
 in an agent's nodes actually exist. Call after building.
 - list_agents() — list all agent packages in exports/ with session counts
@@ -121,15 +161,14 @@ You are not just a file writer. You have deep integration with the \
 Hive framework:
 ## Tool Discovery (MANDATORY before designing)
-Before designing any agent, run list_agent_tools() to get all \
+Before designing any agent, run list_agent_tools() to discover all \
-available tool names. ONLY use tools from this list in your node \
+available tools. ONLY use tools from this list in your node definitions. \
-definitions. NEVER guess or fabricate tool names from memory.
+NEVER guess or fabricate tool names from memory.
-For full parameter schemas when you need details:
+  list_agent_tools()                                    # names + descriptions
-  discover_mcp_tools()
+  list_agent_tools(output_schema="full")                # include input_schema
-
+  list_agent_tools(group="gmail")                       # only gmail_* tools
-To check a specific agent's configured tools:
+  list_agent_tools("exports/{agent_name}/mcp_servers.json")  # specific agent
  list_agent_tools("exports/{agent_name}/mcp_servers.json")
 ## Agent Awareness
 Run list_agents() to see what agents already exist. Read their code \
@@ -246,11 +285,12 @@ explicitly requests a one-shot/batch agent. Forever-alive agents loop \
 continuously — the user exits by closing the TUI. This is the standard \
 pattern for all interactive agents.
-### Node Count Rules (HARD LIMITS)
+### Node Design Rules
-**2-4 nodes** for all agents. Never exceed 4 unless the user explicitly \
+Each node boundary serializes outputs to shared memory \
-requests more. Each node boundary serializes outputs to shared memory \
+and DESTROYS all in-context information (tool results, reasoning, history). \
-and DESTROYS all in-context information (tool results, reasoning, history).
+Use as many nodes as the use case requires, but don't create nodes without \
 tools — merge them into nodes that do real work.
 **MERGE nodes when:**
 - Node has NO tools (pure LLM reasoning) → merge into predecessor/successor
@@ -264,10 +304,11 @@ and DESTROYS all in-context information (tool results, reasoning, history).
 - Fundamentally different tool sets
 - Fan-out parallelism (parallel branches MUST be separate)
-**Typical patterns:**
+**Typical patterns (queen manages intake — NO client-facing intake node):**
- 2 nodes: `interact (client-facing) → process (autonomous) → interact`
+- 2 nodes: `process (autonomous) → review (client-facing) → process`
- 3 nodes: `intake (CF) → process (auto) → review (CF) → intake`
+- 1 node: `process (autonomous)` — simplest; queen handles all interaction
 - WRONG: 7 nodes where half have no tools and just do LLM reasoning
 - WRONG: Intake node that asks the user for requirements — the queen does intake
 Read reference agents before designing:
  list_agents()
@@ -280,20 +321,27 @@ use box-drawing characters and clear flow arrows:
 ```
 ┌─────────────────────────┐
 │  intake (client-facing)  │
 │  tools: set_output       │
 └────────────┬────────────┘
             │ on_success
             ▼
 ┌─────────────────────────┐
 │  process (autonomous)    │
 │  in:  user_request       │
 │  tools: web_search,      │
 │         save_data        │
 └────────────┬────────────┘
             │ on_success
-             └──────► back to intake
+             ▼
 ┌─────────────────────────┐
 │  review (client-facing)  │
 │  tools: set_output       │
 └────────────┬────────────┘
             │ on_success
             └──────► back to process
 ```
 The queen owns intake: she gathers user requirements, then calls \
 `run_agent_with_input(task)` with a structured task description. \
 When building the agent, design the entry node's `input_keys` to \
 match what the queen will provide at run time. No client-facing \
 intake node in the worker.
 Follow the graph with a brief summary of each node's purpose. \
 Get user approval before implementing.
@@ -356,8 +404,9 @@ from .agent import (
 ```
 **entry_points**: `{"start": "first-node-id"}`
-For agents with multiple entry points (e.g. a reminder trigger), \
+The first node should be an autonomous processing node (NOT a \
-add them: `{"start": "intake", "reminder": "reminder"}`
+client-facing intake). For agents with multiple entry points, \
 add them: `{"start": "process", "reminder": "check"}`
 **conversation_mode** — ONLY two valid values:
 - `"continuous"` — recommended for interactive agents (context carries \
@@ -391,7 +440,8 @@ NO "mcpServers" wrapper. cwd "../../tools". command "uv".
 **Storage**: `Path.home() / ".hive" / "agents" / "{name}"`
-**Client-facing system prompts** — STEP 1/STEP 2 pattern:
+**Client-facing system prompts** (review/approval nodes only, NOT intake) \
 — STEP 1/STEP 2 pattern:
 ```
 STEP 1 — Present to user (text only, NO tool calls):
 [instructions]
@@ -399,6 +449,9 @@ STEP 1 — Present to user (text only, NO tool calls):
 STEP 2 — After user responds, call set_output:
 [set_output calls]
 ```
 The queen manages intake. Workers should NOT have a client-facing node \
 that asks for requirements. Use client_facing=True only for review or \
 approval checkpoints mid-execution.
 **Autonomous system prompts** — set_output in SEPARATE turn.
@@ -408,7 +461,10 @@ If list_agent_tools() shows these don't exist, use alternatives \
 (e.g. save_data/load_data for data persistence).
 **Node rules**:
- **2-4 nodes MAX.** Never exceed 4. Merge thin nodes aggressively.
+- **NO intake nodes.** The queen owns intake. She defines the entry \
 node's input_keys at build time and fills them via \
 `run_agent_with_input(task)` at run time.
 - Don't abuse nodes without tools — merge them into a node that does work.
 - A node with 0 tools is NOT a real node — merge it.
 - node_type "event_loop" for all regular graph nodes. Use "gcu" ONLY for
  browser automation subagents (see GCU appendix). GCU nodes MUST be in a
@@ -542,50 +598,89 @@ start_agent("{name}")           # triggers default entry point
 _queen_tools_docs = """
-## Worker Lifecycle
+## Operating Modes
 - start_worker(task) — Start the worker with a task description. The \
 worker runs autonomously until it finishes or asks the user a question.
 - stop_worker() — Cancel the worker's current execution.
 - get_worker_status() — Check if the worker is idle, running, or waiting \
 for user input. Returns execution details.
 - inject_worker_message(content) — Send a message to the running worker. \
 Use this to relay user instructions or concerns.
-## Monitoring
+You operate in one of three modes. Your available tools change based on the \
- get_worker_health_summary() — Read the latest health data from the judge.
+mode. The system notifies you when a mode change occurs.
 - notify_operator(ticket_id, analysis, urgency) — Alert the user about a \
 critical issue. Use sparingly.
-## Agent Loading
+### BUILDING mode (default)
- load_built_agent(agent_path) — Load a newly built agent as the worker in \
+You have full coding tools for building and modifying agents:
-this session. If a worker is already loaded, it is automatically unloaded \
+- File I/O: read_file, write_file, edit_file, list_directory, search_files, \
-first. Call after building and validating an agent to make it available \
+run_command, undo_changes
-immediately.
+- Meta-agent: list_agent_tools, validate_agent_tools, \
 list_agents, list_agent_sessions, get_agent_session_state, get_agent_session_memory, \
 list_agent_checkpoints, get_agent_checkpoint, run_agent_tests
 - load_built_agent(agent_path) — Load the agent and switch to STAGING mode
 - list_credentials(credential_id?) — List authorized credentials
-## Credentials
+When you finish building an agent, call load_built_agent(path) to stage it.
- list_credentials(credential_id?) — List all authorized credentials in the \
+
-local store. Returns IDs, aliases, status, and identity metadata (never \
+### STAGING mode (agent loaded, not yet running)
-secrets). Optionally filter by credential_id.
+The agent is loaded and ready to run. You can inspect it and launch it:
 - Read-only: read_file, list_directory, search_files, run_command
 - list_credentials(credential_id?) — Verify credentials are configured
 - get_worker_status() — Check the loaded worker
 - run_agent_with_input(task) — Start the worker and switch to RUNNING mode
 - stop_worker_and_edit() — Go back to BUILDING mode
 In STAGING mode you do NOT have write tools. If you need to modify the agent, \
 call stop_worker_and_edit() to go back to BUILDING mode.
 ### RUNNING mode (worker is executing)
 The worker is running. You have monitoring and lifecycle tools:
 - Read-only: read_file, list_directory, search_files, run_command
 - get_worker_status() — Check worker status (idle, running, waiting)
 - inject_worker_message(content) — Send a message to the running worker
 - get_worker_health_summary() — Read the latest health data
 - notify_operator(ticket_id, analysis, urgency) — Alert the user (use sparingly)
 - stop_worker() — Stop the worker and return to STAGING mode, then ask the user what to do next
 - stop_worker_and_edit() — Stop the worker and switch back to BUILDING mode
 In RUNNING mode you do NOT have write tools or agent construction tools. \
 If you need to modify the agent, call stop_worker_and_edit() to switch back \
 to BUILDING mode. To stop the worker and ask the user what to do next, call \
 stop_worker() to return to STAGING mode.
 ### Mode transitions
 - load_built_agent(path) → switches to STAGING mode
 - run_agent_with_input(task) → starts worker, switches to RUNNING mode
 - stop_worker() → stops worker, switches to STAGING mode (ask user: re-run or edit?)
 - stop_worker_and_edit() → stops worker (if running), switches to BUILDING mode
 """
 _queen_behavior = """
 # Behavior
 ## CRITICAL RULE — ask_user tool
 Every response that ends with a question, a prompt, or expects user \
 input MUST finish with a call to ask_user(prompt, options). This is \
 NON-NEGOTIABLE. The system CANNOT detect that you are waiting for \
 input unless you call ask_user. You MUST call ask_user as the LAST \
 action in your response.
 NEVER end a response with a question in text without calling ask_user. \
 NEVER rely on the user seeing your text and replying — call ask_user.
 Always provide 2-4 short options that cover the most likely answers. \
 The user can always type a custom response.
 Examples:
 - ask_user("What do you need?",
  ["Build a new agent", "Run the loaded worker", "Help with code"])
 - ask_user("Which pattern?",
  ["Simple 2-node", "Rich with feedback", "Custom"])
 - ask_user("Ready to proceed?",
  ["Yes, go ahead", "Let me change something"])
 ## Greeting and identity
-When the user greets you ("hi", "hello") or asks what you can do / \
+When the user greets you or asks what you can do, respond concisely \
-what you are, respond concisely. DO NOT list internal processes \
+(under 10 lines). DO NOT list internal processes. Focus on:
-(validation steps, AgentRunner.load, tool discovery). Focus on \
+1. Direct capabilities: coding, agent building & debugging.
-user-facing capabilities:
+2. What the loaded worker does (one sentence from Worker Profile). \
-
+If no worker is loaded, say so.
-1. Direct capabilities: file operations, shell commands, coding, \
+3. THEN call ask_user to prompt them — do NOT just write text.
 agent building & debugging.
 2. Delegation: describe what the loaded worker does in one sentence \
 (read the Worker Profile at the end of this prompt). If no worker \
 is loaded, say so.
 3. End with a short prompt: "What do you need?"
 Keep it under 10 lines. No bullet-point dumps of every tool you have.
 ## Direct coding
 You can do any coding task directly — reading files, writing code, running \
@@ -596,7 +691,8 @@ The worker is a specialized agent (see Worker Profile at the end of this \
 prompt). It can ONLY do what its goal and tools allow.
 **Decision rule — read the Worker Profile first:**
- The user's request directly matches the worker's goal → start_worker(task)
+- The user's request directly matches the worker's goal → use \
 run_agent_with_input(task) (if in staging) or load then run (if in building)
 - Anything else → do it yourself. Do NOT reframe user requests into \
 subtasks to justify delegation.
 - Building, modifying, or configuring agents is ALWAYS your job. Never \
@@ -604,16 +700,30 @@ delegate agent construction to the worker, even as a "research" subtask.
 ## When the user says "run", "execute", or "start" (without specifics)
-The loaded worker is described in the Worker Profile below. Ask what \
+The loaded worker is described in the Worker Profile below. You MUST \
-task or topic they want — do NOT call list_agents() or list directories. \
+ask the user what task or input they want using ask_user — do NOT \
-The worker is already loaded. Just ask for the input the worker needs \
+invent a task, do NOT call list_agents() or list directories. \
-(e.g., a research topic, a target domain, a job description).
+The worker is already loaded. Just ask for the specific input the \
 worker needs (e.g., a research topic, a target domain, a job description). \
 NEVER call run_agent_with_input until the user has provided their input.
 If NO worker is loaded, say so and offer to build one.
 ## When in staging mode (agent loaded, not running):
 - Tell the user the agent is loaded and ready.
 - For tasks matching the worker's goal: ALWAYS ask the user for their \
 specific input BEFORE calling run_agent_with_input(task). NEVER make up \
 or assume what the user wants. Use ask_user to collect the task details \
 (e.g., topic, target, requirements). Once you have the user's answer, \
 compose a structured task description from their input and call \
 run_agent_with_input(task). The worker has no intake node — it receives \
 your task and starts processing.
 - If the user wants to modify the agent, call stop_worker_and_edit().
 ## When idle (worker not running):
 - Greet the user. Mention what the worker can do in one sentence.
- For tasks matching the worker's goal, call start_worker(task).
+- For tasks matching the worker's goal, use run_agent_with_input(task) \
 (if in staging) or load the agent first (if in building).
 - For everything else, do it directly.
 ## When the user clicks Run (external event notification)
@@ -625,24 +735,37 @@ explain the problem clearly and help fix it. For credential errors, \
 guide the user to set up the missing credentials. For structural \
 issues, offer to fix the agent graph directly.
-## When worker is running:
+## When worker is running — GO SILENT
 - If the user asks about progress, call get_worker_status() ONCE and \
 report the result. Do NOT poll in a loop.
 - NEVER call get_worker_status() repeatedly without user input in between. \
 The worker will surface results through client-facing nodes. You do not \
 need to monitor it. One check per user request is enough.
 - If the user has a concern or instruction for the worker, call \
 inject_worker_message(content) to relay it.
 - You can still do coding tasks directly while the worker runs.
 - If an escalation ticket arrives from the judge, assess severity:
  - Low/transient: acknowledge silently, do not disturb the user.
  - High/critical: notify the user with a brief analysis and suggested action.
 - After starting the worker or checking its status, WAIT for the user's \
 next message. Do not take autonomous actions unless the user asks.
-## When worker asks user a question:
+Once you call start_worker(), your job is DONE. Do NOT call ask_user, \
- The system will route the user's response directly to the worker. \
+do NOT call get_worker_status(), do NOT emit any text. Just stop. \
-You do not need to relay it. The user will come back to you after responding.
+The worker owns the conversation now — it has its own client-facing \
 nodes that talk to the user directly.
 **After start_worker, your ENTIRE response should be ONE short \
 confirmation sentence with NO tool calls.** Example: \
 "Started the vulnerability assessment." — that's it. No ask_user, \
 no get_worker_status, no follow-up questions.
 You only wake up again when:
 - The user explicitly addresses you (not answering a worker question)
 - A worker question is forwarded to you for relay
 - An escalation ticket arrives from the judge
 - The worker finishes
 If the user explicitly asks about progress, call get_worker_status() \
 ONCE and report. Do NOT poll or check proactively.
 For escalation tickets: low/transient → acknowledge silently. \
 High/critical → notify the user with a brief analysis.
 ## When the worker asks the user a question:
 - The user's answer is routed to you with context: \
 [Worker asked: "...", Options: ...] User answered: "...".
 - If the user is answering the worker's question normally, relay it \
 using inject_worker_message(answer_text). Then go silent again.
 - If the user is rejecting the approach, asking to stop, or giving \
 you an instruction, handle it yourself — do NOT relay.
 ## Showing or describing the loaded worker
@@ -658,16 +781,18 @@ building something new.
 When the user asks to change, modify, or update the loaded worker \
 (e.g., "change the report node", "add a node", "delete node X"):
-1. Use the **Path** from the Worker Profile to locate the agent files.
+1. Call stop_worker_and_edit() — this stops the worker and gives you \
-2. Read the relevant files (nodes/__init__.py, agent.py, etc.).
+coding tools (switches to BUILDING mode).
-3. Make the requested changes using edit_file / write_file.
+2. Use the **Path** from the Worker Profile to locate the agent files.
-4. Run validation (default_agent.validate(), AgentRunner.load(), \
+3. Read the relevant files (nodes/__init__.py, agent.py, etc.).
 4. Make the requested changes using edit_file / write_file.
 5. Run validation (default_agent.validate(), AgentRunner.load(), \
 validate_agent_tools()).
-5. **Reload the modified worker**: call load_built_agent("{path}") \
+6. **Reload the modified worker**: call load_built_agent("{path}") \
-so the changes take effect immediately. If a worker is already loaded, \
+so the changes take effect immediately (switches to STAGING mode). \
-stop it first, then reload.
+Then call run_agent_with_input(task) to restart execution.
-Do NOT skip step 5 — without reloading, the user will still be \
+Do NOT skip step 6 — without reloading, the user will still be \
 interacting with the old version.
 """
@@ -676,9 +801,9 @@ _queen_phase_7 = """
 After building and verifying, load the agent into the current session:
  load_built_agent("exports/{name}")
-This makes the agent available immediately — the user sees its graph, \
+This switches to STAGING mode — the user sees the agent's graph and \
-the tab name updates, and you can delegate to it via start_worker(). \
+the tab name updates. Then call run_agent_with_input(task) to start it. \
-Do NOT tell the user to run `python -m {name} run` — load it here.
+Do NOT tell the user to run `python -m {name} run` — load and run it here.
 """
 _queen_style = """
@@ -808,21 +933,7 @@ queen_node = NodeSpec(
        "User's intent is understood, coding tasks are completed correctly, "
        "and the worker is managed effectively when delegated to."
    ),
-    tools=_SHARED_TOOLS
+    tools=sorted(set(_QUEEN_BUILDING_TOOLS + _QUEEN_STAGING_TOOLS + _QUEEN_RUNNING_TOOLS)),
    + [
        # Worker lifecycle
        "start_worker",
        "stop_worker",
        "get_worker_status",
        "inject_worker_message",
        # Monitoring
        "get_worker_health_summary",
        "notify_operator",
        # Agent loading
        "load_built_agent",
        # Credentials
        "list_credentials",
    ],
    system_prompt=(
        "You are the Queen — the user's primary interface. You are a coding agent "
        "with the same capabilities as the Hive Coder worker, PLUS the ability to "
@@ -836,20 +947,7 @@ queen_node = NodeSpec(
    ),
 )
-ALL_QUEEN_TOOLS = _SHARED_TOOLS + [
+ALL_QUEEN_TOOLS = sorted(set(_QUEEN_BUILDING_TOOLS + _QUEEN_STAGING_TOOLS + _QUEEN_RUNNING_TOOLS))
    # Worker lifecycle
    "start_worker",
    "stop_worker",
    "get_worker_status",
    "inject_worker_message",
    # Monitoring
    "get_worker_health_summary",
    "notify_operator",
    # Agent loading
    "load_built_agent",
    # Credentials
    "list_credentials",
 ]
 __all__ = [
    "coder_node",
@@ -857,4 +955,7 @@ __all__ = [
    "queen_node",
    "ALL_QUEEN_TRIAGE_TOOLS",
    "ALL_QUEEN_TOOLS",
    "_QUEEN_BUILDING_TOOLS",
    "_QUEEN_STAGING_TOOLS",
    "_QUEEN_RUNNING_TOOLS",
 ]
@@ -48,11 +48,11 @@ profile_setup → daily_intake → update_tracker → analyze_progress → gener
 ```
 `analyze_progress` has no tools. `schedule_reminders` just sets one boolean. `report` just presents analysis. `update_tracker` and `generate_plan` are sequential autonomous work.
-**Good example** (3 nodes):
+**Good example** (2 nodes):
 ```
-intake (client-facing) → process (autonomous: track + analyze + plan) → intake (loop back)
+process (autonomous: track + analyze + plan) → review (client-facing) → process (loop back)
 ```
-One client-facing node handles ALL user interaction (setup, logging, reports). One autonomous node handles ALL backend work (CSV update, analysis, plan generation) with tools and context preserved.
+The queen handles intake (gathering requirements from the user) and passes the task via `run_agent_with_input(task)`. One autonomous node handles ALL backend work (CSV update, analysis, plan generation) with tools and context preserved. One client-facing node handles review/approval when needed.
 12. **Adding framework gating for LLM behavior** — Don't add output rollback, premature rejection, or interaction protocol injection. Fix with better prompts or custom judges.
@@ -109,3 +109,5 @@ def test_research_routes_back_to_interact(self):
 25. **Manually wiring browser tools on event_loop nodes** — If the agent needs browser automation, use `node_type="gcu"` which auto-includes all browser tools and prepends best-practices guidance. Do NOT manually list browser tool names on event_loop nodes — they may not exist in the MCP server or may be incomplete. See the GCU Guide appendix.
 26. **Using GCU nodes as regular graph nodes** — GCU nodes (`node_type="gcu"`) are exclusively subagents. They must ONLY appear in a parent node's `sub_agents=["gcu-node-id"]` list and be invoked via `delegate_to_sub_agent()`. They must NEVER be connected via edges, used as entry nodes, or used as terminal nodes. If a GCU node appears as an edge source or target, the graph will fail pre-load validation.
 27. **Adding a client-facing intake node to worker agents** — The queen owns intake. She defines the entry node's `input_keys` at build time and fills them via `run_agent_with_input(task)` at run time. Worker agents should start with an autonomous processing node, NOT a client-facing intake node that asks the user for requirements. Client-facing nodes in workers are for mid-execution review/approval only.
@@ -57,51 +57,28 @@ metadata = AgentMetadata()
 from framework.graph import NodeSpec
-# Node 1: Intake (client-facing)
+# Node 1: Process (autonomous entry node)
-intake_node = NodeSpec(
+# The queen handles intake and passes structured input via
-    id="intake",
+# run_agent_with_input(task). NO client-facing intake node.
-    name="Intake",
+# The queen defines input_keys at build time and fills them at run time.
-    description="Gather requirements from the user",
+process_node = NodeSpec(
    id="process",
    name="Process",
    description="Execute the task using available tools",
    node_type="event_loop",
    client_facing=True,
    max_node_visits=0,  # Unlimited for forever-alive
-    input_keys=["topic"],
+    input_keys=["user_request", "feedback"],
    output_keys=["brief"],
    success_criteria="The brief is specific and actionable.",
    system_prompt="""\
 You are an intake specialist.
 **STEP 1 — Read and respond (text only, NO tool calls):**
 1. Read the topic provided
 2. If vague, ask 1-2 clarifying questions
 3. If clear, confirm your understanding
 **STEP 2 — After the user confirms, call set_output:**
 - set_output("brief", "Clear description of what to do")
 """,
    tools=[],
 )
 # Node 2: Worker (autonomous)
 worker_node = NodeSpec(
    id="worker",
    name="Worker",
    description="Do the main work",
    node_type="event_loop",
    max_node_visits=0,
    input_keys=["brief", "feedback"],
    output_keys=["results"],
    nullable_output_keys=["feedback"],  # Only on feedback edge
    success_criteria="Results are complete and accurate.",
    system_prompt="""\
-You are a worker agent. Given a brief, do the work.
+You are a processing agent. Your task is in memory under "user_request". \
-
+If "feedback" is present, this is a revision — address the feedback.
 If feedback is provided, this is a follow-up — address the feedback.
 Work in phases:
 1. Use tools to gather/process data
 2. Analyze results
-3. Call set_output for each key in a SEPARATE turn:
+3. Call set_output in a SEPARATE turn:
   - set_output("results", "structured results")
 """,
    tools=["web_search", "web_scrape", "save_data", "load_data", "list_data_files"],
@@ -115,7 +92,7 @@ review_node = NodeSpec(
    node_type="event_loop",
    client_facing=True,
    max_node_visits=0,
-    input_keys=["results", "brief"],
+    input_keys=["results", "user_request"],
    output_keys=["next_action", "feedback"],
    nullable_output_keys=["feedback"],
    success_criteria="User has reviewed and decided next steps.",
@@ -128,14 +105,14 @@ Present the results to the user.
 3. Ask: satisfied, or want changes?
 **STEP 2 — After user responds, call set_output:**
- set_output("next_action", "new_topic")   — if starting fresh
+- set_output("next_action", "done")        — if satisfied
 - set_output("next_action", "revise")      — if changes needed
 - set_output("feedback", "what to change") — only if revising
 """,
    tools=[],
 )
-__all__ = ["intake_node", "worker_node", "review_node"]
+__all__ = ["process_node", "review_node"]
 ```
 ## agent.py
@@ -155,7 +132,7 @@ from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
 from framework.runtime.execution_stream import EntryPointSpec
 from .config import default_config, metadata
-from .nodes import intake_node, worker_node, review_node
+from .nodes import process_node, review_node
 # Goal definition
 goal = Goal(
@@ -172,27 +149,26 @@ goal = Goal(
 )
 # Node list
-nodes = [intake_node, worker_node, review_node]
+nodes = [process_node, review_node]
 # Edge definitions
 edges = [
-    EdgeSpec(id="intake-to-worker", source="intake", target="worker",
+    EdgeSpec(id="process-to-review", source="process", target="review",
             condition=EdgeCondition.ON_SUCCESS, priority=1),
-    EdgeSpec(id="worker-to-review", source="worker", target="review",
+    # Feedback loop — revise results
-             condition=EdgeCondition.ON_SUCCESS, priority=1),
+    EdgeSpec(id="review-to-process", source="review", target="process",
    # Feedback loop
    EdgeSpec(id="review-to-worker", source="review", target="worker",
             condition=EdgeCondition.CONDITIONAL,
             condition_expr="str(next_action).lower() == 'revise'", priority=2),
-    # Loop back for new topic
+    # Loop back for next task (queen sends new input)
-    EdgeSpec(id="review-to-intake", source="review", target="intake",
+    EdgeSpec(id="review-done", source="review", target="process",
             condition=EdgeCondition.CONDITIONAL,
-             condition_expr="str(next_action).lower() == 'new_topic'", priority=1),
+             condition_expr="str(next_action).lower() == 'done'", priority=1),
 ]
-# Graph configuration
+# Graph configuration — entry is the autonomous process node
-entry_node = "intake"
+# The queen handles intake and passes the task via run_agent_with_input(task)
-entry_points = {"start": "intake"}
+entry_node = "process"
 entry_points = {"start": "process"}
 pause_nodes = []
 terminal_nodes = []  # Forever-alive
@@ -208,7 +184,7 @@ class MyAgent:
        self.goal = goal
        self.nodes = nodes
        self.edges = edges
-        self.entry_node = entry_node
+        self.entry_node = entry_node  # "process" — autonomous entry
        self.entry_points = entry_points
        self.pause_nodes = pause_nodes
        self.terminal_nodes = terminal_nodes
@@ -498,7 +474,7 @@ def tui():
        llm = LiteLLMProvider(model=agent.config.model, api_key=agent.config.api_key, api_base=agent.config.api_base)
        runtime = create_agent_runtime(
            graph=agent._build_graph(), goal=agent.goal, storage_path=storage,
-            entry_points=[EntryPointSpec(id="start", name="Start", entry_node="intake", trigger_type="manual", isolation_level="isolated")],
+            entry_points=[EntryPointSpec(id="start", name="Start", entry_node="process", trigger_type="manual", isolation_level="isolated")],
            llm=llm, tools=list(agent._tool_registry.get_tools().values()), tool_executor=agent._tool_registry.get_executor())
        await runtime.start()
        try:
@@ -131,13 +131,19 @@ downstream node only sees the serialized summary string.
 - A "report" node that presents analysis → merge into the client-facing node
 - A "confirm" or "schedule" node that doesn't call any external service → remove
-**Typical agent structure (3 nodes):**
+**Typical agent structure (2 nodes):**
 ```
-intake (client-facing) ←→ process (autonomous) ←→ review (client-facing)
+process (autonomous) ←→ review (client-facing)
 ```
-Or for simpler agents, just 2 nodes:
+The queen owns intake — she gathers requirements from the user, then
 passes structured input via `run_agent_with_input(task)`. When building
 the agent, design the entry node's `input_keys` to match what the queen
 will provide at run time. Worker agents should NOT have a client-facing
 intake node. Client-facing nodes are for mid-execution review/approval only.
 For simpler agents, just 1 autonomous node:
 ```
-interact (client-facing) → process (autonomous) → interact (loop)
+process (autonomous) — loops back to itself
 ```
 ### nullable_output_keys
@@ -397,7 +403,7 @@ from .agent import (
 ### Reference Agent
 See `exports/gmail_inbox_guardian/agent.py` for a complete example with:
- Primary client-facing intake node (user configures rules)
+- Primary client-facing node (user configures rules)
 - Timer-based scheduled inbox checks (every 20 min)
 - Webhook-triggered email event handling
 - Shared isolation for memory access across streams
@@ -413,13 +419,13 @@ See `exports/gmail_inbox_guardian/agent.py` for a complete example with:
 ## Tool Discovery
 Do NOT rely on a static tool list — it will be outdated. Always use
-`list_agent_tools()` to get available tool names grouped by category.
+`list_agent_tools()` to discover available tools, grouped by category.
 For full schemas with parameter details, use `discover_mcp_tools()`.
 ```
-list_agent_tools()                            # all available tools
+list_agent_tools()                            # names + descriptions, all groups
-list_agent_tools("exports/my_agent/mcp_servers.json")  # specific agent
+list_agent_tools(output_schema="full")        # include input_schema
-discover_mcp_tools()                          # full schemas with params
+list_agent_tools(group="gmail")               # only gmail_* tools
 list_agent_tools("exports/my_agent/mcp_servers.json")  # specific agent's tools
 ```
 After building, validate tools exist: `validate_agent_tools("exports/{name}")`
@@ -21,7 +21,7 @@ Do NOT use GCU for:
 - Same underlying `EventLoopNode` class — no new imports needed
 - `tools=[]` is correct — tools are auto-populated at runtime
-## GCU Architecture Pattern
+## GCU Architecture Pattern  
 GCU nodes are **subagents** — invoked via `delegate_to_sub_agent()`, not connected via edges.
@@ -152,6 +152,72 @@ def _compact_tool_calls(tool_calls: list[dict[str, Any]]) -> list[dict[str, Any]
    return compact
 def extract_tool_call_history(messages: list[Message], max_entries: int = 30) -> str:
    """Build a compact tool call history from a list of messages.
    Used in compaction summaries to prevent the LLM from re-calling
    tools it already called.  Extracts tool call details, files saved,
    outputs set, and errors encountered.
    """
    tool_calls_detail: dict[str, list[str]] = {}
    files_saved: list[str] = []
    outputs_set: list[str] = []
    errors: list[str] = []
    def _summarize_input(name: str, args: dict) -> str:
        if name == "web_search":
            return args.get("query", "")
        if name == "web_scrape":
            return args.get("url", "")
        if name in ("load_data", "save_data"):
            return args.get("filename", "")
        return ""
    for msg in messages:
        if msg.role == "assistant" and msg.tool_calls:
            for tc in msg.tool_calls:
                func = tc.get("function", {})
                name = func.get("name", "unknown")
                try:
                    args = json.loads(func.get("arguments", "{}"))
                except (json.JSONDecodeError, TypeError):
                    args = {}
                summary = _summarize_input(name, args)
                tool_calls_detail.setdefault(name, []).append(summary)
                if name == "save_data" and args.get("filename"):
                    files_saved.append(args["filename"])
                if name == "set_output" and args.get("key"):
                    outputs_set.append(args["key"])
        if msg.role == "tool" and msg.is_error:
            preview = msg.content[:120].replace("\n", " ")
            errors.append(preview)
    parts: list[str] = []
    if tool_calls_detail:
        lines: list[str] = []
        for name, inputs in list(tool_calls_detail.items())[:max_entries]:
            count = len(inputs)
            non_empty = [s for s in inputs if s]
            if non_empty:
                detail_lines = [f"    - {s[:120]}" for s in non_empty[:8]]
                lines.append(f"  {name} ({count}x):\n" + "\n".join(detail_lines))
            else:
                lines.append(f"  {name} ({count}x)")
        parts.append("TOOLS ALREADY CALLED:\n" + "\n".join(lines))
    if files_saved:
        unique = list(dict.fromkeys(files_saved))
        parts.append("FILES SAVED: " + ", ".join(unique))
    if outputs_set:
        unique = list(dict.fromkeys(outputs_set))
        parts.append("OUTPUTS SET: " + ", ".join(unique))
    if errors:
        parts.append("ERRORS (do NOT retry these):\n" + "\n".join(f"  - {e}" for e in errors[:10]))
    return "\n\n".join(parts)
 # ---------------------------------------------------------------------------
 # ConversationStore protocol (Phase 2)
 # ---------------------------------------------------------------------------
@@ -373,9 +439,36 @@ class NodeConversation:
    def _repair_orphaned_tool_calls(
        msgs: list[dict[str, Any]],
    ) -> list[dict[str, Any]]:
-        """Ensure every tool_call has a matching tool-result message."""
+        """Ensure tool_call / tool_result pairs are consistent.
        1. **Orphaned tool results** (tool_result with no preceding tool_use)
           are dropped.  This happens when compaction removes an assistant
           message but leaves its tool-result messages behind.
        2. **Orphaned tool calls** (tool_use with no following tool_result)
           get a synthetic error result appended.  This happens when a loop
           is cancelled mid-tool-execution.
        """
        # Pass 1: collect all tool_call IDs from assistant messages so we
        # can identify orphaned tool-result messages.
        all_tool_call_ids: set[str] = set()
        for m in msgs:
            if m.get("role") == "assistant":
                for tc in m.get("tool_calls") or []:
                    tc_id = tc.get("id")
                    if tc_id:
                        all_tool_call_ids.add(tc_id)
        # Pass 2: build repaired list — drop orphaned tool results, patch
        # missing tool results.
        repaired: list[dict[str, Any]] = []
        for i, m in enumerate(msgs):
            # Drop tool-result messages whose tool_call_id has no matching
            # tool_use in any assistant message (orphaned by compaction).
            if m.get("role") == "tool":
                tid = m.get("tool_call_id")
                if tid and tid not in all_tool_call_ids:
                    continue  # skip orphaned result
            repaired.append(m)
            tool_calls = m.get("tool_calls")
            if m.get("role") != "assistant" or not tool_calls:
@@ -653,6 +746,7 @@ class NodeConversation:
        spillover_dir: str,
        keep_recent: int = 4,
        phase_graduated: bool = False,
        aggressive: bool = False,
    ) -> None:
        """Structure-preserving compaction: save freeform text to file, keep tool messages.
@@ -662,6 +756,11 @@ class NodeConversation:
        after pruning.  Only freeform text exchanges (user messages,
        text-only assistant messages) are saved to a file and removed.
        When *aggressive* is True, non-essential tool call pairs are also
        collapsed into a compact summary instead of being kept individually.
        Only ``set_output`` calls and error results are preserved; all other
        old tool pairs are replaced by a tool-call history summary.
        The result: the agent retains exact knowledge of what tools it called,
        where each result is stored, and can load the conversation text if
        needed.  No LLM summary call.  No heuristics.  Nothing lost.
@@ -693,35 +792,91 @@ class NodeConversation:
        # Classify old messages: structural (keep) vs freeform (save to file)
        kept_structural: list[Message] = []
        freeform_lines: list[str] = []
        collapsed_msgs: list[Message] = []
-        for msg in old_messages:
+        if aggressive:
-            if msg.role == "tool":
+            # Aggressive: only keep set_output tool pairs and error results.
-                # Tool results — already pruned to ~30 tokens (file reference).
+            # Everything else is collapsed into a tool-call history summary.
-                # Keep in conversation.
+            # We need to track tool_call IDs to pair assistant messages with
-                kept_structural.append(msg)
+            # their tool results.
-            elif msg.role == "assistant" and msg.tool_calls:
+            protected_tc_ids: set[str] = set()
-                # Assistant message with tool_calls — keep the tool_calls
+            collapsible_tc_ids: set[str] = set()
-                # with truncated arguments, clear the freeform text content.
+
-                compact_tcs = _compact_tool_calls(msg.tool_calls)
+            # First pass: classify assistant messages
-                kept_structural.append(
+            for msg in old_messages:
-                    Message(
+                if msg.role != "assistant" or not msg.tool_calls:
-                        seq=msg.seq,
+                    continue
-                        role=msg.role,
+                has_protected = any(
-                        content="",
+                    tc.get("function", {}).get("name") == "set_output" for tc in msg.tool_calls
                        tool_calls=compact_tcs,
                        is_error=msg.is_error,
                        phase_id=msg.phase_id,
                        is_transition_marker=msg.is_transition_marker,
                    )
                )
-            else:
+                tc_ids = {tc.get("id", "") for tc in msg.tool_calls}
-                # Freeform text (user messages, text-only assistant messages)
+                if has_protected:
-                # — save to file and remove from conversation.
+                    protected_tc_ids |= tc_ids
-                role_label = msg.role
+                else:
-                text = msg.content
+                    collapsible_tc_ids |= tc_ids
-                if len(text) > 2000:
+
-                    text = text[:2000] + "…"
+            # Second pass: classify all messages
-                freeform_lines.append(f"[{role_label}] (seq={msg.seq}): {text}")
+            for msg in old_messages:
                if msg.role == "tool":
                    tc_id = msg.tool_use_id or ""
                    if tc_id in protected_tc_ids:
                        kept_structural.append(msg)
                    elif msg.is_error:
                        # Error results are always protected
                        kept_structural.append(msg)
                        # Protect the parent assistant message too
                        protected_tc_ids.add(tc_id)
                    else:
                        collapsed_msgs.append(msg)
                elif msg.role == "assistant" and msg.tool_calls:
                    tc_ids = {tc.get("id", "") for tc in msg.tool_calls}
                    if tc_ids & protected_tc_ids:
                        # Has at least one protected tool call — keep entire msg
                        compact_tcs = _compact_tool_calls(msg.tool_calls)
                        kept_structural.append(
                            Message(
                                seq=msg.seq,
                                role=msg.role,
                                content="",
                                tool_calls=compact_tcs,
                                is_error=msg.is_error,
                                phase_id=msg.phase_id,
                                is_transition_marker=msg.is_transition_marker,
                            )
                        )
                    else:
                        collapsed_msgs.append(msg)
                else:
                    # Freeform text — save to file
                    role_label = msg.role
                    text = msg.content
                    if len(text) > 2000:
                        text = text[:2000] + "…"
                    freeform_lines.append(f"[{role_label}] (seq={msg.seq}): {text}")
        else:
            # Standard mode: keep all tool call pairs as structural
            for msg in old_messages:
                if msg.role == "tool":
                    kept_structural.append(msg)
                elif msg.role == "assistant" and msg.tool_calls:
                    compact_tcs = _compact_tool_calls(msg.tool_calls)
                    kept_structural.append(
                        Message(
                            seq=msg.seq,
                            role=msg.role,
                            content="",
                            tool_calls=compact_tcs,
                            is_error=msg.is_error,
                            phase_id=msg.phase_id,
                            is_transition_marker=msg.is_transition_marker,
                        )
                    )
                else:
                    role_label = msg.role
                    text = msg.content
                    if len(text) > 2000:
                        text = text[:2000] + "…"
                    freeform_lines.append(f"[{role_label}] (seq={msg.seq}): {text}")
        # Write freeform text to a numbered conversation file
        spill_path = Path(spillover_dir)
@@ -741,13 +896,25 @@ class NodeConversation:
            conv_filename = ""
        # Build reference message
        ref_parts: list[str] = []
        if conv_filename:
-            ref_content = (
+            ref_parts.append(
                f"[Previous conversation saved to '{conv_filename}'. "
                f"Use load_data('{conv_filename}') to review if needed.]"
            )
-        else:
+        elif not collapsed_msgs:
-            ref_content = "[Previous freeform messages compacted.]"
+            ref_parts.append("[Previous freeform messages compacted.]")
        # Aggressive: add collapsed tool-call history to the reference
        if collapsed_msgs:
            tool_history = extract_tool_call_history(collapsed_msgs)
            if tool_history:
                ref_parts.append(tool_history)
            elif not ref_parts:
                ref_parts.append("[Previous tool calls compacted.]")
        ref_content = "\n\n".join(ref_parts)
        # Use a seq just before the first kept message
        recent_messages = list(self._messages[split:])
        if kept_structural:
@@ -760,15 +927,13 @@ class NodeConversation:
        ref_msg = Message(seq=ref_seq, role="user", content=ref_content)
-        # Persist: delete old messages from store, write reference + kept structural
+        # Persist: delete old messages from store, write reference + kept structural.
        # In aggressive mode, collapsed messages may be interspersed with kept
        # messages, so we delete everything before the recent boundary and
        # rewrite only what we want to keep.
        if self._store:
-            first_kept_seq = (
+            recent_boundary = recent_messages[0].seq if recent_messages else self._next_seq
-                kept_structural[0].seq
+            await self._store.delete_parts_before(recent_boundary)
                if kept_structural
                else (recent_messages[0].seq if recent_messages else self._next_seq)
            )
            # Delete everything before the first structural message we're keeping
            await self._store.delete_parts_before(first_kept_seq)
            # Write the reference message
            await self._store.write_part(ref_msg.seq, ref_msg.to_storage_dict())
            # Write kept structural messages (they may have been modified)
@@ -138,6 +138,7 @@ class GraphExecutor:
        accounts_prompt: str = "",
        accounts_data: list[dict] | None = None,
        tool_provider_map: dict[str, str] | None = None,
        dynamic_tools_provider: Callable | None = None,
    ):
        """
        Initialize the executor.
@@ -160,6 +161,8 @@ class GraphExecutor:
            accounts_prompt: Connected accounts block for system prompt injection
            accounts_data: Raw account data for per-node prompt generation
            tool_provider_map: Tool name to provider name mapping for account routing
            dynamic_tools_provider: Optional callback returning current
                tool list (for mode switching)
        """
        self.runtime = runtime
        self.llm = llm
@@ -178,6 +181,7 @@ class GraphExecutor:
        self.accounts_prompt = accounts_prompt
        self.accounts_data = accounts_data
        self.tool_provider_map = tool_provider_map
        self.dynamic_tools_provider = dynamic_tools_provider
        # Initialize output cleaner
        self.cleansing_config = cleansing_config or CleansingConfig()
@@ -286,6 +290,125 @@ class GraphExecutor:
        return errors
    # Max chars of formatted messages before proactively splitting for LLM.
    _PHASE_LLM_CHAR_LIMIT = 240_000
    _PHASE_LLM_MAX_DEPTH = 10
    async def _phase_llm_compact(
        self,
        conversation: Any,
        next_spec: NodeSpec,
        messages: list,
        _depth: int = 0,
    ) -> str:
        """Summarise messages for phase-boundary compaction.
        Uses the same recursive binary-search splitting as EventLoopNode.
        """
        from framework.graph.conversation import extract_tool_call_history
        from framework.graph.event_loop_node import _is_context_too_large_error
        if _depth > self._PHASE_LLM_MAX_DEPTH:
            raise RuntimeError("Phase LLM compaction recursion limit")
        # Format messages
        lines: list[str] = []
        for m in messages:
            if m.role == "tool":
                c = m.content[:500] + ("..." if len(m.content) > 500 else "")
                lines.append(f"[tool result]: {c}")
            elif m.role == "assistant" and m.tool_calls:
                names = [tc.get("function", {}).get("name", "?") for tc in m.tool_calls]
                lines.append(
                    f"[assistant (calls: {', '.join(names)})]: "
                    f"{m.content[:200] if m.content else ''}"
                )
            else:
                lines.append(f"[{m.role}]: {m.content}")
        formatted = "\n\n".join(lines)
        # Proactive split
        if len(formatted) > self._PHASE_LLM_CHAR_LIMIT and len(messages) > 1:
            summary = await self._phase_llm_compact_split(
                conversation,
                next_spec,
                messages,
                _depth,
            )
        else:
            max_tokens = getattr(conversation, "_max_history_tokens", 32000)
            target_tokens = max_tokens // 2
            target_chars = target_tokens * 4
            prompt = (
                "You are compacting an AI agent's conversation history "
                "at a phase boundary.\n\n"
                f"NEXT PHASE: {next_spec.name}\n"
            )
            if next_spec.description:
                prompt += f"NEXT PHASE PURPOSE: {next_spec.description}\n"
            prompt += (
                f"\nCONVERSATION MESSAGES:\n{formatted}\n\n"
                "INSTRUCTIONS:\n"
                f"Write a summary of approximately {target_chars} characters "
                f"(~{target_tokens} tokens).\n"
                "Preserve user-stated rules, constraints, and preferences "
                "verbatim. Preserve key decisions and results from earlier "
                "phases. Preserve context needed for the next phase.\n"
            )
            summary_budget = max(1024, max_tokens // 2)
            try:
                response = await self._llm.acomplete(
                    messages=[{"role": "user", "content": prompt}],
                    system=(
                        "You are a conversation compactor. Write a detailed "
                        "summary preserving context for the next phase."
                    ),
                    max_tokens=summary_budget,
                )
                summary = response.content
            except Exception as e:
                if _is_context_too_large_error(e) and len(messages) > 1:
                    summary = await self._phase_llm_compact_split(
                        conversation,
                        next_spec,
                        messages,
                        _depth,
                    )
                else:
                    raise
        # Append tool history at top level only
        if _depth == 0:
            tool_history = extract_tool_call_history(messages)
            if tool_history and "TOOLS ALREADY CALLED" not in summary:
                summary += "\n\n" + tool_history
        return summary
    async def _phase_llm_compact_split(
        self,
        conversation: Any,
        next_spec: NodeSpec,
        messages: list,
        _depth: int,
    ) -> str:
        """Split messages in half and summarise each half."""
        mid = max(1, len(messages) // 2)
        s1 = await self._phase_llm_compact(
            conversation,
            next_spec,
            messages[:mid],
            _depth + 1,
        )
        s2 = await self._phase_llm_compact(
            conversation,
            next_spec,
            messages[mid:],
            _depth + 1,
        )
        return s1 + "\n\n" + s2
    async def execute(
        self,
        graph: GraphSpec,
@@ -1291,9 +1414,7 @@ class GraphExecutor:
                        # Set current phase for phase-aware compaction
                        continuous_conversation.set_current_phase(next_spec.id)
-                        # Opportunistic compaction at transition:
+                        # Phase-boundary compaction (same flow as EventLoopNode._compact)
                        # 1. Prune old tool results (free, no LLM call)
                        # 2. If still over 80%, do a phase-graduated compact
                        if continuous_conversation.usage_ratio() > 0.5:
                            await continuous_conversation.prune_old_tool_results(
                                protect_tokens=2000,
@@ -1307,38 +1428,62 @@ class GraphExecutor:
                            _data_dir = (
                                str(self._storage_path / "data") if self._storage_path else None
                            )
                            # Step 1: Structural compaction (>=80%)
                            if _data_dir:
                                _pre = continuous_conversation.usage_ratio()
                                await continuous_conversation.compact_preserving_structure(
                                    spillover_dir=_data_dir,
                                    keep_recent=4,
                                    phase_graduated=True,
                                )
-                                # Circuit breaker: if still over budget, fall back
+                                if continuous_conversation.usage_ratio() >= 0.9 * _pre:
-                                _post_ratio = continuous_conversation.usage_ratio()
+                                    await continuous_conversation.compact_preserving_structure(
-                                if _post_ratio >= 0.9 * _phase_ratio:
+                                        spillover_dir=_data_dir,
                                    self.logger.warning(
                                        "   Structure-preserving compaction ineffective "
                                        "(%.0f%% -> %.0f%%), falling back to summary",
                                        _phase_ratio * 100,
                                        _post_ratio * 100,
                                    )
                                    summary = (
                                        f"Summary of earlier phases (before {next_spec.name}). "
                                        "See transition markers for phase details."
                                    )
                                    await continuous_conversation.compact(
                                        summary,
                                        keep_recent=4,
                                        phase_graduated=True,
                                        aggressive=True,
                                    )
-                            else:
+
                            # Step 2: LLM compaction (>95%)
                            if (
                                continuous_conversation.usage_ratio() > 0.95
                                and self._llm is not None
                            ):
                                self.logger.info(
                                    "   LLM phase-boundary compaction (%.0f%% usage)",
                                    continuous_conversation.usage_ratio() * 100,
                                )
                                try:
                                    _llm_summary = await self._phase_llm_compact(
                                        continuous_conversation,
                                        next_spec,
                                        list(continuous_conversation.messages),
                                    )
                                    await continuous_conversation.compact(
                                        _llm_summary,
                                        keep_recent=2,
                                        phase_graduated=True,
                                    )
                                except Exception as e:
                                    self.logger.warning(
                                        "   Phase LLM compaction failed: %s",
                                        e,
                                    )
                            # Step 3: Emergency (only if still over budget)
                            if continuous_conversation.needs_compaction():
                                self.logger.warning(
                                    "   Emergency phase compaction (%.0f%%)",
                                    continuous_conversation.usage_ratio() * 100,
                                )
                                summary = (
-                                    f"Summary of earlier phases (before {next_spec.name}). "
+                                    f"Summary of earlier phases "
                                    f"(before {next_spec.name}). "
                                    "See transition markers for phase details."
                                )
                                await continuous_conversation.compact(
                                    summary,
-                                    keep_recent=4,
+                                    keep_recent=1,
                                    phase_graduated=True,
                                )
@@ -1651,6 +1796,7 @@ class GraphExecutor:
            node_registry=node_registry or {},
            all_tools=list(self.tools),  # Full catalog for subagent tool resolution
            shared_node_registry=self.node_registry,  # For subagent escalation routing
            dynamic_tools_provider=self.dynamic_tools_provider,
        )
    VALID_NODE_TYPES = {
@@ -544,6 +544,11 @@ class NodeContext:
    # the inject_input() routing chain can find.
    shared_node_registry: dict[str, Any] = field(default_factory=dict)
    # Dynamic tool provider — when set, EventLoopNode rebuilds the tool
    # list from this callback at the start of each iteration.  Used by
    # the queen to switch between building-mode and running-mode tools.
    dynamic_tools_provider: Any = None  # Callable[[], list[Tool]] | None
@dataclass
 class NodeResult:
@@ -137,6 +137,9 @@ class EventType(StrEnum):
    WORKER_LOADED = "worker_loaded"
    CREDENTIALS_REQUIRED = "credentials_required"
    # Queen mode changes (building ↔ running)
    QUEEN_MODE_CHANGED = "queen_mode_changed"
    # Subagent reports (one-way progress updates from sub-agents)
    SUBAGENT_REPORT = "subagent_report"
@@ -715,15 +718,24 @@ class EventBus:
        node_id: str,
        prompt: str = "",
        execution_id: str | None = None,
        options: list[str] | None = None,
    ) -> None:
-        """Emit client input requested event (client_facing=True nodes)."""
+        """Emit client input requested event (client_facing=True nodes).
        Args:
            options: Optional predefined choices for the user (1-3 items).
                     The frontend appends an "Other" free-text option automatically.
        """
        data: dict[str, Any] = {"prompt": prompt}
        if options:
            data["options"] = options
        await self.publish(
            AgentEvent(
                type=EventType.CLIENT_INPUT_REQUESTED,
                stream_id=stream_id,
                node_id=node_id,
                execution_id=execution_id,
-                data={"prompt": prompt},
+                data=data,
            )
        )
@@ -511,9 +511,11 @@ class ExecutionStream:
        logger.debug(f"Queued execution {execution_id} for stream {self.stream_id}")
        return execution_id
-    # Errors that indicate a fundamental configuration or environment problem.
+    # Errors that indicate resurrection won't help — the same error will recur.
-    # Resurrecting after these is pointless — the same error will recur.
+    # Includes both configuration/environment errors and deterministic node
    # failures where the conversation/state hasn't changed.
    _FATAL_ERROR_PATTERNS: tuple[str, ...] = (
        # Configuration / environment
        "credential",
        "authentication",
        "unauthorized",
@@ -525,6 +527,11 @@ class ExecutionStream:
        "permission denied",
        "invalid api",
        "configuration error",
        # Deterministic node failures — resurrecting at the same node with
        # the same conversation produces the same result.
        "node stalled",
        "ghost empty stream",
        "max iterations",
    )
    @classmethod
@@ -38,6 +38,7 @@ DEFAULT_EVENT_TYPES = [
    EventType.WORKER_LOADED,
    EventType.CREDENTIALS_REQUIRED,
    EventType.SUBAGENT_REPORT,
    EventType.QUEEN_MODE_CHANGED,
 ]
 # Keepalive interval in seconds
@@ -91,6 +92,7 @@ async def handle_events(request: web.Request) -> web.StreamResponse:
        "node_loop_started",
        "credentials_required",
        "worker_loaded",
        "queen_mode_changed",
    }
    client_disconnected = asyncio.Event()
@@ -130,6 +132,29 @@ async def handle_events(request: web.Request) -> web.StreamResponse:
        "SSE connected: session='%s', sub_id='%s', types=%d", session.id, sub_id, len(event_types)
    )
    # Replay buffered events that were published before this SSE connected.
    # The EventBus keeps a history ring-buffer; we replay the subset that
    # produces visible chat messages so the frontend never misses early
    # queen output.  Lifecycle events are NOT replayed to avoid duplicate
    # state transitions (turn counter increments, etc.).
    _REPLAY_TYPES = {
        EventType.CLIENT_OUTPUT_DELTA.value,
        EventType.EXECUTION_STARTED.value,
        EventType.CLIENT_INPUT_REQUESTED.value,
    }
    event_type_values = {et.value for et in event_types}
    replay_types = _REPLAY_TYPES & event_type_values
    replayed = 0
    for past_event in event_bus._event_history:
        if past_event.type.value in replay_types:
            try:
                queue.put_nowait(past_event.to_dict())
                replayed += 1
            except asyncio.QueueFull:
                break
    if replayed:
        logger.info("SSE replayed %d buffered events for session='%s'", replayed, session.id)
    event_count = 0
    close_reason = "unknown"
    try:
@@ -64,6 +64,16 @@ async def handle_trigger(request: web.Request) -> web.Response:
        session_state=session_state,
    )
    # Cancel queen's in-progress LLM turn so it picks up the mode change cleanly
    if session.queen_executor:
        node = session.queen_executor.node_registry.get("queen")
        if node and hasattr(node, "cancel_current_turn"):
            node.cancel_current_turn()
    # Switch queen to running mode (mirrors run_agent_with_input tool behavior)
    if session.mode_state is not None:
        await session.mode_state.switch_to_running(source="frontend")
    return web.json_response({"execution_id": execution_id})
@@ -124,6 +134,35 @@ async def handle_chat(request: web.Request) -> web.Response:
    return web.json_response({"error": "Queen not available"}, status=503)
 async def handle_queen_context(request: web.Request) -> web.Response:
    """POST /api/sessions/{session_id}/queen-context — queue context for the queen.
    Unlike /chat, this does NOT trigger an LLM response. The message is
    queued in the queen's injection queue and will be drained on her next
    natural iteration (prefixed with [External event]:).
    Body: {"message": "..."}
    """
    session, err = resolve_session(request)
    if err:
        return err
    body = await request.json()
    message = body.get("message", "")
    if not message:
        return web.json_response({"error": "message is required"}, status=400)
    queen_executor = session.queen_executor
    if queen_executor is not None:
        node = queen_executor.node_registry.get("queen")
        if node is not None and hasattr(node, "inject_event"):
            await node.inject_event(message, is_client_input=False)
            return web.json_response({"status": "queued", "delivered": True})
    return web.json_response({"error": "Queen not available"}, status=503)
 async def handle_worker_input(request: web.Request) -> web.Response:
    """POST /api/sessions/{session_id}/worker-input — send input to waiting worker node.
@@ -282,6 +321,16 @@ async def handle_stop(request: web.Request) -> web.Response:
            cancelled = await stream.cancel_execution(execution_id)
            if cancelled:
                # Cancel queen's in-progress LLM turn
                if session.queen_executor:
                    node = session.queen_executor.node_registry.get("queen")
                    if node and hasattr(node, "cancel_current_turn"):
                        node.cancel_current_turn()
                # Switch to staging (agent still loaded, ready to re-run)
                if session.mode_state is not None:
                    await session.mode_state.switch_to_staging(source="frontend")
                return web.json_response(
                    {
                        "stopped": True,
@@ -365,6 +414,7 @@ def register_routes(app: web.Application) -> None:
    app.router.add_post("/api/sessions/{session_id}/trigger", handle_trigger)
    app.router.add_post("/api/sessions/{session_id}/inject", handle_inject)
    app.router.add_post("/api/sessions/{session_id}/chat", handle_chat)
    app.router.add_post("/api/sessions/{session_id}/queen-context", handle_queen_context)
    app.router.add_post("/api/sessions/{session_id}/worker-input", handle_worker_input)
    app.router.add_post("/api/sessions/{session_id}/pause", handle_stop)
    app.router.add_post("/api/sessions/{session_id}/resume", handle_resume)
@@ -48,6 +48,7 @@ def _get_manager(request: web.Request) -> SessionManager:
 def _session_to_live_dict(session) -> dict:
    """Serialize a live Session to the session-primary JSON shape."""
    info = session.worker_info
    mode_state = getattr(session, "mode_state", None)
    return {
        "session_id": session.id,
        "worker_id": session.worker_id,
@@ -60,6 +61,7 @@ def _session_to_live_dict(session) -> dict:
        "loaded_at": session.loaded_at,
        "uptime_seconds": round(time.time() - session.loaded_at, 1),
        "intro_message": getattr(session.runner, "intro_message", "") or "",
        "queen_mode": mode_state.mode if mode_state else "building",
    }
@@ -40,6 +40,8 @@ class Session:
    runner: Any | None = None  # AgentRunner
    worker_runtime: Any | None = None  # AgentRuntime
    worker_info: Any | None = None  # AgentInfo
    # Queen mode state (building/staging/running)
    mode_state: Any = None  # QueenModeState
    # Judge (active when worker is loaded)
    judge_task: asyncio.Task | None = None
    escalation_sub: str | None = None
@@ -425,16 +427,26 @@ class SessionManager:
            except Exception:
                logger.warning("Queen: MCP config failed to load", exc_info=True)
        # Mode state for building/running mode switching
        from framework.tools.queen_lifecycle_tools import (
            QueenModeState,
            register_queen_lifecycle_tools,
        )
        # Start in staging when the caller provided an agent, building otherwise.
        initial_mode = "staging" if worker_identity else "building"
        mode_state = QueenModeState(mode=initial_mode, event_bus=session.event_bus)
        session.mode_state = mode_state
        # Always register lifecycle tools — they check session.worker_runtime
        # at call time, so they work even if no worker is loaded yet.
        from framework.tools.queen_lifecycle_tools import register_queen_lifecycle_tools
        register_queen_lifecycle_tools(
            queen_registry,
            session=session,
            session_id=session.id,
            session_manager=self,
            manager_session_id=session.id,
            mode_state=mode_state,
        )
        # Monitoring tools need concrete worker paths — only register when present
@@ -452,6 +464,32 @@ class SessionManager:
        queen_tools = list(queen_registry.get_tools().values())
        queen_tool_executor = queen_registry.get_executor()
        # Partition tools into mode-specific sets
        from framework.agents.hive_coder.nodes import (
            _QUEEN_BUILDING_TOOLS,
            _QUEEN_RUNNING_TOOLS,
            _QUEEN_STAGING_TOOLS,
        )
        building_names = set(_QUEEN_BUILDING_TOOLS)
        staging_names = set(_QUEEN_STAGING_TOOLS)
        running_names = set(_QUEEN_RUNNING_TOOLS)
        registered_names = {t.name for t in queen_tools}
        missing_building = building_names - registered_names
        if missing_building:
            logger.warning(
                "Queen: %d/%d building tools NOT registered: %s",
                len(missing_building),
                len(building_names),
                sorted(missing_building),
            )
        logger.info("Queen: registered tools: %s", sorted(registered_names))
        mode_state.building_tools = [t for t in queen_tools if t.name in building_names]
        mode_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
        mode_state.running_tools = [t for t in queen_tools if t.name in running_names]
        # Build queen graph with adjusted prompt + tools
        _orig_node = _queen_graph.nodes[0]
        base_prompt = _orig_node.system_prompt or ""
@@ -493,12 +531,37 @@ class SessionManager:
                    storage_path=queen_dir,
                    loop_config=queen_graph.loop_config,
                    execution_id=session.id,
                    dynamic_tools_provider=mode_state.get_current_tools,
                )
                session.queen_executor = executor
                # Wire inject_notification so mode switches notify the queen LLM
                async def _inject_mode_notification(content: str) -> None:
                    node = executor.node_registry.get("queen")
                    if node is not None and hasattr(node, "inject_event"):
                        await node.inject_event(content)
                mode_state.inject_notification = _inject_mode_notification
                # Auto-switch to staging when worker execution finishes naturally
                from framework.runtime.event_bus import EventType as _ET
                async def _on_worker_done(event):
                    if event.stream_id == "queen":
                        return
                    if mode_state.mode == "running":
                        await mode_state.switch_to_staging(source="auto")
                session.event_bus.subscribe(
                    event_types=[_ET.EXECUTION_COMPLETED, _ET.EXECUTION_FAILED],
                    handler=_on_worker_done,
                )
                logger.info(
-                    "Queen starting with %d tools: %s",
+                    "Queen starting in %s mode with %d tools: %s",
-                    len(queen_tools),
+                    mode_state.mode,
-                    [t.name for t in queen_tools],
+                    len(mode_state.get_current_tools()),
                    [t.name for t in mode_state.get_current_tools()],
                )
                result = await executor.execute(
                    graph=queen_graph,
@@ -36,7 +36,7 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -66,6 +66,125 @@ class WorkerSessionAdapter:
    worker_path: Path | None = None
@dataclass
 class QueenModeState:
    """Mutable state container for queen operating mode.
    Three modes: building → staging → running.
    Shared between the dynamic_tools_provider callback and tool handlers
    that trigger mode transitions.
    """
    mode: str = "building"  # "building", "staging", or "running"
    building_tools: list = field(default_factory=list)  # list[Tool]
    staging_tools: list = field(default_factory=list)  # list[Tool]
    running_tools: list = field(default_factory=list)  # list[Tool]
    inject_notification: Any = None  # async (str) -> None
    event_bus: Any = None  # EventBus — for emitting QUEEN_MODE_CHANGED events
    def get_current_tools(self) -> list:
        """Return tools for the current mode."""
        if self.mode == "running":
            return list(self.running_tools)
        if self.mode == "staging":
            return list(self.staging_tools)
        return list(self.building_tools)
    async def _emit_mode_event(self) -> None:
        """Publish a QUEEN_MODE_CHANGED event so the frontend updates the tag."""
        if self.event_bus is not None:
            await self.event_bus.publish(
                AgentEvent(
                    type=EventType.QUEEN_MODE_CHANGED,
                    stream_id="queen",
                    data={"mode": self.mode},
                )
            )
    async def switch_to_running(self, source: str = "tool") -> None:
        """Switch to running mode and notify the queen.
        Args:
            source: Who triggered the switch — "tool" (queen LLM),
                "frontend" (user clicked Run), or "auto" (system).
        """
        if self.mode == "running":
            return
        self.mode = "running"
        tool_names = [t.name for t in self.running_tools]
        logger.info("Queen mode → running (source=%s, tools: %s)", source, tool_names)
        await self._emit_mode_event()
        if self.inject_notification:
            if source == "frontend":
                msg = (
                    "[MODE CHANGE] The user clicked Run in the UI. Switched to RUNNING mode. "
                    "Worker is now executing. You have monitoring/lifecycle tools: "
                    + ", ".join(tool_names)
                    + "."
                )
            else:
                msg = (
                    "[MODE CHANGE] Switched to RUNNING mode. "
                    "Worker is executing. You now have monitoring/lifecycle tools: "
                    + ", ".join(tool_names)
                    + "."
                )
            await self.inject_notification(msg)
    async def switch_to_staging(self, source: str = "tool") -> None:
        """Switch to staging mode and notify the queen.
        Args:
            source: Who triggered the switch — "tool", "frontend", or "auto".
        """
        if self.mode == "staging":
            return
        self.mode = "staging"
        tool_names = [t.name for t in self.staging_tools]
        logger.info("Queen mode → staging (source=%s, tools: %s)", source, tool_names)
        await self._emit_mode_event()
        if self.inject_notification:
            if source == "frontend":
                msg = (
                    "[MODE CHANGE] The user stopped the worker from the UI. "
                    "Switched to STAGING mode. Agent is still loaded. "
                    "Available tools: " + ", ".join(tool_names) + "."
                )
            elif source == "auto":
                msg = (
                    "[MODE CHANGE] Worker execution completed. Switched to STAGING mode. "
                    "Agent is still loaded. Call run_agent_with_input(task) to run again. "
                    "Available tools: " + ", ".join(tool_names) + "."
                )
            else:
                msg = (
                    "[MODE CHANGE] Switched to STAGING mode. "
                    "Agent loaded and ready. Call run_agent_with_input(task) to start, "
                    "or stop_worker_and_edit() to go back to building. "
                    "Available tools: " + ", ".join(tool_names) + "."
                )
            await self.inject_notification(msg)
    async def switch_to_building(self, source: str = "tool") -> None:
        """Switch to building mode and notify the queen.
        Args:
            source: Who triggered the switch — "tool", "frontend", or "auto".
        """
        if self.mode == "building":
            return
        self.mode = "building"
        tool_names = [t.name for t in self.building_tools]
        logger.info("Queen mode → building (source=%s, tools: %s)", source, tool_names)
        await self._emit_mode_event()
        if self.inject_notification:
            await self.inject_notification(
                "[MODE CHANGE] Switched to BUILDING mode. "
                "Lifecycle tools removed. Full coding tools restored. "
                "Call load_built_agent(path) when ready to stage."
            )
 def build_worker_profile(runtime: AgentRuntime, agent_path: Path | str | None = None) -> str:
    """Build a worker capability profile from its graph/goal definition.
@@ -120,6 +239,8 @@ def register_queen_lifecycle_tools(
    # Server context — enables load_built_agent tool
    session_manager: Any = None,
    manager_session_id: str | None = None,
    # Mode switching
    mode_state: QueenModeState | None = None,
 ) -> int:
    """Register queen lifecycle tools.
@@ -136,6 +257,9 @@ def register_queen_lifecycle_tools(
            for ``load_built_agent`` to hot-load a worker.
        manager_session_id: (Server only) The session's ID in the manager,
            used with ``session_manager.load_worker()``.
        mode_state: (Optional) Mutable mode state for building/running
            mode switching. When provided, load_built_agent switches to
            running mode and stop_worker_and_edit switches to building mode.
    Returns the number of tools registered.
    """
@@ -343,6 +467,75 @@ def register_queen_lifecycle_tools(
    registry.register("stop_worker", _stop_tool, lambda inputs: stop_worker())
    tools_registered += 1
    # --- stop_worker_and_edit -------------------------------------------------
    async def stop_worker_and_edit() -> str:
        """Stop the worker and switch to building mode for editing the agent."""
        stop_result = await stop_worker()
        # Switch to building mode
        if mode_state is not None:
            await mode_state.switch_to_building()
        result = json.loads(stop_result)
        result["mode"] = "building"
        result["message"] = (
            "Worker stopped. You are now in building mode. "
            "Use your coding tools to modify the agent, then call "
            "load_built_agent(path) to stage it again."
        )
        return json.dumps(result)
    _stop_edit_tool = Tool(
        name="stop_worker_and_edit",
        description=(
            "Stop the running worker and switch to building mode. "
            "Use this when you need to modify the agent's code, nodes, or configuration. "
            "After editing, call load_built_agent(path) to reload and run."
        ),
        parameters={"type": "object", "properties": {}},
    )
    registry.register(
        "stop_worker_and_edit", _stop_edit_tool, lambda inputs: stop_worker_and_edit()
    )
    tools_registered += 1
    # --- stop_worker (Running → Staging) -------------------------------------
    async def stop_worker_to_staging() -> str:
        """Stop the running worker and switch to staging mode.
        After stopping, ask the user whether they want to:
        1. Re-run the agent with new input → call run_agent_with_input(task)
        2. Edit the agent code → call stop_worker_and_edit() to go to building mode
        """
        stop_result = await stop_worker()
        # Switch to staging mode
        if mode_state is not None:
            await mode_state.switch_to_staging()
        result = json.loads(stop_result)
        result["mode"] = "staging"
        result["message"] = (
            "Worker stopped. You are now in staging mode. "
            "Ask the user: would they like to re-run with new input, "
            "or edit the agent code?"
        )
        return json.dumps(result)
    _stop_worker_tool = Tool(
        name="stop_worker",
        description=(
            "Stop the running worker and switch to staging mode. "
            "After stopping, ask the user whether they want to re-run "
            "with new input or edit the agent code."
        ),
        parameters={"type": "object", "properties": {}},
    )
    registry.register("stop_worker", _stop_worker_tool, lambda inputs: stop_worker_to_staging())
    tools_registered += 1
    # --- get_worker_status ----------------------------------------------------
    def _get_event_bus():
@@ -648,7 +841,7 @@ def register_queen_lifecycle_tools(
            injectable = stream.get_injectable_nodes()
            if injectable:
                target_node_id = injectable[0]["node_id"]
-                ok = await stream.inject_input(target_node_id, content)
+                ok = await stream.inject_input(target_node_id, content, is_client_input=True)
                if ok:
                    return json.dumps(
                        {
@@ -818,11 +1011,24 @@ def register_queen_lifecycle_tools(
                    str(resolved_path),
                )
                info = updated_session.worker_info
                # Switch to staging mode after successful load
                if mode_state is not None:
                    await mode_state.switch_to_staging()
                worker_name = info.name if info else updated_session.worker_id
                return json.dumps(
                    {
                        "status": "loaded",
                        "mode": "staging",
                        "message": (
                            f"Successfully loaded '{worker_name}'. "
                            "You are now in STAGING mode. "
                            "Call run_agent_with_input(task) to start the worker, "
                            "or stop_worker_and_edit() to go back to building."
                        ),
                        "worker_id": updated_session.worker_id,
-                        "worker_name": info.name if info else updated_session.worker_id,
+                        "worker_name": worker_name,
                        "goal": info.goal_name if info else "",
                        "node_count": info.node_count if info else 0,
                    }
@@ -857,5 +1063,125 @@ def register_queen_lifecycle_tools(
        )
        tools_registered += 1
    # --- run_agent_with_input ------------------------------------------------
    async def run_agent_with_input(task: str) -> str:
        """Run the loaded worker agent with the given task input.
        Performs preflight checks (credentials, MCP resync), triggers the
        worker's default entry point, and switches to running mode.
        """
        runtime = _get_runtime()
        if runtime is None:
            return json.dumps({"error": "No worker loaded in this session."})
        try:
            # Pre-flight: validate credentials and resync MCP servers.
            loop = asyncio.get_running_loop()
            async def _preflight():
                cred_error: CredentialError | None = None
                try:
                    await loop.run_in_executor(
                        None,
                        lambda: validate_credentials(
                            runtime.graph.nodes,
                            interactive=False,
                            skip=False,
                        ),
                    )
                except CredentialError as e:
                    cred_error = e
                runner = getattr(session, "runner", None)
                if runner:
                    try:
                        await loop.run_in_executor(
                            None,
                            lambda: runner._tool_registry.resync_mcp_servers_if_needed(),
                        )
                    except Exception as e:
                        logger.warning("MCP resync failed: %s", e)
                if cred_error is not None:
                    raise cred_error
            try:
                await asyncio.wait_for(_preflight(), timeout=_START_PREFLIGHT_TIMEOUT)
            except TimeoutError:
                logger.warning(
                    "run_agent_with_input preflight timed out after %ds — proceeding",
                    _START_PREFLIGHT_TIMEOUT,
                )
            except CredentialError:
                raise  # handled below
            # Resume timers in case they were paused by a previous stop
            runtime.resume_timers()
            # Get session state from any prior execution for memory continuity
            session_state = runtime._get_primary_session_state("default") or {}
            if session_id:
                session_state["resume_session_id"] = session_id
            exec_id = await runtime.trigger(
                entry_point_id="default",
                input_data={"user_request": task},
                session_state=session_state,
            )
            # Switch to running mode
            if mode_state is not None:
                await mode_state.switch_to_running()
            return json.dumps(
                {
                    "status": "started",
                    "mode": "running",
                    "execution_id": exec_id,
                    "task": task,
                }
            )
        except CredentialError as e:
            error_payload = credential_errors_to_json(e)
            error_payload["agent_path"] = str(getattr(session, "worker_path", "") or "")
            bus = getattr(session, "event_bus", None)
            if bus is not None:
                await bus.publish(
                    AgentEvent(
                        type=EventType.CREDENTIALS_REQUIRED,
                        stream_id="queen",
                        data=error_payload,
                    )
                )
            return json.dumps(error_payload)
        except Exception as e:
            return json.dumps({"error": f"Failed to start worker: {e}"})
    _run_input_tool = Tool(
        name="run_agent_with_input",
        description=(
            "Run the loaded worker agent with the given task. Validates credentials, "
            "triggers the worker's default entry point, and switches to running mode. "
            "Use this after loading an agent (staging mode) to start execution."
        ),
        parameters={
            "type": "object",
            "properties": {
                "task": {
                    "type": "string",
                    "description": "The task or input for the worker agent to execute",
                },
            },
            "required": ["task"],
        },
    )
    registry.register(
        "run_agent_with_input", _run_input_tool, lambda inputs: run_agent_with_input(**inputs)
    )
    tools_registered += 1
    logger.info("Registered %d queen lifecycle tools", tools_registered)
    return tools_registered
@@ -475,7 +475,10 @@ class AdenTUI(App):
        from framework.graph.executor import GraphExecutor
        from framework.runner.tool_registry import ToolRegistry
        from framework.runtime.core import Runtime
-        from framework.tools.queen_lifecycle_tools import register_queen_lifecycle_tools
+        from framework.tools.queen_lifecycle_tools import (
            QueenModeState,
            register_queen_lifecycle_tools,
        )
        from framework.tools.worker_monitoring_tools import register_worker_monitoring_tools
        log = logging.getLogger("tui.queen")
@@ -536,12 +539,16 @@ class AdenTUI(App):
                except Exception:
                    log.warning("Queen: MCP config failed to load", exc_info=True)
            # Worker is already loaded in TUI path → start in staging mode.
            mode_state = QueenModeState(mode="staging", event_bus=event_bus)
            register_queen_lifecycle_tools(
                queen_registry,
                worker_runtime=self.runtime,
                event_bus=event_bus,
                storage_path=storage_path,
                session_id=session_id,
                mode_state=mode_state,
            )
            register_worker_monitoring_tools(
                queen_registry,
@@ -553,6 +560,20 @@ class AdenTUI(App):
            queen_tools = list(queen_registry.get_tools().values())
            queen_tool_executor = queen_registry.get_executor()
            # Partition tools into mode-specific sets
            from framework.agents.hive_coder.nodes import (
                _QUEEN_BUILDING_TOOLS,
                _QUEEN_RUNNING_TOOLS,
                _QUEEN_STAGING_TOOLS,
            )
            building_names = set(_QUEEN_BUILDING_TOOLS)
            staging_names = set(_QUEEN_STAGING_TOOLS)
            running_names = set(_QUEEN_RUNNING_TOOLS)
            mode_state.building_tools = [t for t in queen_tools if t.name in building_names]
            mode_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
            mode_state.running_tools = [t for t in queen_tools if t.name in running_names]
            # Build worker profile for queen's system prompt.
            from framework.tools.queen_lifecycle_tools import build_worker_profile
@@ -593,12 +614,23 @@ class AdenTUI(App):
                        stream_id="queen",
                        storage_path=queen_dir,
                        loop_config=queen_graph.loop_config,
                        dynamic_tools_provider=mode_state.get_current_tools,
                    )
                    self._queen_executor = executor
                    # Wire inject_notification so mode switches notify the queen LLM
                    async def _inject_mode_notification(content: str) -> None:
                        node = executor.node_registry.get("queen")
                        if node is not None and hasattr(node, "inject_event"):
                            await node.inject_event(content)
                    mode_state.inject_notification = _inject_mode_notification
                    log.info(
-                        "Queen starting with %d tools: %s",
+                        "Queen starting in %s mode with %d tools: %s",
-                        len(queen_tools),
+                        mode_state.mode,
-                        [t.name for t in queen_tools],
+                        len(mode_state.get_current_tools()),
                        [t.name for t in mode_state.get_current_tools()],
                    )
                    # The queen's event_loop node runs forever (continuous mode).
                    # It blocks on _await_user_input() after each LLM turn,
@@ -37,6 +37,10 @@ export const executionApi = {
  chat: (sessionId: string, message: string) =>
    api.post<ChatResult>(`/sessions/${sessionId}/chat`, { message }),
  /** Queue context for the queen without triggering an LLM response. */
  queenContext: (sessionId: string, message: string) =>
    api.post<ChatResult>(`/sessions/${sessionId}/queen-context`, { message }),
  workerInput: (sessionId: string, message: string) =>
    api.post<ChatResult>(`/sessions/${sessionId}/worker-input`, { message }),
@@ -12,6 +12,8 @@ export interface LiveSession {
  loaded_at: number;
  uptime_seconds: number;
  intro_message?: string;
  /** Queen operating mode — "building", "staging", or "running" */
  queen_mode?: "building" | "staging" | "running";
  /** Present in 409 conflict responses when worker is still loading */
  loading?: boolean;
 }
@@ -271,6 +273,7 @@ export type EventTypeName =
  | "escalation_requested"
  | "worker_loaded"
  | "credentials_required"
  | "queen_mode_changed"
  | "subagent_report";
 export interface AgentEvent {
@@ -31,6 +31,7 @@ interface AgentGraphProps {
  version?: string;
  runState?: RunState;
  building?: boolean;
  queenMode?: "building" | "staging" | "running";
 }
 // --- Extracted RunButton so hover state survives parent re-renders ---
@@ -145,7 +146,7 @@ function truncateLabel(label: string, availablePx: number, fontSize: number): st
  return label.slice(0, Math.max(maxChars - 1, 1)) + "\u2026";
 }
-export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, onPause, version, runState: externalRunState, building }: AgentGraphProps) {
+export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, onPause, version, runState: externalRunState, building, queenMode }: AgentGraphProps) {
  const [localRunState, setLocalRunState] = useState<RunState>("idle");
  const runState = externalRunState ?? localRunState;
  const runBtnRef = useRef<HTMLButtonElement>(null);
@@ -277,7 +278,7 @@ export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, o
              </span>
            )}
          </div>
-          <RunButton runState={runState} disabled={nodes.length === 0} onRun={handleRun} onPause={onPause ?? (() => {})} btnRef={runBtnRef} />
+          <RunButton runState={runState} disabled={nodes.length === 0 || queenMode === "building"} onRun={handleRun} onPause={onPause ?? (() => {})} btnRef={runBtnRef} />
        </div>
        <div className="flex-1 flex items-center justify-center px-5">
          {building ? (
@@ -1,6 +1,7 @@
 import { memo, useState, useRef, useEffect } from "react";
-import { Send, Square, Crown, Cpu, Check, Loader2, Reply } from "lucide-react";
+import { Send, Square, Crown, Cpu, Check, Loader2 } from "lucide-react";
 import MarkdownContent from "@/components/MarkdownContent";
 import QuestionWidget from "@/components/QuestionWidget";
 export interface ChatMessage {
  id: string;
@@ -20,15 +21,25 @@ interface ChatPanelProps {
  messages: ChatMessage[];
  onSend: (message: string, thread: string) => void;
  isWaiting?: boolean;
  /** When true a worker is thinking (not yet streaming) */
  isWorkerWaiting?: boolean;
  /** When true the queen is busy (typing or streaming) — shows the stop button */
  isBusy?: boolean;
  activeThread: string;
  /** When true, the worker is waiting for user input — shows inline reply box */
  workerAwaitingInput?: boolean;
  /** When true, the input is disabled (e.g. during loading) */
  disabled?: boolean;
  /** Called when user clicks the stop button to cancel the queen's current turn */
  onCancel?: () => void;
-  /** Called when user submits a reply to the worker's input request */
+  /** Pending question from ask_user — replaces textarea when present */
-  onWorkerReply?: (message: string) => void;
+  pendingQuestion?: string | null;
  /** Options for the pending question */
  pendingOptions?: string[] | null;
  /** Called when user submits an answer to the pending question */
  onQuestionSubmit?: (answer: string, isOther: boolean) => void;
  /** Called when user dismisses the pending question without answering */
  onQuestionDismiss?: () => void;
  /** Queen operating mode — shown as a tag on queen messages */
  queenMode?: "building" | "staging" | "running";
 }
 const queenColor = "hsl(45,95%,58%)";
@@ -133,76 +144,7 @@ function ToolActivityRow({ content }: { content: string }) {
  );
 }
-/** Inline reply box that appears below a worker's input request in the chat thread. */
+const MessageBubble = memo(function MessageBubble({ msg, queenMode }: { msg: ChatMessage; queenMode?: "building" | "staging" | "running" }) {
 function WorkerInputReply({ onSubmit, disabled }: { onSubmit: (text: string) => void; disabled?: boolean }) {
  const [value, setValue] = useState("");
  const [sent, setSent] = useState(false);
  const inputRef = useRef<HTMLTextAreaElement>(null);
  useEffect(() => {
    if (!disabled && !sent) inputRef.current?.focus();
  }, [disabled, sent]);
  const handleSubmit = (e: React.FormEvent) => {
    e.preventDefault();
    if (!value.trim() || sent) return;
    onSubmit(value.trim());
    setSent(true);
  };
  if (sent) {
    return (
      <div className="ml-10 flex items-center gap-1.5 text-[11px] text-muted-foreground py-1">
        <Check className="w-3 h-3 text-emerald-500" />
        <span>Response sent</span>
      </div>
    );
  }
  return (
    <form onSubmit={handleSubmit} className="ml-10 mt-1">
      <div
        className="flex items-center gap-2 rounded-xl px-3 py-2 border transition-colors"
        style={{
          backgroundColor: `${workerColor}08`,
          borderColor: `${workerColor}30`,
        }}
      >
        <Reply className="w-3.5 h-3.5 flex-shrink-0" style={{ color: workerColor }} />
        <textarea
          ref={inputRef}
          rows={1}
          value={value}
          onChange={(e) => {
            setValue(e.target.value);
            const ta = e.target;
            ta.style.height = "auto";
            ta.style.height = `${Math.min(ta.scrollHeight, 120)}px`;
          }}
          onKeyDown={(e) => {
            if (e.key === "Enter" && !e.shiftKey) {
              e.preventDefault();
              handleSubmit(e);
            }
          }}
          placeholder="Reply to worker..."
          disabled={disabled}
          className="flex-1 bg-transparent text-sm text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-50 resize-none overflow-y-auto"
        />
        <button
          type="submit"
          disabled={!value.trim() || disabled}
          className="p-1.5 rounded-lg transition-opacity disabled:opacity-30 hover:opacity-90"
          style={{ backgroundColor: workerColor, color: "white" }}
        >
          <Send className="w-3.5 h-3.5" />
        </button>
      </div>
    </form>
  );
 }
 const MessageBubble = memo(function MessageBubble({ msg }: { msg: ChatMessage }) {
  const isUser = msg.type === "user";
  const isQueen = msg.role === "queen";
  const color = getColor(msg.agent, msg.role);
@@ -257,7 +199,13 @@ const MessageBubble = memo(function MessageBubble({ msg }: { msg: ChatMessage })
              isQueen ? "bg-primary/15 text-primary" : "bg-muted text-muted-foreground"
            }`}
          >
-            {isQueen ? "Queen" : "Worker"}
+            {isQueen
              ? queenMode === "running"
                ? "running mode"
                : queenMode === "staging"
                  ? "staging mode"
                  : "building mode"
              : "Worker"}
          </span>
        </div>
        <div
@@ -270,12 +218,14 @@ const MessageBubble = memo(function MessageBubble({ msg }: { msg: ChatMessage })
      </div>
    </div>
  );
-}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content);
+}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.queenMode === next.queenMode);
-export default function ChatPanel({ messages, onSend, isWaiting, activeThread, workerAwaitingInput, disabled, onCancel, onWorkerReply }: ChatPanelProps) {
+export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, onQuestionSubmit, onQuestionDismiss, queenMode }: ChatPanelProps) {
  const [input, setInput] = useState("");
  const [readMap, setReadMap] = useState<Record<string, number>>({});
  const bottomRef = useRef<HTMLDivElement>(null);
  const scrollRef = useRef<HTMLDivElement>(null);
  const stickToBottom = useRef(true);
  const textareaRef = useRef<HTMLTextAreaElement>(null);
  const threadMessages = messages.filter((m) => {
@@ -292,10 +242,24 @@ export default function ChatPanel({ messages, onSend, isWaiting, activeThread, w
  // Suppress unused var
  void readMap;
-  const lastMsg = threadMessages[threadMessages.length - 1];
+  // Autoscroll: only when user is already near the bottom
  const handleScroll = () => {
    const el = scrollRef.current;
    if (!el) return;
    const distFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight;
    stickToBottom.current = distFromBottom < 80;
  };
  useEffect(() => {
-    bottomRef.current?.scrollIntoView({ behavior: "smooth" });
+    if (stickToBottom.current) {
-  }, [threadMessages.length, lastMsg?.content, workerAwaitingInput]);
+      bottomRef.current?.scrollIntoView({ behavior: "smooth" });
    }
  }, [threadMessages, pendingQuestion, isWaiting, isWorkerWaiting]);
  // Always start pinned to bottom when switching threads
  useEffect(() => {
    stickToBottom.current = true;
  }, [activeThread]);
  const handleSubmit = (e: React.FormEvent) => {
    e.preventDefault();
@@ -305,17 +269,6 @@ export default function ChatPanel({ messages, onSend, isWaiting, activeThread, w
    if (textareaRef.current) textareaRef.current.style.height = "auto";
  };
  // Find the last worker message to attach the inline reply box below.
  // For explicit ask_user, this will be the worker_input_request message.
  // For auto-block, this will be the last client_output_delta streamed message.
  const lastWorkerMsgIdx = workerAwaitingInput
    ? threadMessages.reduce(
        (last, m, i) =>
          m.role === "worker" && m.type !== "tool_status" && m.type !== "system" ? i : last,
        -1,
      )
    : -1;
  return (
    <div className="flex flex-col h-full min-w-0">
      {/* Compact sub-header */}
@@ -324,20 +277,44 @@ export default function ChatPanel({ messages, onSend, isWaiting, activeThread, w
      </div>
      {/* Messages */}
-      <div className="flex-1 overflow-auto px-5 py-4 space-y-3">
+      <div ref={scrollRef} onScroll={handleScroll} className="flex-1 overflow-auto px-5 py-4 space-y-3">
-        {threadMessages.map((msg, idx) => (
+        {threadMessages.map((msg) => (
          <div key={msg.id}>
-            <MessageBubble msg={msg} />
+            <MessageBubble msg={msg} queenMode={queenMode} />
            {idx === lastWorkerMsgIdx && onWorkerReply && (
              <WorkerInputReply onSubmit={onWorkerReply} />
            )}
          </div>
        ))}
        {isWaiting && (
          <div className="flex gap-3">
-            <div className="w-7 h-7 rounded-xl bg-muted flex items-center justify-center">
+            <div
-              <Cpu className="w-3.5 h-3.5 text-muted-foreground" />
+              className="flex-shrink-0 w-9 h-9 rounded-xl flex items-center justify-center"
              style={{
                backgroundColor: `${queenColor}18`,
                border: `1.5px solid ${queenColor}35`,
                boxShadow: `0 0 12px ${queenColor}20`,
              }}
            >
              <Crown className="w-4 h-4" style={{ color: queenColor }} />
            </div>
            <div className="border border-primary/20 bg-primary/5 rounded-2xl rounded-tl-md px-4 py-3">
              <div className="flex gap-1.5">
                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "0ms" }} />
                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "150ms" }} />
                <span className="w-1.5 h-1.5 rounded-full bg-muted-foreground animate-bounce" style={{ animationDelay: "300ms" }} />
              </div>
            </div>
          </div>
        )}
        {isWorkerWaiting && !isWaiting && (
          <div className="flex gap-3">
            <div
              className="flex-shrink-0 w-7 h-7 rounded-xl flex items-center justify-center"
              style={{
                backgroundColor: `${workerColor}18`,
                border: `1.5px solid ${workerColor}35`,
              }}
            >
              <Cpu className="w-3.5 h-3.5" style={{ color: workerColor }} />
            </div>
            <div className="bg-muted/60 rounded-2xl rounded-tl-md px-4 py-3">
              <div className="flex gap-1.5">
@@ -351,48 +328,57 @@ export default function ChatPanel({ messages, onSend, isWaiting, activeThread, w
        <div ref={bottomRef} />
      </div>
-      {/* Input — always connected to Queen */}
+      {/* Input area — question widget replaces textarea when a question is pending */}
-      <form onSubmit={handleSubmit} className="p-4 border-t border-border">
+      {pendingQuestion && pendingOptions && onQuestionSubmit ? (
-        <div className="flex items-center gap-3 bg-muted/40 rounded-xl px-4 py-2.5 border border-border focus-within:border-primary/40 transition-colors">
+        <QuestionWidget
-          <textarea
+          question={pendingQuestion}
-            ref={textareaRef}
+          options={pendingOptions}
-            rows={1}
+          onSubmit={onQuestionSubmit}
-            value={input}
+          onDismiss={onQuestionDismiss}
-            onChange={(e) => {
+        />
-              setInput(e.target.value);
+      ) : (
-              const ta = e.target;
+        <form onSubmit={handleSubmit} className="p-4">
-              ta.style.height = "auto";
+          <div className="flex items-center gap-3 bg-muted/40 rounded-xl px-4 py-2.5 border border-border focus-within:border-primary/40 transition-colors">
-              ta.style.height = `${Math.min(ta.scrollHeight, 160)}px`;
+            <textarea
-            }}
+              ref={textareaRef}
-            onKeyDown={(e) => {
+              rows={1}
-              if (e.key === "Enter" && !e.shiftKey) {
+              value={input}
-                e.preventDefault();
+              onChange={(e) => {
-                handleSubmit(e);
+                setInput(e.target.value);
-              }
+                const ta = e.target;
-            }}
+                ta.style.height = "auto";
-            placeholder={disabled ? "Connecting to agent..." : "Message Queen Bee..."}
+                ta.style.height = `${Math.min(ta.scrollHeight, 160)}px`;
-            disabled={disabled}
+              }}
-            className="flex-1 bg-transparent text-sm text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-50 disabled:cursor-not-allowed resize-none overflow-y-auto"
+              onKeyDown={(e) => {
-          />
+                if (e.key === "Enter" && !e.shiftKey) {
-          {isWaiting && onCancel ? (
+                  e.preventDefault();
-            <button
+                  handleSubmit(e);
-              type="button"
+                }
-              onClick={onCancel}
+              }}
-              className="p-2 rounded-lg bg-destructive text-destructive-foreground hover:opacity-90 transition-opacity"
+              placeholder={disabled ? "Connecting to agent..." : "Message Queen Bee..."}
-            >
+              disabled={disabled}
-              <Square className="w-4 h-4" />
+              className="flex-1 bg-transparent text-sm text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-50 disabled:cursor-not-allowed resize-none overflow-y-auto"
-            </button>
+            />
-          ) : (
+            {isBusy && onCancel ? (
-            <button
+              <button
-              type="submit"
+                type="button"
-              disabled={!input.trim() || disabled}
+                onClick={onCancel}
-              className="p-2 rounded-lg bg-primary text-primary-foreground disabled:opacity-30 hover:opacity-90 transition-opacity"
+                className="p-2 rounded-lg bg-amber-500/15 text-amber-400 border border-amber-500/40 hover:bg-amber-500/25 transition-colors"
-            >
+              >
-              <Send className="w-4 h-4" />
+                <Square className="w-4 h-4" />
-            </button>
+              </button>
-          )}
+            ) : (
-        </div>
+              <button
-      </form>
+                type="submit"
                disabled={!input.trim() || disabled}
                className="p-2 rounded-lg bg-primary text-primary-foreground disabled:opacity-30 hover:opacity-90 transition-opacity"
              >
                <Send className="w-4 h-4" />
              </button>
            )}
          </div>
        </form>
      )}
    </div>
  );
 }
@@ -0,0 +1,142 @@
 import { useState, useRef, useEffect, useCallback } from "react";
 import { Send, MessageCircleQuestion, X } from "lucide-react";
 export interface QuestionWidgetProps {
  /** The question text shown to the user */
  question: string;
  /** 1-3 predefined options. The UI appends an "Other" free-text option. */
  options: string[];
  /** Called with the selected option label or custom text, and whether "Other" was chosen */
  onSubmit: (answer: string, isOther: boolean) => void;
  /** Called when user dismisses the question without answering */
  onDismiss?: () => void;
 }
 export default function QuestionWidget({ question, options, onSubmit, onDismiss }: QuestionWidgetProps) {
  const [selected, setSelected] = useState<number | null>(null);
  const [customText, setCustomText] = useState("");
  const [submitted, setSubmitted] = useState(false);
  const inputRef = useRef<HTMLInputElement>(null);
  const containerRef = useRef<HTMLDivElement>(null);
  // "Other" is always the last option index
  const otherIndex = options.length;
  const isOtherSelected = selected === otherIndex;
  // Focus the text input when "Other" is selected
  useEffect(() => {
    if (isOtherSelected) {
      inputRef.current?.focus();
    }
  }, [isOtherSelected]);
  const canSubmit = selected !== null && (!isOtherSelected || customText.trim().length > 0);
  const handleSubmit = useCallback(() => {
    if (!canSubmit || submitted) return;
    setSubmitted(true);
    if (isOtherSelected) {
      onSubmit(customText.trim(), true);
    } else {
      onSubmit(options[selected!], false);
    }
  }, [canSubmit, submitted, isOtherSelected, customText, options, selected, onSubmit]);
  // Keyboard: Enter to submit, number keys to select (only when text input is not focused)
  useEffect(() => {
    const handleKeyDown = (e: KeyboardEvent) => {
      if (submitted) return;
      const inTextInput = e.target === inputRef.current;
      if (e.key === "Enter" && !e.shiftKey) {
        e.preventDefault();
        handleSubmit();
        return;
      }
      // Number keys 1-4 select options — skip when typing in the "Other" field
      if (!inTextInput) {
        const num = parseInt(e.key, 10);
        if (num >= 1 && num <= options.length + 1) {
          e.preventDefault();
          setSelected(num - 1);
        }
      }
    };
    window.addEventListener("keydown", handleKeyDown);
    return () => window.removeEventListener("keydown", handleKeyDown);
  }, [handleSubmit, submitted, options.length]);
  if (submitted) return null;
  return (
    <div ref={containerRef} className="p-4">
      <div className="bg-card border border-border rounded-xl shadow-sm overflow-hidden">
        {/* Header / Question */}
        <div className="px-5 pt-4 pb-3 flex items-start gap-3">
          <div className="w-7 h-7 rounded-lg bg-primary/10 border border-primary/20 flex items-center justify-center flex-shrink-0 mt-0.5">
            <MessageCircleQuestion className="w-3.5 h-3.5 text-primary" />
          </div>
          <p className="text-sm font-medium text-foreground leading-relaxed flex-1">{question}</p>
          {onDismiss && (
            <button
              onClick={onDismiss}
              className="p-1 rounded-md text-muted-foreground hover:text-foreground hover:bg-muted/60 transition-colors flex-shrink-0"
            >
              <X className="w-4 h-4" />
            </button>
          )}
        </div>
        {/* Options */}
        <div className="px-5 pb-3 space-y-1.5">
          {options.map((option, idx) => (
            <button
              key={idx}
              onClick={() => setSelected(idx)}
              className={`w-full text-left px-4 py-2.5 rounded-lg border text-sm transition-colors ${
                selected === idx
                  ? "border-primary bg-primary/10 text-foreground"
                  : "border-border/60 bg-muted/20 text-foreground hover:border-primary/40 hover:bg-muted/40"
              }`}
            >
              <span className="text-xs text-muted-foreground mr-2">{idx + 1}.</span>
              {option}
            </button>
          ))}
          {/* "Other" — inline text input that auto-selects on focus */}
          <input
            ref={inputRef}
            type="text"
            value={customText}
            onFocus={() => setSelected(otherIndex)}
            onChange={(e) => {
              setSelected(otherIndex);
              setCustomText(e.target.value);
            }}
            placeholder="Type a custom response..."
            className={`w-full px-4 py-2.5 rounded-lg border border-dashed text-sm transition-colors bg-transparent placeholder:text-muted-foreground focus:outline-none ${
              isOtherSelected
                ? "border-primary bg-primary/10 text-foreground"
                : "border-border text-muted-foreground hover:border-primary/40"
            }`}
          />
        </div>
        {/* Submit */}
        <div className="px-5 pb-4">
          <button
            onClick={handleSubmit}
            disabled={!canSubmit}
            className="w-full flex items-center justify-center gap-2 py-2.5 rounded-lg text-sm font-medium bg-primary text-primary-foreground hover:bg-primary/90 disabled:opacity-30 disabled:cursor-not-allowed transition-colors"
          >
            <Send className="w-3.5 h-3.5" />
            Submit
          </button>
        </div>
      </div>
    </div>
  );
 }
@@ -167,3 +167,12 @@
 .animate-in.slide-in-from-right {
  animation: slide-in-from-right 0.2s ease-out;
 }
 /* Slide-up animation for question widget */
@keyframes slide-in-from-bottom {
  from { transform: translateY(16px); opacity: 0; }
  to { transform: translateY(0); opacity: 1; }
 }
 .animate-in.slide-in-from-bottom {
  animation: slide-in-from-bottom 0.25s ease-out;
 }
@@ -8,6 +8,7 @@ import TopBar from "@/components/TopBar";
 import { TAB_STORAGE_KEY, loadPersistedTabs, savePersistedTabs, type PersistedTabState } from "@/lib/tab-persistence";
 import NodeDetailPanel from "@/components/NodeDetailPanel";
 import CredentialsModal, { type Credential, createFreshCredentials, cloneCredentials, allRequiredCredentialsMet, clearCredentialCache } from "@/components/CredentialsModal";
 import { agentsApi } from "@/api/agents";
 import { executionApi } from "@/api/execution";
 import { graphsApi } from "@/api/graphs";
@@ -240,6 +241,8 @@ interface AgentBackendState {
  /** The message ID of the current worker input request (for inline reply box) */
  workerInputMessageId: string | null;
  queenBuilding: boolean;
  /** Queen operating mode — "building" (coding), "staging" (loaded), or "running" (executing) */
  queenMode: "building" | "staging" | "running";
  workerRunState: "idle" | "deploying" | "running";
  currentExecutionId: string | null;
  nodeLogs: Record<string, string[]>;
@@ -247,8 +250,18 @@ interface AgentBackendState {
  subagentReports: { subagent_id: string; message: string; data?: Record<string, unknown>; timestamp: string }[];
  isTyping: boolean;
  isStreaming: boolean;
  /** True only when the queen's LLM is actively processing (not worker) */
  queenIsTyping: boolean;
  /** True only when a worker's LLM is actively processing (not queen) */
  workerIsTyping: boolean;
  llmSnapshots: Record<string, string>;
  activeToolCalls: Record<string, { name: string; done: boolean; streamId: string }>;
  /** Structured question text from ask_user with options */
  pendingQuestion: string | null;
  /** Predefined choices from ask_user (1-3 items); UI appends "Other" */
  pendingOptions: string[] | null;
  /** Whether the pending question came from queen or worker */
  pendingQuestionSource: "queen" | "worker" | null;
 }
 function defaultAgentState(): AgentBackendState {
@@ -264,6 +277,7 @@ function defaultAgentState(): AgentBackendState {
    awaitingInput: false,
    workerInputMessageId: null,
    queenBuilding: false,
    queenMode: "building",
    workerRunState: "idle",
    currentExecutionId: null,
    nodeLogs: {},
@@ -271,8 +285,13 @@ function defaultAgentState(): AgentBackendState {
    subagentReports: [],
    isTyping: false,
    isStreaming: false,
    queenIsTyping: false,
    workerIsTyping: false,
    llmSnapshots: {},
    activeToolCalls: {},
    pendingQuestion: null,
    pendingOptions: null,
    pendingQuestionSource: null,
  };
 }
@@ -352,8 +371,14 @@ export default function Workspace() {
    if (persisted) {
      const restored = { ...persisted.activeSessionByAgent };
      const urlSessions = sessionsByAgent[initialAgent];
-      if (urlSessions?.length && !restored[initialAgent]) {
+      if (urlSessions?.length) {
-        restored[initialAgent] = urlSessions[0].id;
+        // When a prompt was submitted from home, activate the newly created
        // session (last in array) instead of the previously active one.
        if (initialPrompt && hasExplicitAgent) {
          restored[initialAgent] = urlSessions[urlSessions.length - 1].id;
        } else if (!restored[initialAgent]) {
          restored[initialAgent] = urlSessions[0].id;
        }
      }
      return restored;
    }
@@ -632,7 +657,11 @@ export default function Workspace() {
                  const result = await sessionsApi.get(existingSessionId);
                  if (result.loading) continue;
                  return result as LiveSession;
-                } catch {
+                } catch (pollErr) {
                  // 404 = agent failed to load and was cleaned up — stop immediately
                  if (pollErr instanceof ApiError && pollErr.status === 404) {
                    throw new Error("Agent failed to load");
                  }
                  if (i === maxAttempts - 1) throw loadErr;
                }
              }
@@ -648,7 +677,13 @@ export default function Workspace() {
      // failed, the throw inside the catch exits the outer try block.
      const session = liveSession!;
      const displayName = formatAgentDisplayName(session.worker_name || agentType);
-      updateAgentState(agentType, { sessionId: session.session_id, displayName });
+      const initialMode = session.queen_mode || (session.has_worker ? "staging" : "building");
      updateAgentState(agentType, {
        sessionId: session.session_id,
        displayName,
        queenMode: initialMode,
        queenBuilding: initialMode === "building",
      });
      // Update the session label
      setSessionsByAgent((prev) => {
@@ -921,7 +956,7 @@ export default function Workspace() {
    } catch {
      // Best-effort — queen may have already finished
    }
-    updateAgentState(activeWorker, { isTyping: false, isStreaming: false });
+    updateAgentState(activeWorker, { isTyping: false, isStreaming: false, queenIsTyping: false, workerIsTyping: false });
  }, [agentStates, activeWorker, updateAgentState]);
  // --- Node log helper (writes into agentStates) ---
@@ -1004,7 +1039,7 @@ export default function Workspace() {
        case "execution_started":
          if (isQueen) {
            turnCounterRef.current[turnKey] = currentTurn + 1;
-            updateAgentState(agentType, { isTyping: true });
+            updateAgentState(agentType, { isTyping: true, queenIsTyping: true });
          } else {
            // Warn if prior LLM snapshots are being dropped (edge case: execution_completed never arrived)
            const priorSnapshots = agentStates[agentType]?.llmSnapshots || {};
@@ -1015,6 +1050,7 @@ export default function Workspace() {
            updateAgentState(agentType, {
              isTyping: true,
              isStreaming: false,
              workerIsTyping: true,
              awaitingInput: false,
              workerRunState: "running",
              currentExecutionId: event.execution_id || agentStates[agentType]?.currentExecutionId || null,
@@ -1022,6 +1058,9 @@ export default function Workspace() {
              subagentReports: [],
              llmSnapshots: {},
              activeToolCalls: {},
              pendingQuestion: null,
              pendingOptions: null,
              pendingQuestionSource: null,
            });
            markAllNodesAs(agentType, ["running", "looping", "complete", "error"], "pending");
          }
@@ -1029,7 +1068,7 @@ export default function Workspace() {
        case "execution_completed":
          if (isQueen) {
-            updateAgentState(agentType, { isTyping: false });
+            updateAgentState(agentType, { isTyping: false, queenIsTyping: false });
          } else {
            // Flush any remaining LLM snapshots before clearing state
            const completedSnapshots = agentStates[agentType]?.llmSnapshots || {};
@@ -1041,11 +1080,15 @@ export default function Workspace() {
            updateAgentState(agentType, {
              isTyping: false,
              isStreaming: false,
              workerIsTyping: false,
              awaitingInput: false,
              workerInputMessageId: null,
              workerRunState: "idle",
              currentExecutionId: null,
              llmSnapshots: {},
              pendingQuestion: null,
              pendingOptions: null,
              pendingQuestionSource: null,
            });
            markAllNodesAs(agentType, ["running", "looping"], "complete");
@@ -1070,7 +1113,7 @@ export default function Workspace() {
          // Mark streaming when LLM text is actively arriving
          if (event.type === "llm_text_delta" || event.type === "client_output_delta") {
-            updateAgentState(agentType, { isStreaming: true });
+            updateAgentState(agentType, { isStreaming: true, ...(isQueen ? {} : { workerIsTyping: false }) });
          }
          if (event.type === "llm_text_delta" && !isQueen && event.node_id) {
@@ -1092,8 +1135,41 @@ export default function Workspace() {
          if (event.type === "client_input_requested") {
            console.log('[CLIENT_INPUT_REQ] stream_id:', streamId, 'isQueen:', isQueen, 'node_id:', event.node_id, 'prompt:', (event.data?.prompt as string)?.slice(0, 80), 'agentType:', agentType);
            const rawOptions = event.data?.options;
            const options = Array.isArray(rawOptions) ? (rawOptions as string[]) : null;
            if (isQueen) {
-              updateAgentState(agentType, { awaitingInput: true, isTyping: false, isStreaming: false, queenBuilding: false });
+              const prompt = (event.data?.prompt as string) || "";
              const isAutoBlock = !prompt && !options;
              // Queen auto-block (empty prompt, no options) should not
              // overwrite a pending worker question — the worker's
              // QuestionWidget must stay visible.  Use the updater form
              // to read the latest state and avoid stale-closure races
              // when worker and queen events arrive in the same batch.
              setAgentStates(prev => {
                const cur = prev[agentType] || defaultAgentState();
                const workerQuestionActive = cur.pendingQuestionSource === "worker";
                if (isAutoBlock && workerQuestionActive) {
                  return { ...prev, [agentType]: {
                    ...cur,
                    awaitingInput: true,
                    isTyping: false,
                    isStreaming: false,
                    queenIsTyping: false,
                    queenBuilding: false,
                  }};
                }
                return { ...prev, [agentType]: {
                  ...cur,
                  awaitingInput: true,
                  isTyping: false,
                  isStreaming: false,
                  queenIsTyping: false,
                  queenBuilding: false,
                  pendingQuestion: prompt || null,
                  pendingOptions: options,
                  pendingQuestionSource: "queen",
                }};
              });
            } else {
              // Worker input request.
              // If the prompt is non-empty (explicit ask_user), create a visible
@@ -1121,18 +1197,22 @@ export default function Workspace() {
                awaitingInput: true,
                isTyping: false,
                isStreaming: false,
                queenIsTyping: false,
                pendingQuestion: prompt || null,
                pendingOptions: options,
                pendingQuestionSource: options ? "worker" : null,
              });
            }
          }
          if (event.type === "execution_paused") {
-            updateAgentState(agentType, { isTyping: false, isStreaming: false, awaitingInput: false, workerInputMessageId: null });
+            updateAgentState(agentType, { isTyping: false, isStreaming: false, queenIsTyping: false, workerIsTyping: false, awaitingInput: false, workerInputMessageId: null, pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
            if (!isQueen) {
              updateAgentState(agentType, { workerRunState: "idle", currentExecutionId: null });
              markAllNodesAs(agentType, ["running", "looping"], "pending");
            }
          }
          if (event.type === "execution_failed") {
-            updateAgentState(agentType, { isTyping: false, isStreaming: false, awaitingInput: false, workerInputMessageId: null });
+            updateAgentState(agentType, { isTyping: false, isStreaming: false, queenIsTyping: false, workerIsTyping: false, awaitingInput: false, workerInputMessageId: null, pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
            if (!isQueen) {
              updateAgentState(agentType, { workerRunState: "idle", currentExecutionId: null });
              if (event.node_id) {
@@ -1164,7 +1244,11 @@ export default function Workspace() {
        case "node_loop_iteration":
          turnCounterRef.current[turnKey] = currentTurn + 1;
-          updateAgentState(agentType, { isStreaming: false, activeToolCalls: {}, awaitingInput: false });
+          if (isQueen) {
            updateAgentState(agentType, { isStreaming: false, activeToolCalls: {}, awaitingInput: false, pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
          } else {
            updateAgentState(agentType, { isStreaming: false, workerIsTyping: true, activeToolCalls: {}, awaitingInput: false, pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
          }
          if (!isQueen && event.node_id) {
            const pendingText = agentStates[agentType]?.llmSnapshots[event.node_id];
            if (pendingText?.trim()) {
@@ -1212,13 +1296,7 @@ export default function Workspace() {
        case "tool_call_started": {
          console.log('[TOOL_PILL] tool_call_started received:', { isQueen, nodeId: event.node_id, streamId: event.stream_id, agentType, executionId: event.execution_id, toolName: event.data?.tool_name });
-          // Detect queen building: when the queen starts writing/editing files, she's building an agent
+          // queenBuilding is now driven by queen_mode_changed events
          if (isQueen) {
            const tn = (event.data?.tool_name as string) || "";
            if (tn === "write_file" || tn === "edit_file") {
              updateAgentState(agentType, { queenBuilding: true });
            }
          }
          if (event.node_id) {
            if (!isQueen) {
@@ -1453,6 +1531,19 @@ export default function Workspace() {
          break;
        }
        case "queen_mode_changed": {
          const rawMode = event.data?.mode as string;
          const newMode: "building" | "staging" | "running" =
            rawMode === "running" ? "running" : rawMode === "staging" ? "staging" : "building";
          updateAgentState(agentType, {
            queenMode: newMode,
            queenBuilding: newMode === "building",
            // Sync workerRunState so the RunButton reflects the mode
            workerRunState: newMode === "running" ? "running" : "idle",
          });
          break;
        }
        case "worker_loaded": {
          const workerName = event.data?.worker_name as string | undefined;
          const agentPathFromEvent = event.data?.agent_path as string | undefined;
@@ -1561,6 +1652,11 @@ export default function Workspace() {
      return;
    }
    // If queen has a pending question widget, dismiss it when user types directly
    if (agentStates[activeWorker]?.pendingQuestionSource === "queen") {
      updateAgentState(activeWorker, { pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
    }
    const userMsg: ChatMessage = {
      id: makeId(), agent: "You", agentColor: "",
      content: text, timestamp: "", type: "user", thread, createdAt: Date.now(),
@@ -1571,7 +1667,7 @@ export default function Workspace() {
        s.id === activeSession.id ? { ...s, messages: [...s.messages, userMsg] } : s
      ),
    }));
-    updateAgentState(activeWorker, { isTyping: true });
+    updateAgentState(activeWorker, { isTyping: true, queenIsTyping: true });
    if (state?.sessionId && state?.ready) {
      executionApi.chat(state.sessionId, text).catch((err: unknown) => {
@@ -1587,7 +1683,7 @@ export default function Workspace() {
            s.id === activeSession.id ? { ...s, messages: [...s.messages, errorChatMsg] } : s
          ),
        }));
-        updateAgentState(activeWorker, { isTyping: false, isStreaming: false });
+        updateAgentState(activeWorker, { isTyping: false, isStreaming: false, queenIsTyping: false });
      });
    } else {
      const errorMsg: ChatMessage = {
@@ -1624,7 +1720,7 @@ export default function Workspace() {
    }));
    // Clear awaiting state optimistically
-    updateAgentState(activeWorker, { awaitingInput: false, workerInputMessageId: null, isTyping: true });
+    updateAgentState(activeWorker, { awaitingInput: false, workerInputMessageId: null, isTyping: true, pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
    executionApi.workerInput(state.sessionId, text).catch((err: unknown) => {
      const errMsg = err instanceof Error ? err.message : String(err);
@@ -1643,6 +1739,90 @@ export default function Workspace() {
    });
  }, [activeWorker, activeSession, agentStates, updateAgentState]);
  // --- handleWorkerQuestionAnswer: route predefined answers direct to worker, "Other" through queen ---
  const handleWorkerQuestionAnswer = useCallback((answer: string, isOther: boolean) => {
    if (!activeSession) return;
    const state = agentStates[activeWorker];
    const question = state?.pendingQuestion || "";
    const opts = state?.pendingOptions;
    if (isOther) {
      // "Other" free-text → route through queen for evaluation
      updateAgentState(activeWorker, { pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
      if (question && opts && state?.sessionId && state?.ready) {
        const formatted = `[Worker asked: "${question}" | Options: ${opts.join(", ")}]\nUser answered: "${answer}"`;
        const userMsg: ChatMessage = {
          id: makeId(), agent: "You", agentColor: "",
          content: answer, timestamp: "", type: "user", thread: activeWorker, createdAt: Date.now(),
        };
        setSessionsByAgent(prev => ({
          ...prev,
          [activeWorker]: prev[activeWorker].map(s =>
            s.id === activeSession.id ? { ...s, messages: [...s.messages, userMsg] } : s
          ),
        }));
        updateAgentState(activeWorker, { isTyping: true, queenIsTyping: true });
        executionApi.chat(state.sessionId, formatted).catch((err: unknown) => {
          const errMsg = err instanceof Error ? err.message : String(err);
          const errorChatMsg: ChatMessage = {
            id: makeId(), agent: "System", agentColor: "",
            content: `Failed to send message: ${errMsg}`,
            timestamp: "", type: "system", thread: activeWorker, createdAt: Date.now(),
          };
          setSessionsByAgent(prev => ({
            ...prev,
            [activeWorker]: prev[activeWorker].map(s =>
              s.id === activeSession.id ? { ...s, messages: [...s.messages, errorChatMsg] } : s
            ),
          }));
          updateAgentState(activeWorker, { isTyping: false, isStreaming: false, queenIsTyping: false });
        });
      } else {
        handleSend(answer, activeWorker);
      }
    } else {
      // Predefined option → send directly to worker
      handleWorkerReply(answer);
      // Queue context for queen (fire-and-forget, no LLM response triggered)
      if (question && state?.sessionId && state?.ready) {
        const notification = `[Worker asked: "${question}" | User selected: "${answer}"]`;
        executionApi.queenContext(state.sessionId, notification).catch(() => {});
      }
    }
  }, [activeWorker, activeSession, agentStates, handleWorkerReply, handleSend, updateAgentState, setSessionsByAgent]);
  // --- handleQueenQuestionAnswer: submit queen's own question answer via /chat ---
  // The queen asked the question herself, so she already has context — just send the raw answer.
  const handleQueenQuestionAnswer = useCallback((answer: string, _isOther: boolean) => {
    updateAgentState(activeWorker, { pendingQuestion: null, pendingOptions: null, pendingQuestionSource: null });
    handleSend(answer, activeWorker);
  }, [activeWorker, handleSend, updateAgentState]);
  // --- handleQuestionDismiss: user closed the question widget without answering ---
  // Injects a dismiss signal so the blocked node can continue.
  const handleQuestionDismiss = useCallback(() => {
    const state = agentStates[activeWorker];
    if (!state?.sessionId) return;
    const source = state.pendingQuestionSource;
    const question = state.pendingQuestion || "";
    // Clear UI state immediately
    updateAgentState(activeWorker, {
      pendingQuestion: null,
      pendingOptions: null,
      pendingQuestionSource: null,
      awaitingInput: false,
    });
    // Unblock the waiting node with a dismiss signal
    const dismissMsg = `[User dismissed the question: "${question}"]`;
    if (source === "worker") {
      executionApi.workerInput(state.sessionId, dismissMsg).catch(() => {});
    } else {
      executionApi.chat(state.sessionId, dismissMsg).catch(() => {});
    }
  }, [agentStates, activeWorker, updateAgentState]);
  const handleLoadAgent = useCallback(async (agentPath: string) => {
    const state = agentStates[activeWorker];
    if (!state?.sessionId) return;
@@ -1795,6 +1975,7 @@ export default function Workspace() {
              onPause={handlePause}
              runState={activeAgentState?.workerRunState ?? "idle"}
              building={activeAgentState?.queenBuilding ?? false}
              queenMode={activeAgentState?.queenMode ?? "building"}
            />
          </div>
        </div>
@@ -1856,16 +2037,23 @@ export default function Workspace() {
                messages={activeSession.messages}
                onSend={handleSend}
                onCancel={handleCancelQueen}
                onWorkerReply={handleWorkerReply}
                activeThread={activeWorker}
-                isWaiting={(activeAgentState?.isTyping && !activeAgentState?.isStreaming) ?? false}
+                isWaiting={(activeAgentState?.queenIsTyping && !activeAgentState?.isStreaming) ?? false}
-                workerAwaitingInput={
+                isWorkerWaiting={(activeAgentState?.workerIsTyping && !activeAgentState?.isStreaming) ?? false}
-                  (activeAgentState?.awaitingInput && activeAgentState?.workerRunState === "running") ?? false
+                isBusy={activeAgentState?.queenIsTyping ?? false}
                }
                disabled={
                  (activeAgentState?.loading ?? true) ||
                  !(activeAgentState?.queenReady)
                }
                queenMode={activeAgentState?.queenMode ?? "building"}
                pendingQuestion={activeAgentState?.awaitingInput ? activeAgentState.pendingQuestion : null}
                pendingOptions={activeAgentState?.awaitingInput ? activeAgentState.pendingOptions : null}
                onQuestionSubmit={
                  activeAgentState?.pendingQuestionSource === "queen"
                    ? handleQueenQuestionAnswer
                    : handleWorkerQuestionAnswer
                }
                onQuestionDismiss={handleQuestionDismiss}
              />
            )}
          </div>
@@ -578,7 +578,11 @@ class TestClientFacingBlocking:
        """signal_shutdown should unblock a waiting client_facing node."""
        llm = MockStreamingLLM(
            scenarios=[
-                tool_call_scenario("ask_user", {"question": "Waiting..."}, tool_use_id="ask_1"),
+                tool_call_scenario(
                    "ask_user",
                    {"question": "Waiting...", "options": ["Continue", "Stop"]},
                    tool_use_id="ask_1",
                ),
            ]
        )
        bus = EventBus()
@@ -600,7 +604,11 @@ class TestClientFacingBlocking:
        """CLIENT_INPUT_REQUESTED should be published when ask_user blocks."""
        llm = MockStreamingLLM(
            scenarios=[
-                tool_call_scenario("ask_user", {"question": "Hello!"}, tool_use_id="ask_1"),
+                tool_call_scenario(
                    "ask_user",
                    {"question": "Hello!", "options": ["Yes", "No"]},
                    tool_use_id="ask_1",
                ),
            ]
        )
        bus = EventBus()
@@ -796,7 +804,7 @@ class TestClientFacingExpectingWork:
        async def user_then_shutdown():
            await asyncio.sleep(0.05)
-            await node.inject_event("furwise.app")
+            await node.inject_event("furwise.app", is_client_input=True)
            # Node should auto-block on "Monitoring..." text.
            # Give it time to reach the block, then shutdown.
            await asyncio.sleep(0.1)
@@ -2027,3 +2035,61 @@ class TestExecutionId:
            node_spec=node_spec, memory=SharedMemory(), goal=goal, input_data={}
        )
        assert ctx.execution_id == ""
 # ---------------------------------------------------------------------------
 # Subagent memory snapshot includes accumulator outputs
 # ---------------------------------------------------------------------------
 class TestSubagentAccumulatorMemory:
    """Verify that subagent memory construction merges accumulator outputs
    and includes the subagent's input_keys in read permissions."""
    def test_accumulator_values_merged_into_parent_data(self):
        """Keys from OutputAccumulator should appear in subagent memory."""
        # Simulate what _execute_subagent does internally:
        # parent shared memory has user_request but NOT tweet_content
        parent_memory = SharedMemory()
        parent_memory.write("user_request", "post a joke")
        parent_data = parent_memory.read_all()  # {"user_request": "post a joke"}
        # Accumulator has tweet_content (set via set_output before delegation)
        acc = OutputAccumulator(values={"tweet_content": "Hello world!"})
        # Merge accumulator outputs (the fix)
        for key, value in acc.to_dict().items():
            if key not in parent_data:
                parent_data[key] = value
        # Build subagent memory
        subagent_memory = SharedMemory()
        for key, value in parent_data.items():
            subagent_memory.write(key, value, validate=False)
        subagent_input_keys = ["tweet_content"]
        read_keys = set(parent_data.keys()) | set(subagent_input_keys)
        scoped = subagent_memory.with_permissions(read_keys=list(read_keys), write_keys=[])
        # This would have raised PermissionError before the fix
        assert scoped.read("tweet_content") == "Hello world!"
        assert scoped.read("user_request") == "post a joke"
    def test_input_keys_allowed_even_if_not_in_data(self):
        """Subagent input_keys should be in read permissions even if the
        key doesn't exist in memory (returns None instead of PermissionError)."""
        parent_memory = SharedMemory()
        parent_memory.write("user_request", "hi")
        parent_data = parent_memory.read_all()
        subagent_memory = SharedMemory()
        for key, value in parent_data.items():
            subagent_memory.write(key, value, validate=False)
        # input_keys includes "tweet_content" which isn't in parent_data
        read_keys = set(parent_data.keys()) | {"tweet_content"}
        scoped = subagent_memory.with_permissions(read_keys=list(read_keys), write_keys=[])
        # Should return None (not raise PermissionError)
        assert scoped.read("tweet_content") is None
        assert scoped.read("user_request") == "hi"
@@ -2,11 +2,12 @@
 from __future__ import annotations
 import json
 from typing import Any
 import pytest
-from framework.graph.conversation import Message, NodeConversation
+from framework.graph.conversation import Message, NodeConversation, extract_tool_call_history
 from framework.storage.conversation_store import FileConversationStore
 # ---------------------------------------------------------------------------
@@ -930,3 +931,600 @@ class TestConversationIntegration:
        assert restored.next_seq == 4
        assert restored.messages[0].content == "new msg"
        assert restored.messages[0].seq == 2
 # ---------------------------------------------------------------------------
 # Helpers for aggressive compaction tests
 # ---------------------------------------------------------------------------
 def _make_tool_call(call_id: str, name: str, args: dict) -> dict:
    return {
        "id": call_id,
        "type": "function",
        "function": {"name": name, "arguments": json.dumps(args)},
    }
 async def _build_tool_heavy_conversation(
    store: MockConversationStore | None = None,
 ) -> NodeConversation:
    """Build a conversation with many tool call pairs.
    Layout: user msg, then 5x (assistant with append_data tool_call + tool result),
    then 1x (assistant with set_output tool_call + tool result), then user msg + assistant msg.
    """
    conv = NodeConversation(store=store)
    await conv.add_user_message("Process the data")  # seq 0
    for i in range(5):
        args = {"filename": "output.html", "content": "x" * 500}
        tc = [_make_tool_call(f"call_{i}", "append_data", args)]
        conv._messages.append(
            Message(
                seq=conv._next_seq,
                role="assistant",
                content=f"Appending part {i}",
                tool_calls=tc,
            )
        )
        if store:
            await store.write_part(conv._next_seq, conv._messages[-1].to_storage_dict())
        conv._next_seq += 1
        conv._messages.append(
            Message(
                seq=conv._next_seq,
                role="tool",
                content='{"success": true}',
                tool_use_id=f"call_{i}",
            )
        )
        if store:
            await store.write_part(conv._next_seq, conv._messages[-1].to_storage_dict())
        conv._next_seq += 1
    # set_output call — must be protected
    so_tc = [_make_tool_call("call_so", "set_output", {"key": "result", "value": "done"})]
    conv._messages.append(
        Message(seq=conv._next_seq, role="assistant", content="Setting output", tool_calls=so_tc)
    )
    if store:
        await store.write_part(conv._next_seq, conv._messages[-1].to_storage_dict())
    conv._next_seq += 1
    conv._messages.append(
        Message(
            seq=conv._next_seq,
            role="tool",
            content="Output 'result' set successfully.",
            tool_use_id="call_so",
        )
    )
    if store:
        await store.write_part(conv._next_seq, conv._messages[-1].to_storage_dict())
    conv._next_seq += 1
    # Recent messages
    await conv.add_user_message("Continue")
    await conv.add_assistant_message("Working on it")
    return conv
 # ---------------------------------------------------------------------------
 # Tests: aggressive structural compaction
 # ---------------------------------------------------------------------------
 class TestAggressiveStructuralCompaction:
    @pytest.mark.asyncio
    async def test_aggressive_collapses_tool_pairs(self, tmp_path):
        """Aggressive mode should collapse non-essential tool pairs into a summary."""
        conv = await _build_tool_heavy_conversation()
        spill = str(tmp_path)
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=True,
        )
        # The 5 append_data pairs (10 msgs) + 1 user msg should be collapsed.
        # Remaining: ref_msg + set_output pair (2 msgs) + 2 recent = 5
        assert conv.message_count == 5
        assert conv.messages[0].role == "user"  # ref message
        assert "TOOLS ALREADY CALLED" in conv.messages[0].content
        assert "append_data (5x)" in conv.messages[0].content
        # set_output pair should be preserved
        assert conv.messages[1].role == "assistant"
        assert conv.messages[1].tool_calls is not None
        assert conv.messages[1].tool_calls[0]["function"]["name"] == "set_output"
        assert conv.messages[2].role == "tool"
        # Recent messages intact
        assert conv.messages[3].content == "Continue"
        assert conv.messages[4].content == "Working on it"
    @pytest.mark.asyncio
    async def test_aggressive_preserves_set_output(self, tmp_path):
        """set_output tool calls are always protected in aggressive mode."""
        conv = await _build_tool_heavy_conversation()
        spill = str(tmp_path)
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=True,
        )
        # Find all tool calls in remaining messages
        tool_names = []
        for msg in conv.messages:
            if msg.tool_calls:
                for tc in msg.tool_calls:
                    tool_names.append(tc["function"]["name"])
        assert "set_output" in tool_names
        # append_data should NOT be in remaining messages (collapsed)
        assert "append_data" not in tool_names
    @pytest.mark.asyncio
    async def test_aggressive_preserves_errors(self, tmp_path):
        """Error tool results are always protected in aggressive mode."""
        conv = NodeConversation()
        await conv.add_user_message("Start")
        # Regular tool call
        tc1 = [_make_tool_call("call_ok", "web_search", {"query": "test"})]
        conv._messages.append(
            Message(seq=conv._next_seq, role="assistant", content="", tool_calls=tc1)
        )
        conv._next_seq += 1
        conv._messages.append(
            Message(seq=conv._next_seq, role="tool", content="results", tool_use_id="call_ok")
        )
        conv._next_seq += 1
        # Error tool call
        tc2 = [_make_tool_call("call_err", "web_scrape", {"url": "http://broken.com"})]
        conv._messages.append(
            Message(seq=conv._next_seq, role="assistant", content="", tool_calls=tc2)
        )
        conv._next_seq += 1
        conv._messages.append(
            Message(
                seq=conv._next_seq,
                role="tool",
                content="Connection timeout",
                tool_use_id="call_err",
                is_error=True,
            )
        )
        conv._next_seq += 1
        await conv.add_user_message("Next")
        await conv.add_assistant_message("OK")
        spill = str(tmp_path)
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=True,
        )
        # Error pair should be preserved
        error_msgs = [m for m in conv.messages if m.role == "tool" and m.is_error]
        assert len(error_msgs) == 1
        assert error_msgs[0].content == "Connection timeout"
    @pytest.mark.asyncio
    async def test_standard_mode_keeps_all_tool_pairs(self, tmp_path):
        """Non-aggressive mode should keep all tool pairs (existing behavior)."""
        conv = await _build_tool_heavy_conversation()
        spill = str(tmp_path)
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=False,
        )
        # All 6 tool pairs (12 msgs) should be kept as structural.
        # Removed: 1 user msg (freeform). Remaining: ref + 12 structural + 2 recent = 15
        assert conv.message_count == 15
    @pytest.mark.asyncio
    async def test_two_pass_sequence(self, tmp_path):
        """Standard pass then aggressive pass produces valid result."""
        conv = await _build_tool_heavy_conversation()
        spill = str(tmp_path)
        # Pass 1: standard
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
        )
        after_standard = conv.message_count
        assert after_standard == 15  # all structural kept
        # Pass 2: aggressive
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=True,
        )
        after_aggressive = conv.message_count
        assert after_aggressive < after_standard
        # ref + set_output pair + 2 recent = 5
        assert after_aggressive == 5
    @pytest.mark.asyncio
    async def test_aggressive_persists_correctly(self, tmp_path):
        """Aggressive compaction correctly updates the store."""
        store = MockConversationStore()
        conv = await _build_tool_heavy_conversation(store=store)
        spill = str(tmp_path)
        await conv.compact_preserving_structure(
            spillover_dir=spill,
            keep_recent=2,
            aggressive=True,
        )
        # Verify store state matches in-memory state
        parts = await store.read_parts()
        assert len(parts) == conv.message_count
 class TestExtractToolCallHistory:
    def test_basic_extraction(self):
        msgs = [
            Message(
                seq=0,
                role="assistant",
                content="",
                tool_calls=[
                    _make_tool_call("c1", "web_search", {"query": "python async"}),
                ],
            ),
            Message(seq=1, role="tool", content="results", tool_use_id="c1"),
            Message(
                seq=2,
                role="assistant",
                content="",
                tool_calls=[
                    _make_tool_call(
                        "c2", "save_data", {"filename": "output.txt", "content": "data"}
                    ),
                ],
            ),
            Message(seq=3, role="tool", content="saved", tool_use_id="c2"),
        ]
        result = extract_tool_call_history(msgs)
        assert "web_search (1x)" in result
        assert "save_data (1x)" in result
        assert "FILES SAVED: output.txt" in result
    def test_errors_included(self):
        msgs = [
            Message(
                seq=0,
                role="tool",
                content="Connection refused",
                is_error=True,
                tool_use_id="c1",
            ),
        ]
        result = extract_tool_call_history(msgs)
        assert "ERRORS" in result
        assert "Connection refused" in result
    def test_empty_messages(self):
        assert extract_tool_call_history([]) == ""
 # ---------------------------------------------------------------------------
 # Tests for _is_context_too_large_error
 # ---------------------------------------------------------------------------
 class TestIsContextTooLargeError:
    def test_context_window_class_name(self):
        from framework.graph.event_loop_node import _is_context_too_large_error
        class ContextWindowExceededError(Exception):
            pass
        assert _is_context_too_large_error(ContextWindowExceededError("x"))
    def test_openai_context_length(self):
        from framework.graph.event_loop_node import _is_context_too_large_error
        err = RuntimeError("This model's maximum context length is 128000 tokens")
        assert _is_context_too_large_error(err)
    def test_anthropic_too_long(self):
        from framework.graph.event_loop_node import _is_context_too_large_error
        err = RuntimeError("prompt is too long: 150000 tokens > 100000")
        assert _is_context_too_large_error(err)
    def test_generic_exceeds_limit(self):
        from framework.graph.event_loop_node import _is_context_too_large_error
        err = ValueError("Request exceeds token limit")
        assert _is_context_too_large_error(err)
    def test_unrelated_error(self):
        from framework.graph.event_loop_node import _is_context_too_large_error
        assert not _is_context_too_large_error(ValueError("connection refused"))
        assert not _is_context_too_large_error(RuntimeError("timeout"))
 # ---------------------------------------------------------------------------
 # Tests for _format_messages_for_summary
 # ---------------------------------------------------------------------------
 class TestFormatMessagesForSummary:
    def test_user_assistant_messages(self):
        from framework.graph.event_loop_node import EventLoopNode
        msgs = [
            Message(seq=0, role="user", content="Hello world"),
            Message(seq=1, role="assistant", content="Hi there"),
        ]
        result = EventLoopNode._format_messages_for_summary(msgs)
        assert "[user]: Hello world" in result
        assert "[assistant]: Hi there" in result
    def test_tool_result_truncated(self):
        from framework.graph.event_loop_node import EventLoopNode
        msgs = [
            Message(seq=0, role="tool", content="x" * 1000, tool_use_id="c1"),
        ]
        result = EventLoopNode._format_messages_for_summary(msgs)
        assert "[tool result]:" in result
        assert "..." in result
        # Should be truncated to 500 + "..."
        assert len(result) < 600
    def test_assistant_with_tool_calls(self):
        from framework.graph.event_loop_node import EventLoopNode
        tc = [_make_tool_call("c1", "web_search", {"query": "test"})]
        msgs = [
            Message(seq=0, role="assistant", content="Searching", tool_calls=tc),
        ]
        result = EventLoopNode._format_messages_for_summary(msgs)
        assert "web_search" in result
        assert "[assistant (calls:" in result
 # ---------------------------------------------------------------------------
 # Tests for _llm_compact (recursive binary-search)
 # ---------------------------------------------------------------------------
 class TestLlmCompact:
    """Test the recursive LLM compaction with mock LLM."""
    def _make_node(self):
        """Create a minimal EventLoopNode for testing."""
        from framework.graph.event_loop_node import EventLoopNode, LoopConfig
        config = LoopConfig(max_history_tokens=32000)
        node = EventLoopNode.__new__(EventLoopNode)
        node._config = config
        node._event_bus = None
        node._judge = None
        node._approval_callback = None
        node._tool_executor = None
        node._adaptive_learner = None
        # Set class-level constants (already on class, but explicit)
        return node
    def _make_ctx(self, llm_responses=None, llm_error=None):
        """Create a mock NodeContext with controllable LLM."""
        from unittest.mock import AsyncMock, MagicMock
        from framework.graph.node import NodeSpec
        spec = NodeSpec(
            id="test",
            name="Test Node",
            description="A test node",
            node_type="event_loop",
            input_keys=[],
            output_keys=["result"],
        )
        ctx = MagicMock()
        ctx.node_spec = spec
        ctx.node_id = "test"
        ctx.stream_id = "test"
        ctx.continuous_mode = False
        ctx.runtime_logger = None
        mock_llm = AsyncMock()
        if llm_error:
            mock_llm.acomplete.side_effect = llm_error
        elif llm_responses:
            responses = []
            for text in llm_responses:
                resp = MagicMock()
                resp.content = text
                responses.append(resp)
            mock_llm.acomplete.side_effect = responses
        else:
            resp = MagicMock()
            resp.content = "Summary of conversation."
            mock_llm.acomplete.return_value = resp
        ctx.llm = mock_llm
        return ctx
    @pytest.mark.asyncio
    async def test_single_call_success(self):
        node = self._make_node()
        ctx = self._make_ctx()
        msgs = [
            Message(seq=0, role="user", content="Do something"),
            Message(seq=1, role="assistant", content="Done"),
        ]
        result = await node._llm_compact(ctx, msgs, None)
        assert "Summary of conversation." in result
        ctx.llm.acomplete.assert_called_once()
    @pytest.mark.asyncio
    async def test_context_too_large_triggers_split(self):
        """When LLM raises context error, should split and retry."""
        from unittest.mock import MagicMock
        node = self._make_node()
        call_count = 0
        async def mock_acomplete(**kwargs):
            nonlocal call_count
            call_count += 1
            # First call with full messages → fail
            # Subsequent calls with smaller chunks → succeed
            if call_count == 1:
                raise RuntimeError("This model's maximum context length is 128000 tokens")
            resp = MagicMock()
            resp.content = f"Summary part {call_count}"
            return resp
        ctx = self._make_ctx()
        ctx.llm.acomplete = mock_acomplete
        msgs = [Message(seq=i, role="user", content=f"Message {i}") for i in range(10)]
        result = await node._llm_compact(ctx, msgs, None)
        # Should have split and produced two summaries
        assert "Summary part" in result
        assert call_count >= 3  # 1 failure + 2 successful halves
    @pytest.mark.asyncio
    async def test_non_context_error_propagates(self):
        """Non-context errors should propagate, not trigger splitting."""
        node = self._make_node()
        ctx = self._make_ctx(llm_error=ValueError("API key invalid"))
        msgs = [
            Message(seq=0, role="user", content="Hello"),
            Message(seq=1, role="assistant", content="Hi"),
        ]
        with pytest.raises(ValueError, match="API key invalid"):
            await node._llm_compact(ctx, msgs, None)
    @pytest.mark.asyncio
    async def test_proactive_split_for_large_input(self):
        """Messages exceeding char limit should be split proactively."""
        node = self._make_node()
        # Lower the limit for testing
        node._LLM_COMPACT_CHAR_LIMIT = 100
        ctx = self._make_ctx(
            llm_responses=["Part 1 summary", "Part 2 summary"],
        )
        msgs = [
            Message(seq=0, role="user", content="x" * 80),
            Message(seq=1, role="user", content="y" * 80),
        ]
        result = await node._llm_compact(ctx, msgs, None)
        assert "Part 1 summary" in result
        assert "Part 2 summary" in result
        # LLM should have been called twice (no failure, proactive split)
        assert ctx.llm.acomplete.call_count == 2
    @pytest.mark.asyncio
    async def test_tool_history_appended_at_top_level(self):
        """Tool history should only be appended at depth 0."""
        node = self._make_node()
        ctx = self._make_ctx()
        tc = [_make_tool_call("c1", "web_search", {"query": "test"})]
        msgs = [
            Message(seq=0, role="assistant", content="", tool_calls=tc),
            Message(seq=1, role="tool", content="results", tool_use_id="c1"),
        ]
        result = await node._llm_compact(ctx, msgs, None)
        assert "TOOLS ALREADY CALLED" in result
        assert "web_search" in result
 # ---------------------------------------------------------------------------
 # Orphaned tool result repair
 # ---------------------------------------------------------------------------
 class TestRepairOrphanedToolCalls:
    """Test _repair_orphaned_tool_calls handles both directions."""
    def test_orphaned_tool_result_dropped(self):
        """Tool result with no matching tool_use should be dropped."""
        msgs = [
            # tool result with no preceding assistant tool_use
            {"role": "tool", "tool_call_id": "orphan_1", "content": "stale result"},
            {"role": "user", "content": "hello"},
            {"role": "assistant", "content": "hi"},
        ]
        repaired = NodeConversation._repair_orphaned_tool_calls(msgs)
        assert len(repaired) == 2
        assert repaired[0]["role"] == "user"
        assert repaired[1]["role"] == "assistant"
    def test_valid_tool_pair_preserved(self):
        """Tool result with matching tool_use should be kept."""
        msgs = [
            {"role": "user", "content": "search"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [{"id": "tc_1", "function": {"name": "search", "arguments": "{}"}}],
            },
            {"role": "tool", "tool_call_id": "tc_1", "content": "results"},
        ]
        repaired = NodeConversation._repair_orphaned_tool_calls(msgs)
        assert len(repaired) == 3
        assert repaired[2]["tool_call_id"] == "tc_1"
    def test_orphaned_tool_use_gets_stub(self):
        """Tool use with no following tool result gets a synthetic error stub."""
        msgs = [
            {"role": "user", "content": "search"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [{"id": "tc_1", "function": {"name": "search", "arguments": "{}"}}],
            },
            # No tool result follows
            {"role": "user", "content": "what happened?"},
        ]
        repaired = NodeConversation._repair_orphaned_tool_calls(msgs)
        # Should insert a synthetic tool result between assistant and user
        assert len(repaired) == 4
        assert repaired[2]["role"] == "tool"
        assert repaired[2]["tool_call_id"] == "tc_1"
        assert "interrupted" in repaired[2]["content"].lower()
    def test_mixed_orphans(self):
        """Both orphaned results and orphaned calls handled together."""
        msgs = [
            # Orphaned result (no matching tool_use)
            {"role": "tool", "tool_call_id": "gone_1", "content": "old result"},
            {"role": "user", "content": "try again"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [{"id": "tc_2", "function": {"name": "fetch", "arguments": "{}"}}],
            },
            # Missing result for tc_2
            {"role": "user", "content": "done?"},
        ]
        repaired = NodeConversation._repair_orphaned_tool_calls(msgs)
        # orphaned result dropped, stub added for tc_2
        roles = [m["role"] for m in repaired]
        assert roles == ["user", "assistant", "tool", "user"]
        assert repaired[2]["tool_call_id"] == "tc_2"
@@ -8,7 +8,7 @@ from framework.graph.executor import ExecutionResult
 from framework.graph.checkpoint_config import CheckpointConfig
 from framework.llm import LiteLLMProvider
 from framework.runner.tool_registry import ToolRegistry
-from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.agent_runtime import create_agent_runtime
 from framework.runtime.execution_stream import EntryPointSpec
 from .config import default_config, metadata
@@ -90,7 +90,7 @@ edges = [
        source="confirm-draft",
        target="intake",
        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="batch_complete == True and send_started == True and send_count >= 1 and sent_message_ids is not None and len(sent_message_ids) >= 1",
+        condition_expr="batch_complete == True",
        priority=1,
    ),
 ]
@@ -251,9 +251,7 @@ class EmailReplyAgent:
                errors.append(f"Terminal node '{t}' not found")
        for ep_id, nid in self.entry_points.items():
            if nid not in node_ids:
-                errors.append(
+                errors.append(f"Entry point '{ep_id}' references unknown node '{nid}'")
                    f"Entry point '{ep_id}' references unknown node '{nid}'"
                )
        return {"valid": len(errors) == 0, "errors": errors, "warnings": warnings}
@@ -36,7 +36,9 @@ default_config = RuntimeConfig()
 class AgentMetadata:
    name: str = "Email Reply Agent"
    version: str = "1.0.0"
-    description: str = "Filter unreplied emails, confirm recipients, send personalized replies."
+    description: str = (
        "Filter unreplied emails, confirm recipients, send personalized replies."
    )
    intro_message: str = "Tell me which emails you want to reply to (e.g., 'emails from @company.com in the last week')."
@@ -83,8 +83,8 @@ confirm_draft_node = NodeSpec(
    client_facing=True,
    max_node_visits=0,
    input_keys=["email_list", "filter_criteria"],
-    output_keys=["batch_complete", "restart", "send_started", "send_count", "sent_message_ids", "send_failures"],
+    output_keys=["batch_complete", "restart"],
-    nullable_output_keys=["batch_complete", "restart", "send_started", "send_count", "sent_message_ids", "send_failures"],
+    nullable_output_keys=["batch_complete", "restart"],
    success_criteria="User confirmed recipients and personalized replies sent for each.",
    system_prompt="""\
 You are a Gmail reply assistant. Present emails for confirmation, then send personalized replies.
@@ -99,22 +99,14 @@ You are a Gmail reply assistant. Present emails for confirmation, then send pers
 **STEP 2 — Handle user response:**
 If user CONFIRMS (says yes, go ahead, sounds good, etc.):
-1. Immediately call set_output("send_started", True) before any send tools.
+For EACH email in email_list:
-2. For EACH email in email_list, call gmail_reply_email with:
+1. Read the subject and snippet
 2. Use tone_guidance from filter_criteria + any user-specified preferences
 3. Call gmail_reply_email with:
   - message_id: the email's message_id
-   - html: personalized 2-4 sentence reply based on email context, using tone_guidance from filter_criteria and any new user preferences.
+   - html: personalized 2-4 sentence reply based on email context
-3. Track send results during this run:
+   (The tool automatically handles recipient, subject, and threading)
-   - send_count: number of successful gmail_reply_email calls
+4. After all replies sent, call: set_output("batch_complete", True)
   - sent_message_ids: list of message_ids successfully replied to
   - send_failures: list of {"message_id": "...", "error": "..."} for failed sends
 4. REQUIRED completion gate:
   - You MUST NOT set batch_complete=True unless send_started is True AND send_count >= 1 AND sent_message_ids is non-empty.
   - If no sends succeeded, do NOT set batch_complete=True. Instead explain what failed and ask user whether to retry or restart.
 5. After successful sends, call set_output in a separate turn:
   - set_output("send_count", <int>)
   - set_output("sent_message_ids", <list>)
   - set_output("send_failures", <list>)
   - set_output("batch_complete", True)
 If user wants to CHANGE LOGIC/FILTER (says change filter, different criteria, not these emails, wrong emails, etc.):
 1. Acknowledge their request
@@ -1,7 +1,5 @@
 """Structural tests for Email Reply Agent."""
 import pytest
 class TestAgentStructure:
    """Test agent graph structure."""
@@ -247,121 +247,34 @@ def undo_changes(path: str = "") -> str:
@mcp.tool()
-def discover_mcp_tools(server_config_path: str = "") -> str:
+def list_agent_tools(
-    """Discover available MCP tools by connecting to servers defined in a config file.
+    server_config_path: str = "",
    output_schema: str = "simple",
    group: str = "all",
 ) -> str:
    """Discover tools available for agent building, grouped by category.
-    Connects to each MCP server, lists all tools with full schemas, then
+    Connects to each MCP server, lists tools, then disconnects. Use this
-    disconnects. Use this to see what tools are available before designing
+    BEFORE designing an agent to know exactly which tools exist. Only use
-    an agent — never rely on static documentation.
+    tools from this list in node definitions — never guess or fabricate.
    Args:
        server_config_path: Path to mcp_servers.json (relative to project root).
            Default: the hive-tools server config at tools/mcp_servers.json.
            Can also point to any agent's mcp_servers.json.
    Returns:
        JSON listing of all tools with names, descriptions, and input schemas
    """
    # Resolve config path
    if not server_config_path:
        # Default: look for the main hive-tools mcp_servers.json
        candidates = [
            os.path.join(PROJECT_ROOT, "tools", "mcp_servers.json"),
            os.path.join(PROJECT_ROOT, "mcp_servers.json"),
        ]
        config_path = None
        for c in candidates:
            if os.path.isfile(c):
                config_path = c
                break
        if not config_path:
            return "Error: No mcp_servers.json found. Provide server_config_path."
    else:
        config_path = _resolve_path(server_config_path)
        if not os.path.isfile(config_path):
            return f"Error: Config file not found: {server_config_path}"
    try:
        with open(config_path, encoding="utf-8") as f:
            servers_config = json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        return f"Error reading config: {e}"
    # Import MCPClient (deferred — needs PYTHONPATH to include core/)
    try:
        from framework.runner.mcp_client import MCPClient, MCPServerConfig
    except ImportError:
        return "Error: Cannot import MCPClient. Ensure PYTHONPATH includes the core/ directory."
    all_tools = []
    errors = []
    config_dir = os.path.dirname(config_path)
    for server_name, server_conf in servers_config.items():
        # Resolve cwd relative to config file location
        cwd = server_conf.get("cwd", "")
        if cwd and not os.path.isabs(cwd):
            cwd = os.path.abspath(os.path.join(config_dir, cwd))
        try:
            config = MCPServerConfig(
                name=server_name,
                transport=server_conf.get("transport", "stdio"),
                command=server_conf.get("command"),
                args=server_conf.get("args", []),
                env=server_conf.get("env", {}),
                cwd=cwd or None,
                url=server_conf.get("url"),
                headers=server_conf.get("headers", {}),
            )
            client = MCPClient(config)
            client.connect()
            tools = client.list_tools()
            for tool in tools:
                all_tools.append(
                    {
                        "server": server_name,
                        "name": tool.name,
                        "description": tool.description,
                        "input_schema": tool.input_schema,
                    }
                )
            client.disconnect()
        except Exception as e:
            errors.append({"server": server_name, "error": str(e)})
    result = {
        "tools": all_tools,
        "total": len(all_tools),
        "servers_queried": len(servers_config),
    }
    if errors:
        result["errors"] = errors
    return json.dumps(result, indent=2, default=str)
 # ── Meta-agent: Agent tool catalog ────────────────────────────────────────
@mcp.tool()
 def list_agent_tools(server_config_path: str = "") -> str:
    """List all tools available for agent building from the hive-tools MCP server.
    Returns tool names grouped by category. Use this BEFORE designing an agent
    to know exactly which tools exist. Only use tools from this list in node
    definitions — never guess or fabricate tool names.
    Args:
        server_config_path: Path to mcp_servers.json. Default: tools/mcp_servers.json
            (the standard hive-tools server). Can also point to an agent's config
            to see what tools that specific agent has access to.
        output_schema: "simple" (default) returns name and description per tool.
            "full" also includes server and input_schema.
        group: "all" (default) returns every category. A prefix like "gmail"
            returns only that group's tools.
    Returns:
-        JSON with tool names grouped by prefix (e.g. gmail_*, slack_*, etc.)
+        JSON with tools grouped by prefix (e.g. gmail_*, slack_*).
    """
    if output_schema not in ("simple", "full"):
        return json.dumps(
            {"error": f"Invalid output_schema: {output_schema!r}. Use 'simple' or 'full'."}
        )
    # Resolve config path
    if not server_config_path:
        candidates = [
@@ -413,27 +326,46 @@ def list_agent_tools(server_config_path: str = "") -> str:
            client = MCPClient(config)
            client.connect()
            for tool in client.list_tools():
-                all_tools.append({"name": tool.name, "description": tool.description})
+                all_tools.append(
                    {
                        "server": server_name,
                        "name": tool.name,
                        "description": tool.description,
                        "input_schema": tool.input_schema,
                    }
                )
            client.disconnect()
        except Exception as e:
            errors.append({"server": server_name, "error": str(e)})
    # Group by prefix (e.g., gmail_, slack_, stripe_)
-    groups: dict[str, list[str]] = {}
+    groups: dict[str, list[dict]] = {}
    for t in sorted(all_tools, key=lambda x: x["name"]):
        parts = t["name"].split("_", 1)
        prefix = parts[0] if len(parts) > 1 else "general"
-        groups.setdefault(prefix, []).append(t["name"])
+        groups.setdefault(prefix, []).append(t)
    # Filter to a specific group
    if group != "all":
        groups = {group: groups[group]} if group in groups else {}
    # Apply output schema
    if output_schema == "simple":
        groups = {
            prefix: [{"name": t["name"], "description": t["description"]} for t in tools]
            for prefix, tools in groups.items()
        }
    all_names = sorted(t["name"] for tools in groups.values() for t in tools)
    result: dict = {
-        "total": len(all_tools),
+        "total": len(all_names),
        "tools_by_category": groups,
-        "all_tool_names": sorted(t["name"] for t in all_tools),
+        "all_tool_names": all_names,
    }
    if errors:
        result["errors"] = errors
-    return json.dumps(result, indent=2)
+    return json.dumps(result, indent=2, default=str)
 # ── Meta-agent: Agent tool validation ─────────────────────────────────────
@@ -564,7 +496,7 @@ def validate_agent_tools(agent_path: str) -> str:
        result["missing_tools"] = missing_by_node
        result["message"] = (
            f"FAIL: {sum(len(v) for v in missing_by_node.values())} tool(s) declared "
-            f"in nodes do not exist. Run discover_mcp_tools() to see available tools "
+            f"in nodes do not exist. Run list_agent_tools() to see available tools "
            f"and fix the node definitions."
        )
    else: