feat: active streams and waiting nodes

Merge branch 'main' into feature/concurrent-judge-runtime
feat: event bus logging
2026-02-24 09:03:21 -08:00 · 2026-02-24 07:43:22 -08:00 · 2026-02-24 07:43:05 -08:00 · 2026-02-24 17:06:11 +08:00 · 2026-02-24 10:05:48 +01:00 · 2026-02-24 09:50:17 +01:00
444 changed files with 77026 additions and 12407 deletions
@@ -0,0 +1,9 @@
+{
+  "mcpServers": {
+    "agent-builder": {
+      "command": "uv",
+      "args": ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"],
+      "disabled": false
+    }
+  }
+}
@@ -0,0 +1 @@
+../../.claude/skills/hive
@@ -0,0 +1 @@
+../../.claude/skills/hive-concepts
@@ -0,0 +1 @@
+../../.claude/skills/hive-create
@@ -0,0 +1 @@
+../../.claude/skills/hive-credentials
@@ -0,0 +1 @@
+../../.claude/skills/hive-patterns
@@ -0,0 +1 @@
+../../.claude/skills/hive-test
@@ -0,0 +1,5 @@
+---
+description: hive-concepts
+---
+
+use hive-concepts skill
@@ -0,0 +1,5 @@
+---
+description: hive-create
+---
+
+use hive-create skill
@@ -0,0 +1,5 @@
+---
+description: hive-credentials
+---
+
+use hive-credentials skill
@@ -0,0 +1,5 @@
+---
+description: hive-patterns
+---
+
+use hive-patterns skill
@@ -0,0 +1,5 @@
+---
+description: hive-test
+---
+
+use hive-test skill
@@ -0,0 +1,5 @@
+---
+description: hive
+---
+
+use hive skill
@@ -0,0 +1 @@
+../../.claude/skills/hive
@@ -0,0 +1 @@
+../../.claude/skills/hive-concepts
@@ -0,0 +1 @@
+../../.claude/skills/hive-create
@@ -0,0 +1 @@
+../../.claude/skills/hive-credentials
@@ -0,0 +1 @@
+../../.claude/skills/hive-patterns
@@ -0,0 +1 @@
+../../.claude/skills/hive-test
@@ -0,0 +1,34 @@
+{
+  "permissions": {
+    "allow": [
+      "mcp__agent-builder__create_session",
+      "mcp__agent-builder__set_goal",
+      "mcp__agent-builder__add_node",
+      "mcp__agent-builder__add_edge",
+      "mcp__agent-builder__configure_loop",
+      "mcp__agent-builder__add_mcp_server",
+      "mcp__agent-builder__validate_graph",
+      "mcp__agent-builder__export_graph",
+      "mcp__agent-builder__load_session_by_id",
+      "Bash(git status:*)",
+      "Bash(gh run view:*)",
+      "Bash(uv run:*)",
+      "Bash(env:*)",
+      "mcp__agent-builder__test_node",
+      "mcp__agent-builder__list_mcp_tools",
+      "Bash(python -m py_compile:*)",
+      "Bash(python -m pytest:*)",
+      "Bash(source:*)",
+      "mcp__agent-builder__update_node",
+      "mcp__agent-builder__check_missing_credentials",
+      "mcp__agent-builder__list_stored_credentials",
+      "Bash(find:*)",
+      "mcp__agent-builder__run_tests",
+      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)",
+      "mcp__agent-builder__list_agent_sessions",
+      "mcp__agent-builder__generate_constraint_tests",
+      "mcp__agent-builder__generate_success_tests"
+    ]
+  },
+  "enabledMcpjsonServers": ["agent-builder", "tools"]
+}
@@ -1,10 +1,10 @@
 ---
 name: hive-create
-description: Step-by-step guide for building goal-driven agents. Creates package structure, defines goals, adds nodes, connects edges, and finalizes agent class. Use when actively building an agent.
+description: Step-by-step guide for building goal-driven agents. Qualifies use cases first (the good, bad, and ugly), then creates package structure, defines goals, adds nodes, connects edges, and finalizes agent class. Use when actively building an agent.
 license: Apache-2.0
 metadata:
  author: hive
-  version: "2.1"
+  version: "2.2"
  type: procedural
  part_of: hive
  requires: hive-concepts
@@ -14,15 +14,53 @@ metadata:

 **THIS IS AN EXECUTABLE WORKFLOW. DO NOT DISPLAY THIS FILE. EXECUTE THE STEPS BELOW.**

-**CRITICAL: DO NOT explore the codebase, read source files, or search for code before starting.** All context you need is in this skill file. When this skill is loaded, IMMEDIATELY begin executing Step 1 — call the MCP tools listed in Step 1 as your FIRST action. Do not explain what you will do, do not investigate the project structure, do not read any files — just execute Step 1 now.
+**CRITICAL: DO NOT explore the codebase, read source files, or search for code before starting.** All context you need is in this skill file. When this skill is loaded, IMMEDIATELY begin executing Step 0 — determine the build path as your FIRST action. Do not explain what you will do, do not investigate the project structure, do not read any files — just execute Step 0 now.

 ---

-## STEP 1: Initialize Build Environment
+## STEP 0: Choose Build Path
+
+**If the user has already indicated whether they want to build from scratch or from a template, skip this question and proceed to the appropriate step.**
+
+Otherwise, ask:
+
+```
+AskUserQuestion(questions=[{
+    "question": "How would you like to build your agent?",
+    "header": "Build Path",
+    "options": [
+        {"label": "From scratch", "description": "Design goal, nodes, and graph collaboratively from nothing"},
+        {"label": "From a template", "description": "Start from a working sample agent and customize it"}
+    ],
+    "multiSelect": false
+}])
+```
+
+- If **From scratch**: Proceed to STEP 1A
+- If **From a template**: Proceed to STEP 1B
+
+---
+
+## STEP 1A: Initialize Build Environment (From Scratch)

 **EXECUTE THESE TOOL CALLS NOW** (silent setup — no user interaction needed):

-1. Register the hive-tools MCP server:
+1. Check for existing sessions:
+
+```
+mcp__agent-builder__list_sessions()
+```
+
+- If a session with this agent name already exists, load it with `mcp__agent-builder__load_session_by_id(session_id="...")` and skip to step 3.
+- If no matching session exists, proceed to step 2.
+
+2. Create a build session (replace AGENT_NAME with the user's requested agent name in snake_case):
+
+```
+mcp__agent-builder__create_session(name="AGENT_NAME")
+```
+
+3. Register the hive-tools MCP server:

 ```
 mcp__agent-builder__add_mcp_server(
@@ -35,45 +73,368 @@ mcp__agent-builder__add_mcp_server(
 )
 ```

-2. Create a build session (replace AGENT_NAME with the user's requested agent name in snake_case):
-
-```
-mcp__agent-builder__create_session(name="AGENT_NAME")
-```
-
-3. Discover available tools:
+4. Discover available tools:

 ```
 mcp__agent-builder__list_mcp_tools()
 ```

-4. Create the package directory:
+5. Create the package directory:

 ```bash
 mkdir -p exports/AGENT_NAME/nodes
 ```

-**Save the tool list for step 3** — you will need it for node design in STEP 3.
+**Save the tool list for STEP 4** — you will need it for node design.

 **THEN immediately proceed to STEP 2** (do NOT display setup results to the user — just move on).

 ---

+## STEP 1B: Initialize Build Environment (From Template)
+
+**EXECUTE THESE STEPS NOW:**
+
+### 1B.1: Discover available templates
+
+List the template directories and read each template's `agent.json` to get its name and description:
+
+```bash
+ls examples/templates/
+```
+
+For each directory found, read `examples/templates/TEMPLATE_DIR/agent.json` with the Read tool and extract:
+- `agent.name` — the template's display name
+- `agent.description` — what the template does
+
+### 1B.2: Present templates to user
+
+Show the user a table of available templates:
+
+> **Available Templates:**
+>
+> | # | Template | Description |
+> |---|----------|-------------|
+> | 1 | [name from agent.json] | [description from agent.json] |
+> | 2 | ... | ... |
+
+Then ask the user to pick a template and provide a name for their new agent:
+
+```
+AskUserQuestion(questions=[{
+    "question": "Which template would you like to start from?",
+    "header": "Template",
+    "options": [
+        {"label": "[template 1 name]", "description": "[template 1 description]"},
+        {"label": "[template 2 name]", "description": "[template 2 description]"},
+        ...
+    ],
+    "multiSelect": false
+}, {
+    "question": "What should the new agent be named? (snake_case)",
+    "header": "Agent Name",
+    "options": [
+        {"label": "Use template name", "description": "Keep the original template name as-is"},
+        {"label": "Custom name", "description": "I'll provide a new snake_case name"}
+    ],
+    "multiSelect": false
+}])
+```
+
+### 1B.3: Copy template to exports
+
+```bash
+cp -r examples/templates/TEMPLATE_DIR exports/NEW_AGENT_NAME
+```
+
+### 1B.4: Create session and register MCP (same logic as STEP 1A)
+
+First, check for existing sessions:
+
+```
+mcp__agent-builder__list_sessions()
+```
+
+- If a session with this agent name already exists, load it with `mcp__agent-builder__load_session_by_id(session_id="...")` and skip to `list_mcp_tools`.
+- If no matching session exists, create one:
+
+```
+mcp__agent-builder__create_session(name="NEW_AGENT_NAME")
+```
+
+Then register MCP and discover tools:
+
+```
+mcp__agent-builder__add_mcp_server(
+    name="hive-tools",
+    transport="stdio",
+    command="uv",
+    args='["run", "python", "mcp_server.py", "--stdio"]',
+    cwd="tools",
+    description="Hive tools MCP server"
+)
+```
+
+```
+mcp__agent-builder__list_mcp_tools()
+```
+
+### 1B.5: Load template into builder session
+
+Import the entire agent definition in one call:
+
+```
+mcp__agent-builder__import_from_export(agent_json_path="exports/NEW_AGENT_NAME/agent.json")
+```
+
+This reads the agent.json and populates the builder session with the goal, all nodes, and all edges.
+
+**THEN immediately proceed to STEP 2.**
+
+---
+
 ## STEP 2: Define Goal Together with User
+**A responsible engineer doesn't jump into building. First, understand the problem and be transparent about what the framework can and cannot do.**
+
+**If starting from a template**, the goal is already loaded in the builder session. Present the existing goal to the user using the format below and ask for approval. Skip the collaborative drafting questions — go straight to presenting and asking "Do you approve this goal, or would you like to modify it?"
+
+**If the user has NOT already described what they want to build**, start by asking what kind of agent they have in mind:
+
+```
+AskUserQuestion(questions=[{
+    "question": "What kind of agent do you want to build? Select an option below, or choose 'Other' to describe your own.",
+    "header": "Agent type",
+    "options": [
+        {"label": "Data collection", "description": "Gathers information from the web, analyzes it, and produces a report or sends outreach (e.g. market research, news digest, email campaigns, competitive analysis)"},
+        {"label": "Workflow automation", "description": "Automates a multi-step business process end-to-end (e.g. lead qualification, content publishing pipeline, data entry)"},
+        {"label": "Personal assistant", "description": "Handles recurring tasks or monitors for events and acts on them (e.g. daily briefings, meeting prep, file organization)"}
+    ],
+    "multiSelect": false
+}])
+```
+
+Use the user's selection (or their custom description if they chose "Other") as context when shaping the goal below. If the user already described what they want before this step, skip the question and proceed directly.

 **DO NOT propose a complete goal on your own.** Instead, collaborate with the user to define it.

-**START by asking the user to help shape the goal:**
+### 2a: Fast Discovery (3-8 Turns)

-> I've set up the build environment and discovered [N] available tools. Let's define the goal for your agent together.
->
-> To get started, can you help me understand:
->
-> 1. **What should this agent accomplish?** (the core purpose)
-> 2. **How will we know it succeeded?** (what does "done" look like)
-> 3. **Are there any hard constraints?** (things it must never do, quality bars, etc.)
+**The core principle**: Discovery should feel like progress, not paperwork. The stakeholder should walk away feeling like you understood them faster than anyone else would have.

-**WAIT for the user to respond.** Use their input to draft:
+**Communication sytle**: Be concise. Say less. Mean more. Impatient stakeholders don't want a wall of text — they want to know you get it. Every sentence you say should either move the conversation forward or prove you understood something. If it does neither, cut it.
+
+**Ask Question Rules: Respect Their Time.** Every question must earn its place by:
+1. **Preventing a costly wrong turn** — you're about to build the wrong thing
+2. **Unlocking a shortcut** — their answer lets you simplify the design
+3. **Surfacing a dealbreaker** — there's a constraint that changes everything
+4. **Provide Options** - Provide options to your questions if possible, but also always allow the user to type something beyong the options.
+
+If a question doesn't do one of these, don't ask it. Make an assumption, state it, and move on.
+
+---
+
+#### 2a.1: Let Them Talk, But Listen Like an Architect
+
+When the stakeholder describes what they want, don't just hear the words — listen for the architecture underneath. While they talk, mentally construct:
+
+- **The actors**: Who are the people/systems involved?
+- **The trigger**: What kicks off the workflow?
+- **The core loop**: What's the main thing that happens repeatedly?
+- **The output**: What's the valuable thing produced at the end?
+- **The pain**: What about today's situation is broken, slow, or missing?
+
+You are extracting a **domain model** from natural language in real time. Most stakeholders won't give you this structure explicitly — they'll give you a story. Your job is to hear the structure inside the story.
+
+| They say... | You're hearing... |
+|-------------|-------------------|
+| Nouns they repeat | Your entities |
+| Verbs they emphasize | Your core operations |
+| Frustrations they mention | Your design constraints |
+| Workarounds they describe | What the system must replace |
+| People they name | Your user types |
+
+---
+
+#### 2a.2: Use Domain Knowledge to Fill In the Blanks
+
+You have broad knowledge of how systems work. Use it aggressively.
+
+If they say "I need a research agent," you already know it probably involves: search, summarization, source tracking, and iteration. Don't ask about each — use them as your starting mental model and let their specifics override your defaults.
+
+If they say "I need to monitor files and alert me," you know this probably involves: watch patterns, triggers, notifications, and state tracking.
+
+**The key move**: Take your general knowledge of the domain and merge it with the specifics they've given you. The result is a draft understanding that's 60-80% right before you've asked a single question. Your questions close the remaining 20-40%.
+
+---
+
+#### 2a.3: Play Back a Proposed Model (Not a List of Questions)
+
+After listening, present a **concrete picture** of what you think they need. Make it specific enough that they can spot what's wrong.
+
+**Pattern: "Here's what I heard — tell me where I'm off"**
+
+> "OK here's how I'm picturing this: [User type] needs to [core action]. Right now they're [current painful workflow]. What you want is [proposed solution that replaces the pain].
+>
+> The way I'd structure this: [key entities] connected by [key relationships], with the main flow being [trigger → steps → outcome].
+>
+> For the MVP, I'd focus on [the one thing that delivers the most value] and hold off on [things that can wait].
+>
+> Before I start — [1-2 specific questions you genuinely can't infer]."
+
+Why this works:
+- **Proves you were listening** — they don't feel like they have to repeat themselves
+- **Shows competence** — you're already thinking in systems
+- **Fast to correct** — "no, it's more like X" takes 10 seconds vs. answering 15 questions
+- **Creates momentum** — heading toward building, not more talking
+
+---
+
+#### 2a.4: Ask Only What You Cannot Infer
+
+Your questions should be **narrow, specific, and consequential**. Never ask what you could answer yourself.
+
+**Good questions** (high-stakes, can't infer):
+- "Who's the primary user — you or your end customers?"
+- "Is this replacing a spreadsheet, or is there literally nothing today?"
+- "Does this need to integrate with anything, or standalone?"
+- "Is there existing data to migrate, or starting fresh?"
+
+**Bad questions** (low-stakes, inferable):
+- "What should happen if there's an error?" *(handle gracefully, obviously)*
+- "Should it have search?" *(if there's a list, yes)*
+- "How should we handle permissions?" *(follow standard patterns)*
+- "What tools should I use?" *(your call, not theirs)*
+
+---
+
+#### Conversation Flow (3-5 Turns)
+
+| Turn | Who | What |
+|------|-----|------|
+| 1 | User | Describes what they need |
+| 2 | Agent | Plays back understanding as a proposed model. Asks 1-2 critical questions max. |
+| 3 | User | Corrects, confirms, or adds detail |
+| 4 | Agent | Adjusts model, confirms MVP scope, states assumptions, declares starting point |
+| *(5)* | *(Only if Turn 3 revealed something that fundamentally changes the approach)* |
+
+**AFTER the conversation, IMMEDIATELY proceed to 2b. DO NOT skip to building.**
+
+---
+
+#### Anti-Patterns
+
+| Don't | Do Instead |
+|-------|------------|
+| Open with a list of questions | Open with what you understood from their request |
+| "What are your requirements?" | "Here's what I think you need — am I right?" |
+| Ask about every edge case | Handle with smart defaults, flag in summary |
+| 10+ turn discovery conversation | 3-8 turns. Start building, iterate with real software. |
+| Being lazy nd not understand what user want to achieve | Understand "what" and "why |
+| Ask for permission to start | State your plan and start |
+| Wait for certainty | Start at 80% confidence, iterate the rest |
+| Ask what tech/tools to use | That's your job. Decide, disclose, move on. |
+
+---
+
+
+
+### 2b: Capability Assessment
+
+**After the user responds, analyze the fit.** Present this assessment honestly:
+
+> **Framework Fit Assessment**
+>
+> Based on what you've described, here's my honest assessment of how well this framework fits your use case:
+>
+> **What Works Well (The Good):**
+> - [List 2-4 things the framework handles well for this use case]
+> - Examples: multi-turn conversations, human-in-the-loop review, tool orchestration, structured outputs
+>
+> **Limitations to Be Aware Of (The Bad):**
+> - [List 2-3 limitations that apply but are workable]
+> - Examples: LLM latency means not suitable for sub-second responses, context window limits for very large documents, cost per run for heavy tool usage
+>
+> **Potential Deal-Breakers (The Ugly):**
+> - [List any significant challenges or missing capabilities — be honest]
+> - Examples: no tool available for X, would require custom MCP server, framework not designed for Y
+
+**Be specific.** Reference the actual tools discovered in Step 1. If the user needs `send_email` but it's not available, say so. If they need real-time streaming from a database, explain that's not how the framework works.
+
+### 2c: Gap Analysis
+
+**Identify specific gaps** between what the user wants and what you can deliver:
+
+| Requirement | Framework Support | Gap/Workaround |
+|-------------|-------------------|----------------|
+| [User need] | [✅ Supported / ⚠️ Partial / ❌ Not supported] | [How to handle or why it's a problem] |
+
+**Examples of gaps to identify:**
+- Missing tools (user needs X, but only Y and Z are available)
+- Scope issues (user wants to process 10,000 items, but LLM rate limits apply)
+- Interaction mismatches (user wants CLI-only, but agent is designed for TUI)
+- Data flow issues (user needs to persist state across runs, but sessions are isolated)
+- Latency requirements (user needs instant responses, but LLM calls take seconds)
+
+### 2d: Recommendation
+
+**Give a clear recommendation:**
+
+> **My Recommendation:**
+>
+> [One of these three:]
+>
+> **✅ PROCEED** — This is a good fit. The framework handles your core needs well. [List any minor caveats.]
+>
+> **⚠️ PROCEED WITH SCOPE ADJUSTMENT** — This can work, but we should adjust: [specific changes]. Without these adjustments, you'll hit [specific problems].
+>
+> **🛑 RECONSIDER** — This framework may not be the right tool for this job because [specific reasons]. Consider instead: [alternatives — simpler script, different framework, custom solution].
+
+### 2e: Get Explicit Acknowledgment
+
+**CALL AskUserQuestion:**
+
+```
+AskUserQuestion(questions=[{
+    "question": "Based on this assessment, how would you like to proceed?",
+    "header": "Proceed",
+    "options": [
+        {"label": "Proceed as described", "description": "I understand the limitations, let's build it"},
+        {"label": "Adjust scope", "description": "Let's modify the requirements to fit better"},
+        {"label": "More questions", "description": "I have questions about the assessment"},
+        {"label": "Reconsider", "description": "Maybe this isn't the right approach"}
+    ],
+    "multiSelect": false
+}])
+```
+
+**WAIT for user response.**
+
+- If **Proceed**: Move to STEP 3
+- If **Adjust scope**: Discuss what to change, update your notes, re-assess if needed
+- If **More questions**: Answer them honestly, then ask again
+- If **Reconsider**: Discuss alternatives. If they decide to proceed anyway, that's their informed choice
+
+---
+
+## STEP 3: Define Goal Together with User
+
+**Now that the use case is qualified, collaborate on the goal definition.**
+
+**START by synthesizing what you learned:**
+
+> Based on our discussion, here's my understanding of the goal:
+>
+> **Core purpose:** [what you understood from 2a]
+> **Success looks like:** [what you inferred]
+> **Key constraints:** [what you inferred]
+>
+> Let me refine this with you:
+>
+> 1. **What should this agent accomplish?** (confirm or correct my understanding)
+> 2. **How will we know it succeeded?** (what specific outcomes matter)
+> 3. **Are there any hard constraints?** (things it must never do, quality bars)
+
+**WAIT for the user to respond.** Use their input (and the agent type they selected) to draft:

 - Goal ID (kebab-case)
 - Goal name
@@ -115,12 +476,14 @@ AskUserQuestion(questions=[{

 **WAIT for user response.**

- If **Approve**: Call `mcp__agent-builder__set_goal(...)` with the goal details, then proceed to STEP 3
+- If **Approve**: Call `mcp__agent-builder__set_goal(...)` with the goal details, then proceed to STEP 4
 - If **Modify**: Ask what they want to change, update the draft, ask again

 ---

-## STEP 3: Design Conceptual Nodes
+## STEP 4: Design Conceptual Nodes
+
+**If starting from a template**, the nodes are already loaded in the builder session. Present the existing nodes using the table format below and ask for approval. Skip the design phase.

 **BEFORE designing nodes**, review the available tools from Step 1. Nodes can ONLY use tools that exist.

@@ -129,7 +492,7 @@ AskUserQuestion(questions=[{
 - node_id (kebab-case)
 - name
 - description
- node_type: `"event_loop"` (recommended for all LLM work) or `"function"` (deterministic, no LLM)
+- node_type: `"event_loop"` (the only valid type; use `client_facing: True` for HITL)
 - input_keys (what data this node receives)
 - output_keys (what data this node produces)
 - tools (ONLY tools that exist from Step 1 — empty list if no tools needed)
@@ -173,12 +536,14 @@ AskUserQuestion(questions=[{

 **WAIT for user response.**

- If **Approve**: Proceed to STEP 4
+- If **Approve**: Proceed to STEP 5
 - If **Modify**: Ask what they want to change, update design, ask again

 ---

-## STEP 4: Design Full Graph and Review
+## STEP 5: Design Full Graph and Review
+
+**If starting from a template**, the edges are already loaded in the builder session. Render the existing graph as ASCII art and present it to the user for approval. Skip the edge design phase.

 **DETERMINE the edges** connecting the approved nodes. For each edge:

@@ -188,6 +553,26 @@ AskUserQuestion(questions=[{
 - condition_expr (Python expression, only if conditional)
 - priority (positive = forward, negative = feedback/loop-back)

+**DETERMINE the graph lifecycle.** Not every agent needs a terminal node:
+
+| Pattern | `terminal_nodes` | When to Use |
+|---------|-------------------|-------------|
+| **Linear (finish)** | `["last-node"]` | Agent completes a task and exits (batch processing, one-shot generation) |
+| **Forever-alive (loop)** | `[]` (empty) | Agent stays alive for continuous interaction (research assistant, personal assistant, monitoring) |
+
+**Forever-alive pattern:** The deep_research_agent example uses `terminal_nodes=[]`. Every leaf node has edges that loop back to earlier nodes, creating a perpetual session. The agent only stops when the user explicitly exits. This is the preferred pattern for interactive, multi-turn agents.
+
+**Key design rules for forever-alive graphs:**
+- Every node must have at least one outgoing edge (no dead ends)
+- Client-facing nodes block for user input — these are the natural "pause points"
+- The user controls when to stop, not the graph
+- Sessions accumulate memory across loops — plan for conversation compaction
+- Use `conversation_mode="continuous"` to preserve conversation history across node transitions
+- `max_iterations` should be set high (e.g., 100) since the agent is designed to run indefinitely
+- The agent will NOT enter a "completed" execution state — this is intentional, not a bug
+
+**Ask the user** which lifecycle pattern fits their agent. Default to forever-alive for interactive agents, linear for batch/one-shot tasks.
+
 **RENDER the complete graph as ASCII art.** Make it large and clear — the user needs to see and understand the full workflow at a glance.

 **IMPORTANT: Make the ASCII art BIG and READABLE.** Use a box-and-arrow style with generous spacing. Do NOT make it tiny or compressed. Example format:
@@ -288,16 +673,38 @@ AskUserQuestion(questions=[{

 **WAIT for user response.**

- If **Approve**: Proceed to STEP 5
+- If **Approve**: Proceed to STEP 6
 - If **Modify**: Ask what they want to change, update the graph, re-render, ask again

 ---

-## STEP 5: Build the Agent
+## STEP 6: Build the Agent

 **NOW — and only now — write the actual code.** The user has approved the goal, nodes, and graph.

-### 5a: Register nodes and edges with MCP
+### 6a: Register nodes and edges with MCP
+**If starting from a template**, the copied files will be overwritten with the approved design. You MUST replace every occurrence of the old template name with the new agent name. Here is the complete checklist — miss NONE of these:
+
+| File | What to rename |
+|------|---------------|
+| `config.py` | `AgentMetadata.name` — the display name shown in TUI agent selection |
+| `config.py` | `AgentMetadata.description` — agent description |
+| `config.py` | `AgentMetadata.intro_message` — greeting shown to user when TUI loads |
+| `agent.py` | Module docstring (line 1) |
+| `agent.py` | `class OldNameAgent:` → `class NewNameAgent:` |
+| `agent.py` | `GraphSpec(id="old-name-graph")` → `GraphSpec(id="new-name-graph")` — shown in TUI status bar |
+| `agent.py` | Storage path: `Path.home() / ".hive" / "agents" / "old_name"` → `"new_name"` |
+| `__main__.py` | Module docstring (line 1) |
+| `__main__.py` | `from .agent import ... OldNameAgent` → `NewNameAgent` |
+| `__main__.py` | CLI help string in `def cli()` docstring |
+| `__main__.py` | All `OldNameAgent()` instantiations |
+| `__main__.py` | Storage path (duplicated from agent.py) |
+| `__main__.py` | Shell banner string (e.g. `"=== Old Name Agent ==="`) |
+| `__init__.py` | Package docstring |
+| `__init__.py` | `from .agent import OldNameAgent` import |
+| `__init__.py` | `__all__` list entry |
+
+**If starting from a template and no modifications were made in Steps 2-5**, the nodes and edges are already registered. Skip to validation (`mcp__agent-builder__validate_graph()`). If modifications were made, re-register the changed nodes/edges (the MCP tools handle duplicates by overwriting).

 **FOR EACH approved node**, call:

@@ -337,9 +744,9 @@ mcp__agent-builder__validate_graph()
 ```

 - If invalid: Fix the issues and re-validate
- If valid: Continue to 5b
+- If valid: Continue to 6b

-### 5b: Write Python package files
+### 6b: Write Python package files

 **EXPORT the graph data:**

@@ -349,7 +756,7 @@ mcp__agent-builder__export_graph()

 **THEN write the Python package files** using the exported data. Create these files in `exports/AGENT_NAME/`:

-1. `config.py` - Runtime configuration with model settings
+1. `config.py` - Runtime configuration with model settings and `AgentMetadata` (including `intro_message` — the greeting shown when TUI loads)
 2. `nodes/__init__.py` - All NodeSpec definitions
 3. `agent.py` - Goal, edges, graph config, and agent class
 4. `__init__.py` - Package exports
@@ -399,7 +806,7 @@ mcp__agent-builder__export_graph()

 ---

-## STEP 6: Verify and Test
+## STEP 7: Verify and Test

 **RUN validation:**

@@ -445,8 +852,7 @@ cd /home/timothy/oss/hive && PYTHONPATH=exports uv run python -m AGENT_NAME vali

 | Type         | tools param             | Use when                                |
 | ------------ | ----------------------- | --------------------------------------- |
-| `event_loop` | `'["tool1"]'` or `'[]'` | LLM-powered work with or without tools  |
-| `function`   | N/A                     | Deterministic Python operations, no LLM |
+| `event_loop` | `'["tool1"]'` or `'[]'` | All agent work (with or without tools, HITL via client_facing) |

 ---

@@ -507,7 +913,7 @@ EventLoopNodes are **auto-created** by `GraphExecutor` at runtime. Both direct `
 from framework.graph.executor import GraphExecutor
 from framework.runtime.core import Runtime

-storage_path = Path.home() / ".hive" / "my_agent"
+storage_path = Path.home() / ".hive" / "agents" / "my_agent"
 storage_path.mkdir(parents=True, exist_ok=True)
 runtime = Runtime(storage_path)

@@ -525,16 +931,113 @@ result = await executor.execute(graph=graph, goal=goal, input_data=input_data)

 ---

+## REFERENCE: Graph Lifecycle & Conversation Memory
+
+### Terminal vs Forever-Alive Graphs
+
+Agents have two lifecycle patterns:
+
+**Linear (terminal) graphs** have `terminal_nodes=["last-node"]`. Execution ends when the terminal node completes. The session enters a "completed" state. Use for batch processing, one-shot generation, and fire-and-forget tasks.
+
+**Forever-alive graphs** have `terminal_nodes=[]` (empty). Every node has at least one outgoing edge — the graph loops indefinitely. The session **never enters a "completed" state** — this is intentional. The agent stays alive until the user explicitly exits. Use for interactive assistants, research tools, and any agent where the user drives the conversation.
+
+The deep_research_agent example demonstrates this: `report` loops back to either `research` (dig deeper) or `intake` (new topic). The agent is a persistent, interactive assistant.
+
+### Continuous Conversation Mode
+
+When `conversation_mode="continuous"` is set on the GraphSpec, the framework preserves a **single conversation thread** across all node transitions:
+
+**What the framework does automatically:**
+- **Inherits conversation**: Same message history carries forward to the next node
+- **Composes layered system prompts**: Identity (agent-level) + Narrative (auto-generated state summary) + Focus (per-node instructions)
+- **Inserts transition markers**: At each node boundary, a "State of the World" message showing completed phases, current memory, and available data files
+- **Accumulates tools**: Once a tool becomes available, it stays available in subsequent nodes
+- **Compacts opportunistically**: At phase transitions, old tool results are pruned to stay within token budget
+
+**What this means for agent builders:**
+- Nodes don't need to re-explain context — the conversation carries it forward
+- Output keys from earlier nodes are available in memory for edge conditions and later nodes
+- For forever-alive agents, conversation memory persists across the entire session lifetime
+- Plan for compaction: very long sessions will have older tool results summarized automatically
+
+**When to use continuous mode:**
+- Interactive agents with client-facing nodes (always)
+- Multi-phase workflows where context matters across phases
+- Forever-alive agents that loop indefinitely
+
+**When NOT to use continuous mode:**
+- Embarrassingly parallel fan-out nodes (each branch should be independent)
+- Stateless utility agents that process items independently
+
+---
+
+## REFERENCE: Framework Capabilities for Qualification
+
+Use this reference during STEP 2 to give accurate, honest assessments.
+
+### What the Framework Does Well (The Good)
+
+| Capability | Description |
+|------------|-------------|
+| Multi-turn conversations | Client-facing nodes stream to users and block for input |
+| Human-in-the-loop review | Approval checkpoints with feedback loops back to earlier nodes |
+| Tool orchestration | LLM can call multiple tools, framework handles execution |
+| Structured outputs | `set_output` produces validated, typed outputs |
+| Parallel execution | Fan-out/fan-in for concurrent node execution |
+| Context management | Automatic compaction and spillover for large data |
+| Error recovery | Retry logic, judges, and feedback edges for self-correction |
+| Session persistence | State saved to disk, resumable sessions |
+
+### Framework Limitations (The Bad)
+
+| Limitation | Impact | Workaround |
+|------------|--------|------------|
+| LLM latency | 2-10+ seconds per turn | Not suitable for real-time/low-latency needs |
+| Context window limits | ~128K tokens max | Use data tools for spillover, design for chunking |
+| Cost per run | LLM API calls cost money | Budget planning, caching where possible |
+| Rate limits | API throttling on heavy usage | Backoff, queue management |
+| Node boundaries lose context | Outputs must be serialized | Prefer fewer, richer nodes |
+| Single-threaded within node | One LLM call at a time per node | Use fan-out for parallelism |
+
+### Not Designed For (The Ugly)
+
+| Use Case | Why It's Problematic | Alternative |
+|----------|---------------------|-------------|
+| Persistent background daemons (no user) | Forever-alive graphs need a user at client-facing nodes; no autonomous background polling without user | External scheduler triggering agent runs |
+| Sub-second responses | LLM latency is inherent | Traditional code, no LLM |
+| Processing millions of items | Context windows and rate limits | Batch processing + sampling |
+| Real-time streaming data | No built-in pub/sub or streaming input | Custom MCP server + agent |
+| Guaranteed determinism | LLM outputs vary | Traditional code for deterministic parts |
+| Offline/air-gapped | Requires LLM API access | Local models (not currently supported) |
+| Multi-user concurrency | Single-user session model | Separate agent instances per user |
+
+### Tool Availability Reality Check
+
+**Before promising any capability, check `list_mcp_tools()`.** Common gaps:
+
+- **Email**: May not have `send_email` — check before promising email automation
+- **Calendar**: May not have calendar APIs — check before promising scheduling
+- **Database**: May not have SQL tools — check before promising data queries
+- **File system**: Has data tools but not arbitrary filesystem access
+- **External APIs**: Depends entirely on what MCP servers are registered
+
+---
+
 ## COMMON MISTAKES TO AVOID

-1. **Using tools that don't exist** - Always check `mcp__agent-builder__list_mcp_tools()` first
-2. **Wrong entry_points format** - Must be `{"start": "node-id"}`, NOT a set or list
-3. **Skipping validation** - Always validate nodes and graph before proceeding
-4. **Not waiting for approval** - Always ask user before major steps
-5. **Displaying this file** - Execute the steps, don't show documentation
-6. **Too many thin nodes** - Prefer fewer, richer nodes (4 nodes > 8 nodes)
-7. **Missing STEP 1/STEP 2 in client-facing prompts** - Client-facing nodes need explicit phases to prevent premature set_output
-8. **Forgetting nullable_output_keys** - Mark input_keys that only arrive on certain edges (e.g., feedback) as nullable on the receiving node
-9. **Adding framework gating for LLM behavior** - Fix prompts or use judges, not ad-hoc code
-10. **Writing code before user approves the graph** - Always get approval on goal, nodes, and graph BEFORE writing any agent code
-11. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), `cwd` must be `"../../tools"`, and `command` must be `"uv"` with args `["run", "python", ...]`
+1. **Skipping use case qualification** - A responsible engineer qualifies the use case BEFORE building. Be transparent about what works, what doesn't, and what's problematic
+2. **Hiding limitations** - Don't oversell the framework. If a tool doesn't exist or a capability is missing, say so upfront
+3. **Using tools that don't exist** - Always check `mcp__agent-builder__list_mcp_tools()` first
+4. **Wrong entry_points format** - Must be `{"start": "node-id"}`, NOT a set or list
+5. **Skipping validation** - Always validate nodes and graph before proceeding
+6. **Not waiting for approval** - Always ask user before major steps
+7. **Displaying this file** - Execute the steps, don't show documentation
+8. **Too many thin nodes** - Prefer fewer, richer nodes (4 nodes > 8 nodes)
+9. **Missing STEP 1/STEP 2 in client-facing prompts** - Client-facing nodes need explicit phases to prevent premature set_output
+10. **Forgetting nullable_output_keys** - Mark input_keys that only arrive on certain edges (e.g., feedback) as nullable on the receiving node
+11. **Adding framework gating for LLM behavior** - Fix prompts or use judges, not ad-hoc code
+12. **Writing code before user approves the graph** - Always get approval on goal, nodes, and graph BEFORE writing any agent code
+13. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), `cwd` must be `"../../tools"`, and `command` must be `"uv"` with args `["run", "python", ...]`
+14. **Assuming all agents need terminal nodes** - Interactive agents often work best with `terminal_nodes=[]` (forever-alive pattern). The agent never enters "completed" state — this is intentional. Only batch/one-shot agents need terminal nodes
+15. **Creating dead-end nodes in forever-alive graphs** - Every node must have at least one outgoing edge. A node with no outgoing edges will cause execution to end unexpectedly, breaking the forever-alive loop
+16. **Not using continuous conversation mode for interactive agents** - Multi-phase interactive agents should use `conversation_mode="continuous"` to preserve context across node transitions. Without it, each node starts with a blank conversation and loses all prior context
@@ -90,7 +90,7 @@ def tui(mock, verbose, debug):
        agent._event_bus = EventBus()
        agent._tool_registry = ToolRegistry()

-        storage_path = Path.home() / ".hive" / "deep_research_agent"
+        storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
        storage_path.mkdir(parents=True, exist_ok=True)

        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
@@ -1,12 +1,15 @@
 """Agent graph construction for Deep Research Agent."""

+from pathlib import Path
+
 from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
 from framework.graph.edge import GraphSpec
-from framework.graph.executor import ExecutionResult, GraphExecutor
-from framework.runtime.event_bus import EventBus
-from framework.runtime.core import Runtime
+from framework.graph.executor import ExecutionResult
+from framework.graph.checkpoint_config import CheckpointConfig
 from framework.llm import LiteLLMProvider
 from framework.runner.tool_registry import ToolRegistry
+from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.execution_stream import EntryPointSpec

 from .config import default_config, metadata
 from .nodes import (
@@ -120,13 +123,31 @@ edges = [
        condition_expr="needs_more_research == False",
        priority=2,
    ),
+    # report -> research (user wants deeper research on current topic)
+    EdgeSpec(
+        id="report-to-research",
+        source="report",
+        target="research",
+        condition=EdgeCondition.CONDITIONAL,
+        condition_expr="str(next_action).lower() == 'more_research'",
+        priority=2,
+    ),
+    # report -> intake (user wants a new topic — default when not more_research)
+    EdgeSpec(
+        id="report-to-intake",
+        source="report",
+        target="intake",
+        condition=EdgeCondition.CONDITIONAL,
+        condition_expr="str(next_action).lower() != 'more_research'",
+        priority=1,
+    ),
 ]

 # Graph configuration
 entry_node = "intake"
 entry_points = {"start": "intake"}
 pause_nodes = []
-terminal_nodes = ["report"]
+terminal_nodes = []


 class DeepResearchAgent:
@@ -136,6 +157,12 @@ class DeepResearchAgent:
    Flow: intake -> research -> review -> report
                      ^           |
                      +-- feedback loop (if user wants more)
+
+    Uses AgentRuntime for proper session management:
+    - Session-scoped storage (sessions/{session_id}/)
+    - Checkpointing for resume capability
+    - Runtime logging
+    - Data folder for save_data/load_data
    """

    def __init__(self, config=None):
@@ -147,10 +174,10 @@ class DeepResearchAgent:
        self.entry_points = entry_points
        self.pause_nodes = pause_nodes
        self.terminal_nodes = terminal_nodes
-        self._executor: GraphExecutor | None = None
        self._graph: GraphSpec | None = None
-        self._event_bus: EventBus | None = None
+        self._agent_runtime: AgentRuntime | None = None
        self._tool_registry: ToolRegistry | None = None
+        self._storage_path: Path | None = None

    def _build_graph(self) -> GraphSpec:
        """Build the GraphSpec."""
@@ -171,16 +198,20 @@ class DeepResearchAgent:
                "max_tool_calls_per_turn": 20,
                "max_history_tokens": 32000,
            },
+            conversation_mode="continuous",
+            identity_prompt=(
+                "You are a rigorous research agent. You search for information "
+                "from diverse, authoritative sources, analyze findings critically, "
+                "and produce well-cited reports. You never fabricate information — "
+                "every claim must trace back to a source you actually retrieved."
+            ),
        )

-    def _setup(self, mock_mode=False) -> GraphExecutor:
-        """Set up the executor with all components."""
-        from pathlib import Path
+    def _setup(self, mock_mode=False) -> None:
+        """Set up the agent runtime with sessions, checkpoints, and logging."""
+        self._storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
+        self._storage_path.mkdir(parents=True, exist_ok=True)

-        storage_path = Path.home() / ".hive" / "deep_research_agent"
-        storage_path.mkdir(parents=True, exist_ok=True)
-
-        self._event_bus = EventBus()
        self._tool_registry = ToolRegistry()

        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
@@ -199,47 +230,63 @@ class DeepResearchAgent:
        tools = list(self._tool_registry.get_tools().values())

        self._graph = self._build_graph()
-        runtime = Runtime(storage_path)

-        self._executor = GraphExecutor(
-            runtime=runtime,
+        checkpoint_config = CheckpointConfig(
+            enabled=True,
+            checkpoint_on_node_start=False,
+            checkpoint_on_node_complete=True,
+            checkpoint_max_age_days=7,
+            async_checkpoint=True,
+        )
+
+        entry_point_specs = [
+            EntryPointSpec(
+                id="default",
+                name="Default",
+                entry_node=self.entry_node,
+                trigger_type="manual",
+                isolation_level="shared",
+            )
+        ]
+
+        self._agent_runtime = create_agent_runtime(
+            graph=self._graph,
+            goal=self.goal,
+            storage_path=self._storage_path,
+            entry_points=entry_point_specs,
            llm=llm,
            tools=tools,
            tool_executor=tool_executor,
-            event_bus=self._event_bus,
-            storage_path=storage_path,
-            loop_config=self._graph.loop_config,
+            checkpoint_config=checkpoint_config,
        )

-        return self._executor
-
    async def start(self, mock_mode=False) -> None:
-        """Set up the agent (initialize executor and tools)."""
-        if self._executor is None:
+        """Set up and start the agent runtime."""
+        if self._agent_runtime is None:
            self._setup(mock_mode=mock_mode)
+        if not self._agent_runtime.is_running:
+            await self._agent_runtime.start()

    async def stop(self) -> None:
-        """Clean up resources."""
-        self._executor = None
-        self._event_bus = None
+        """Stop the agent runtime and clean up."""
+        if self._agent_runtime and self._agent_runtime.is_running:
+            await self._agent_runtime.stop()
+        self._agent_runtime = None

    async def trigger_and_wait(
        self,
-        entry_point: str,
-        input_data: dict,
+        entry_point: str = "default",
+        input_data: dict | None = None,
        timeout: float | None = None,
        session_state: dict | None = None,
    ) -> ExecutionResult | None:
        """Execute the graph and wait for completion."""
-        if self._executor is None:
+        if self._agent_runtime is None:
            raise RuntimeError("Agent not started. Call start() first.")
-        if self._graph is None:
-            raise RuntimeError("Graph not built. Call start() first.")

-        return await self._executor.execute(
-            graph=self._graph,
-            goal=self.goal,
-            input_data=input_data,
+        return await self._agent_runtime.trigger_and_wait(
+            entry_point_id=entry_point,
+            input_data=input_data or {},
            session_state=session_state,
        )

@@ -250,7 +297,7 @@ class DeepResearchAgent:
        await self.start(mock_mode=mock_mode)
        try:
            result = await self.trigger_and_wait(
-                "start", context, session_state=session_state
+                "default", context, session_state=session_state
            )
            return result or ExecutionResult(success=False, error="Execution timeout")
        finally:
@@ -1,33 +1,8 @@
 """Runtime configuration."""

-import json
-from dataclasses import dataclass, field
-from pathlib import Path
-
-
-def _load_preferred_model() -> str:
-    """Load preferred model from ~/.hive/configuration.json."""
-    config_path = Path.home() / ".hive" / "configuration.json"
-    if config_path.exists():
-        try:
-            with open(config_path) as f:
-                config = json.load(f)
-            llm = config.get("llm", {})
-            if llm.get("provider") and llm.get("model"):
-                return f"{llm['provider']}/{llm['model']}"
-        except Exception:
-            pass
-    return "anthropic/claude-sonnet-4-20250514"
-
-
-@dataclass
-class RuntimeConfig:
-    model: str = field(default_factory=_load_preferred_model)
-    temperature: float = 0.7
-    max_tokens: int = 40000
-    api_key: str | None = None
-    api_base: str | None = None
+from dataclasses import dataclass

+from framework.config import RuntimeConfig

 default_config = RuntimeConfig()

@@ -41,6 +16,11 @@ class AgentMetadata:
        "multi-source search, quality evaluation, and synthesis - with TUI conversation "
        "at key checkpoints for user guidance and feedback."
    )
+    intro_message: str = (
+        "Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
+        "thoroughly — searching multiple sources, evaluating quality, and synthesizing "
+        "a comprehensive report. What would you like me to research?"
+    )


 metadata = AgentMetadata()
@@ -10,8 +10,13 @@ intake_node = NodeSpec(
    description="Discuss the research topic with the user, clarify scope, and confirm direction",
    node_type="event_loop",
    client_facing=True,
+    max_node_visits=0,
    input_keys=["topic"],
    output_keys=["research_brief"],
+    success_criteria=(
+        "The research brief is specific and actionable: it states the topic, "
+        "the key questions to answer, the desired scope, and depth."
+    ),
    system_prompt="""\
 You are a research intake specialist. The user wants to research a topic.
 Have a brief conversation to clarify what they need.
@@ -38,10 +43,14 @@ research_node = NodeSpec(
    name="Research",
    description="Search the web, fetch source content, and compile findings",
    node_type="event_loop",
-    max_node_visits=3,
+    max_node_visits=0,
    input_keys=["research_brief", "feedback"],
    output_keys=["findings", "sources", "gaps"],
    nullable_output_keys=["feedback"],
+    success_criteria=(
+        "Findings reference at least 3 distinct sources with URLs. "
+        "Key claims are substantiated by fetched content, not generated."
+    ),
    system_prompt="""\
 You are a research agent. Given a research brief, find and analyze sources.

@@ -56,18 +65,26 @@ Work in phases:
   and any contradictions between sources.

 Important:
- Work in batches of 3-4 tool calls at a time to manage context
+- Work in batches of 3-4 tool calls at a time — never more than 10 per turn
 - After each batch, assess whether you have enough material
 - Prefer quality over quantity — 5 good sources beat 15 thin ones
 - Track which URL each finding comes from (you'll need citations later)
+- Call set_output for each key in a SEPARATE turn (not in the same turn as other tool calls)

-When done, use set_output:
+When done, use set_output (one key at a time, separate turns):
 - set_output("findings", "Structured summary: key findings with source URLs for each claim. \
 Include themes, contradictions, and confidence levels.")
 - set_output("sources", [{"url": "...", "title": "...", "summary": "..."}])
 - set_output("gaps", "What aspects of the research brief are NOT well-covered yet, if any.")
 """,
-    tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
+    tools=[
+        "web_search",
+        "web_scrape",
+        "load_data",
+        "save_data",
+        "append_data",
+        "list_data_files",
+    ],
 )

 # Node 3: Review (client-facing)
@@ -78,9 +95,13 @@ review_node = NodeSpec(
    description="Present findings to user and decide whether to research more or write the report",
    node_type="event_loop",
    client_facing=True,
-    max_node_visits=3,
+    max_node_visits=0,
    input_keys=["findings", "sources", "gaps", "research_brief"],
    output_keys=["needs_more_research", "feedback"],
+    success_criteria=(
+        "The user has been presented with findings and has explicitly indicated "
+        "whether they want more research or are ready for the report."
+    ),
    system_prompt="""\
 Present the research findings to the user clearly and concisely.

@@ -109,49 +130,70 @@ report_node = NodeSpec(
    description="Write a cited HTML report from the findings and present it to the user",
    node_type="event_loop",
    client_facing=True,
+    max_node_visits=0,
    input_keys=["findings", "sources", "research_brief"],
-    output_keys=["delivery_status"],
+    output_keys=["delivery_status", "next_action"],
+    success_criteria=(
+        "An HTML report has been saved, the file link has been presented to the user, "
+        "and the user has indicated what they want to do next."
+    ),
    system_prompt="""\
-Write a comprehensive research report as an HTML file and present it to the user.
+Write a research report as an HTML file and present it to the user.

-**STEP 1 — Write the HTML report (tool calls, NO text to user yet):**
+IMPORTANT: save_data requires TWO separate arguments: filename and data.
+Call it like: save_data(filename="report.html", data="<html>...</html>")
+Do NOT use _raw, do NOT nest arguments inside a JSON string.

-1. Compose a complete, self-contained HTML document with embedded CSS styling.
-   Use a clean, readable design: max-width container, pleasant typography,
-   numbered citation links, a table of contents, and a references section.
+**STEP 1 — Write and save the HTML report (tool calls, NO text to user yet):**

-   Report structure inside the HTML:
-   - Title & date
-   - Executive Summary (2-3 paragraphs)
-   - Table of Contents
-   - Findings (organized by theme, with [n] citation links)
-   - Analysis (synthesis, implications, areas of debate)
-   - Conclusion (key takeaways, confidence assessment)
-   - References (numbered list with clickable URLs)
+Build a clean HTML document. Keep the HTML concise — aim for clarity over length.
+Use minimal embedded CSS (a few lines of style, not a full framework).

-   Requirements:
-   - Every factual claim must cite its source with [n] notation
-   - Be objective — present multiple viewpoints where sources disagree
-   - Distinguish well-supported conclusions from speculation
-   - Answer the original research questions from the brief
+Report structure:
+- Title & date
+- Executive Summary (2-3 paragraphs)
+- Key Findings (organized by theme, with [n] citation links)
+- Analysis (synthesis, implications)
+- Conclusion (key takeaways)
+- References (numbered list with clickable URLs)

-2. Save the HTML file:
-   save_data(filename="report.html", data=<your_html>)
+Requirements:
+- Every factual claim must cite its source with [n] notation
+- Be objective — present multiple viewpoints where sources disagree
+- Answer the original research questions from the brief

-3. Get the clickable link:
-   serve_file_to_user(filename="report.html", label="Research Report")
+Save the HTML:
+  save_data(filename="report.html", data="<html>...</html>")
+
+Then get the clickable link:
+  serve_file_to_user(filename="report.html", label="Research Report")
+
+If save_data fails, simplify and shorten the HTML, then retry.

 **STEP 2 — Present the link to the user (text only, NO tool calls):**

 Tell the user the report is ready and include the file:// URI from
 serve_file_to_user so they can click it to open. Give a brief summary
-of what the report covers. Ask if they have questions.
+of what the report covers. Ask if they have questions or want to continue.

 **STEP 3 — After the user responds:**
- Answer follow-up questions from the research material
- When the user is satisfied: set_output("delivery_status", "completed")
+- Answer any follow-up questions from the research material
+- When the user is ready to move on, ask what they'd like to do next:
+  - Research a new topic?
+  - Dig deeper into the current topic?
+- Then call set_output:
+  - set_output("delivery_status", "completed")
+  - set_output("next_action", "new_topic")       — if they want a new topic
+  - set_output("next_action", "more_research")   — if they want deeper research
 """,
-    tools=["save_data", "serve_file_to_user", "load_data", "list_data_files"],
+    tools=[
+        "save_data",
+        "append_data",
+        "edit_data",
+        "serve_file_to_user",
+        "load_data",
+        "list_data_files",
+    ],
 )

 __all__ = [
@@ -141,6 +141,12 @@ for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTI
 - **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
 - **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile

+> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
+> ```
+> ⚠️  Environment variables were added to your shell config.
+>     Open a NEW TERMINAL for them to take effect outside this session.
+> ```
+
 #### Option 1: Aden Platform (OAuth)

 This is the recommended flow for supported integrations (HubSpot, etc.).
@@ -202,6 +208,12 @@ if success:
    print(f"Run: {source_cmd}")
 ```

+> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
+> ```
+> ⚠️  Environment variables were added to your shell config.
+>     Open a NEW TERMINAL for them to take effect outside this session.
+> ```
+
 Also save to `~/.hive/configuration.json` for the framework:

 ```python
@@ -460,9 +472,14 @@ result: HealthCheckResult = check_credential_health("hubspot", token_value)
 The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.

 - If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
- The user MUST persist this key (e.g., in `~/.bashrc` or a secrets manager)
+- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
 - Without this key, stored credentials cannot be decrypted
- This is the ONLY secret that should live in `~/.bashrc` or environment config
+
+**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
+- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
+- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
+
+All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**

 If `HIVE_CREDENTIAL_KEY` is not set:

@@ -475,6 +492,7 @@ If `HIVE_CREDENTIAL_KEY` is not set:
 - **NEVER** log, print, or echo credential values in tool output
 - **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
 - **NEVER** hardcode credentials in source code
+- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
 - **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
 - **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
 - **ALWAYS** run health checks before storing credentials (when possible)
@@ -490,7 +508,7 @@ All credential specs are defined in `tools/src/aden_tools/credentials/`:
 | `llm.py`          | LLM Providers | `anthropic`                                   | No             |
 | `search.py`       | Search Tools  | `brave_search`, `google_search`, `google_cse` | No             |
 | `email.py`        | Email         | `resend`                                      | No             |
-| `integrations.py` | Integrations  | `github`, `hubspot`                           | No / Yes       |
+| `integrations.py` | Integrations  | `github`, `hubspot`, `google_calendar_oauth`  | No / Yes       |

 **Note:** Additional LLM providers (Cerebras, Groq, OpenAI) are handled by LiteLLM via environment
 variables (`CEREBRAS_API_KEY`, `GROQ_API_KEY`, `OPENAI_API_KEY`) but are not yet in CREDENTIAL_SPECS.
@@ -601,18 +619,22 @@ All credentials are now configured:
 │                      ✅ CREDENTIALS CONFIGURED                              │
 ├─────────────────────────────────────────────────────────────────────────────┤
 │                                                                             │
+│     OPEN A NEW TERMINAL before running commands below.                      │
+│     Environment variables were saved to your shell config but               │
+│     only take effect in new terminal sessions.                              │
+│                                                                             │
 │  NEXT STEPS:                                                                │
 │                                                                             │
 │  1. RUN YOUR AGENT:                                                         │
 │                                                                             │
-│     PYTHONPATH=core:exports python -m research-agent tui                    │
+│     hive tui                                                                │
 │                                                                             │
 │  2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER:                              │
 │                                                                             │
 │     /hive-debugger                                                          │
 │                                                                             │
 │     The debugger analyzes runtime logs, identifies retry loops, tool        │
-│     failures, stalled execution, and provides actionable fix suggestions.  │
+│     failures, stalled execution, and provides actionable fix suggestions.   │
 │                                                                             │
 └─────────────────────────────────────────────────────────────────────────────┘
 ```
@@ -26,6 +26,17 @@ Use `/hive-debugger` when:

 This skill works alongside agents running in TUI mode and provides supervisor-level insights into execution behavior.

+### Forever-Alive Agent Awareness
+
+Some agents use `terminal_nodes=[]` (the "forever-alive" pattern), meaning they loop indefinitely and never enter a "completed" execution state. For these agents:
+- Sessions with status "in_progress" or "paused" are **normal**, not failures
+- High step counts, long durations, and many node visits are expected behavior
+- The agent stops only when the user explicitly exits — there is no graph-driven completion
+- Debug focus should be on **quality of individual node visits and iterations**, not whether the session reached a terminal state
+- Conversation memory accumulates across loops — watch for context overflow and stale data issues
+
+**How to identify forever-alive agents:** Check `agent.py` or `agent.json` for `terminal_nodes=[]` (empty list). If empty, the agent is forever-alive.
+
 ---

 ## Prerequisites
@@ -34,7 +45,7 @@ Before using this skill, ensure:
 1. You have an exported agent in `exports/{agent_name}/`
 2. The agent has been run at least once (logs exist)
 3. Runtime logging is enabled (default in Hive framework)
-4. You have access to the agent's working directory at `~/.hive/{agent_name}/`
+4. You have access to the agent's working directory at `~/.hive/agents/{agent_name}/`

 ---

@@ -47,11 +58,11 @@ Before using this skill, ensure:
 **What to do:**

 1. **Ask the developer which agent needs debugging:**
-   - Get agent name (e.g., "twitter_outreach", "deep_research_agent")
+   - Get agent name (e.g., "deep_research_agent", "deep_research_agent")
   - Confirm the agent exists in `exports/{agent_name}/`

 2. **Determine agent working directory:**
-   - Calculate: `~/.hive/{agent_name}/`
+   - Calculate: `~/.hive/agents/{agent_name}/`
   - Verify this directory exists and contains session logs

 3. **Read agent configuration:**
@@ -66,7 +77,7 @@ Before using this skill, ensure:

 4. **Store context for the debugging session:**
   - agent_name
-   - agent_work_dir (e.g., `/home/user/.hive/twitter_outreach`)
+   - agent_work_dir (e.g., `/home/user/.hive/deep_research_agent`)
   - goal_id
   - success_criteria
   - constraints
@@ -74,19 +85,19 @@ Before using this skill, ensure:

 **Example:**
 ```
-Developer: "My twitter_outreach agent keeps failing"
+Developer: "My deep_research_agent agent keeps failing"

-You: "I'll help debug the twitter_outreach agent. Let me gather context..."
+You: "I'll help debug the deep_research_agent agent. Let me gather context..."

-[Read exports/twitter_outreach/agent.json]
+[Read exports/deep_research_agent/agent.json]

 Context gathered:
- Agent: twitter_outreach
- Goal: twitter-outreach-multi-loop
- Working Directory: /home/user/.hive/twitter_outreach
- Success Criteria: ["Successfully send 5 personalized outreach messages"]
- Constraints: ["Must verify handle exists", "Must personalize message"]
- Nodes: ["intake-collector", "profile-analyzer", "message-composer", "outreach-sender"]
+- Agent: deep_research_agent
+- Goal: deep-research
+- Working Directory: /home/user/.hive/deep_research_agent
+- Success Criteria: ["Produce a comprehensive research report with cited sources"]
+- Constraints: ["Must cite all sources", "Must cover multiple perspectives"]
+- Nodes: ["intake", "research", "analysis", "report-writer"]
 ```

 ---
@@ -142,6 +153,7 @@ Store the selected mode for the session.
   - Check `attention_summary.categories` for issue types
   - Note the `run_id` of problematic sessions
   - Check `status` field: "degraded", "failure", "in_progress"
+   - **For forever-alive agents:** Sessions with status "in_progress" or "paused" are normal — these agents never reach "completed". Only flag sessions with `needs_attention: true` or actual error indicators (tool failures, retry loops, missing outputs). High step counts alone do not indicate a problem.

 3. **Attention flag triggers to understand:**
   From runtime_logger.py, runs are flagged when:
@@ -199,13 +211,20 @@ Which run would you like to investigate?
   | **Tool Errors** | `tool_error_count > 0`, `attention_reasons` contains "tool_failures" | Tool calls failed (API errors, timeouts, auth issues) |
   | **Retry Loops** | `retry_count > 3`, `verdict_counts.RETRY > 5` | Judge repeatedly rejecting outputs |
   | **Guard Failures** | `guard_reject_count > 0` | Output validation failed (wrong types, missing keys) |
-   | **Stalled Execution** | `total_steps > 20`, `verdict_counts.CONTINUE > 10` | EventLoopNode not making progress |
+   | **Stalled Execution** | `total_steps > 20`, `verdict_counts.CONTINUE > 10` | EventLoopNode not making progress. **Caveat:** Forever-alive agents may legitimately have high step counts — check if agent is blocked at a client-facing node (normal) vs genuinely stuck in a loop |
   | **High Latency** | `latency_ms > 60000`, `avg_step_latency > 5000` | Slow tool calls or LLM responses |
   | **Client-Facing Issues** | `client_input_requested` but no `user_input_received` | Premature set_output before user input |
   | **Edge Routing Errors** | `exit_status == "no_valid_edge"`, `attention_reasons` contains "routing_issue" | No edges match current state |
   | **Memory/Context Issues** | `tokens_used > 100000`, `context_overflow_count > 0` | Conversation history too long |
   | **Constraint Violations** | Compare output against goal constraints | Agent violated goal-level rules |

+   **Forever-Alive Agent Caveat:** If the agent uses `terminal_nodes=[]`, sessions will never reach "completed" status. This is by design. When debugging these agents, focus on:
+   - Whether individual node visits succeed (not whether the graph "finishes")
+   - Quality of each loop iteration — are outputs improving or degrading across loops?
+   - Whether client-facing nodes are correctly blocking for user input
+   - Memory accumulation issues: stale data from previous loops, context overflow across many iterations
+   - Conversation compaction behavior: is the conversation growing unbounded?
+
 3. **Analyze each flagged node:**
   - Node ID and name
   - Exit status
@@ -224,7 +243,7 @@ Which run would you like to investigate?
 ```
 Diagnosis for session_20260206_115718_e22339c5:

-Problem Node: intake-collector
+Problem Node: research
 ├─ Exit Status: escalate
 ├─ Retry Count: 5 (HIGH)
 ├─ Verdict Counts: {RETRY: 5, ESCALATE: 1}
@@ -232,7 +251,7 @@ Problem Node: intake-collector
 ├─ Total Steps: 8
 └─ Categories: Missing Outputs + Retry Loops

-Root Issue: The intake-collector node is stuck in a retry loop because it's not setting required outputs.
+Root Issue: The research node is stuck in a retry loop because it's not setting required outputs.
 ```

 ---
@@ -293,25 +312,25 @@ Root Issue: The intake-collector node is stuck in a retry loop because it's not

 **Example Output:**
 ```
-Root Cause Analysis for intake-collector:
+Root Cause Analysis for research:

 Step-by-step breakdown:

 Step 3:
- Tool Call: web_search(query="@RomuloNevesOf")
- Result: Found Twitter profile information
+- Tool Call: web_search(query="latest AI regulations 2026")
+- Result: Found relevant articles and sources
 - Verdict: RETRY
- Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
+- Feedback: "Missing required output 'research_findings'. You found sources but didn't call set_output."

 Step 4:
- Tool Call: web_search(query="@RomuloNevesOf twitter")
- Result: Found additional Twitter information
+- Tool Call: web_search(query="AI regulation policy 2026")
+- Result: Found additional policy information
 - Verdict: RETRY
- Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
+- Feedback: "Still missing 'research_findings'. Use set_output to save your findings."

 Steps 5-7: Similar pattern continues...

-ROOT CAUSE: The node is successfully finding Twitter handles via web_search, but the LLM is not calling set_output to save the results. It keeps searching for more information instead of completing the task.
+ROOT CAUSE: The node is successfully finding research sources via web_search, but the LLM is not calling set_output to save the results. It keeps searching for more information instead of completing the task.
 ```

 ---
@@ -495,11 +514,114 @@ max_node_visits=3  # Prevent getting stuck
 - Confirm it calls set_output eventually
 ```

+#### Template 6: Checkpoint Recovery (Post-Fix Resume)
+
+```markdown
+## Recovery Strategy: Resume from Last Clean Checkpoint
+
+**Situation:** You've fixed the issue, but the failed session is stuck mid-execution
+
+**Solution:** Resume execution from a checkpoint before the failure
+
+### Option A: Auto-Resume from Latest Checkpoint (Recommended)
+
+Use CLI arguments to auto-resume when launching TUI:
+
+```bash
+PYTHONPATH=core:exports python -m {agent_name} --tui \
+    --resume-session {session_id}
+```
+
+This will:
+- Load session state from `state.json`
+- Continue from where it paused/failed
+- Apply your fixes immediately
+
+### Option B: Resume from Specific Checkpoint (Time-Travel)
+
+If you need to go back to an earlier point:
+
+```bash
+PYTHONPATH=core:exports python -m {agent_name} --tui \
+    --resume-session {session_id} \
+    --checkpoint {checkpoint_id}
+```
+
+Example:
+```bash
+PYTHONPATH=core:exports python -m deep_research_agent --tui \
+    --resume-session session_20260208_143022_abc12345 \
+    --checkpoint cp_node_complete_intake_143030
+```
+
+### Option C: Use TUI Commands
+
+Alternatively, launch TUI normally and use commands:
+
+```bash
+# Launch TUI
+PYTHONPATH=core:exports python -m {agent_name} --tui
+
+# In TUI, use commands:
+/resume {session_id}                    # Resume from session state
+/recover {session_id} {checkpoint_id}   # Recover from specific checkpoint
+```
+
+### When to Use Each Option:
+
+**Use `/resume` (or --resume-session) when:**
+- You fixed credentials and want to retry
+- Agent paused and you want to continue
+- Agent failed and you want to retry from last state
+
+**Use `/recover` (or --resume-session + --checkpoint) when:**
+- You need to go back to an earlier checkpoint
+- You want to try a different path from a specific point
+- Debugging requires time-travel to earlier state
+
+### Find Available Checkpoints:
+
+Use MCP tools to programmatically find and inspect checkpoints:
+
+```
+# List all sessions to find the failed one
+list_agent_sessions(agent_work_dir="~/.hive/agents/{agent_name}", status="failed")
+
+# Inspect session state
+get_agent_session_state(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}")
+
+# Find clean checkpoints to resume from
+list_agent_checkpoints(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", is_clean="true")
+
+# Compare checkpoints to understand what changed
+compare_agent_checkpoints(
+    agent_work_dir="~/.hive/agents/{agent_name}",
+    session_id="{session_id}",
+    checkpoint_id_before="cp_node_complete_intake_143030",
+    checkpoint_id_after="cp_node_complete_research_143115"
+)
+
+# Inspect memory at a specific checkpoint
+get_agent_checkpoint(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", checkpoint_id="cp_node_complete_intake_143030")
+```
+
+Or in TUI:
+```bash
+/sessions {session_id}
+```
+
+**Verification:**
+- Use `--resume-session` to test your fix immediately
+- No need to re-run from the beginning
+- Session continues with your code changes applied
+```
+
 **Selecting the right template:**
 - Match the issue category from Stage 4
 - Customize with specific details from Stage 5
 - Include actual error messages and code snippets
 - Provide file paths and line numbers when possible
+- **Always include recovery commands** (Template 6) after providing fix recommendations

 ---

@@ -532,7 +654,7 @@ max_node_visits=3  # Prevent getting stuck
   **Check if issue is resolved:**
   ```
   query_runtime_logs(
-       agent_work_dir="~/.hive/{agent_name}",
+       agent_work_dir="~/.hive/agents/{agent_name}",
       status="needs_attention",
       limit=5
   )
@@ -542,7 +664,7 @@ max_node_visits=3  # Prevent getting stuck
   **Verify specific node behavior:**
   ```
   query_runtime_log_details(
-       agent_work_dir="~/.hive/{agent_name}",
+       agent_work_dir="~/.hive/agents/{agent_name}",
       run_id="{new_run_id}",
       node_id="{fixed_node_id}"
   )
@@ -568,7 +690,7 @@ max_node_visits=3  # Prevent getting stuck

 **Example interaction:**
 ```
-Developer: "I applied the fix to intake-collector. How do I verify it works?"
+Developer: "I applied the fix to research. How do I verify it works?"

 You: "Great! Let's verify the fix with these steps:

@@ -580,11 +702,11 @@ You: "Great! Let's verify the fix with these steps:
   [Use query_runtime_logs to check for attention flags]

 3. Verify the specific node:
-   [Use query_runtime_log_details for intake-collector]
+   [Use query_runtime_log_details for research]

 Expected results:
 - No 'needs_attention' flags
- intake-collector shows exit_status='success'
+- research shows exit_status='success'
 - retry_count should be 0

 Let me know when you've run it and I'll help check the logs!"
@@ -602,7 +724,7 @@ Let me know when you've run it and I'll help check the logs!"
 - **Example:**
  ```
  query_runtime_logs(
-      agent_work_dir="/home/user/.hive/twitter_outreach",
+      agent_work_dir="/home/user/.hive/deep_research_agent",
      status="needs_attention",
      limit=20
  )
@@ -614,7 +736,7 @@ Let me know when you've run it and I'll help check the logs!"
 - **Example:**
  ```
  query_runtime_log_details(
-      agent_work_dir="/home/user/.hive/twitter_outreach",
+      agent_work_dir="/home/user/.hive/deep_research_agent",
      run_id="session_20260206_115718_e22339c5",
      needs_attention_only=True
  )
@@ -626,9 +748,83 @@ Let me know when you've run it and I'll help check the logs!"
 - **Example:**
  ```
  query_runtime_log_raw(
-      agent_work_dir="/home/user/.hive/twitter_outreach",
+      agent_work_dir="/home/user/.hive/deep_research_agent",
      run_id="session_20260206_115718_e22339c5",
-      node_id="intake-collector"
+      node_id="research"
+  )
+  ```
+
+### Session & Checkpoint Tools
+
+**list_agent_sessions** - Browse sessions with filtering
+- **When to use:** Finding resumable sessions, identifying failed sessions, Stage 3 triage
+- **Returns:** Session list with status, timestamps, is_resumable, current_node, quality
+- **Example:**
+  ```
+  list_agent_sessions(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      status="failed",
+      limit=10
+  )
+  ```
+
+**get_agent_session_state** - Load full session state (excludes memory values)
+- **When to use:** Inspecting session progress, checking is_resumable, examining path
+- **Returns:** Full state with memory_keys/memory_size instead of memory values
+- **Example:**
+  ```
+  get_agent_session_state(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345"
+  )
+  ```
+
+**get_agent_session_memory** - Get memory contents from a session
+- **When to use:** Stage 5 root cause analysis, inspecting produced data
+- **Returns:** All memory keys+values, or a single key's value
+- **Example:**
+  ```
+  get_agent_session_memory(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      key="twitter_handles"
+  )
+  ```
+
+**list_agent_checkpoints** - List checkpoints for a session
+- **When to use:** Stage 6 recovery, finding clean checkpoints to resume from
+- **Returns:** Checkpoint summaries with type, node, clean status
+- **Example:**
+  ```
+  list_agent_checkpoints(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      is_clean="true"
+  )
+  ```
+
+**get_agent_checkpoint** - Load a specific checkpoint with full state
+- **When to use:** Inspecting exact state at a checkpoint, comparing to current state
+- **Returns:** Full checkpoint: memory snapshot, execution path, metrics
+- **Example:**
+  ```
+  get_agent_checkpoint(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      checkpoint_id="cp_node_complete_intake_143030"
+  )
+  ```
+
+**compare_agent_checkpoints** - Diff memory between two checkpoints
+- **When to use:** Understanding data flow, finding where state diverged
+- **Returns:** Memory diff (added/removed/changed keys) + execution path diff
+- **Example:**
+  ```
+  compare_agent_checkpoints(
+      agent_work_dir="/home/user/.hive/agents/twitter_outreach",
+      session_id="session_20260208_143022_abc12345",
+      checkpoint_id_before="cp_node_complete_intake_143030",
+      checkpoint_id_after="cp_node_complete_research_143115"
  )
  ```

@@ -654,27 +850,37 @@ Loop every 10 seconds:
  2. If found: Alert and drill into L2
 ```

+**Pattern 4: Session State + Checkpoint Recovery**
+```
+1. list_agent_sessions: Find failed/paused sessions
+2. get_agent_session_state: Check is_resumable, see execution path
+3. get_agent_session_memory: Inspect what data was produced
+4. list_agent_checkpoints: Find clean checkpoints before failure
+5. compare_agent_checkpoints: Understand what changed between checkpoints
+6. Recommend resume command with specific checkpoint
+```
+
 ---

 ## Complete Example Walkthrough

-### Scenario: twitter_outreach Agent Failing
+### Scenario: deep_research_agent Agent Failing

-**Developer says:** "My twitter_outreach agent keeps failing. It's not finding Twitter handles correctly."
+**Developer says:** "My deep_research_agent keeps failing. It's not saving research findings correctly."

 #### Stage 1: Setup
 ```
-You: "I'll help debug the twitter_outreach agent. Let me gather context..."
+You: "I'll help debug the deep_research_agent agent. Let me gather context..."

-[Read exports/twitter_outreach/agent.json]
+[Read exports/deep_research_agent/agent.json]

 Context:
- Agent: twitter_outreach
- Goal: twitter-outreach-multi-loop
- Working Dir: ~/.hive/twitter_outreach
+- Agent: deep_research_agent
+- Goal: deep-research
+- Working Dir: ~/.hive/agents/deep_research_agent
 - Success Criteria: ["Successfully send 5 personalized outreach messages"]
 - Constraints: ["Must verify handle exists", "Must personalize message"]
- Nodes: intake-collector, profile-analyzer, message-composer, outreach-sender
+- Nodes: research, profile-analyzer, message-composer, outreach-sender
 ```

 #### Stage 2: Mode Selection
@@ -706,31 +912,31 @@ You: "Analyzing node-level details..."
 [Call query_runtime_log_details for the session]

 Results:
- Problem Node: intake-collector
+- Problem Node: research
 - Exit Status: escalate
 - Retry Count: 5
 - Verdict Counts: {RETRY: 5, ESCALATE: 1}
 - Attention Reasons: ["high_retry_count", "missing_outputs"]
 - Total Steps: 8

-Diagnosis: intake-collector has retry loop + missing outputs issue
+Diagnosis: research has retry loop + missing outputs issue
 Category: Missing Outputs + Retry Loops
 ```

 #### Stage 5: Root Cause (L3)
 ```
-You: "Drilling into detailed logs for intake-collector..."
+You: "Drilling into detailed logs for research..."

-[Call query_runtime_log_raw for node intake-collector]
+[Call query_runtime_log_raw for node research]

 Evidence:
-Step 3: web_search("@RomuloNevesOf") → Found profile
+Step 3: web_search("latest AI regulations 2026") → Found articles
  Verdict: RETRY
-  Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
+  Feedback: "Missing required output 'research_findings'. You found sources but didn't call set_output."

-Step 4: web_search("@RomuloNevesOf twitter") → Found more info
+Step 4: web_search("AI regulation policy 2026") → Found more info
  Verdict: RETRY
-  Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
+  Feedback: "Still missing 'research_findings'. Use set_output to save your findings."

 Steps 5-7: Same pattern repeats...

@@ -741,37 +947,37 @@ ROOT CAUSE: Node successfully finds handles but LLM doesn't call set_output. It
 ```
 You: "Here's the fix for this issue:

-## Issue: intake-collector Not Setting Outputs
+## Issue: research Node Not Setting Outputs

-**Root Cause:** Node successfully finds Twitter handles but doesn't call set_output
+**Root Cause:** Node successfully finds research sources but doesn't call set_output

 **Category:** Missing Outputs

-**File to edit:** exports/twitter_outreach/nodes/intake_collector.py
+**File to edit:** exports/deep_research_agent/nodes/research.py

 **Changes:**
 Update the system_prompt to emphasize the output requirement:

 ```python
 system_prompt = """
-You are the Intake Collector for the Twitter outreach agent.
+You are the Research node for the deep research agent.

 Your task:
-1. Extract Twitter handle from user input (URL or @mention)
-2. Verify it's a valid Twitter handle format
-3. **CRITICAL: Call set_output(twitter_handles=[handle]) immediately after finding valid handle**
+1. Search for relevant sources on the research topic
+2. Extract key findings from each source
+3. **CRITICAL: Call set_output(research_findings=[...]) immediately after gathering sufficient findings**

-IMPORTANT: You MUST call set_output even if you found the handle on the first try.
-Do not continue searching once you have a valid handle - set output and finish.
+IMPORTANT: You MUST call set_output even if you found findings on the first search.
+Do not continue searching indefinitely - set output and finish once you have enough data.

 Output format:
 {
-  "twitter_handles": ["@handle1", "@handle2", ...]
+  "research_findings": [{"source": "...", "summary": "...", "url": "..."}, ...]
 }
 """
 ```

-**Why this works:** The explicit instruction to call set_output immediately prevents the LLM from continuing to search after finding the handle.
+**Why this works:** The explicit instruction to call set_output immediately prevents the LLM from continuing to search after finding sufficient sources.
 ```

 #### Stage 7: Verification
@@ -795,13 +1001,13 @@ You: "Checking now..."
 [Call query_runtime_log_details for the new run]

 Results:
- Node: intake-collector
+- Node: research
 - Exit Status: success ✓
 - Retry Count: 0 ✓
 - Total Steps: 2 ✓
 - No attention flags ✓

-"Perfect! The fix worked. The intake-collector now:
+"Perfect! The fix worked. The research now:
 - Completes successfully on first try
 - No retries needed
 - Calls set_output properly
@@ -828,18 +1034,21 @@ Your agent should now work correctly!"
 3. **Don't ignore edge conditions** - Missing edges cause routing failures
 4. **Don't overlook judge configuration** - Mismatched expectations cause retry loops
 5. **Don't forget nullable_output_keys** - Optional inputs need explicit marking
+6. **Don't diagnose "in_progress" as a failure for forever-alive agents** - Agents with `terminal_nodes=[]` are designed to never enter "completed" state. This is intentional. Focus on quality of individual node visits, not session completion status
+7. **Don't ignore conversation memory issues in long-running sessions** - In continuous conversation mode, history grows across node transitions and loop iterations. Watch for context overflow (tokens_used > 100K), stale data from previous loops affecting edge conditions, and compaction failures that cause the LLM to lose important context
+8. **Don't confuse "waiting for user" with "stalled"** - Client-facing nodes in forever-alive agents block for user input by design. A session paused at a client-facing node is working correctly, not stalled

 ---

 ## Storage Locations Reference

 **New unified storage (default):**
- Logs: `~/.hive/{agent_name}/sessions/session_YYYYMMDD_HHMMSS_{uuid}/logs/`
- State: `~/.hive/{agent_name}/sessions/{session_id}/state.json`
- Conversations: `~/.hive/{agent_name}/sessions/{session_id}/conversations/`
+- Logs: `~/.hive/agents/{agent_name}/sessions/session_YYYYMMDD_HHMMSS_{uuid}/logs/`
+- State: `~/.hive/agents/{agent_name}/sessions/{session_id}/state.json`
+- Conversations: `~/.hive/agents/{agent_name}/sessions/{session_id}/conversations/`

 **Old storage (deprecated, still supported):**
- Logs: `~/.hive/{agent_name}/runtime_logs/runs/{run_id}/`
+- Logs: `~/.hive/agents/{agent_name}/runtime_logs/runs/{run_id}/`

 The MCP tools automatically check both locations.

@@ -1,351 +1,333 @@
-# Example: Testing a YouTube Research Agent
+# Example: Iterative Testing of a Research Agent

-This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
+This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.

-## Prerequisites
+## Agent Structure

- Agent built with hive-create skill at `exports/youtube-research/`
- Goal defined with success criteria and constraints
-
-## Step 1: Load the Goal
-
-First, load the goal that was defined during the Goal stage:
-
-```json
-{
-    "id": "youtube-research",
-    "name": "YouTube Research Agent",
-    "description": "Find relevant YouTube videos on a given topic",
-    "success_criteria": [
-        {
-            "id": "find_videos",
-            "description": "Find 3-5 relevant videos",
-            "metric": "video_count",
-            "target": "3-5",
-            "weight": 1.0
-        },
-        {
-            "id": "relevance",
-            "description": "Videos must be relevant to the topic",
-            "metric": "relevance_score",
-            "target": ">0.8",
-            "weight": 0.8
-        }
-    ],
-    "constraints": [
-        {
-            "id": "api_limits",
-            "description": "Must not exceed YouTube API rate limits",
-            "constraint_type": "hard",
-            "category": "technical"
-        },
-        {
-            "id": "content_safety",
-            "description": "Must filter out inappropriate content",
-            "constraint_type": "hard",
-            "category": "safety"
-        }
-    ]
-}
+```
+exports/deep_research_agent/
+├── agent.py          # Goal + graph: intake → research → review → report
+├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
+├── config.py         # Model config
+├── mcp_servers.json  # Tools: web_search, web_scrape
+└── tests/            # Test files (we'll create these)
 ```

-## Step 2: Get Constraint Test Guidelines
+**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.

-During the Goal stage (or early Eval), get test guidelines for constraints:
+---
+
+## Phase 1: Generate Tests
+
+### Read the goal

 ```python
-result = generate_constraint_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON above>',
-    agent_path="exports/youtube-research"
-)
+Read(file_path="exports/deep_research_agent/agent.py")
+# Extract: goal_id="rigorous-interactive-research"
+# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
+# constraints: no-hallucination, source-attribution
 ```

-**The result contains guidelines (not generated tests):**
- `output_file`: Where to write tests
- `file_header`: Imports and fixtures to use
- `test_template`: Format for test functions
- `constraints_formatted`: The constraints to test
- `test_guidelines`: Rules for writing tests
-
-## Step 3: Write Constraint Tests
-
-Using the guidelines, write tests directly with the Write tool:
-
-```python
-# Write constraint tests using the provided file_header and guidelines
-Write(
-    file_path="exports/youtube-research/tests/test_constraints.py",
-    content='''
-"""Constraint tests for youtube-research agent."""
-
-import os
-import pytest
-from exports.youtube_research import default_agent
-
-
-pytestmark = pytest.mark.skipif(
-    not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing."
-)
-
-
-@pytest.mark.asyncio
-async def test_constraint_api_limits_respected():
-    """Verify API rate limits are not exceeded."""
-    import time
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-
-    for i in range(10):
-        result = await default_agent.run({"topic": f"test_{i}"}, mock_mode=mock_mode)
-        time.sleep(0.1)
-
-    # Should complete without rate limit errors
-    assert "rate limit" not in str(result).lower()
-
-
-@pytest.mark.asyncio
-async def test_constraint_content_safety_filter():
-    """Verify inappropriate content is filtered."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "general topic"}, mock_mode=mock_mode)
-
-    for video in result.videos:
-        assert video.safe_for_work is True
-        assert video.age_restricted is False
-'''
-)
-```
-
-## Step 4: Get Success Criteria Test Guidelines
-
-After the agent is built, get success criteria test guidelines:
+### Get test guidelines

 ```python
 result = generate_success_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON>',
-    node_names="search_node,filter_node,rank_node,format_node",
-    tool_names="youtube_search,video_details,channel_info",
-    agent_path="exports/youtube-research"
+    goal_id="rigorous-interactive-research",
+    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
+    node_names="intake,research,review,report",
+    tool_names="web_search,web_scrape",
+    agent_path="exports/deep_research_agent"
 )
 ```

-## Step 5: Write Success Criteria Tests
-
-Using the guidelines, write success criteria tests:
+### Write tests

 ```python
 Write(
-    file_path="exports/youtube-research/tests/test_success_criteria.py",
-    content='''
-"""Success criteria tests for youtube-research agent."""
-
-import os
-import pytest
-from exports.youtube_research import default_agent
-
-
-pytestmark = pytest.mark.skipif(
-    not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing."
-)
-
+    file_path="exports/deep_research_agent/tests/test_success_criteria.py",
+    content=result["file_header"] + '''

@pytest.mark.asyncio
-async def test_find_videos_happy_path():
-    """Test finding videos for a common topic."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "machine learning"}, mock_mode=mock_mode)
-
-    assert result.success
-    assert 3 <= len(result.videos) <= 5
-    assert all(v.title for v in result.videos)
-    assert all(v.video_id for v in result.videos)
-
+async def test_success_source_diversity(runner, auto_responder, mock_mode):
+    """At least 5 diverse sources are found."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "impact of remote work on productivity"})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    sources = output.get("sources", [])
+    if isinstance(sources, list):
+        assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"

@pytest.mark.asyncio
-async def test_find_videos_minimum_boundary():
-    """Test at minimum threshold (3 videos)."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "niche topic xyz"}, mock_mode=mock_mode)
-
-    assert len(result.videos) >= 3
-
+async def test_success_citation_coverage(runner, auto_responder, mock_mode):
+    """Every factual claim in the report cites its source."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "climate change effects on agriculture"})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    report = output.get("report", "")
+    # Check that report contains numbered references
+    assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"

@pytest.mark.asyncio
-async def test_relevance_score_threshold():
-    """Test relevance scoring meets threshold."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "python programming"}, mock_mode=mock_mode)
-
-    for video in result.videos:
-        assert video.relevance_score > 0.8
-
+async def test_success_report_completeness(runner, auto_responder, mock_mode):
+    """Report addresses the original research question."""
+    query = "pros and cons of nuclear energy"
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": query})
+    finally:
+        await auto_responder.stop()
+    assert result.success, f"Agent failed: {result.error}"
+    output = result.output or {}
+    report = output.get("report", "")
+    assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"

@pytest.mark.asyncio
-async def test_find_videos_no_results_graceful():
-    """Test graceful handling of no results."""
-    mock_mode = bool(os.environ.get("MOCK_MODE"))
-    result = await default_agent.run({"topic": "xyznonexistent123"}, mock_mode=mock_mode)
+async def test_empty_query_handling(runner, auto_responder, mock_mode):
+    """Agent handles empty input gracefully."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": ""})
+    finally:
+        await auto_responder.stop()
+    output = result.output or {}
+    assert not result.success or output.get("error"), "Should handle empty query"

-    # Should not crash, return empty or message
-    assert result.videos == [] or result.message
+@pytest.mark.asyncio
+async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
+    """Feedback loop between review and research terminates."""
+    await auto_responder.start()
+    try:
+        result = await runner.run({"query": "quantum computing basics"})
+    finally:
+        await auto_responder.stop()
+    visits = result.node_visit_counts or {}
+    for node_id, count in visits.items():
+        assert count <= 5, f"Node {node_id} visited {count} times"
 '''
 )
 ```

-## Step 6: Run All Tests
+---

-Execute all tests:
+## Phase 2: First Execution

 ```python
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]',
-    parallel=4
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent",
+    fail_fast=True
 )
 ```

-**Results:**
-
+**Result:**
 ```json
 {
-    "goal_id": "youtube-research",
-    "overall_passed": false,
-    "summary": {
-        "total": 6,
-        "passed": 5,
-        "failed": 1,
-        "pass_rate": "83.3%"
-    },
-    "duration_ms": 4521,
-    "results": [
-        {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
-        {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
-        {"test_id": "test_success_001", "passed": true, "duration_ms": 789},
-        {"test_id": "test_success_002", "passed": true, "duration_ms": 654},
-        {"test_id": "test_success_003", "passed": true, "duration_ms": 543},
-        {"test_id": "test_success_004", "passed": false, "duration_ms": 845,
-         "error_category": "IMPLEMENTATION_ERROR",
-         "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
-    ]
+  "overall_passed": false,
+  "summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
+  "failures": [
+    {"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
+    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
+  ]
 }
 ```

-## Step 7: Debug the Failed Test
+---
+
+## Phase 3: Analyze (Iteration 1)
+
+### Debug the first failure

 ```python
-result = debug_test(
-    goal_id="youtube-research",
-    test_name="test_find_videos_no_results_graceful",
-    agent_path="exports/youtube-research"
+debug_test(
+    goal_id="rigorous-interactive-research",
+    test_name="test_success_source_diversity",
+    agent_path="exports/deep_research_agent"
+)
+# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
+```
+
+### Find the session and inspect memory
+
+```python
+list_agent_sessions(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    status="completed",
+    limit=1
+)
+# → session_20260209_150000_abc12345
+
+get_agent_session_memory(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_150000_abc12345",
+    key="research_results"
+)
+# → Only 2 sources found. LLM stopped searching after 2 queries.
+```
+
+### Check LLM behavior in the research node
+
+```python
+query_runtime_log_raw(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    run_id="session_20260209_150000_abc12345",
+    node_id="research"
+)
+# → LLM called web_search twice, got results, immediately called set_output.
+# → Prompt doesn't instruct it to find at least 5 sources.
+```
+
+**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
+
+---
+
+## Phase 4: Fix (Iteration 1)
+
+```python
+Read(file_path="exports/deep_research_agent/nodes/__init__.py")
+
+# Fix the research node prompt
+Edit(
+    file_path="exports/deep_research_agent/nodes/__init__.py",
+    old_string='system_prompt="Search for information on the user\'s topic using web search."',
+    new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
 )
 ```

-**Debug Output:**
+---

+## Phase 5: Recover & Resume (Iteration 1)
+
+The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
+
+```python
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent",
+    fail_fast=True
+)
+```
+
+**Result:**
 ```json
 {
-    "test_id": "test_success_004",
-    "test_name": "test_find_videos_no_results_graceful",
-    "input": {"topic": "xyznonexistent123"},
-    "expected": "Empty list or message",
-    "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
-    "passed": false,
-    "error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
-    "error_category": "IMPLEMENTATION_ERROR",
-    "stack_trace": "Traceback (most recent call last):\n  File \"filter_node.py\", line 42\n    for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
-    "logs": [
-        {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
-    ],
-    "runtime_data": {
-        "execution_path": ["start", "search_node", "filter_node"],
-        "node_outputs": {
-            "search_node": null
-        }
-    },
-    "suggested_fix": "Add null check in filter_node before accessing .videos attribute",
-    "iteration_guidance": {
-        "stage": "Agent",
-        "action": "Fix the code in nodes/edges",
-        "restart_required": false,
-        "description": "The goal is correct, but filter_node doesn't handle null results from search_node."
-    }
+  "overall_passed": false,
+  "summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
+  "failures": [
+    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
+  ]
 }
 ```

-## Step 8: Iterate Based on Category
+Source diversity now passes. Citation coverage still fails.

-Since this is an **IMPLEMENTATION_ERROR**, we:
+---

-1. **Don't restart** the Goal → Agent → Eval flow
-2. **Fix the agent** using hive-create skill:
-   - Modify `filter_node` to handle null results
-3. **Re-run Eval** (tests only)
-
-### Fix in hive-create:
+## Phase 3: Analyze (Iteration 2)

 ```python
-# Update the filter_node to handle null
-add_node(
-    node_id="filter_node",
-    name="Filter Node",
-    description="Filter and rank videos",
-    node_type="function",
-    input_keys=["search_results"],
-    output_keys=["filtered_videos"],
-    system_prompt="""
-    Filter videos by relevance.
-    IMPORTANT: Handle case where search_results is None or empty.
-    Return empty list if no results.
-    """
+debug_test(
+    goal_id="rigorous-interactive-research",
+    test_name="test_success_citation_coverage",
+    agent_path="exports/deep_research_agent"
+)
+# Category: ASSERTION_FAILURE — Report lacks citations
+
+# Check what the report node produced
+list_agent_sessions(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    status="completed",
+    limit=1
+)
+# → session_20260209_151500_def67890
+
+get_agent_session_memory(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_151500_def67890",
+    key="report"
+)
+# → Report text exists but uses no numbered references.
+# → Sources are in memory but report node doesn't cite them.
+```
+
+**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
+
+---
+
+## Phase 4: Fix (Iteration 2)
+
+```python
+Edit(
+    file_path="exports/deep_research_agent/nodes/__init__.py",
+    old_string='system_prompt="Write a comprehensive report based on the research findings."',
+    new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
 )
 ```

-### Re-export and re-test:
+---
+
+## Phase 5: Resume (Iteration 2)
+
+The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
+
+```bash
+# Run via CLI to get checkpoints
+uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
+
+# After it runs, find the clean checkpoint before report
+list_agent_checkpoints(
+    agent_work_dir="~/.hive/agents/deep_research_agent",
+    session_id="session_20260209_152000_ghi34567",
+    is_clean="true"
+)
+# → cp_node_complete_review_152100 (after review, before report)
+
+# Resume — skips intake, research, review entirely
+uv run hive run exports/deep_research_agent \
+  --resume-session session_20260209_152000_ghi34567 \
+  --checkpoint cp_node_complete_review_152100
+```
+
+Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
+
+---
+
+## Phase 6: Final Verification

 ```python
-# Re-export the fixed agent
-export_graph(path="exports/youtube-research")
-
-# Re-run tests
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]'
+run_tests(
+    goal_id="rigorous-interactive-research",
+    agent_path="exports/deep_research_agent"
 )
 ```

-**Updated Results:**
-
+**Result:**
 ```json
 {
-    "goal_id": "youtube-research",
-    "overall_passed": true,
-    "summary": {
-        "total": 6,
-        "passed": 6,
-        "failed": 0,
-        "pass_rate": "100.0%"
-    }
+  "overall_passed": true,
+  "summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
 }
 ```

+All tests pass.
+
+---
+
 ## Summary

-1. **Got guidelines** for constraint tests during Goal stage
-2. **Wrote** constraint tests using Write tool
-3. **Got guidelines** for success criteria tests during Eval stage
-4. **Wrote** success criteria tests using Write tool
-5. **Ran** tests in parallel
-6. **Debugged** the one failure
-7. **Categorized** as IMPLEMENTATION_ERROR
-8. **Fixed** the agent (not the goal)
-9. **Re-ran** Eval only (didn't restart full flow)
-10. **Passed** all tests
+| Iteration | Failure | Root Cause | Fix | Recovery |
+|-----------|---------|------------|-----|----------|
+| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
+| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |

-The agent is now validated and ready for production use.
+**Key takeaways:**
+- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
+- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
+- Final `run_tests` confirms all scenarios pass end-to-end
@@ -19,14 +19,18 @@ metadata:

 **THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**

-When this skill is loaded, determine what the user needs and invoke the appropriate skill NOW:
- **User wants to build an agent** → Invoke `/hive-create` immediately
- **User wants to test an agent** → Invoke `/hive-test` immediately
- **User wants to learn concepts** → Invoke `/hive-concepts` immediately
- **User wants patterns/optimization** → Invoke `/hive-patterns` immediately
- **User wants to set up credentials** → Invoke `/hive-credentials` immediately
- **User has a failing/broken agent** → Invoke `/hive-debugger` immediately
- **Unclear what user needs** → Ask the user (do NOT explore the codebase to figure it out)
+When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
+
+```
+Use AskUserQuestion with these options:
+- "Build a new agent" → Then invoke /hive-create
+- "Test an existing agent" → Then invoke /hive-test
+- "Learn agent concepts" → Then invoke /hive-concepts
+- "Optimize agent design" → Then invoke /hive-patterns
+- "Set up credentials" → Then invoke /hive-credentials
+- "Debug a failing agent" → Then invoke /hive-debugger
+- "Other" (please describe what you want to achieve)
+```

 **DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.

@@ -73,7 +77,6 @@ Use this meta-skill when:

 ## Phase 0: Understand Concepts (Optional)

-**Duration**: 5-10 minutes
 **Skill**: `/hive-concepts`
 **Input**: Questions about agent architecture

@@ -95,9 +98,8 @@ Use this meta-skill when:

 ## Phase 1: Build Agent Structure

-**Duration**: 15-30 minutes
 **Skill**: `/hive-create`
-**Input**: User requirements ("Build an agent that...")
+**Input**: User requirements ("Build an agent that...") or a template to start from

 ### What This Phase Does

@@ -166,7 +168,6 @@ exports/agent_name/

 ## Phase 1.5: Optimize Design (Optional)

-**Duration**: 10-15 minutes
 **Skill**: `/hive-patterns`
 **Input**: Completed agent structure

@@ -191,22 +192,21 @@ exports/agent_name/

 ## Phase 2: Test & Validate

-**Duration**: 20-40 minutes
 **Skill**: `/hive-test`
 **Input**: Working agent from Phase 1

 ### What This Phase Does

-Creates comprehensive test suite:
- Constraint tests (verify hard requirements)
- Success criteria tests (measure goal achievement)
- Edge case tests (handle failures gracefully)
- Integration tests (end-to-end workflows)
+Guides the creation and execution of a comprehensive test suite:
+- Constraint tests
+- Success criteria tests
+- Edge case tests
+- Integration tests

 ### Process

 1. **Analyze agent** - Read goal, constraints, success criteria
-2. **Generate tests** - Create pytest files in `exports/agent_name/tests/`
+2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
 3. **User approval** - Review and approve each test
 4. **Run evaluation** - Execute tests and collect results
 5. **Debug failures** - Identify and fix issues
@@ -287,6 +287,19 @@ User: "Build an agent (first time)"
 → Done: Production-ready agent
 ```

+### Pattern 1c: Build from Template
+
+```
+User: "Build an agent based on the deep research template"
+→ Use /hive-create
+→ Select "From a template" path
+→ Pick template, name new agent
+→ Review/modify goal, nodes, graph
+→ Agent exported with customizations
+→ Use /hive-test
+→ Done: Customized agent
+```
+
 ### Pattern 2: Test Existing Agent

 ```
@@ -490,6 +503,7 @@ The workflow is **flexible** - skip phases as needed, iterate freely, and adapt
 - Have clear requirements
 - Ready to write code
 - Want step-by-step guidance
+- Want to start from an existing template and customize it

 **Choose hive-patterns when:**
 - Agent structure complete
@@ -0,0 +1,7 @@
+# Project-level Codex config for Hive.
+# Keep this file minimal: MCP connectivity + skill discovery.
+
+[mcp_servers.agent-builder]
+command = "uv"
+args = ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"]
+cwd = "."
@@ -74,4 +74,6 @@ exports/*

 docs/github-issues/*
 core/tests/*dumps/*
-screenshots/*
+
+screenshots/*
+
@@ -4,11 +4,6 @@
      "command": "uv",
      "args": ["run", "-m", "framework.mcp.agent_builder_server"],
      "cwd": "core"
-    },
-    "tools": {
-      "command": "uv",
-      "args": ["run", "mcp_server.py", "--stdio"],
-      "cwd": "tools"
    }
  }
 }
@@ -0,0 +1,30 @@
+{
+  "mcpServers": {
+    "agent-builder": {
+      "command": "uv",
+      "args": [
+        "run",
+        "python",
+        "-m",
+        "framework.mcp.agent_builder_server"
+      ],
+      "cwd": "core",
+      "env": {
+        "PYTHONPATH": "../tools/src"
+      }
+    },
+    "tools": {
+      "command": "uv",
+      "args": [
+        "run",
+        "python",
+        "mcp_server.py",
+        "--stdio"
+      ],
+      "cwd": "tools",
+      "env": {
+        "PYTHONPATH": "src"
+      }
+    }
+  }
+}
@@ -0,0 +1 @@
+../../.claude/skills/hive
@@ -0,0 +1 @@
+../../.claude/skills/hive-concepts
@@ -0,0 +1 @@
+../../.claude/skills/hive-create
@@ -0,0 +1 @@
+../../.claude/skills/hive-credentials
@@ -0,0 +1 @@
+../../.claude/skills/hive-debugger
@@ -0,0 +1 @@
+../../.claude/skills/hive-patterns
@@ -0,0 +1 @@
+../../.claude/skills/hive-test
@@ -0,0 +1 @@
+../../.claude/skills/triage-issue
@@ -1,41 +1,207 @@
-# Changelog
+# Release Notes

-All notable changes to this project will be documented in this file.
+**Release Date:** February 18, 2026
+**Tag:** v0.5.1

-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## The Hive Gets a Brain

-## [Unreleased]
+v0.5.1 is our most ambitious release yet. Hive agents can now **build other agents** -- the new Hive Coder meta-agent writes, tests, and fixes agent packages from natural language. The runtime grows multi-graph support so one session can orchestrate multiple agents simultaneously. The TUI gets a complete overhaul with an in-app agent picker, live streaming, and seamless escalation to the Coder. And we're now provider-agnostic: Claude Code subscriptions, OpenAI-compatible endpoints, and any LiteLLM-supported model work out of the box.

-### Added
- Initial project structure
- React frontend (honeycomb) with Vite and TypeScript
- Node.js backend (hive) with Express and TypeScript
- Docker Compose configuration for local development
- Configuration system via `config.yaml`
- GitHub Actions CI/CD workflows
- Comprehensive documentation
+---

-### Changed
- N/A
+## Highlights

-### Deprecated
- N/A
+### Hive Coder -- The Agent That Builds Agents

-### Removed
- N/A
+A native meta-agent that lives inside the framework at `core/framework/agents/hive_coder/`. Give it a natural-language specification and it produces a complete agent package -- goal definition, node prompts, edge routing, MCP tool wiring, tests, and all boilerplate files.

+```bash
+# Launch the Coder directly
+hive code

-### Fixed
- tools: Fixed web_scrape tool attempting to parse non-HTML content (PDF, JSON) as HTML (#487)
+# Or escalate from any running agent (TUI)
+Ctrl+E  # or /coder in chat
+```

-### Security
- N/A
+The Coder ships with:

-## [0.1.0] - 2025-01-13
+- **Reference documentation** -- anti-patterns, construction guide, and design patterns baked into its system prompt
+- **Guardian watchdog** -- an event-driven monitor that catches agent failures and triggers automatic remediation
+- **Coder Tools MCP server** -- file I/O, fuzzy-match editing, git snapshots, and sandboxed shell execution (`tools/coder_tools_server.py`)
+- **Test generation** -- structural tests for forever-alive agents that don't hang on `runner.run()`

-### Added
- Initial release
+### Multi-Graph Agent Runtime

-[Unreleased]: https://github.com/adenhq/hive/compare/v0.1.0...HEAD
-[0.1.0]: https://github.com/adenhq/hive/releases/tag/v0.1.0
+`AgentRuntime` now supports loading, managing, and switching between multiple agent graphs within a single session. Six new lifecycle tools give agents (and the TUI) full control:
+
+```python
+# Load a second agent into the runtime
+await runtime.add_graph("exports/deep_research_agent")
+
+# Tools available to agents:
+# load_agent, unload_agent, start_agent, restart_agent, list_agents, get_user_presence
+```
+
+The Hive Coder uses multi-graph internally -- when you escalate from a worker agent, the Coder loads as a separate graph while the worker stays alive in the background.
+
+### TUI Revamp
+
+The Terminal UI gets a ground-up rebuild with five major additions:
+
+- **Agent Picker** (Ctrl+A) -- tabbed modal screen for browsing Your Agents, Framework agents, and Examples with metadata badges (node count, tool count, session count, tags)
+- **Runtime-optional startup** -- TUI launches without a pre-loaded agent, showing the picker on first open
+- **Live streaming pane** -- dedicated RichLog widget shows LLM tokens as they arrive, replacing the old one-token-per-line display
+- **PDF attachments** -- `/attach` and `/detach` commands with native OS file dialog (macOS, Linux, Windows)
+- **Multi-graph commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>` for managing agent graphs in-session
+
+### Provider-Agnostic LLM Support
+
+Hive is no longer Anthropic-only. v0.5.1 adds first-class support for:
+
+- **Claude Code subscriptions** -- `use_claude_code_subscription: true` in `~/.hive/configuration.json` reads OAuth tokens from `~/.claude/.credentials.json` with automatic refresh
+- **OpenAI-compatible endpoints** -- `api_base` config routes traffic through any compatible API (Azure OpenAI, vLLM, Ollama, etc.)
+- **Any LiteLLM model** -- `RuntimeConfig` now passes `api_key`, `api_base`, and `extra_kwargs` through to LiteLLM
+
+The quickstart script auto-detects Claude Code subscriptions and ZAI Code installations.
+
+---
+
+## What's New
+
+### Architecture & Runtime
+
+- **Hive Coder meta-agent** -- Natural-language agent builder with reference docs, guardian watchdog, and `hive code` CLI command. (@TimothyZhang7)
+- **Multi-graph agent sessions** -- `add_graph`/`remove_graph` on AgentRuntime with 6 lifecycle tools (`load_agent`, `unload_agent`, `start_agent`, `restart_agent`, `list_agents`, `get_user_presence`). (@TimothyZhang7)
+- **Claude Code subscription support** -- OAuth token refresh via `use_claude_code_subscription` config, auto-detection in quickstart, LiteLLM header patching. (@TimothyZhang7)
+- **OpenAI-compatible endpoint support** -- `api_base` and `extra_kwargs` in `RuntimeConfig` for any OpenAI-compatible API. (@TimothyZhang7)
+- **Remove deprecated node types** -- Delete `FlexibleGraphExecutor`, `WorkerNode`, `HybridJudge`, `CodeSandbox`, `Plan`, `FunctionNode`, `LLMNode`, `RouterNode`. Deprecated types (`llm_tool_use`, `llm_generate`, `function`, `router`, `human_input`) now raise `RuntimeError` with migration guidance. (@TimothyZhang7)
+- **Interactive credential setup** -- Guided `CredentialSetupSession` with health checks and encrypted storage, accessible via `hive setup-credentials` or automatic prompting on credential errors. (@RichardTang-Aden)
+- **Pre-start confirmation prompt** -- Interactive prompt before agent execution allowing credential updates or abort. (@RichardTang-Aden)
+- **Event bus multi-graph support** -- `graph_id` on events, `filter_graph` on subscriptions, `ESCALATION_REQUESTED` event type, `exclude_own_graph` filter. (@TimothyZhang7)
+
+### TUI Improvements
+
+- **In-app agent picker** (Ctrl+A) -- Tabbed modal for browsing agents with metadata badges (nodes, tools, sessions, tags). (@TimothyZhang7)
+- **Runtime-optional TUI startup** -- Launches without a pre-loaded agent, shows agent picker on startup. (@TimothyZhang7)
+- **Hive Coder escalation** (Ctrl+E) -- Escalate to Hive Coder and return; also available via `/coder` and `/back` chat commands. (@TimothyZhang7)
+- **PDF attachment support** -- `/attach` and `/detach` commands with native OS file dialog. (@TimothyZhang7)
+- **Streaming output pane** -- Dedicated RichLog widget for live LLM token streaming. (@TimothyZhang7)
+- **Multi-graph TUI commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>`. (@TimothyZhang7)
+- **Agent Guardian watchdog** -- Event-driven monitor that catches secondary agent failures and triggers automatic remediation, with `--no-guardian` CLI flag. (@TimothyZhang7)
+
+### New Tool Integrations
+
+| Tool                   | Description                                                                                                                                                            | Contributor        |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| **Discord**            | 4 MCP tools (`discord_list_guilds`, `discord_list_channels`, `discord_send_message`, `discord_get_messages`) with rate-limit retry and channel filtering               | @mishrapravin114   |
+| **Exa Search API**     | 4 AI-powered search tools (`exa_search`, `exa_find_similar`, `exa_get_contents`, `exa_answer`) with neural/keyword search, domain filters, and citation-backed answers | @JeetKaria06       |
+| **Razorpay**           | 6 payment processing tools for payments, invoices, payment links, and refunds with HTTP Basic Auth                                                                     | @shivamshahi07     |
+| **Google Docs**        | Document creation, reading, and editing with OAuth credential support                                                                                                  | @haliaeetusvocifer |
+| **Gmail enhancements** | Expanded mail operations for inbox management                                                                                                                          | @bryanadenhq       |
+
+### Infrastructure
+
+- **Default node type → `event_loop`** -- `NodeSpec.node_type` defaults to `"event_loop"` instead of `"llm_tool_use"`. (@TimothyZhang7)
+- **Default `max_node_visits` → 0 (unlimited)** -- Nodes default to unlimited visits, reducing friction for feedback loops and forever-alive agents. (@TimothyZhang7)
+- **Remove `function` field from NodeSpec** -- Follows deprecation of `FunctionNode`. (@TimothyZhang7)
+- **LiteLLM OAuth patch** -- Correct header construction for OAuth tokens (remove `x-api-key` when Bearer token is present). (@TimothyZhang7)
+- **Orchestrator config centralization** -- Reads `api_key`, `api_base`, `extra_kwargs` from centralized `~/.hive/configuration.json`. (@TimothyZhang7)
+- **System prompt datetime injection** -- All system prompts now include current date/time for time-aware agent behavior. (@TimothyZhang7)
+- **Utils module exports** -- Proper `__init__.py` exports for the utils module. (@Siddharth2624)
+- **Increased default max_tokens** -- Opus 4.6 defaults to 32768, Sonnet 4.5 to 16384 (up from 8192). (@TimothyZhang7)
+
+---
+
+## Bug Fixes
+
+- Flush WIP accumulator outputs on cancel/failure so edge conditions see correct values on resume
+- Stall detection state preserved across resume (no more resets on checkpoint restore)
+- Skip client-facing blocking for event-triggered executions (timer/webhook)
+- Executor retry override scoped to actual EventLoopNode instances only
+- Add `_awaiting_input` flag to EventLoopNode to prevent input injection race conditions
+- Fix TUI streaming display (tokens no longer appear one-per-line)
+- Fix `_return_from_escalation` crash when ChatRepl widgets not yet mounted
+- Fix tools registration problems for Google Docs credentials (@RichardTang-Aden)
+- Fix email agent version conflicts (@RichardTang-Aden)
+- Fix coder tool timeouts (120s for tests, 300s cap for commands)
+
+## Documentation
+
+- Clarify installation and prevent root pip install misuse (@paarths-collab)
+
+---
+
+## Agent Updates
+
+- **Email Inbox Management** -- Consolidate `gmail_inbox_guardian` and `inbox_management` into a single unified agent with updated prompts and config. (@RichardTang-Aden, @bryanadenhq)
+- **Job Hunter** -- Updated node prompts, config, and agent metadata; added PDF resume selection. (@bryanadenhq)
+- **Deep Research Agent** -- Revised node implementations with updated prompts and output handling.
+- **Tech News Reporter** -- Revised node prompts for improved output quality.
+- **Vulnerability Assessment** -- Expanded prompts with more detailed assessment instructions. (@bryanadenhq)
+
+---
+
+## Breaking Changes
+
+- **Deprecated node types raise `RuntimeError`** -- `llm_tool_use`, `llm_generate`, `function`, `router`, `human_input` now fail instead of warning. Migrate to `event_loop`.
+- **`NodeSpec.node_type` defaults to `"event_loop"`** (was `"llm_tool_use"`)
+- **`NodeSpec.max_node_visits` defaults to `0` / unlimited** (was `1`)
+- **`NodeSpec.function` field removed** -- `FunctionNode` is deleted; use event_loop nodes with tools instead.
+
+---
+
+## Community Contributors
+
+A huge thank you to everyone who contributed to this release:
+
+- **Richard Tang** (@RichardTang-Aden) -- Interactive credential setup, pre-start confirmation, email agent consolidation, tool registration fixes, lint and formatting
+- **Pravin Mishra** (@mishrapravin114) -- Discord integration with 4 MCP tools
+- **Jeet Karia** (@JeetKaria06) -- Exa Search API integration with 4 AI-powered search tools
+- **Shivam Shahi** (@shivamshahi07) -- Razorpay payment processing integration
+- **Siddharth Varshney** (@Siddharth2624) -- Utils module exports
+- **@haliaeetusvocifer** -- Google Docs integration with OAuth support
+- **Bryan** (@bryanadenhq) -- PDF selection, inbox agent fixes, Job Hunter and Vulnerability Assessment updates
+- **@paarths-collab** -- Documentation improvements
+
+---
+
+## Upgrading
+
+```bash
+git pull origin main
+uv sync
+```
+
+### Migration Guide
+
+If your agents use deprecated node types, update them:
+
+```python
+# Before (v0.5.0) -- these now raise RuntimeError
+NodeSpec(node_type="llm_tool_use", ...)
+NodeSpec(node_type="function", function=my_func, ...)
+
+# After (v0.5.1) -- use event_loop for everything
+NodeSpec(node_type="event_loop", ...)  # or just omit node_type (it's the default now)
+```
+
+If your agents set `max_node_visits=1` explicitly, they'll still work. The only change is the _default_ -- new agents without an explicit value now get unlimited visits.
+
+To try the new Hive Coder:
+
+```bash
+# Launch Coder directly
+hive code
+
+# Or from TUI -- press Ctrl+E to escalate
+hive tui
+```
+
+---
+
+## What's Next
+
+- **Agent-to-agent communication** -- one agent's output triggers another agent's entry point
+- **Cost visibility** -- detailed runtime log of LLM costs per node and per session
+- **Persistent webhook subscriptions** -- survive agent restarts without re-registering
+- **Remote agent deployment** -- run agents as long-lived services with HTTP APIs
@@ -1,10 +1,10 @@
 # Contributing to Aden Agent Framework

-Thank you for your interest in contributing to the Aden Agent Framework! This document provides guidelines and information for contributors. We’re especially looking for help building tools, integrations([check #2805](https://github.com/adenhq/hive/issues/2805)), and example agents for the framework. If you’re interested in extending its functionality, this is the perfect place to start. 
+Thank you for your interest in contributing to the Aden Agent Framework! This document provides guidelines and information for contributors. We’re especially looking for help building tools, integrations ([check #2805](https://github.com/adenhq/hive/issues/2805)), and example agents for the framework. If you’re interested in extending its functionality, this is the perfect place to start. 

 ## Code of Conduct

-By participating in this project, you agree to abide by our [Code of Conduct](CODE_OF_CONDUCT.md).
+By participating in this project, you agree to abide by our [Code of Conduct](docs/CODE_OF_CONDUCT.md).

 ## Issue Assignment Policy

@@ -49,8 +49,8 @@ You may submit PRs without prior assignment for:
   make check    # Lint and format checks (ruff check + ruff format --check on core/ and tools/)
   make test     # Core tests (cd core && pytest tests/ -v)
   ```
-6. Commit your changes following our commit conventions
-7. Push to your fork and submit a Pull Request
+8. Commit your changes following our commit conventions
+9. Push to your fork and submit a Pull Request

 ## Development Setup

@@ -99,8 +99,7 @@ docs(readme): update installation instructions
 2. Update documentation if needed
 3. Add tests for new functionality
 4. Ensure `make check` and `make test` pass
-5. Update the CHANGELOG.md if applicable
-6. Request review from maintainers
+5. Request review from maintainers

 ### PR Title Format

@@ -127,6 +126,8 @@ feat(component): add new feature description
 - Use meaningful variable and function names
 - Keep functions focused and small

+For linting and formatting (Ruff, pre-commit hooks), see [Linting & Formatting Setup](docs/contributing-lint-setup.md).
+
 ## Testing

 > **Note:** When testing agents in `exports/`, always set PYTHONPATH:
@@ -145,6 +146,9 @@ make test
 # Or run tests directly
 cd core && pytest tests/ -v

+# Run tools package tests (when contributing to tools/)
+cd tools && uv run pytest tests/ -v
+
 # Run tests for a specific agent
 PYTHONPATH=exports uv run python -m agent_name test
 ```
@@ -159,4 +163,4 @@ By submitting a Pull Request, you agree that your contributions will be licensed

 Feel free to open an issue for questions or join our [Discord community](https://discord.com/invite/MXE49hrKDk).

-Thank you for contributing!
+Thank you for contributing!
@@ -1,5 +1,5 @@
 <p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
+  <img width="100%" alt="Hive Banner" src="https://github.com/user-attachments/assets/a027429b-5d3c-4d34-88e4-0feaeaabbab3" />
 </p>

 <p align="center">
@@ -13,16 +13,19 @@
  <a href="docs/i18n/ko.md">한국어</a>
 </p>

-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
+<p align="center">
+  <a href="https://github.com/adenhq/hive/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="Apache 2.0 License" /></a>
+  <a href="https://www.ycombinator.com/companies/aden"><img src="https://img.shields.io/badge/Y%20Combinator-Aden-orange" alt="Y Combinator" /></a>
+  <a href="https://discord.com/invite/MXE49hrKDk"><img src="https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb" alt="Discord" /></a>
+  <a href="https://x.com/aden_hq"><img src="https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5" alt="Twitter Follow" /></a>
+  <a href="https://www.linkedin.com/company/teamaden/"><img src="https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff" alt="LinkedIn" /></a>
+  <img src="https://img.shields.io/badge/MCP-102_Tools-00ADD8?style=flat-square" alt="MCP" />
+</p>

 <p align="center">
  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
+  <img src="https://img.shields.io/badge/Headless-Development-purple?style=flat-square" alt="Headless" />
  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
 </p>
@@ -30,15 +33,16 @@
  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
 </p>

 ## Overview

-Build reliable, self-improving AI agents without hardcoding workflows. Define your goal through conversation with a coding agent, and the framework generates a node graph with dynamically created connection code. When things break, the framework captures failure data, evolves the agent through the coding agent, and redeploys. Built-in human-in-the-loop nodes, credential management, and real-time monitoring give you control without sacrificing adaptability.
+Build autonomous, reliable, self-improving AI agents without hardcoding workflows. Define your goal through conversation with a coding agent, and the framework generates a node graph with dynamically created connection code. When things break, the framework captures failure data, evolves the agent through the coding agent, and redeploys. Built-in human-in-the-loop nodes, credential management, and real-time monitoring give you control without sacrificing adaptability.

 Visit [adenhq.com](https://adenhq.com) for complete documentation, examples, and guides.

+https://github.com/user-attachments/assets/846c0cc7-ffd6-47fa-b4b7-495494857a55
+
 ## Who Is Hive For?

 Hive is designed for developers and teams who want to build **production-grade AI agents** without manually wiring complex workflows.
@@ -58,44 +62,36 @@ Hive may not be the best fit if you’re only experimenting with simple agent ch
 Use Hive when you need:

 - Long-running, autonomous agents
- Multi-agent coordination
+- Strong guardrails, process, and controls
 - Continuous improvement based on failures
- Strong monitoring, safety, and budget controls
+- Multi-agent coordination
 - A framework that evolves with your goals

-## What is Aden
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Aden is a platform for building, deploying, operating, and adapting AI agents:
-
- **Build** - A Coding Agent generates specialized Worker Agents (Sales, Marketing, Ops) from natural language goals
- **Deploy** - Headless deployment with CI/CD integration and full API lifecycle management
- **Operate** - Real-time monitoring, observability, and runtime guardrails keep agents reliable
- **Adapt** - Continuous evaluation, supervision, and adaptation ensure agents improve over time
- **Infra** - Shared memory, LLM integrations, tools, and skills power every agent
-
 ## Quick Links

 - **[Documentation](https://docs.adenhq.com/)** - Complete guides and API reference
 - **[Self-Hosting Guide](https://docs.adenhq.com/getting-started/quickstart)** - Deploy Hive on your infrastructure
 - **[Changelog](https://github.com/adenhq/hive/releases)** - Latest updates and releases
-<!-- - **[Roadmap](https://adenhq.com/roadmap)** - Upcoming features and plans -->
+- **[Roadmap](docs/roadmap.md)** - Upcoming features and plans
 - **[Report Issues](https://github.com/adenhq/hive/issues)** - Bug reports and feature requests
+- **[Contributing](CONTRIBUTING.md)** - How to contribute and submit PRs

 ## Quick Start

-## Prerequisites
+### Prerequisites

 - Python 3.11+ for agent development
- Claude Code or Cursor for utilizing agent skills
+- Claude Code, Codex CLI, or Cursor for utilizing agent skills

 > **Note for Windows Users:** It is strongly recommended to use **WSL (Windows Subsystem for Linux)** or **Git Bash** to run this framework. Some core automation scripts may not execute correctly in standard Command Prompt or PowerShell.

 ### Installation

+> **Note**
+> Hive uses a `uv` workspace layout and is not installed with `pip install`.
+> Running `pip install -e .` from the repository root will create a placeholder package and Hive will not function correctly.
+> Please use the quickstart script below to set up the environment.
+
 ```bash
 # Clone the repository
 git clone https://github.com/adenhq/hive.git
@@ -111,7 +107,7 @@ This sets up:
 - **aden_tools** - MCP tools for agent capabilities (in `tools/.venv`)
 - **credential store** - Encrypted API key storage (`~/.hive/credentials`)
 - **LLM provider** - Interactive default model configuration
- All required Python dependencies
+- All required Python dependencies with `uv`

 ### Build Your First Agent

@@ -129,16 +125,44 @@ hive tui
 hive run exports/your_agent_name --input '{"key": "value"}'
 ```

+## Coding Agent Support
+
+### Codex CLI
+
+Hive includes native support for [OpenAI Codex CLI](https://github.com/openai/codex) (v0.101.0+).
+
+1. **Config:** `.codex/config.toml` with `agent-builder` MCP server (tracked in git)
+2. **Skills:** `.agents/skills/` symlinks to Hive skills (tracked in git)
+3. **Launch:** Run `codex` in the repo root, then type `use hive`
+
+Example:
+
+```
+codex> use hive
+```
+
+### Opencode
+
+Hive includes native support for [Opencode](https://github.com/opencode-ai/opencode).
+
+1. **Setup:** Run the quickstart script
+2. **Launch:** Open Opencode in the project root.
+3. **Activate:** Type `/hive` in the chat to switch to the Hive Agent.
+4. **Verify:** Ask the agent _"List your tools"_ to confirm the connection.
+
+The agent has access to all Hive skills and can scaffold agents, add tools, and debug workflows directly from the chat.
+
 **[📖 Complete Setup Guide](docs/environment-setup.md)** - Detailed instructions for agent development

-### Cursor IDE Support
+### Antigravity IDE Support

-Skills are also available in Cursor. To enable:
+Skills and MCP servers are also available in [Antigravity IDE](https://antigravity.google/) (Google's AI-powered IDE). **Easiest:** open a terminal in the hive repo folder and run (use `./` — the script is inside the repo):

-1. Open Command Palette (`Cmd+Shift+P` / `Ctrl+Shift+P`)
-2. Run `MCP: Enable` to enable MCP servers
-3. Restart Cursor to load the MCP servers from `.cursor/mcp.json`
-4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`)
+```bash
+./scripts/setup-antigravity-mcp.sh
+```
+
+**Important:** Always restart/refresh Antigravity IDE after running the setup script—MCP servers only load on startup. After restart, **agent-builder** and **tools** MCP servers should connect. Skills are under `.agent/skills/` (symlinks to `.claude/skills/`). See [docs/antigravity-setup.md](docs/antigravity-setup.md) for manual setup and troubleshooting.

 ## Features

@@ -152,9 +176,18 @@ Skills are also available in Cursor. To enable:
 - **Cost & Budget Control** - Set spending limits, throttles, and automatic model degradation policies
 - **Production-Ready** - Self-hostable, built for scale and reliability

+## Integration
+
+<a href="https://github.com/adenhq/hive/tree/main/tools/src/aden_tools/tools"><img width="100%" alt="Integration" src="https://github.com/user-attachments/assets/a1573f93-cf02-4bb8-b3d5-b305b05b1e51" /></a>
+
+Hive is built to be model-agnostic and system-agnostic.
+
+- **LLM flexibility** - Hive Framework is designed to support various types of LLMs, including hosted and local models through LiteLLM-compatible providers.
+- **Business system connectivity** - Hive Framework is designed to connect to all kinds of business systems as tools, such as CRM, support, messaging, data, file, and internal APIs via MCP.
+
 ## Why Aden

-Hive focuses on generating agents that run real business processes rather than generic agents. Instead of requiring you to manually design workflows, define agent interactions, and handle failures reactively, Hive flips the paradigm: **you describe [outcomes](docs/key_concepts/goals_outcome.md), and the system builds itself**—delivering an outcome-driven, [adaptive](docs/key_concepts/evolution.md) experience with an easy-to-use set of tools and integrations.
+Hive focuses on generating agents that run real business processes rather than generic agents. Instead of requiring you to manually design workflows, define agent interactions, and handle failures reactively, Hive flips the paradigm: **you describe outcomes, and the system builds itself**—delivering an outcome-driven, adaptive experience with an easy-to-use set of tools and integrations.

 ```mermaid
 flowchart LR
@@ -188,9 +221,9 @@ flowchart LR
    style V6 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
 ```

-### The Aden Advantage
+### The Hive Advantage

-| Traditional Frameworks     | Aden                                   |
+| Traditional Frameworks     | Hive                                   |
 | -------------------------- | -------------------------------------- |
 | Hardcode agent workflows   | Describe goals in natural language     |
 | Manual graph definition    | Auto-generated agent graphs            |
@@ -239,104 +272,129 @@ See [environment-setup.md](docs/environment-setup.md) for complete setup instruc
 - [Configuration Guide](docs/configuration.md) - All configuration options
 - [Architecture Overview](docs/architecture/README.md) - System design and structure

-### Key Concepts
-
- [Goals & Outcome-Driven Development](docs/key_concepts/goals_outcome.md) - Why Hive is outcome-driven and how goals define success
- [The Agent Graph](docs/key_concepts/graph.md) - Nodes, edges, shared memory, and how agents execute
- [The Worker Agent](docs/key_concepts/worker_agent.md) - Sessions, iterations, headless execution, and the runtime
- [Evolution](docs/key_concepts/evolution.md) - How agents improve across generations through failure data
-
 ## Roadmap

 Aden Hive Agent Framework aims to help developers build outcome-oriented, self-adaptive agents. See [roadmap.md](docs/roadmap.md) for details.

 ```mermaid
-flowchart TD
-subgraph Foundation
-    direction LR
-    subgraph arch["Architecture"]
-        a1["Node-Based Architecture"]:::done
-        a2["Python SDK"]:::done
-        a3["LLM Integration"]:::done
-        a4["Communication Protocol"]:::done
-    end
-    subgraph ca["Coding Agent"]
-        b1["Goal Creation Session"]:::done
-        b2["Worker Agent Creation"]
-        b3["MCP Tools"]:::done
-    end
-    subgraph wa["Worker Agent"]
-        c1["Human-in-the-Loop"]:::done
-        c2["Callback Handlers"]:::done
-        c3["Intervention Points"]:::done
-        c4["Streaming Interface"]
-    end
-    subgraph cred["Credentials"]
-        d1["Setup Process"]:::done
-        d2["Pluggable Sources"]:::done
-        d3["Enterprise Secrets"]
-        d4["Integration Tools"]:::done
-    end
-    subgraph tools["Tools"]
-        e1["File Use"]:::done
-        e2["Memory STM/LTM"]:::done
-        e3["Web Search/Scraper"]:::done
-        e4["CSV/PDF"]:::done
-        e5["Excel/Email"]
-    end
-    subgraph core["Core"]
-        f1["Eval System"]
-        f2["Pydantic Validation"]:::done
-        f3["Documentation"]:::done
-        f4["Adaptiveness"]
-        f5["Sample Agents"]
-    end
-end
+flowchart TB
+    %% Main Entity
+    User([User])

-subgraph Expansion
-    direction LR
-    subgraph intel["Intelligence"]
-        g1["Guardrails"]
-        g2["Streaming Mode"]
-        g3["Image Generation"]
-        g4["Semantic Search"]
+    %% =========================================
+    %% EXTERNAL EVENT SOURCES
+    %% =========================================
+    subgraph ExtEventSource [External Event Source]
+        E_Sch["Schedulers"]
+        E_WH["Webhook"]
+        E_SSE["SSE"]
    end
-    subgraph mem["Memory Iteration"]
-        h1["Message Model & Sessions"]
-        h2["Storage Migration"]
-        h3["Context Building"]
-        h4["Proactive Compaction"]
-        h5["Token Tracking"]
-    end
-    subgraph evt["Event System"]
-        i1["Event Bus for Nodes"]
-    end
-    subgraph cas["Coding Agent Support"]
-        j1["Claude Code"]
-        j2["Cursor"]
-        j3["Opencode"]
-        j4["Antigravity"]
-    end
-    subgraph plat["Platform"]
-        k1["JavaScript/TypeScript SDK"]
-        k2["Custom Tool Integrator"]
-        k3["Windows Support"]
-    end
-    subgraph dep["Deployment"]
-        l1["Self-Hosted"]
-        l2["Cloud Services"]
-        l3["CI/CD Pipeline"]
-    end
-    subgraph tmpl["Templates"]
-        m1["Sales Agent"]
-        m2["Marketing Agent"]
-        m3["Analytics Agent"]
-        m4["Training Agent"]
-        m5["Smart Form Agent"]
-    end
-end

-classDef done fill:#9e9e9e,color:#fff,stroke:#757575
+    %% =========================================
+    %% SYSTEM NODES
+    %% =========================================
+    subgraph WorkerBees [Worker Bees]
+        WB_C["Conversation"]
+        WB_SP["System prompt"]
+
+        subgraph Graph [Graph]
+            direction TB
+            N1["Node"] --> N2["Node"] --> N3["Node"]
+            N1 -.-> AN["Active Node"]
+            N2 -.-> AN
+            N3 -.-> AN
+
+            %% Nested Event Loop Node
+            subgraph EventLoopNode [Event Loop Node]
+                ELN_L["listener"]
+                ELN_SP["System Prompt<br/>(Task)"]
+                ELN_EL["Event loop"]
+                ELN_C["Conversation"]
+            end
+        end
+    end
+
+    subgraph JudgeNode [Judge]
+        J_C["Criteria"]
+        J_P["Principles"]
+        J_EL["Event loop"] <--> J_S["Scheduler"]
+    end
+
+    subgraph QueenBee [Queen Bee]
+        QB_SP["System prompt"]
+        QB_EL["Event loop"]
+        QB_C["Conversation"]
+    end
+
+    subgraph Infra [Infra]
+        SA["Sub Agent"]
+        TR["Tool Registry"]
+        WTM["Write through Conversation Memory<br/>(Logs/RAM/Harddrive)"]
+        SM["Shared Memory<br/>(State/Harddrive)"]
+        EB["Event Bus<br/>(RAM)"]
+        CS["Credential Store<br/>(Harddrive/Cloud)"]
+    end
+
+    subgraph PC [PC]
+        B["Browser"]
+        CB["Codebase<br/>v 0.0.x ... v n.n.n"]
+    end
+
+    %% =========================================
+    %% CONNECTIONS & DATA FLOW
+    %% =========================================
+
+    %% External Event Routing
+    E_Sch --> ELN_L
+    E_WH --> ELN_L
+    E_SSE --> ELN_L
+    ELN_L -->|"triggers"| ELN_EL
+
+    %% User Interactions
+    User -->|"Talk"| WB_C
+    User -->|"Talk"| QB_C
+    User -->|"Read/Write Access"| CS
+
+    %% Inter-System Logic
+    ELN_C <-->|"Mirror"| WB_C
+    WB_C -->|"Focus"| AN
+
+    WorkerBees -->|"Inquire"| JudgeNode
+    JudgeNode -->|"Approve"| WorkerBees
+
+    %% Judge Alignments
+    J_C <-.->|"aligns"| WB_SP
+    J_P <-.->|"aligns"| QB_SP
+
+    %% Escalate path
+    J_EL -->|"Report (Escalate)"| QB_EL
+
+    %% Pub/Sub Logic
+    AN -->|"publish"| EB
+    EB -->|"subscribe"| QB_C
+
+    %% Infra and Process Spawning
+    ELN_EL -->|"Spawn"| SA
+    SA -->|"Inform"| ELN_EL
+    SA -->|"Starts"| B
+    B -->|"Report"| ELN_EL
+    TR -->|"Assigned"| ELN_EL
+    CB -->|"Modify Worker Bee"| WB_C
+
+    %% =========================================
+    %% SHARED MEMORY & LOGS ACCESS
+    %% =========================================
+
+    %% Worker Bees Access (link to node inside Graph subgraph)
+    AN <-->|"Read/Write"| WTM
+    AN <-->|"Read/Write"| SM
+
+    %% Queen Bee Access
+    QB_C <-->|"Read/Write"| WTM
+    QB_EL <-->|"Read/Write"| SM
+
+    %% Credentials Access
+    CS -->|"Read Access"| QB_C
 ```

 ## Contributing
@@ -376,10 +434,6 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS

 ## Frequently Asked Questions (FAQ)

-**Q: Does Hive depend on LangChain or other agent frameworks?**
-
-No. Hive is built from the ground up with no dependencies on LangChain, CrewAI, or other agent frameworks. The framework is designed to be lean and flexible, generating agent graphs dynamically rather than relying on predefined components.
-
 **Q: What LLM providers does Hive support?**

 Hive supports 100+ LLM providers through LiteLLM integration, including OpenAI (GPT-4, GPT-4o), Anthropic (Claude models), Google Gemini, DeepSeek, Mistral, Groq, and many more. Simply set the appropriate API key environment variable and specify the model name.
@@ -390,20 +444,12 @@ Yes! Hive supports local models through LiteLLM. Simply use the model name forma

 **Q: What makes Hive different from other agent frameworks?**

-Hive generates your entire agent system from natural language [goals](docs/key_concepts/goals_outcome.md) using a coding agent—you don't hardcode workflows or manually define graphs. When agents fail, the framework automatically captures failure data, [evolves the agent graph](docs/key_concepts/evolution.md), and redeploys. This self-improving loop is unique to Aden.
+Hive generates your entire agent system from natural language goals using a coding agent—you don't hardcode workflows or manually define graphs. When agents fail, the framework automatically captures failure data, [evolves the agent graph](docs/key_concepts/evolution.md), and redeploys. This self-improving loop is unique to Aden.

 **Q: Is Hive open-source?**

 Yes, Hive is fully open-source under the Apache License 2.0. We actively encourage community contributions and collaboration.

-**Q: Does Hive collect data from users?**
-
-Hive collects telemetry data for monitoring and observability purposes, including token usage, latency metrics, and cost tracking. Content capture (prompts and responses) is configurable and stored with team-scoped data isolation. All data stays within your infrastructure when self-hosted.
-
-**Q: What deployment options does Hive support?**
-
-Hive supports self-hosted deployments via Python packages. See the [Environment Setup Guide](docs/environment-setup.md) for installation instructions. Cloud deployment options and Kubernetes-ready configurations are on the roadmap.
-
 **Q: Can Hive handle complex, production-scale use cases?**

 Yes. Hive is explicitly designed for production environments with features like automatic failure recovery, real-time observability, cost controls, and horizontal scaling support. The framework handles both simple automations and complex multi-agent workflows.
@@ -412,15 +458,11 @@ Yes. Hive is explicitly designed for production environments with features like

 Yes, Hive fully supports [human-in-the-loop](docs/key_concepts/graph.md#human-in-the-loop) workflows through intervention nodes that pause execution for human input. These include configurable timeouts and escalation policies, allowing seamless collaboration between human experts and AI agents.

-**Q: What monitoring and debugging tools does Hive provide?**
-
-Hive includes comprehensive observability features: real-time WebSocket streaming for live agent execution monitoring, TimescaleDB-powered analytics for cost and performance metrics, health check endpoints for Kubernetes integration, and MCP tools for agent execution, including file operations, web search, data processing, and more.
-
 **Q: What programming languages does Hive support?**

 The Hive framework is built in Python. A JavaScript/TypeScript SDK is on the roadmap.

-**Q: Can Aden agents interact with external tools and APIs?**
+**Q: Can Hive agents interact with external tools and APIs?**

 Yes. Aden's SDK-wrapped nodes provide built-in tool access, and the framework supports flexible tool ecosystems. Agents can integrate with external APIs, databases, and services through the node architecture.

@@ -438,16 +480,12 @@ Contributions are welcome! Fork the repository, create your feature branch, impl

 **Q: When will my team start seeing results from Aden's adaptive agents?**

-Aden's [adaptation loop](docs/key_concepts/evolution.md) begins working from the first execution. When an agent fails, the framework captures the failure data, helping developers evolve the agent graph through the coding agent. How quickly this translates to measurable results depends on the complexity of your use case, the quality of your [goal definitions](docs/key_concepts/goals_outcome.md), and the volume of executions generating feedback.
+Aden's adaptation loop begins working from the first execution. When an agent fails, the framework captures the failure data, helping developers evolve the agent graph through the coding agent. How quickly this translates to measurable results depends on the complexity of your use case, the quality of your goal definitions, and the volume of executions generating feedback.

 **Q: How does Hive compare to other agent frameworks?**

 Hive focuses on generating agents that run real business processes, rather than generic agents. This vision emphasizes outcome-driven design, adaptability, and an easy-to-use set of tools and integrations.

-**Q: Does Aden offer enterprise support?**
-
-For enterprise inquiries, contact the Aden team through [adenhq.com](https://adenhq.com) or join our [Discord community](https://discord.com/invite/MXE49hrKDk) for support and discussions.
-
 ---

 <p align="center">
@@ -1,4 +1,5 @@
 exports/
 docs/
+.agent-builder-sessions/
 .pytest_cache/
 **/__pycache__/
@@ -82,7 +82,7 @@ Register an MCP server as a tool source for your agent.
    "example_tool"
  ],
  "total_mcp_servers": 1,
-  "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in llm_tool_use nodes."
+  "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes."
 }
 ```

@@ -149,7 +149,7 @@ List tools available from registered MCP servers.
    ]
  },
  "total_tools": 6,
-  "note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes"
+  "note": "Use these tool names in the 'tools' parameter when adding event_loop nodes"
 }
 ```

@@ -246,7 +246,7 @@ Here's a complete workflow for building an agent with MCP tools:
    "node_id": "web-searcher",
    "name": "Web Search",
    "description": "Search the web for information",
-    "node_type": "llm_tool_use",
+    "node_type": "event_loop",
    "input_keys": "[\"query\"]",
    "output_keys": "[\"search_results\"]",
    "system_prompt": "Search for {query} using the web_search tool",
@@ -119,7 +119,7 @@ builder = WorkflowBuilder()
 builder.add_node(
    node_id="researcher",
    name="Web Researcher",
-    node_type="llm_tool_use",
+    node_type="event_loop",
    system_prompt="Research the topic using web_search",
    tools=["web_search"],  # Tool from tools MCP server
    input_keys=["topic"],
@@ -137,7 +137,7 @@ Tools from MCP servers can be referenced in your agent.json just like built-in t
    {
      "id": "searcher",
      "name": "Web Searcher",
-      "node_type": "llm_tool_use",
+      "node_type": "event_loop",
      "system_prompt": "Search for information about {topic}",
      "tools": ["web_search", "web_scrape"],
      "input_keys": ["topic"],
@@ -103,31 +103,20 @@ Add a processing node to the agent graph.
 - `node_id` (string, required): Unique node identifier
 - `name` (string, required): Human-readable name
 - `description` (string, required): What this node does
- `node_type` (string, required): One of: `llm_generate`, `llm_tool_use`, `router`, `function`
+- `node_type` (string, required): Must be `event_loop` (the only valid type)
 - `input_keys` (string, required): JSON array of input variable names
 - `output_keys` (string, required): JSON array of output variable names
- `system_prompt` (string, optional): System prompt for LLM nodes
- `tools` (string, optional): JSON array of tool names for tool_use nodes
- `routes` (string, optional): JSON object of route mappings for router nodes
+- `system_prompt` (string, optional): System prompt for the LLM
+- `tools` (string, optional): JSON array of tool names
+- `client_facing` (boolean, optional): Set to true for human-in-the-loop interaction

-**Node Types:**
+**Node Type:**

-1. **llm_generate**: Uses LLM to generate output from inputs
-   - Requires: `system_prompt`
-   - Tools: Not used
-
-2. **llm_tool_use**: Uses LLM with tools to accomplish tasks
-   - Requires: `system_prompt`, `tools`
-   - Tools: Array of tool names (e.g., `["web_search", "web_fetch"]`)
-
-3. **router**: LLM-powered routing to different paths
-   - Requires: `system_prompt`, `routes`
-   - Routes: Object mapping route names to target node IDs
-   - Example: `{"pass": "success_node", "fail": "retry_node"}`
-
-4. **function**: Executes a pre-defined function
-   - System prompt describes the function behavior
-   - No LLM calls, pure computation
+**event_loop**: LLM-powered node with self-correction loop
+- Requires: `system_prompt`
+- Optional: `tools` (array of tool names, e.g., `["web_search", "web_fetch"]`)
+- Optional: `client_facing` (set to true for HITL / user interaction)
+- Supports: iterative refinement, judge-based evaluation, tool use, streaming

 **Example:**
 ```json
@@ -135,7 +124,7 @@ Add a processing node to the agent graph.
  "node_id": "search_sources",
  "name": "Search Sources",
  "description": "Searches for relevant sources on the topic",
-  "node_type": "llm_tool_use",
+  "node_type": "event_loop",
  "input_keys": "[\"topic\", \"search_queries\"]",
  "output_keys": "[\"sources\", \"source_count\"]",
  "system_prompt": "Search for sources using the provided queries...",
@@ -198,7 +187,7 @@ Export the validated graph as an agent specification.

 **What it does:**
 1. Validates the graph
-2. Auto-generates missing edges from router routes
+2. Validates edge connectivity
 3. Writes files to disk:
   - `exports/{agent-name}/agent.json` - Full agent specification
   - `exports/{agent-name}/README.md` - Auto-generated documentation
@@ -252,47 +241,6 @@ Test the complete agent graph with sample inputs.

 ---

-### Evaluation Rules
-
-#### `add_evaluation_rule`
-Add a rule for the HybridJudge to evaluate node outputs.
-
-**Parameters:**
- `rule_id` (string, required): Unique rule identifier
- `description` (string, required): What this rule checks
- `condition` (string, required): Python expression to evaluate
- `action` (string, required): Action to take: `accept`, `retry`, `escalate`
- `priority` (integer, optional): Rule priority (default: 0)
- `feedback_template` (string, optional): Feedback message template
-
-**Condition Examples:**
- `'result.get("success") == True'` - Check for success flag
- `'result.get("error_type") == "timeout"'` - Check error type
- `'len(result.get("data", [])) > 0'` - Check for non-empty data
-
-**Example:**
-```json
-{
-  "rule_id": "timeout_retry",
-  "description": "Retry on timeout errors",
-  "condition": "result.get('error_type') == 'timeout'",
-  "action": "retry",
-  "priority": 10,
-  "feedback_template": "Timeout occurred, retrying..."
-}
-```
-
-#### `list_evaluation_rules`
-List all configured evaluation rules.
-
-#### `remove_evaluation_rule`
-Remove an evaluation rule.
-
-**Parameters:**
- `rule_id` (string, required): Rule to remove
-
---
-
 ## Example Workflow

 Here's a complete workflow for building a research agent:
@@ -320,7 +268,7 @@ add_node(
    node_id="planner",
    name="Research Planner",
    description="Creates research strategy",
-    node_type="llm_generate",
+    node_type="event_loop",
    input_keys='["topic"]',
    output_keys='["strategy", "queries"]',
    system_prompt="Analyze topic and create research plan..."
@@ -330,7 +278,7 @@ add_node(
    node_id="searcher",
    name="Search Sources",
    description="Find relevant sources",
-    node_type="llm_tool_use",
+    node_type="event_loop",
    input_keys='["queries"]',
    output_keys='["sources"]',
    system_prompt="Search for sources...",
@@ -359,10 +307,9 @@ The exported agent will be saved to `exports/research-agent/`.

 1. **Start with the goal**: Define clear success criteria before building nodes
 2. **Test nodes individually**: Use `test_node` to verify each node works
-3. **Use router nodes for branching**: Don't create edges manually for routers - define routes and they'll be auto-generated
-4. **Add evaluation rules**: Help the judge evaluate outputs deterministically
-5. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
-6. **Check exports**: Review the generated README.md to verify your agent structure
+3. **Use conditional edges for branching**: Define condition_expr on edges for decision points
+4. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
+5. **Check exports**: Review the generated README.md to verify your agent structure

 ---

@@ -73,7 +73,7 @@ To use the agent builder with Claude Desktop or other MCP clients, add this to y
 The MCP server provides tools for:
 - Creating agent building sessions
 - Defining goals with success criteria
- Adding nodes (llm_generate, llm_tool_use, router, function)
+- Adding nodes (event_loop only)
 - Connecting nodes with edges
 - Validating and exporting agent graphs
 - Testing nodes and full agent graphs
@@ -68,7 +68,7 @@ from framework.graph.event_loop_node import (  # noqa: E402
 )
 from framework.graph.executor import GraphExecutor  # noqa: E402
 from framework.graph.goal import Goal  # noqa: E402
-from framework.graph.node import NodeSpec  # noqa: E402
+from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec  # noqa: E402
 from framework.llm.litellm import LiteLLMProvider  # noqa: E402
 from framework.runner.tool_registry import ToolRegistry  # noqa: E402
 from framework.runtime.core import Runtime  # noqa: E402
@@ -654,7 +654,7 @@ NODE_SPECS = {
        id="sender",
        name="Sender",
        description="Send approved campaign emails",
-        node_type="function",
+        node_type="event_loop",
        input_keys=["approved_emails"],
        output_keys=["send_results"],
    ),
@@ -823,11 +823,20 @@ def _send_email_via_resend(
        return {"error": f"Network error: {e}"}


+class SenderNode(NodeProtocol):
+    """Node wrapper for send_emails function."""
+
+    async def execute(self, ctx: NodeContext) -> NodeResult:
+        approved = ctx.input_data.get("approved_emails", "")
+        result_str = send_emails(approved_emails=approved)
+        ctx.memory.write("send_results", result_str)
+        return NodeResult(success=True, output={"send_results": result_str})
+
+
 def send_emails(approved_emails: str = "") -> str:
    """Send approved campaign emails via Resend, or log if unconfigured.

-    Called by FunctionNode which unpacks input_keys as kwargs.
-    Returns a JSON string (FunctionNode wraps it in NodeResult).
+    Returns a JSON string.
    """
    approved = approved_emails
    if not approved:
@@ -1780,7 +1789,7 @@ async def _run_pipeline(websocket, initial_message: str):
    )
    for nid, impl in nodes.items():
        executor.register_node(nid, impl)
-    executor.register_function("sender", send_emails)
+    executor.register_node("sender", SenderNode())

    # --- Event forwarding: bus → WebSocket ---

@@ -4,8 +4,8 @@ Minimal Manual Agent Example
 This example demonstrates how to build and run an agent programmatically
 without using the Claude Code CLI or external LLM APIs.

-It uses 'function' nodes to define logic in pure Python, making it perfect
-for understanding the core runtime loop:
+It uses custom NodeProtocol implementations to define logic in pure Python,
+making it perfect for understanding the core runtime loop:
 Setup -> Graph definition -> Execution -> Result

 Run with:
@@ -16,22 +16,33 @@ import asyncio

 from framework.graph import EdgeCondition, EdgeSpec, Goal, GraphSpec, NodeSpec
 from framework.graph.executor import GraphExecutor
+from framework.graph.node import NodeContext, NodeProtocol, NodeResult
 from framework.runtime.core import Runtime


-# 1. Define Node Logic (Pure Python Functions)
-def greet(name: str) -> str:
+# 1. Define Node Logic (Custom NodeProtocol implementations)
+class GreeterNode(NodeProtocol):
    """Generate a simple greeting."""
-    return f"Hello, {name}!"
+
+    async def execute(self, ctx: NodeContext) -> NodeResult:
+        name = ctx.input_data.get("name", "World")
+        greeting = f"Hello, {name}!"
+        ctx.memory.write("greeting", greeting)
+        return NodeResult(success=True, output={"greeting": greeting})


-def uppercase(greeting: str) -> str:
+class UppercaserNode(NodeProtocol):
    """Convert text to uppercase."""
-    return greeting.upper()
+
+    async def execute(self, ctx: NodeContext) -> NodeResult:
+        greeting = ctx.input_data.get("greeting") or ctx.memory.read("greeting") or ""
+        result = greeting.upper()
+        ctx.memory.write("final_greeting", result)
+        return NodeResult(success=True, output={"final_greeting": result})


 async def main():
-    print("🚀 Setting up Manual Agent...")
+    print("Setting up Manual Agent...")

    # 2. Define the Goal
    # Every agent needs a goal with success criteria
@@ -55,8 +66,7 @@ async def main():
        id="greeter",
        name="Greeter",
        description="Generates a simple greeting",
-        node_type="function",
-        function="greet",  # Matches the registered function name
+        node_type="event_loop",
        input_keys=["name"],
        output_keys=["greeting"],
    )
@@ -65,8 +75,7 @@ async def main():
        id="uppercaser",
        name="Uppercaser",
        description="Converts greeting to uppercase",
-        node_type="function",
-        function="uppercase",
+        node_type="event_loop",
        input_keys=["greeting"],
        output_keys=["final_greeting"],
    )
@@ -98,23 +107,23 @@ async def main():
    runtime = Runtime(storage_path=Path("./agent_logs"))
    executor = GraphExecutor(runtime=runtime)

-    # 7. Register Function Implementations
-    # Connect string names in NodeSpecs to actual Python functions
-    executor.register_function("greeter", greet)
-    executor.register_function("uppercaser", uppercase)
+    # 7. Register Node Implementations
+    # Connect node IDs in the graph to actual Python implementations
+    executor.register_node("greeter", GreeterNode())
+    executor.register_node("uppercaser", UppercaserNode())

    # 8. Execute Agent
-    print("▶ Executing agent with input: name='Alice'...")
+    print("Executing agent with input: name='Alice'...")

    result = await executor.execute(graph=graph, goal=goal, input_data={"name": "Alice"})

    # 9. Verify Results
    if result.success:
-        print("\n✅ Success!")
+        print("\nSuccess!")
        print(f"Path taken: {' -> '.join(result.path)}")
        print(f"Final output: {result.output.get('final_greeting')}")
    else:
-        print(f"\n❌ Failed: {result.error}")
+        print(f"\nFailed: {result.error}")


 if __name__ == "__main__":
@@ -122,7 +122,7 @@ async def example_4_custom_agent_with_mcp_tools():
        node_id="web-searcher",
        name="Web Search",
        description="Search the web for information",
-        node_type="llm_tool_use",
+        node_type="event_loop",
        system_prompt="Search for {query} and return the top results. Use the web_search tool.",
        tools=["web_search"],  # This tool comes from tools MCP server
        input_keys=["query"],
@@ -133,7 +133,7 @@ async def example_4_custom_agent_with_mcp_tools():
        node_id="summarizer",
        name="Summarize Results",
        description="Summarize the search results",
-        node_type="llm_generate",
+        node_type="event_loop",
        system_prompt="Summarize the following search results in 2-3 sentences: {search_results}",
        input_keys=["search_results"],
        output_keys=["summary"],
@@ -0,0 +1,13 @@
+"""Framework-provided agents."""
+
+from pathlib import Path
+
+FRAMEWORK_AGENTS_DIR = Path(__file__).parent
+
+
+def list_framework_agents() -> list[Path]:
+    """List all framework agent directories."""
+    return sorted(
+        [p for p in FRAMEWORK_AGENTS_DIR.iterdir() if p.is_dir() and (p / "agent.py").exists()],
+        key=lambda p: p.name,
+    )
@@ -0,0 +1,55 @@
+"""
+Credential Tester — verify credentials (Aden OAuth + local API keys) via live API calls.
+
+Interactive agent that lists all testable accounts, lets the user pick one,
+loads the provider's tools, and runs a chat session to test the credential.
+"""
+
+from .agent import (
+    CredentialTesterAgent,
+    _list_aden_accounts,
+    _list_env_fallback_accounts,
+    _list_local_accounts,
+    configure_for_account,
+    conversation_mode,
+    edges,
+    entry_node,
+    entry_points,
+    get_tools_for_provider,
+    goal,
+    identity_prompt,
+    list_connected_accounts,
+    loop_config,
+    nodes,
+    pause_nodes,
+    requires_account_selection,
+    skip_credential_validation,
+    terminal_nodes,
+)
+from .config import default_config
+
+__version__ = "1.0.0"
+
+__all__ = [
+    "CredentialTesterAgent",
+    "configure_for_account",
+    "conversation_mode",
+    "default_config",
+    "edges",
+    "entry_node",
+    "entry_points",
+    "get_tools_for_provider",
+    "goal",
+    "identity_prompt",
+    "list_connected_accounts",
+    "loop_config",
+    "nodes",
+    "pause_nodes",
+    "requires_account_selection",
+    "skip_credential_validation",
+    "terminal_nodes",
+    # Internal list helpers (exposed for testing)
+    "_list_aden_accounts",
+    "_list_local_accounts",
+    "_list_env_fallback_accounts",
+]
@@ -0,0 +1,148 @@
+"""CLI entry point for Credential Tester agent."""
+
+import asyncio
+import logging
+import sys
+
+import click
+
+from .agent import CredentialTesterAgent
+
+
+def setup_logging(verbose=False, debug=False):
+    if debug:
+        level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
+    elif verbose:
+        level, fmt = logging.INFO, "%(message)s"
+    else:
+        level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
+    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
+
+
+def pick_account(agent: CredentialTesterAgent) -> dict | None:
+    """Interactive account picker. Returns selected account dict or None."""
+    accounts = agent.list_accounts()
+    if not accounts:
+        click.echo("No connected accounts found.")
+        click.echo("Set ADEN_API_KEY and connect accounts at https://app.adenhq.com")
+        return None
+
+    click.echo("\nConnected accounts:\n")
+    for i, acct in enumerate(accounts, 1):
+        provider = acct.get("provider", "?")
+        alias = acct.get("alias", "?")
+        identity = acct.get("identity", {})
+        detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+        detail = f"  ({', '.join(detail_parts)})" if detail_parts else ""
+        click.echo(f"  {i}. {provider}/{alias}{detail}")
+
+    click.echo()
+    while True:
+        choice = click.prompt("Pick an account to test", type=int, default=1)
+        if 1 <= choice <= len(accounts):
+            return accounts[choice - 1]
+        click.echo(f"Invalid choice. Enter 1-{len(accounts)}.")
+
+
+@click.group()
+@click.version_option(version="1.0.0")
+def cli():
+    """Credential Tester — verify synced credentials via live API calls."""
+    pass
+
+
+@cli.command()
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--debug", is_flag=True)
+def tui(verbose, debug):
+    """Launch TUI to test a credential interactively."""
+    setup_logging(verbose=verbose, debug=debug)
+
+    try:
+        from framework.tui.app import AdenTUI
+    except ImportError:
+        click.echo("TUI requires 'textual'. Install with: pip install textual")
+        sys.exit(1)
+
+    agent = CredentialTesterAgent()
+    account = pick_account(agent)
+    if account is None:
+        sys.exit(1)
+
+    agent.select_account(account)
+    provider = account.get("provider", "?")
+    alias = account.get("alias", "?")
+    click.echo(f"\nTesting {provider}/{alias}...\n")
+
+    async def run_tui():
+        agent._setup()
+        runtime = agent._agent_runtime
+        await runtime.start()
+        try:
+            app = AdenTUI(runtime)
+            await app.run_async()
+        finally:
+            await runtime.stop()
+
+    asyncio.run(run_tui())
+
+
+@cli.command()
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--debug", is_flag=True)
+def shell(verbose, debug):
+    """Interactive CLI session to test a credential."""
+    setup_logging(verbose=verbose, debug=debug)
+    asyncio.run(_interactive_shell(verbose))
+
+
+async def _interactive_shell(verbose=False):
+    agent = CredentialTesterAgent()
+    account = pick_account(agent)
+    if account is None:
+        return
+
+    agent.select_account(account)
+    provider = account.get("provider", "?")
+    alias = account.get("alias", "?")
+
+    click.echo(f"\nTesting {provider}/{alias}")
+    click.echo("Type your requests or 'quit' to exit.\n")
+
+    await agent.start()
+
+    try:
+        result = await agent._agent_runtime.trigger_and_wait(
+            entry_point_id="start",
+            input_data={},
+        )
+        if result:
+            click.echo(f"\nSession ended: {'success' if result.success else result.error}")
+    except KeyboardInterrupt:
+        click.echo("\nGoodbye!")
+    finally:
+        await agent.stop()
+
+
+@cli.command(name="list")
+def list_accounts():
+    """List all connected accounts."""
+    agent = CredentialTesterAgent()
+    accounts = agent.list_accounts()
+
+    if not accounts:
+        click.echo("No connected accounts found.")
+        return
+
+    click.echo("\nConnected accounts:\n")
+    for acct in accounts:
+        provider = acct.get("provider", "?")
+        alias = acct.get("alias", "?")
+        identity = acct.get("identity", {})
+        detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+        detail = f"  ({', '.join(detail_parts)})" if detail_parts else ""
+        click.echo(f"  {provider}/{alias}{detail}")
+
+
+if __name__ == "__main__":
+    cli()
@@ -0,0 +1,621 @@
+"""Credential Tester agent — verify credentials via live API calls.
+
+Supports both Aden OAuth2-synced accounts AND locally-stored API key accounts.
+Aden accounts use account="alias" routing; local accounts inject the key into
+the session environment so tools read it without an account= parameter.
+
+When loaded via AgentRunner.load() (TUI picker, ``hive run``), the module-level
+``nodes`` / ``edges`` variables provide a static graph.  The TUI detects
+``requires_account_selection`` and shows an account picker *before* starting
+the agent.  ``configure_for_account()`` then scopes the node's tools to the
+selected provider.
+
+When used directly (``CredentialTesterAgent``), the graph is built dynamically
+after the user picks an account programmatically.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from framework.graph import Goal, NodeSpec, SuccessCriterion
+from framework.graph.checkpoint_config import CheckpointConfig
+from framework.graph.edge import GraphSpec
+from framework.graph.executor import ExecutionResult
+from framework.llm import LiteLLMProvider
+from framework.runner.tool_registry import ToolRegistry
+from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.execution_stream import EntryPointSpec
+
+from .config import default_config
+from .nodes import build_tester_node
+
+if TYPE_CHECKING:
+    from framework.runner import AgentRunner
+
+# ---------------------------------------------------------------------------
+# Goal
+# ---------------------------------------------------------------------------
+
+goal = Goal(
+    id="credential-tester",
+    name="Credential Tester",
+    description="Verify that a credential can make real API calls.",
+    success_criteria=[
+        SuccessCriterion(
+            id="api-call-success",
+            description="At least one API call succeeds using the credential",
+            metric="api_call_success",
+            target="true",
+            weight=1.0,
+        ),
+    ],
+    constraints=[],
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def get_tools_for_provider(provider_name: str) -> list[str]:
+    """Collect tool names for a credential by credential_id OR credential_group.
+
+    Matches on both ``credential_id`` (e.g. "google" → Gmail tools) and
+    ``credential_group`` (e.g. "google_custom_search" → all google search tools).
+    """
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    tools: list[str] = []
+    for spec in CREDENTIAL_SPECS.values():
+        if spec.credential_id == provider_name or spec.credential_group == provider_name:
+            tools.extend(spec.tools)
+    return sorted(set(tools))
+
+
+def _list_aden_accounts() -> list[dict]:
+    """List active accounts from the Aden platform (requires ADEN_API_KEY)."""
+    import os
+
+    api_key = os.environ.get("ADEN_API_KEY")
+    if not api_key:
+        return []
+
+    try:
+        from framework.credentials.aden.client import AdenClientConfig, AdenCredentialClient
+
+        client = AdenCredentialClient(
+            AdenClientConfig(
+                base_url=os.environ.get("ADEN_API_URL", "https://api.adenhq.com"),
+            )
+        )
+        try:
+            integrations = client.list_integrations()
+        finally:
+            client.close()
+
+        return [
+            {
+                "provider": c.provider,
+                "alias": c.alias,
+                "identity": {"email": c.email} if c.email else {},
+                "integration_id": c.integration_id,
+                "source": "aden",
+            }
+            for c in integrations
+            if c.status == "active"
+        ]
+    except Exception:
+        return []
+
+
+def _list_local_accounts() -> list[dict]:
+    """List named local API key accounts from LocalCredentialRegistry."""
+    try:
+        from framework.credentials.local.registry import LocalCredentialRegistry
+
+        return [
+            info.to_account_dict() for info in LocalCredentialRegistry.default().list_accounts()
+        ]
+    except Exception:
+        return []
+
+
+def _list_env_fallback_accounts() -> list[dict]:
+    """Surface configured-but-unregistered credentials as testable entries.
+
+    Detects credentials available via env vars OR stored in the encrypted
+    store in the old flat format (e.g. ``brave_search`` with no alias).
+    These are users who haven't yet run ``save_account()`` but have a working key.
+    Shows with alias="default" and status="unknown".
+    """
+    import os
+
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    # Collect IDs in encrypted store (includes old flat entries like "brave_search")
+    try:
+        from framework.credentials.storage import EncryptedFileStorage
+
+        encrypted_ids: set[str] = set(EncryptedFileStorage().list_all())
+    except Exception:
+        encrypted_ids = set()
+
+    def _is_configured(cred_name: str, spec) -> bool:
+        # 1. Env var present
+        if os.environ.get(spec.env_var):
+            return True
+        # 2. Old flat encrypted entry (no slash — new entries have {x}/{y})
+        if cred_name in encrypted_ids:
+            return True
+        return False
+
+    seen_groups: set[str] = set()
+    accounts: list[dict] = []
+
+    for cred_name, spec in CREDENTIAL_SPECS.items():
+        if not spec.direct_api_key_supported or not spec.tools:
+            continue
+
+        if spec.credential_group:
+            if spec.credential_group in seen_groups:
+                continue
+            group_available = all(
+                _is_configured(n, s)
+                for n, s in CREDENTIAL_SPECS.items()
+                if s.credential_group == spec.credential_group
+            )
+            if not group_available:
+                continue
+            seen_groups.add(spec.credential_group)
+            provider = spec.credential_group
+        else:
+            if not _is_configured(cred_name, spec):
+                continue
+            provider = cred_name
+
+        accounts.append(
+            {
+                "provider": provider,
+                "alias": "default",
+                "identity": {},
+                "integration_id": None,
+                "source": "local",
+                "status": "unknown",
+            }
+        )
+
+    return accounts
+
+
+def list_connected_accounts() -> list[dict]:
+    """List all testable accounts: Aden-synced + named local + env-var fallbacks."""
+    aden = _list_aden_accounts()
+    local = _list_local_accounts()
+
+    # Show env-var fallbacks only for credentials not already in the named registry
+    local_providers = {a["provider"] for a in local}
+    env_fallbacks = [
+        a for a in _list_env_fallback_accounts() if a["provider"] not in local_providers
+    ]
+
+    return aden + local + env_fallbacks
+
+
+# ---------------------------------------------------------------------------
+# Module-level hooks (read by AgentRunner.load / TUI)
+# ---------------------------------------------------------------------------
+
+skip_credential_validation = True
+"""Don't validate credentials at load time — we don't know which provider yet."""
+
+requires_account_selection = True
+"""Signal TUI to show account picker before starting the agent."""
+
+
+def configure_for_account(runner: AgentRunner, account: dict) -> None:
+    """Scope the tester node's tools to the selected provider.
+
+    Handles both Aden accounts (account= routing) and local accounts
+    (session-level env var injection, no account= parameter in prompt).
+    """
+    provider = account["provider"]
+    source = account.get("source", "aden")
+    alias = account.get("alias", "unknown")
+    identity = account.get("identity", {})
+    tools = get_tools_for_provider(provider)
+
+    if source == "aden":
+        tools.append("get_account_info")
+        email = identity.get("email", "")
+        detail = f" (email: {email})" if email else ""
+        _configure_aden_node(runner, provider, alias, detail, tools)
+    else:
+        status = account.get("status", "unknown")
+        _activate_local_account(provider, alias)
+        _configure_local_node(runner, provider, alias, identity, tools, status)
+
+
+def _activate_local_account(credential_id: str, alias: str) -> None:
+    """Inject a named local account's key into the session environment.
+
+    Handles three cases:
+    1. Named account in LocalCredentialRegistry (new format: {credential_id}/{alias})
+    2. Old flat credential in EncryptedFileStorage (id == credential_id, no alias)
+    3. Env var already set — skip injection (nothing to do)
+    """
+    import os
+
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    # Collect specs for this credential (handles grouped credentials too)
+    group_specs = [
+        (cred_name, spec)
+        for cred_name, spec in CREDENTIAL_SPECS.items()
+        if spec.credential_group == credential_id
+        or spec.credential_id == credential_id
+        or cred_name == credential_id
+    ]
+    # Deduplicate — credential_id and credential_group may both match the same spec
+    seen_env_vars: set[str] = set()
+
+    try:
+        from framework.credentials.local.registry import LocalCredentialRegistry
+        from framework.credentials.storage import EncryptedFileStorage
+
+        registry = LocalCredentialRegistry.default()
+        flat_storage = EncryptedFileStorage()
+
+        for _cred_name, spec in group_specs:
+            if spec.env_var in seen_env_vars:
+                continue
+            # If env var is already set, nothing to do for this one
+            if os.environ.get(spec.env_var):
+                seen_env_vars.add(spec.env_var)
+                continue
+
+            seen_env_vars.add(spec.env_var)
+
+            # Determine key name based on spec
+            key_name = "api_key"
+            if spec.credential_group and "cse" in spec.env_var.lower():
+                key_name = "cse_id"
+
+            key: str | None = None
+
+            # 1. Try named account in registry (new format)
+            if alias != "default":
+                key = registry.get_key(credential_id, alias, key_name)
+            else:
+                # For "default" alias, check registry first, then fall back to flat store
+                key = registry.get_key(credential_id, "default", key_name)
+
+            # 2. Fall back to old flat encrypted entry (id == credential_id, no alias)
+            if key is None:
+                flat_cred = flat_storage.load(credential_id)
+                if flat_cred is not None:
+                    key = flat_cred.get_key(key_name) or flat_cred.get_default_key()
+
+            if key:
+                os.environ[spec.env_var] = key
+    except Exception:
+        pass
+
+
+def _configure_aden_node(
+    runner: AgentRunner,
+    provider: str,
+    alias: str,
+    detail: str,
+    tools: list[str],
+) -> None:
+    for node in runner.graph.nodes:
+        if node.id == "tester":
+            node.tools = sorted(set(tools))
+            node.system_prompt = f"""\
+You are a credential tester for the account: {provider}/{alias}{detail}
+
+# Instructions
+
+1. Suggest a simple read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts).
+2. Execute the call when the user agrees.
+3. Report the result: success (with sample data) or failure (with error).
+4. Let the user request additional API calls to further test the credential.
+
+# Account routing
+
+IMPORTANT: Always pass `account="{alias}"` when calling any tool. \
+This routes the API call to the correct credential. Never use the email \
+or any other identifier — always use the alias exactly as shown.
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+"""
+            break
+
+    runner.intro_message = (
+        f"Testing {provider}/{alias}{detail} — "
+        f"{len(tools)} tools loaded. "
+        "I'll suggest a read-only API call to verify the credential works."
+    )
+
+
+def _configure_local_node(
+    runner: AgentRunner,
+    provider: str,
+    alias: str,
+    identity: dict,
+    tools: list[str],
+    status: str,
+) -> None:
+    identity_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+    detail = f" ({', '.join(identity_parts)})" if identity_parts else ""
+    status_note = " [key not yet validated]" if status == "unknown" else ""
+
+    for node in runner.graph.nodes:
+        if node.id == "tester":
+            node.tools = sorted(set(tools))
+            node.system_prompt = f"""\
+You are a credential tester for the local API key: {provider}/{alias}{detail}{status_note}
+
+# Instructions
+
+1. Suggest a simple test call to verify the credential works \
+(e.g. search for "test", list items, get profile info).
+2. Execute the call when the user agrees.
+3. Report the result: success (with sample data) or failure (with error).
+4. Let the user request additional API calls to further test the credential.
+
+# Rules
+
+- Do NOT pass an `account` parameter — this credential is injected \
+directly into the session environment and tools read it automatically.
+- Start with read-only operations before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+"""
+            break
+
+    runner.intro_message = (
+        f"Testing {provider}/{alias}{detail} — "
+        f"{len(tools)} tools loaded. "
+        "I'll suggest a test API call to verify the credential works."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Module-level graph variables (read by AgentRunner.load)
+# ---------------------------------------------------------------------------
+
+nodes = [
+    NodeSpec(
+        id="tester",
+        name="Credential Tester",
+        description=(
+            "Interactive credential testing — lets the user pick an account "
+            "and verify it via API calls."
+        ),
+        node_type="event_loop",
+        client_facing=True,
+        max_node_visits=0,
+        input_keys=[],
+        output_keys=[],
+        tools=["get_account_info"],
+        system_prompt="""\
+You are a credential tester. Your job is to help the user verify that their \
+connected accounts and API keys can make real API calls.
+
+# Startup
+
+1. Call ``get_account_info`` to list the user's connected accounts.
+2. Present the list and ask the user which account to test.
+3. Once they pick one, note the account's **alias** (e.g. "Timothy", "work-slack").
+4. Suggest a simple read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts).
+5. Execute the call when the user agrees.
+6. Report the result: success (with sample data) or failure (with error).
+7. Let the user request additional API calls to further test the credential.
+
+# Account routing (Aden accounts only)
+
+IMPORTANT: For Aden-synced accounts, always pass the account's **alias** as the \
+``account`` parameter when calling any tool. For local API key accounts, do NOT \
+pass an account parameter — they are pre-injected into the session.
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+""",
+    ),
+]
+
+edges = []
+
+entry_node = "tester"
+entry_points = {"start": "tester"}
+pause_nodes = []
+terminal_nodes = []  # Forever-alive: loops until user exits
+
+conversation_mode = "continuous"
+identity_prompt = (
+    "You are a credential tester that verifies connected accounts and API keys "
+    "can make real API calls."
+)
+loop_config = {
+    "max_iterations": 50,
+    "max_tool_calls_per_turn": 10,
+    "max_history_tokens": 32000,
+}
+
+# ---------------------------------------------------------------------------
+# Programmatic agent class (used by __main__.py CLI)
+# ---------------------------------------------------------------------------
+
+
+class CredentialTesterAgent:
+    """Interactive agent that tests a specific credential via API calls.
+
+    Usage:
+        agent = CredentialTesterAgent()
+        accounts = agent.list_accounts()
+        agent.select_account(accounts[0])
+        await agent.start()
+        await agent.stop()
+    """
+
+    def __init__(self, config=None):
+        self.config = config or default_config
+        self._selected_account: dict | None = None
+        self._agent_runtime: AgentRuntime | None = None
+        self._tool_registry: ToolRegistry | None = None
+        self._storage_path: Path | None = None
+
+    def list_accounts(self) -> list[dict]:
+        """List all testable accounts (Aden + local named + env-var fallbacks)."""
+        return list_connected_accounts()
+
+    def select_account(self, account: dict) -> None:
+        """Select an account to test.
+
+        Args:
+            account: Account dict from list_accounts() with
+                     provider, alias, identity, source keys.
+        """
+        self._selected_account = account
+
+    @property
+    def selected_provider(self) -> str:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+        return self._selected_account["provider"]
+
+    @property
+    def selected_alias(self) -> str:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+        return self._selected_account.get("alias", "unknown")
+
+    def _build_graph(self) -> GraphSpec:
+        provider = self.selected_provider
+        alias = self.selected_alias
+        source = self._selected_account.get("source", "aden")
+        identity = self._selected_account.get("identity", {})
+        tools = get_tools_for_provider(provider)
+
+        if source == "local":
+            _activate_local_account(provider, alias)
+        elif source == "aden":
+            tools.append("get_account_info")
+
+        tester_node = build_tester_node(
+            provider=provider,
+            alias=alias,
+            tools=tools,
+            identity=identity,
+            source=source,
+        )
+
+        return GraphSpec(
+            id="credential-tester-graph",
+            goal_id=goal.id,
+            version="1.0.0",
+            entry_node="tester",
+            entry_points={"start": "tester"},
+            terminal_nodes=[],
+            pause_nodes=[],
+            nodes=[tester_node],
+            edges=[],
+            default_model=self.config.model,
+            max_tokens=self.config.max_tokens,
+            loop_config={
+                "max_iterations": 50,
+                "max_tool_calls_per_turn": 10,
+                "max_history_tokens": 32000,
+            },
+            conversation_mode="continuous",
+            identity_prompt=(
+                f"You are testing the {provider}/{alias} credential. "
+                "Help the user verify it works by making real API calls."
+            ),
+        )
+
+    def _setup(self) -> None:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+
+        self._storage_path = Path.home() / ".hive" / "agents" / "credential_tester"
+        self._storage_path.mkdir(parents=True, exist_ok=True)
+
+        self._tool_registry = ToolRegistry()
+
+        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
+        if mcp_config_path.exists():
+            self._tool_registry.load_mcp_config(mcp_config_path)
+
+        extra_kwargs = getattr(self.config, "extra_kwargs", {}) or {}
+        llm = LiteLLMProvider(
+            model=self.config.model,
+            api_key=self.config.api_key,
+            api_base=self.config.api_base,
+            **extra_kwargs,
+        )
+
+        tool_executor = self._tool_registry.get_executor()
+        tools = list(self._tool_registry.get_tools().values())
+
+        graph = self._build_graph()
+
+        self._agent_runtime = create_agent_runtime(
+            graph=graph,
+            goal=goal,
+            storage_path=self._storage_path,
+            entry_points=[
+                EntryPointSpec(
+                    id="start",
+                    name="Test Credential",
+                    entry_node="tester",
+                    trigger_type="manual",
+                    isolation_level="isolated",
+                ),
+            ],
+            llm=llm,
+            tools=tools,
+            tool_executor=tool_executor,
+            checkpoint_config=CheckpointConfig(enabled=False),
+            graph_id="credential_tester",
+        )
+
+    async def start(self) -> None:
+        """Set up and start the agent runtime."""
+        if self._agent_runtime is None:
+            self._setup()
+        if not self._agent_runtime.is_running:
+            await self._agent_runtime.start()
+
+    async def stop(self) -> None:
+        """Stop the agent runtime."""
+        if self._agent_runtime and self._agent_runtime.is_running:
+            await self._agent_runtime.stop()
+        self._agent_runtime = None
+
+    async def run(self) -> ExecutionResult:
+        """Run the agent (convenience for single execution)."""
+        await self.start()
+        try:
+            result = await self._agent_runtime.trigger_and_wait(
+                entry_point_id="start",
+                input_data={},
+            )
+            return result or ExecutionResult(success=False, error="Execution timeout")
+        finally:
+            await self.stop()
@@ -0,0 +1,19 @@
+"""Runtime configuration for Credential Tester agent."""
+
+from dataclasses import dataclass
+
+from framework.config import RuntimeConfig
+
+
+@dataclass
+class AgentMetadata:
+    name: str = "Credential Tester"
+    version: str = "1.0.0"
+    description: str = (
+        "Test connected accounts by making real API calls. "
+        "Pick an account, verify credentials work, and explore available tools."
+    )
+
+
+metadata = AgentMetadata()
+default_config = RuntimeConfig(temperature=0.3)
@@ -0,0 +1,9 @@
+{
+  "hive-tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "mcp_server.py", "--stdio"],
+    "cwd": "../../../../tools",
+    "description": "Hive tools MCP server with provider-specific tools"
+  }
+}
@@ -0,0 +1,84 @@
+"""Node definitions for Credential Tester agent."""
+
+from framework.graph import NodeSpec
+
+
+def build_tester_node(
+    provider: str,
+    alias: str,
+    tools: list[str],
+    identity: dict[str, str],
+    source: str = "aden",
+) -> NodeSpec:
+    """Build the tester node dynamically for the selected account.
+
+    Args:
+        provider: Provider / credential name (e.g. "google", "brave_search").
+        alias: User-set alias (e.g. "Timothy", "work").
+        tools: Tool names available for this provider.
+        identity: Identity dict (email, workspace, etc.) for context.
+        source: "aden" or "local" — controls routing instructions in the prompt.
+    """
+    detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+    detail = f" ({', '.join(detail_parts)})" if detail_parts else ""
+
+    if source == "aden":
+        routing_section = f"""\
+# Account routing
+
+IMPORTANT: Always pass `account="{alias}"` when calling any tool. \
+This routes the API call to the correct credential. Never use the email \
+or any other identifier — always use the alias exactly as shown.
+"""
+    else:
+        routing_section = """\
+# Credential routing
+
+This is a local API key credential — do NOT pass an `account` parameter. \
+The key is pre-injected into the session environment and tools read it automatically.
+"""
+
+    account_label = "account" if source == "aden" else "local API key"
+
+    return NodeSpec(
+        id="tester",
+        name="Credential Tester",
+        description=(
+            f"Interactive testing node for {provider}/{alias}. "
+            f"Has access to all {provider} tools to verify the credential works."
+        ),
+        node_type="event_loop",
+        client_facing=True,
+        max_node_visits=0,
+        input_keys=[],
+        output_keys=[],
+        tools=tools,
+        system_prompt=f"""\
+You are a credential tester for the {account_label}: {provider}/{alias}{detail}
+
+Your job is to help the user verify that this credential works by making \
+real API calls using the available tools.
+
+{routing_section}
+# Instructions
+
+1. Start by greeting the user and confirming which account you're testing.
+2. Suggest a simple, safe, read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts, search for "test").
+3. Execute the call when the user agrees.
+4. Report the result clearly: success (with sample data) or failure (with error).
+5. Let the user request additional API calls to further test the credential.
+
+# Available tools
+
+You have access to {len(tools)} tools for {provider}:
+{chr(10).join(f"- {t}" for t in tools)}
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations (create, update, delete).
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+""",
+    )
@@ -0,0 +1,44 @@
+"""
+Hive Coder — Native coding agent that builds Hive agent packages.
+
+Deeply understands the agent framework and produces complete Python packages
+with goals, nodes, edges, system prompts, MCP configuration, and tests
+from natural language specifications.
+"""
+
+from .agent import (
+    HiveCoderAgent,
+    conversation_mode,
+    default_agent,
+    edges,
+    entry_node,
+    entry_points,
+    goal,
+    identity_prompt,
+    loop_config,
+    nodes,
+    pause_nodes,
+    terminal_nodes,
+)
+from .config import AgentMetadata, RuntimeConfig, default_config, metadata
+
+__version__ = "1.0.0"
+
+__all__ = [
+    "HiveCoderAgent",
+    "default_agent",
+    "goal",
+    "nodes",
+    "edges",
+    "entry_node",
+    "entry_points",
+    "pause_nodes",
+    "terminal_nodes",
+    "conversation_mode",
+    "identity_prompt",
+    "loop_config",
+    "RuntimeConfig",
+    "AgentMetadata",
+    "default_config",
+    "metadata",
+]
@@ -0,0 +1,223 @@
+"""CLI entry point for Hive Coder agent."""
+
+import asyncio
+import json
+import logging
+import sys
+
+import click
+
+from .agent import HiveCoderAgent, default_agent
+
+
+def setup_logging(verbose=False, debug=False):
+    """Configure logging for execution visibility."""
+    if debug:
+        level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
+    elif verbose:
+        level, fmt = logging.INFO, "%(message)s"
+    else:
+        level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
+    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
+    logging.getLogger("framework").setLevel(level)
+
+
+@click.group()
+@click.version_option(version="1.0.0")
+def cli():
+    """Hive Coder — Build Hive agent packages from natural language."""
+    pass
+
+
+@cli.command()
+@click.option("--request", "-r", type=str, required=True, help="What agent to build")
+@click.option("--mock", is_flag=True, help="Run in mock mode")
+@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
+@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
+@click.option("--debug", is_flag=True, help="Show debug logging")
+def run(request, mock, quiet, verbose, debug):
+    """Execute agent building from a request."""
+    if not quiet:
+        setup_logging(verbose=verbose, debug=debug)
+
+    context = {"user_request": request}
+
+    result = asyncio.run(default_agent.run(context, mock_mode=mock))
+
+    output_data = {
+        "success": result.success,
+        "steps_executed": result.steps_executed,
+        "output": result.output,
+    }
+    if result.error:
+        output_data["error"] = result.error
+
+    click.echo(json.dumps(output_data, indent=2, default=str))
+    sys.exit(0 if result.success else 1)
+
+
+@cli.command()
+@click.option("--mock", is_flag=True, help="Run in mock mode")
+@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
+@click.option("--debug", is_flag=True, help="Show debug logging")
+def tui(mock, verbose, debug):
+    """Launch the TUI dashboard for interactive agent building."""
+    setup_logging(verbose=verbose, debug=debug)
+
+    try:
+        from framework.tui.app import AdenTUI
+    except ImportError:
+        click.echo("TUI requires the 'textual' package. Install with: pip install textual")
+        sys.exit(1)
+
+    from pathlib import Path
+
+    from framework.llm import LiteLLMProvider
+    from framework.runner.tool_registry import ToolRegistry
+    from framework.runtime.agent_runtime import create_agent_runtime
+    from framework.runtime.execution_stream import EntryPointSpec
+
+    async def run_with_tui():
+        agent = HiveCoderAgent()
+
+        agent._tool_registry = ToolRegistry()
+
+        storage_path = Path.home() / ".hive" / "agents" / "hive_coder"
+        storage_path.mkdir(parents=True, exist_ok=True)
+
+        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
+        if mcp_config_path.exists():
+            agent._tool_registry.load_mcp_config(mcp_config_path)
+
+        llm = None
+        if not mock:
+            llm = LiteLLMProvider(
+                model=agent.config.model,
+                api_key=agent.config.api_key,
+                api_base=agent.config.api_base,
+            )
+
+        tools = list(agent._tool_registry.get_tools().values())
+        tool_executor = agent._tool_registry.get_executor()
+        graph = agent._build_graph()
+
+        runtime = create_agent_runtime(
+            graph=graph,
+            goal=agent.goal,
+            storage_path=storage_path,
+            entry_points=[
+                EntryPointSpec(
+                    id="start",
+                    name="Build Agent",
+                    entry_node="coder",
+                    trigger_type="manual",
+                    isolation_level="isolated",
+                ),
+            ],
+            llm=llm,
+            tools=tools,
+            tool_executor=tool_executor,
+        )
+
+        await runtime.start()
+
+        try:
+            app = AdenTUI(runtime)
+            await app.run_async()
+        finally:
+            await runtime.stop()
+
+    asyncio.run(run_with_tui())
+
+
+@cli.command()
+@click.option("--json", "output_json", is_flag=True)
+def info(output_json):
+    """Show agent information."""
+    info_data = default_agent.info()
+    if output_json:
+        click.echo(json.dumps(info_data, indent=2))
+    else:
+        click.echo(f"Agent: {info_data['name']}")
+        click.echo(f"Version: {info_data['version']}")
+        click.echo(f"Description: {info_data['description']}")
+        click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
+        click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
+        click.echo(f"Entry: {info_data['entry_node']}")
+        click.echo(f"Terminal: {', '.join(info_data['terminal_nodes']) or '(forever-alive)'}")
+
+
+@cli.command()
+def validate():
+    """Validate agent structure."""
+    validation = default_agent.validate()
+    if validation["valid"]:
+        click.echo("Agent is valid")
+        if validation["warnings"]:
+            for warning in validation["warnings"]:
+                click.echo(f"  WARNING: {warning}")
+    else:
+        click.echo("Agent has errors:")
+        for error in validation["errors"]:
+            click.echo(f"  ERROR: {error}")
+    sys.exit(0 if validation["valid"] else 1)
+
+
+@cli.command()
+@click.option("--verbose", "-v", is_flag=True)
+def shell(verbose):
+    """Interactive agent building session (CLI, no TUI)."""
+    asyncio.run(_interactive_shell(verbose))
+
+
+async def _interactive_shell(verbose=False):
+    """Async interactive shell."""
+    setup_logging(verbose=verbose)
+
+    click.echo("=== Hive Coder ===")
+    click.echo("Describe the agent you want to build (or 'quit' to exit):\n")
+
+    agent = HiveCoderAgent()
+    await agent.start()
+
+    try:
+        while True:
+            try:
+                request = await asyncio.get_event_loop().run_in_executor(None, input, "Build> ")
+                if request.lower() in ["quit", "exit", "q"]:
+                    click.echo("Goodbye!")
+                    break
+
+                if not request.strip():
+                    continue
+
+                click.echo("\nBuilding agent...\n")
+
+                result = await agent.trigger_and_wait("default", {"user_request": request})
+
+                if result is None:
+                    click.echo("\n[Execution timed out]\n")
+                    continue
+
+                if result.success:
+                    output = result.output or {}
+                    agent_name = output.get("agent_name", "unknown")
+                    validation = output.get("validation_result", "unknown")
+                    click.echo(f"\nAgent '{agent_name}' built. Validation: {validation}\n")
+                else:
+                    click.echo(f"\nBuild failed: {result.error}\n")
+
+            except KeyboardInterrupt:
+                click.echo("\nGoodbye!")
+                break
+            except Exception as e:
+                click.echo(f"Error: {e}", err=True)
+                import traceback
+
+                traceback.print_exc()
+    finally:
+        await agent.stop()
+
+
+if __name__ == "__main__":
+    cli()
@@ -0,0 +1,356 @@
+"""Agent graph construction for Hive Coder."""
+
+from pathlib import Path
+
+from framework.graph import Constraint, Goal, SuccessCriterion
+from framework.graph.checkpoint_config import CheckpointConfig
+from framework.graph.edge import GraphSpec
+from framework.graph.executor import ExecutionResult
+from framework.llm import LiteLLMProvider
+from framework.runner.tool_registry import ToolRegistry
+from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.execution_stream import EntryPointSpec
+
+from .config import default_config, metadata
+from .nodes import coder_node, queen_node
+# ticket_receiver is no longer needed — the queen runs as an independent
+# GraphExecutor and receives escalation tickets via inject_event().
+# Keeping the import commented for reference:
+# from .ticket_receiver import TICKET_RECEIVER_ENTRY_POINT
+
+# Goal definition
+goal = Goal(
+    id="agent-builder",
+    name="Hive Agent Builder",
+    description=(
+        "Build complete, validated Hive agent packages from natural language "
+        "specifications. Produces production-ready Python packages with goals, "
+        "nodes, edges, system prompts, MCP configuration, and tests."
+    ),
+    success_criteria=[
+        SuccessCriterion(
+            id="valid-package",
+            description="Generated agent package passes structural validation",
+            metric="validation_pass",
+            target="true",
+            weight=0.30,
+        ),
+        SuccessCriterion(
+            id="complete-files",
+            description=(
+                "All required files generated: agent.py, config.py, "
+                "nodes/__init__.py, __init__.py, __main__.py, mcp_servers.json"
+            ),
+            metric="file_count",
+            target=">=6",
+            weight=0.25,
+        ),
+        SuccessCriterion(
+            id="user-satisfaction",
+            description="User reviews and approves the generated agent",
+            metric="user_approval",
+            target="true",
+            weight=0.25,
+        ),
+        SuccessCriterion(
+            id="framework-compliance",
+            description=(
+                "Generated code follows framework patterns: STEP 1/STEP 2 "
+                "for client-facing, correct imports, entry_points format"
+            ),
+            metric="pattern_compliance",
+            target="100%",
+            weight=0.20,
+        ),
+    ],
+    constraints=[
+        Constraint(
+            id="dynamic-tool-discovery",
+            description=(
+                "Always discover available tools dynamically via "
+                "discover_mcp_tools before referencing tools in agent designs"
+            ),
+            constraint_type="hard",
+            category="correctness",
+        ),
+        Constraint(
+            id="no-fabricated-tools",
+            description="Only reference tools that exist in hive-tools MCP",
+            constraint_type="hard",
+            category="correctness",
+        ),
+        Constraint(
+            id="valid-python",
+            description="All generated Python files must be syntactically correct",
+            constraint_type="hard",
+            category="correctness",
+        ),
+        Constraint(
+            id="self-verification",
+            description="Run validation after writing code; fix errors before presenting",
+            constraint_type="hard",
+            category="quality",
+        ),
+    ],
+)
+
+# Nodes: primary coder node only.  The queen runs as an independent
+# GraphExecutor with queen_node — not as part of this graph.
+nodes = [coder_node]
+
+# No edges needed — single forever-alive event_loop node
+edges = []
+
+# Graph configuration
+entry_node = "coder"
+entry_points = {"start": "coder"}
+pause_nodes = []
+terminal_nodes = []  # Forever-alive: loops until user exits
+
+# No async entry points needed — the queen is now an independent executor,
+# not a secondary graph receiving events via add_graph().
+async_entry_points = []
+
+# Module-level variables read by AgentRunner.load()
+conversation_mode = "continuous"
+identity_prompt = (
+    "You are Hive Coder, the best agent-building coding agent on the planet. "
+    "You deeply understand the Hive agent framework at the source code level "
+    "and produce production-ready agent packages from natural language. "
+    "You can dynamically discover available framework tools, inspect runtime "
+    "sessions and checkpoints from agents you build, and run their test suites. "
+    "You follow coding agent discipline: read before writing, verify "
+    "assumptions by reading actual code, adhere to project conventions, "
+    "self-verify with validation, and fix your own errors. You are concise, "
+    "direct, and technically rigorous. No emojis. No fluff."
+)
+loop_config = {
+    "max_iterations": 100,
+    "max_tool_calls_per_turn": 20,
+    "max_history_tokens": 32000,
+}
+
+
+# ---------------------------------------------------------------------------
+# Queen graph — runs as an independent persistent conversation in the TUI.
+# Loaded by _load_judge_and_queen() in app.py, NOT by AgentRunner.
+# ---------------------------------------------------------------------------
+
+queen_goal = Goal(
+    id="queen-manager",
+    name="Queen Manager",
+    description=(
+        "Manage the worker agent lifecycle and serve as the user's primary "
+        "interactive interface. Triage health escalations from the judge."
+    ),
+    success_criteria=[],
+    constraints=[],
+)
+
+queen_graph = GraphSpec(
+    id="queen-graph",
+    goal_id=queen_goal.id,
+    version="1.0.0",
+    entry_node="queen",
+    entry_points={"start": "queen"},
+    terminal_nodes=[],
+    pause_nodes=[],
+    nodes=[queen_node],
+    edges=[],
+    conversation_mode="continuous",
+    loop_config={
+        "max_iterations": 200,
+        "max_tool_calls_per_turn": 10,
+        "max_history_tokens": 32000,
+    },
+)
+
+
+class HiveCoderAgent:
+    """
+    Hive Coder — builds Hive agent packages from natural language.
+
+    Single-node architecture: the coder runs in a continuous while(true) loop.
+    The queen runs as an independent GraphExecutor (loaded by the TUI via
+    _load_judge_and_queen), not as part of this graph.
+    """
+
+    def __init__(self, config=None):
+        self.config = config or default_config
+        self.goal = goal
+        self.nodes = nodes
+        self.edges = edges
+        self.entry_node = entry_node
+        self.entry_points = entry_points
+        self.pause_nodes = pause_nodes
+        self.terminal_nodes = terminal_nodes
+        self.async_entry_points = async_entry_points
+        self._graph: GraphSpec | None = None
+        self._agent_runtime: AgentRuntime | None = None
+        self._tool_registry: ToolRegistry | None = None
+        self._storage_path: Path | None = None
+
+    def _build_graph(self) -> GraphSpec:
+        """Build the GraphSpec."""
+        return GraphSpec(
+            id="hive-coder-graph",
+            goal_id=self.goal.id,
+            version="1.0.0",
+            entry_node=self.entry_node,
+            entry_points=self.entry_points,
+            terminal_nodes=self.terminal_nodes,
+            pause_nodes=self.pause_nodes,
+            nodes=self.nodes,
+            edges=self.edges,
+            default_model=self.config.model,
+            max_tokens=self.config.max_tokens,
+            loop_config=loop_config,
+            conversation_mode=conversation_mode,
+            identity_prompt=identity_prompt,
+            async_entry_points=self.async_entry_points,
+        )
+
+    def _setup(self, mock_mode=False) -> None:
+        """Set up the agent runtime."""
+        self._storage_path = Path.home() / ".hive" / "agents" / "hive_coder"
+        self._storage_path.mkdir(parents=True, exist_ok=True)
+
+        self._tool_registry = ToolRegistry()
+
+        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
+        if mcp_config_path.exists():
+            self._tool_registry.load_mcp_config(mcp_config_path)
+
+        llm = None
+        if not mock_mode:
+            llm = LiteLLMProvider(
+                model=self.config.model,
+                api_key=self.config.api_key,
+                api_base=self.config.api_base,
+            )
+
+        tool_executor = self._tool_registry.get_executor()
+        tools = list(self._tool_registry.get_tools().values())
+
+        self._graph = self._build_graph()
+
+        checkpoint_config = CheckpointConfig(
+            enabled=True,
+            checkpoint_on_node_start=False,
+            checkpoint_on_node_complete=True,
+            checkpoint_max_age_days=7,
+            async_checkpoint=True,
+        )
+
+        entry_point_specs = [
+            EntryPointSpec(
+                id="default",
+                name="Default",
+                entry_node=self.entry_node,
+                trigger_type="manual",
+                isolation_level="shared",
+            ),
+        ]
+
+        self._agent_runtime = create_agent_runtime(
+            graph=self._graph,
+            goal=self.goal,
+            storage_path=self._storage_path,
+            entry_points=entry_point_specs,
+            llm=llm,
+            tools=tools,
+            tool_executor=tool_executor,
+            checkpoint_config=checkpoint_config,
+            graph_id="hive_coder",
+        )
+
+    async def start(self, mock_mode=False) -> None:
+        """Set up and start the agent runtime."""
+        if self._agent_runtime is None:
+            self._setup(mock_mode=mock_mode)
+        if not self._agent_runtime.is_running:
+            await self._agent_runtime.start()
+
+    async def stop(self) -> None:
+        """Stop the agent runtime and clean up."""
+        if self._agent_runtime and self._agent_runtime.is_running:
+            await self._agent_runtime.stop()
+        self._agent_runtime = None
+
+    async def trigger_and_wait(
+        self,
+        entry_point: str = "default",
+        input_data: dict | None = None,
+        timeout: float | None = None,
+        session_state: dict | None = None,
+    ) -> ExecutionResult | None:
+        """Execute the graph and wait for completion."""
+        if self._agent_runtime is None:
+            raise RuntimeError("Agent not started. Call start() first.")
+
+        return await self._agent_runtime.trigger_and_wait(
+            entry_point_id=entry_point,
+            input_data=input_data or {},
+            session_state=session_state,
+        )
+
+    async def run(self, context: dict, mock_mode=False, session_state=None) -> ExecutionResult:
+        """Run the agent (convenience method for single execution)."""
+        await self.start(mock_mode=mock_mode)
+        try:
+            result = await self.trigger_and_wait("default", context, session_state=session_state)
+            return result or ExecutionResult(success=False, error="Execution timeout")
+        finally:
+            await self.stop()
+
+    def info(self):
+        """Get agent information."""
+        return {
+            "name": metadata.name,
+            "version": metadata.version,
+            "description": metadata.description,
+            "goal": {
+                "name": self.goal.name,
+                "description": self.goal.description,
+            },
+            "nodes": [n.id for n in self.nodes],
+            "edges": [e.id for e in self.edges],
+            "entry_node": self.entry_node,
+            "entry_points": self.entry_points,
+            "pause_nodes": self.pause_nodes,
+            "terminal_nodes": self.terminal_nodes,
+            "client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
+        }
+
+    def validate(self):
+        """Validate agent structure."""
+        errors = []
+        warnings = []
+
+        node_ids = {node.id for node in self.nodes}
+        for edge in self.edges:
+            if edge.source not in node_ids:
+                errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
+            if edge.target not in node_ids:
+                errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
+
+        if self.entry_node not in node_ids:
+            errors.append(f"Entry node '{self.entry_node}' not found")
+
+        for terminal in self.terminal_nodes:
+            if terminal not in node_ids:
+                errors.append(f"Terminal node '{terminal}' not found")
+
+        for ep_id, node_id in self.entry_points.items():
+            if node_id not in node_ids:
+                errors.append(f"Entry point '{ep_id}' references unknown node '{node_id}'")
+
+        return {
+            "valid": len(errors) == 0,
+            "errors": errors,
+            "warnings": warnings,
+        }
+
+
+# Create default instance
+default_agent = HiveCoderAgent()
@@ -1,4 +1,4 @@
-"""Runtime configuration."""
+"""Runtime configuration for Hive Coder agent."""

 import json
 from dataclasses import dataclass, field
@@ -34,11 +34,17 @@ default_config = RuntimeConfig()

@dataclass
 class AgentMetadata:
-    name: str = "Twitter Outreach Agent"
+    name: str = "Hive Coder"
    version: str = "1.0.0"
    description: str = (
-        "Reads a target's Twitter/X profile, crafts a personalized outreach email "
-        "referencing their specific activity, and sends it after user approval."
+        "Native coding agent that builds production-ready Hive agent packages "
+        "from natural language specifications. Deeply understands the agent framework "
+        "and produces complete Python packages with goals, nodes, edges, system prompts, "
+        "MCP configuration, and tests."
+    )
+    intro_message: str = (
+        "I'm Hive Coder — I build Hive agents. Describe what kind of agent "
+        "you want to create and I'll design, implement, and validate it for you."
    )


@@ -0,0 +1,9 @@
+{
+  "coder-tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "coder_tools_server.py", "--stdio"],
+    "cwd": "../../../../tools",
+    "description": "Unsandboxed file system tools for code generation and validation"
+  }
+}
@@ -0,0 +1,679 @@
+"""Node definitions for Hive Coder agent."""
+
+from framework.graph import NodeSpec
+
+# Single node — like opencode's while(true) loop.
+# One continuous context handles the entire workflow:
+# discover → design → implement → verify → present → iterate.
+coder_node = NodeSpec(
+    id="coder",
+    name="Hive Coder",
+    description=(
+        "Autonomous coding agent that builds Hive agent packages. "
+        "Handles the full lifecycle: understanding user intent, "
+        "designing architecture, writing code, validating, and "
+        "iterating on feedback — all in one continuous conversation."
+    ),
+    node_type="event_loop",
+    client_facing=True,
+    max_node_visits=0,
+    input_keys=["user_request"],
+    output_keys=["agent_name", "validation_result"],
+    success_criteria=(
+        "A complete, validated Hive agent package exists at "
+        "exports/{agent_name}/ and passes structural validation."
+    ),
+    system_prompt="""\
+You are Hive Coder, the best agent-building coding agent. You build \
+production-ready Hive agent packages from natural language.
+
+# Core Mandates
+
+- **Read before writing.** NEVER write code from assumptions. Read \
+reference agents and templates first. Read every file before editing.
+- **Conventions first.** Follow existing project patterns exactly. \
+Analyze imports, structure, and style in reference agents.
+- **Verify assumptions.** Never assume a class, import, or pattern \
+exists. Read actual source to confirm. Search if unsure.
+- **Discover tools dynamically.** NEVER reference tools from static \
+docs. Always run discover_mcp_tools() to see what actually exists.
+- **Professional objectivity.** If a use case is a poor fit for the \
+framework, say so. Technical accuracy over validation.
+- **Concise.** No emojis. No preambles. No postambles. Substance only.
+- **Self-verify.** After writing code, run validation and tests. Fix \
+errors yourself. Don't declare success until validation passes.
+
+# Tools
+
+## File I/O
+- read_file(path, offset?, limit?) — read with line numbers
+- write_file(path, content) — create/overwrite, auto-mkdir
+- edit_file(path, old_text, new_text, replace_all?) — fuzzy-match edit
+- list_directory(path, recursive?) — list contents
+- search_files(pattern, path?, include?) — regex search
+- run_command(command, cwd?, timeout?) — shell execution
+- undo_changes(path?) — restore from git snapshot
+
+## Meta-Agent
+- discover_mcp_tools(server_config_path?) — connect to MCP servers \
+and list all available tools with full schemas. Default: hive-tools.
+- list_agents() — list all agent packages in exports/ with session counts
+- list_agent_sessions(agent_name, status?, limit?) — list sessions
+- get_agent_session_state(agent_name, session_id) — full session state
+- get_agent_session_memory(agent_name, session_id, key?) — memory data
+- list_agent_checkpoints(agent_name, session_id) — list checkpoints
+- get_agent_checkpoint(agent_name, session_id, checkpoint_id?) — load checkpoint
+- run_agent_tests(agent_name, test_types?, fail_fast?) — run pytest with parsing
+
+# Meta-Agent Capabilities
+
+You are not just a file writer. You have deep integration with the \
+Hive framework:
+
+## Tool Discovery (MANDATORY before designing)
+Before designing any agent, run discover_mcp_tools() to see what \
+tools are actually available from the hive-tools MCP server. This \
+returns full schemas with parameter names, types, and descriptions. \
+NEVER guess tool names or parameters from memory. The tool catalog \
+is the ground truth.
+
+To check a specific agent's tools:
+  discover_mcp_tools("exports/{agent_name}/mcp_servers.json")
+
+## Agent Awareness
+Run list_agents() to see what agents already exist. Read their code \
+for patterns:
+  read_file("exports/{name}/agent.py")
+  read_file("exports/{name}/nodes/__init__.py")
+
+## Post-Build Testing
+After writing agent code, validate structurally AND run tests:
+  run_command("python -c 'from {name} import default_agent; \\
+    print(default_agent.validate())'")
+  run_agent_tests("{name}")
+
+## Debugging Built Agents
+When a user says "my agent is failing" or "debug this agent":
+1. list_agent_sessions("{agent_name}") — find the session
+2. get_agent_session_state("{agent_name}", "{session_id}") — see status
+3. get_agent_session_memory("{agent_name}", "{session_id}") — inspect data
+4. list_agent_checkpoints / get_agent_checkpoint — trace execution
+
+# Workflow
+
+You operate in a continuous loop. The user describes what they want, \
+you build it. No rigid phases — use judgment. But the general flow is:
+
+## 1. Understand
+
+When the user describes what they want to build, hear the structure:
+- The actors, the trigger, the core loop, the output, the pain.
+
+Play back a model: "Here's what I'm picturing: [concrete picture]. \
+Before I start — [1-2 questions you can't infer]."
+
+Ask only what you CANNOT infer. Fill blanks with domain knowledge.
+
+## 2. Qualify
+
+Assess framework fit honestly. Run discover_mcp_tools() to check \
+what tools exist. Read the framework guide:
+  read_file("core/framework/agents/hive_coder/reference/framework_guide.md")
+
+Consider:
+- What works well (multi-turn, HITL, tool orchestration)
+- Limitations (LLM latency, context limits, cost)
+- Deal-breakers (missing tools, wrong paradigm)
+
+Give a clear recommendation: proceed, adjust scope, or reconsider.
+
+## 3. Design
+
+Design the agent architecture:
+- Goal: id, name, description, 3-5 success criteria, 2-4 constraints
+- Nodes: **2-4 nodes MAXIMUM** (see rules below)
+- Edges: on_success for linear, conditional for routing
+- Lifecycle: ALWAYS forever-alive (`terminal_nodes=[]`) unless the user \
+explicitly requests a one-shot/batch agent. Forever-alive agents loop \
+continuously — the user exits by closing the TUI. This is the standard \
+pattern for all interactive agents.
+
+### Node Count Rules (HARD LIMITS)
+
+**2-4 nodes** for all agents. Never exceed 4 unless the user explicitly \
+requests more. Each node boundary serializes outputs to shared memory \
+and DESTROYS all in-context information (tool results, reasoning, history).
+
+**MERGE nodes when:**
+- Node has NO tools (pure LLM reasoning) → merge into predecessor/successor
+- Node sets only 1 trivial output → collapse into predecessor
+- Multiple consecutive autonomous nodes → combine into one rich node
+- A "report" or "summary" node → merge into the client-facing node
+- A "confirm" or "schedule" node that calls no external service → remove
+
+**SEPARATE nodes only when:**
+- Client-facing vs autonomous (different interaction models)
+- Fundamentally different tool sets
+- Fan-out parallelism (parallel branches MUST be separate)
+
+**Typical patterns:**
+- 2 nodes: `interact (client-facing) → process (autonomous) → interact`
+- 3 nodes: `intake (CF) → process (auto) → review (CF) → intake`
+- WRONG: 7 nodes where half have no tools and just do LLM reasoning
+
+Read reference agents before designing:
+  list_agents()
+  read_file("exports/deep_research_agent/agent.py")
+  read_file("exports/deep_research_agent/nodes/__init__.py")
+
+Present the design with ASCII art graph. Get user approval.
+
+## 4. Implement
+
+Read templates before writing code:
+  read_file("core/framework/agents/hive_coder/reference/file_templates.md")
+  read_file("core/framework/agents/hive_coder/reference/anti_patterns.md")
+
+Write files in order:
+1. mkdir -p exports/{name}/nodes exports/{name}/tests
+2. config.py — RuntimeConfig + AgentMetadata
+3. nodes/__init__.py — NodeSpec definitions with system prompts
+4. agent.py — Goal, edges, graph, agent class
+5. __init__.py — package exports
+6. __main__.py — CLI with click
+7. mcp_servers.json — tool server config
+8. tests/ — fixtures
+
+### Critical Rules
+
+**Imports** (must match exactly — only import what you use):
+```python
+from framework.graph import (
+    NodeSpec, EdgeSpec, EdgeCondition,
+    Goal, SuccessCriterion, Constraint,
+)
+from framework.graph.edge import GraphSpec
+from framework.graph.executor import ExecutionResult
+from framework.graph.checkpoint_config import CheckpointConfig
+from framework.llm import LiteLLMProvider
+from framework.runner.tool_registry import ToolRegistry
+from framework.runtime.agent_runtime import (
+    AgentRuntime, create_agent_runtime,
+)
+from framework.runtime.execution_stream import EntryPointSpec
+```
+For agents with async entry points (timers, webhooks, events), also add:
+```python
+from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
+from framework.runtime.agent_runtime import (
+    AgentRuntime, AgentRuntimeConfig, create_agent_runtime,
+)
+```
+NEVER `from core.framework...` — PYTHONPATH includes core/.
+
+**__init__.py MUST re-export ALL module-level variables** \
+(THIS IS THE #1 SOURCE OF AGENT LOAD FAILURES):
+The runner imports the package (__init__.py), NOT agent.py. It reads \
+goal, nodes, edges, entry_node, entry_points, pause_nodes, \
+terminal_nodes, conversation_mode, identity_prompt, loop_config via \
+getattr(). If ANY are missing from __init__.py, they silently default \
+to None or {} — causing "must define goal, nodes, edges" or "node X \
+is unreachable" errors. The __init__.py MUST import and re-export \
+ALL of these from .agent:
+```python
+from .agent import (
+    MyAgent, default_agent, goal, nodes, edges,
+    entry_node, entry_points, pause_nodes, terminal_nodes,
+    conversation_mode, identity_prompt, loop_config,
+)
+```
+
+**entry_points**: `{"start": "first-node-id"}`
+For agents with multiple entry points (e.g. a reminder trigger), \
+add them: `{"start": "intake", "reminder": "reminder"}`
+
+**conversation_mode** — ONLY two valid values:
+- `"continuous"` — recommended for interactive agents (context carries \
+across node transitions)
+- Omit entirely — for isolated per-node conversations
+NEVER use: "client_facing", "interactive", "adaptive", or any other \
+value. These DO NOT EXIST.
+
+**loop_config** — ONLY three valid keys:
+```python
+loop_config = {
+    "max_iterations": 100,
+    "max_tool_calls_per_turn": 20,
+    "max_history_tokens": 32000,
+}
+```
+NEVER add: "strategy", "mode", "timeout", or other keys.
+
+**mcp_servers.json**:
+```json
+{
+  "hive-tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "mcp_server.py", "--stdio"],
+    "cwd": "../../tools"
+  }
+}
+```
+NO "mcpServers" wrapper. cwd "../../tools". command "uv".
+
+**Storage**: `Path.home() / ".hive" / "agents" / "{name}"`
+
+**Client-facing system prompts** — STEP 1/STEP 2 pattern:
+```
+STEP 1 — Present to user (text only, NO tool calls):
+[instructions]
+
+STEP 2 — After user responds, call set_output:
+[set_output calls]
+```
+
+**Autonomous system prompts** — set_output in SEPARATE turn.
+
+**Tools** — NEVER fabricate tool names. Common hallucinations: \
+csv_read, csv_write, csv_append, file_upload, database_query. \
+If discover_mcp_tools() shows these don't exist, use alternatives \
+(e.g. save_data/load_data for data persistence).
+
+**Node rules**:
+- **2-4 nodes MAX.** Never exceed 4. Merge thin nodes aggressively.
+- A node with 0 tools is NOT a real node — merge it.
+- node_type always "event_loop"
+- max_node_visits default is 0 (unbounded) — correct for forever-alive. \
+Only set >0 in one-shot agents with bounded feedback loops.
+- Feedback inputs: nullable_output_keys
+- terminal_nodes=[] for forever-alive (the default)
+- Every node MUST have at least one outgoing edge (no dead ends)
+- Agents are forever-alive unless user explicitly asks for one-shot
+
+**Agent class**: CamelCase name, default_agent at module level. \
+Constructor takes `config=None`. Follow the exact pattern in \
+file_templates.md — do NOT invent constructor params like \
+`llm_provider` or `tool_registry`.
+
+**Module-level variables** (read by AgentRunner.load()):
+goal, nodes, edges, entry_node, entry_points, pause_nodes,
+terminal_nodes, conversation_mode, identity_prompt, loop_config
+
+For agents with async triggers, also export:
+async_entry_points, runtime_config
+
+**Async entry points** (timers, webhooks, events):
+When an agent needs scheduled tasks, webhook reactions, or event-driven \
+triggers, use `AsyncEntryPointSpec` (from framework.graph.edge) and \
+`AgentRuntimeConfig` (from framework.runtime.agent_runtime):
+- Timer (cron): `trigger_type="timer"`, \
+`trigger_config={"cron": "0 9 * * *"}` — standard 5-field cron expression \
+(e.g. `"0 9 * * MON-FRI"` weekdays 9am, `"*/30 * * * *"` every 30 min)
+- Timer (interval): `trigger_type="timer"`, \
+`trigger_config={"interval_minutes": 20, "run_immediately": False}`
+- Event (for webhooks): `trigger_type="event"`, \
+`trigger_config={"event_types": ["webhook_received"]}`
+- `isolation_level="shared"` so async runs can read primary session memory
+- `runtime_config = AgentRuntimeConfig(webhook_routes=[...])` for HTTP webhooks
+- Reference: `exports/gmail_inbox_guardian/agent.py`
+- Full docs: `core/framework/agents/hive_coder/reference/framework_guide.md` \
+(Async Entry Points section)
+
+## 5. Verify
+
+Run THREE validation steps after writing. All must pass:
+
+**Step A — Class validation** (checks graph structure):
+```
+run_command("python -c 'from {name} import default_agent; \\
+  print(default_agent.validate())'")
+```
+
+**Step B — Runner load test** (checks package export contract — \
+THIS IS THE SAME PATH THE TUI USES):
+```
+run_command("python -c 'from framework.runner.runner import \\
+  AgentRunner; r = AgentRunner.load(\"exports/{name}\"); \\
+  print(\"AgentRunner.load: OK\")'")
+```
+This catches missing __init__.py exports, bad conversation_mode, \
+invalid loop_config, and unreachable nodes. If Step A passes but \
+Step B fails, the problem is in __init__.py exports.
+
+**Step C — Run tests:**
+```
+run_agent_tests("{name}")
+```
+
+If anything fails: read error, fix with edit_file, re-validate. Up to 3x.
+
+**CRITICAL: Testing forever-alive agents**
+Most agents use `terminal_nodes=[]` (forever-alive). This means \
+`runner.run()` NEVER returns — it hangs forever waiting for a \
+terminal node that doesn't exist. Agent tests MUST be structural:
+- Validate graph, node specs, edges, tools, prompts
+- Check goal/constraints/success criteria definitions
+- Test `AgentRunner.load()` + `_setup()` (skip if no API key)
+- NEVER call `runner.run()` or `trigger_and_wait()` in tests for \
+forever-alive agents — they will hang and time out.
+When you restructure an agent (change nodes/edges), always update \
+the tests to match. Stale tests referencing old node names will fail.
+
+## 6. Present
+
+Show the user what you built: agent name, goal summary, graph ASCII \
+art, files created, validation status. Offer to revise or build another.
+
+After user confirms satisfaction:
+  set_output("agent_name", "the_agent_name")
+  set_output("validation_result", "valid")
+
+If building another agent, just start the loop again — no need to \
+set_output until the user is done.
+
+## 7. Live Test (optional)
+
+After the user approves, offer to load and run the agent in-session. \
+This runs it alongside you.
+
+```
+load_agent("exports/{name}")   # registers as secondary graph
+start_agent("{name}")           # triggers default entry point
+```
+
+You can also:
+- `list_agents()` — see all loaded graphs and status
+- `restart_agent("{name}")` then `load_agent` — pick up code changes
+- `unload_agent("{name}")` — remove it from the session
+- `get_user_presence()` — check if user is around
+
+The agent runs in a shared session: it can read memory you've set and \
+its outputs are visible to you.
+""",
+    tools=[
+        "read_file",
+        "write_file",
+        "edit_file",
+        "list_directory",
+        "search_files",
+        "run_command",
+        "undo_changes",
+        # Meta-agent tools
+        "discover_mcp_tools",
+        "list_agents",
+        "list_agent_sessions",
+        "get_agent_session_state",
+        "get_agent_session_memory",
+        "list_agent_checkpoints",
+        "get_agent_checkpoint",
+        "run_agent_tests",
+        # Graph lifecycle tools (multi-graph sessions)
+        "load_agent",
+        "unload_agent",
+        "start_agent",
+        "restart_agent",
+        "get_user_presence",
+    ],
+)
+
+
+ticket_triage_node = NodeSpec(
+    id="ticket_triage",
+    name="Ticket Triage",
+    description=(
+        "Queen's triage node. Receives an EscalationTicket from the Health Judge "
+        "via event-driven entry point and decides: dismiss or notify the operator."
+    ),
+    node_type="event_loop",
+    client_facing=True,    # Operator can chat with queen once connected (Ctrl+Q)
+    max_node_visits=0,
+    input_keys=["ticket"],
+    output_keys=["intervention_decision"],
+    nullable_output_keys=["intervention_decision"],
+    success_criteria=(
+        "A clear intervention decision: either dismissed with documented reasoning, "
+        "or operator notified via notify_operator with specific analysis."
+    ),
+    tools=["notify_operator"],
+    system_prompt="""\
+You are the Queen (Hive Coder). The Worker Health Judge has escalated a worker \
+issue to you. The ticket is in your memory under key "ticket". Read it carefully.
+
+## Dismiss criteria — do NOT call notify_operator:
+- severity is "low" AND steps_since_last_accept < 8
+- Cause is clearly a transient issue (single API timeout, brief stall that \
+  self-resolved based on the evidence)
+- Evidence shows the agent is making real progress despite bad verdicts
+
+## Intervene criteria — call notify_operator:
+- severity is "high" or "critical"
+- steps_since_last_accept >= 10 with no sign of recovery
+- stall_minutes > 4 (worker definitively stuck)
+- Evidence shows a doom loop (same error, same tool, no progress)
+- Cause suggests a logic bug, missing configuration, or unrecoverable state
+
+## When intervening:
+Call notify_operator with:
+  ticket_id: <ticket["ticket_id"]>
+  analysis: "<2-3 sentences: what is wrong, why it matters, suggested action>"
+  urgency: "<low|medium|high|critical>"
+
+## After deciding:
+set_output("intervention_decision", "dismissed: <reason>" or "escalated: <summary>")
+
+Be conservative but not passive. You are the last quality gate before the human \
+is disturbed. One unnecessary alert is less costly than alert fatigue — but \
+genuine stuck agents must be caught.
+""",
+)
+
+ALL_QUEEN_TRIAGE_TOOLS = ["notify_operator"]
+
+
+queen_node = NodeSpec(
+    id="queen",
+    name="Queen",
+    description=(
+        "User's primary interactive interface with full coding capability. "
+        "Can build agents directly or delegate to the worker. Manages the "
+        "worker agent lifecycle and triages health escalations from the judge."
+    ),
+    node_type="event_loop",
+    client_facing=True,
+    max_node_visits=0,
+    input_keys=["greeting"],
+    output_keys=[],
+    nullable_output_keys=[],
+    success_criteria=(
+        "User's intent is understood, coding tasks are completed correctly, "
+        "and the worker is managed effectively when delegated to."
+    ),
+    tools=[
+        # File I/O (from coder-tools MCP)
+        "read_file",
+        "write_file",
+        "edit_file",
+        "list_directory",
+        "search_files",
+        "run_command",
+        "undo_changes",
+        # Meta-agent (from coder-tools MCP)
+        "discover_mcp_tools",
+        "list_agents",
+        "list_agent_sessions",
+        "get_agent_session_state",
+        "get_agent_session_memory",
+        "list_agent_checkpoints",
+        "get_agent_checkpoint",
+        "run_agent_tests",
+        # Worker lifecycle
+        "start_worker",
+        "stop_worker",
+        "get_worker_status",
+        "inject_worker_message",
+        # Monitoring
+        "get_worker_health_summary",
+        "notify_operator",
+    ],
+    system_prompt="""\
+You are the Queen — the user's primary interface. You are a coding agent \
+with the same capabilities as the Hive Coder worker, PLUS the ability to \
+manage the worker's lifecycle.
+
+# Core Mandates
+
+- **Read before writing.** NEVER write code from assumptions. Read \
+reference agents and templates first. Read every file before editing.
+- **Conventions first.** Follow existing project patterns exactly. \
+Analyze imports, structure, and style in reference agents.
+- **Verify assumptions.** Never assume a class, import, or pattern \
+exists. Read actual source to confirm. Search if unsure.
+- **Discover tools dynamically.** NEVER reference tools from static \
+docs. Always run discover_mcp_tools() to see what actually exists.
+- **Self-verify.** After writing code, run validation and tests. Fix \
+errors yourself. Don't declare success until validation passes.
+- **Concise.** No emojis. No preambles. No postambles. Substance only.
+
+# Tools
+
+## File I/O
+- read_file(path, offset?, limit?) — read with line numbers
+- write_file(path, content) — create/overwrite, auto-mkdir
+- edit_file(path, old_text, new_text, replace_all?) — fuzzy-match edit
+- list_directory(path, recursive?) — list contents
+- search_files(pattern, path?, include?) — regex search
+- run_command(command, cwd?, timeout?) — shell execution
+- undo_changes(path?) — restore from git snapshot
+
+## Meta-Agent
+- discover_mcp_tools(server_config_path?) — connect to MCP servers \
+and list all available tools with full schemas. Default: hive-tools.
+- list_agents() — list all agent packages in exports/ with session counts
+- list_agent_sessions(agent_name, status?, limit?) — list sessions
+- get_agent_session_state(agent_name, session_id) — full session state
+- get_agent_session_memory(agent_name, session_id, key?) — memory data
+- list_agent_checkpoints(agent_name, session_id) — list checkpoints
+- get_agent_checkpoint(agent_name, session_id, checkpoint_id?) — checkpoint
+- run_agent_tests(agent_name, test_types?, fail_fast?) — run pytest
+
+## Worker Lifecycle
+- start_worker(task) — Start the worker with a task description. The \
+worker runs autonomously until it finishes or asks the user a question.
+- stop_worker() — Cancel the worker's current execution.
+- get_worker_status() — Check if the worker is idle, running, or waiting \
+for user input. Returns execution details.
+- inject_worker_message(content) — Send a message to the running worker. \
+Use this to relay user instructions or concerns.
+
+## Monitoring
+- get_worker_health_summary() — Read the latest health data from the judge.
+- notify_operator(ticket_id, analysis, urgency) — Alert the user about a \
+critical issue. Use sparingly.
+
+# Behavior
+
+## Direct coding
+You can do any coding task directly — reading files, writing code, running \
+commands, building agents, debugging. You have the same tools as the worker. \
+For quick tasks (reading code, small edits, debugging), do them yourself.
+
+## Worker delegation
+For large, autonomous tasks (building a full agent, running a long pipeline), \
+delegate to the worker via start_worker(task). The worker runs in the \
+background while you remain available to the user.
+
+## When idle (worker not running):
+- Greet the user. Ask what they want to build or do.
+- For quick tasks, do them directly.
+- For large tasks, call start_worker(task) with a clear task description. \
+Summarize what you told the worker.
+
+## When worker is running:
+- If the user asks about progress, call get_worker_status().
+- If the user has a concern or instruction for the worker, call \
+inject_worker_message(content) to relay it.
+- You can still do coding tasks directly while the worker runs.
+- If an escalation ticket arrives from the judge, assess severity:
+  - Low/transient: acknowledge silently, do not disturb the user.
+  - High/critical: notify the user with a brief analysis and suggested action.
+
+## When worker asks user a question:
+- The system will route the user's response directly to the worker. \
+You do not need to relay it. The user will come back to you after responding.
+
+# Agent Building Workflow
+
+When building Hive agent packages, follow this workflow:
+
+## 1. Understand & Qualify
+Hear what the user wants. Run discover_mcp_tools() to check tool availability. \
+Read the framework guide:
+  read_file("core/framework/agents/hive_coder/reference/framework_guide.md")
+
+## 2. Design
+Design the agent: Goal, 2-4 nodes MAX, edges. Read reference agents:
+  list_agents()
+  read_file("exports/deep_research_agent/nodes/__init__.py")
+
+Present design with ASCII art. Get user approval.
+
+## 3. Implement
+Read templates before writing:
+  read_file("core/framework/agents/hive_coder/reference/file_templates.md")
+
+Write files: config.py, nodes/__init__.py, agent.py, __init__.py, \
+__main__.py, mcp_servers.json, tests/.
+
+## 4. Verify
+Run THREE validation steps:
+  run_command("python -c 'from {name} import default_agent; print(default_agent.validate())'")
+  run_command("python -c 'from framework.runner.runner import AgentRunner; \\
+    r = AgentRunner.load(\"exports/{name}\"); print(\"OK\")'")
+  run_agent_tests("{name}")
+
+# Style
+
+- Concise. No fluff. Direct.
+- No emojis.
+- When starting the worker, describe what you told it in one sentence.
+- When relaying status, be specific.
+- When an escalation arrives, lead with severity and recommended action.
+""",
+)
+
+ALL_QUEEN_TOOLS = [
+    # File I/O (from coder-tools MCP)
+    "read_file",
+    "write_file",
+    "edit_file",
+    "list_directory",
+    "search_files",
+    "run_command",
+    "undo_changes",
+    # Meta-agent (from coder-tools MCP)
+    "discover_mcp_tools",
+    "list_agents",
+    "list_agent_sessions",
+    "get_agent_session_state",
+    "get_agent_session_memory",
+    "list_agent_checkpoints",
+    "get_agent_checkpoint",
+    "run_agent_tests",
+    # Worker lifecycle
+    "start_worker",
+    "stop_worker",
+    "get_worker_status",
+    "inject_worker_message",
+    # Monitoring
+    "get_worker_health_summary",
+    "notify_operator",
+]
+
+__all__ = [
+    "coder_node",
+    "ticket_triage_node",
+    "queen_node",
+    "ALL_QUEEN_TRIAGE_TOOLS",
+    "ALL_QUEEN_TOOLS",
+]
@@ -0,0 +1,107 @@
+# Common Mistakes When Building Hive Agents
+
+## Critical Errors
+
+1. **Using tools that don't exist** — Always verify tools are available in the hive-tools MCP server before assigning them to nodes. Never guess tool names.
+
+2. **Wrong entry_points format** — MUST be `{"start": "first-node-id"}`. NOT a set, NOT `{node_id: [keys]}`.
+
+3. **Wrong mcp_servers.json format** — Flat dict (no `"mcpServers"` wrapper). `cwd` must be `"../../tools"`. `command` must be `"uv"` with args `["run", "python", ...]`.
+
+4. **Missing STEP 1/STEP 2 in client-facing prompts** — Without explicit phases, the LLM calls set_output before the user responds. Always use the pattern.
+
+5. **Forgetting nullable_output_keys** — When a node receives inputs from multiple edges and some inputs only arrive on certain edges (e.g., feedback), mark those as nullable. Without this, the executor blocks waiting for a value that will never arrive.
+
+6. **Creating dead-end nodes in forever-alive graphs** — Every node must have at least one outgoing edge. A node with no outgoing edges ends the execution, breaking the loop.
+
+7. **Setting max_node_visits to a non-zero value in forever-alive agents** — The framework default is `max_node_visits=0` (unbounded). Setting it to any positive value (e.g., 1) means the node stops executing after that many visits, silently breaking the forever-alive loop. Only set `max_node_visits > 0` in one-shot agents with feedback loops that need bounded retries.
+
+7. **Missing module-level exports in `__init__.py`** — The runner loads agents via `importlib.import_module(package_name)`, which imports `__init__.py`. It then reads `goal`, `nodes`, `edges`, `entry_node`, `entry_points`, `pause_nodes`, `terminal_nodes`, `conversation_mode`, `identity_prompt`, `loop_config` via `getattr()`. If ANY of these are missing from `__init__.py`, they default to `None` or `{}` — causing "must define goal, nodes, edges" errors or "node X is unreachable" validation failures. **ALL module-level variables from agent.py must be re-exported in `__init__.py`.**
+
+## Value Errors
+
+8. **Invalid `conversation_mode` value** — Only two valid values: `"continuous"` (recommended for interactive agents) or omit entirely (for isolated per-node conversations). Values like `"client_facing"`, `"interactive"`, `"adaptive"` do NOT exist and will cause runtime errors.
+
+9. **Invalid `loop_config` keys** — Only three valid keys: `max_iterations` (int), `max_tool_calls_per_turn` (int), `max_history_tokens` (int). Keys like `"strategy"`, `"mode"`, `"timeout"` are NOT valid and are silently ignored or cause errors.
+
+10. **Fabricating tools that don't exist** — Never guess tool names. Always verify via `discover_mcp_tools()`. Common hallucinations: `csv_read`, `csv_write`, `csv_append`, `file_upload`, `database_query`. If a required tool doesn't exist, redesign the agent to use tools that DO exist (e.g., `save_data`/`load_data` for data persistence).
+
+## Design Errors
+
+11. **Too many thin nodes** — Hard limit: **2-4 nodes** for most agents. Each node boundary serializes outputs to shared memory and loses all in-context information (tool results, intermediate reasoning, conversation history). A node with 0 tools that just does LLM reasoning is NOT a real node — merge it into its predecessor or successor.
+
+**Merge when:**
+- Node has NO tools — pure LLM reasoning belongs in the node that produces or consumes its data
+- Node sets only 1 trivial output (e.g., `set_output("done", "true")`) — collapse into predecessor
+- Multiple consecutive autonomous nodes with same/similar tools — combine into one
+- A "report" or "summary" node that just presents analysis — merge into the client-facing node
+- A "schedule" or "confirm" node that doesn't actually schedule anything — remove entirely
+
+**Keep separate when:**
+- Client-facing vs autonomous — different interaction models require separate nodes
+- Fundamentally different tool sets (e.g., web search vs file I/O)
+- Fan-out parallelism — parallel branches MUST be separate nodes
+
+**Bad example** (7 nodes — WAY too many):
+```
+profile_setup → daily_intake → update_tracker → analyze_progress → generate_plan → schedule_reminders → report
+```
+`analyze_progress` has no tools. `schedule_reminders` just sets one boolean. `report` just presents analysis. `update_tracker` and `generate_plan` are sequential autonomous work.
+
+**Good example** (3 nodes):
+```
+intake (client-facing) → process (autonomous: track + analyze + plan) → intake (loop back)
+```
+One client-facing node handles ALL user interaction (setup, logging, reports). One autonomous node handles ALL backend work (CSV update, analysis, plan generation) with tools and context preserved.
+
+12. **Adding framework gating for LLM behavior** — Don't add output rollback, premature rejection, or interaction protocol injection. Fix with better prompts or custom judges.
+
+13. **Not using continuous conversation mode** — Interactive agents should use `conversation_mode="continuous"`. Without it, each node starts with blank context.
+
+14. **Adding terminal nodes by default** — ALL agents should use `terminal_nodes=[]` (forever-alive) unless the user explicitly requests a one-shot/batch agent. Forever-alive is the standard pattern. Every node must have at least one outgoing edge. Dead-end nodes break the loop.
+
+15. **Calling set_output in same turn as tool calls** — Instruct the LLM to call set_output in a SEPARATE turn from real tool calls.
+
+## File Template Errors
+
+16. **Wrong import paths** — Use `from framework.graph import ...`, NOT `from core.framework.graph import ...`. The PYTHONPATH includes `core/`.
+
+17. **Missing storage path** — Agent class must set `self._storage_path = Path.home() / ".hive" / "agents" / "agent_name"`.
+
+18. **Missing mcp_servers.json** — Without this, the agent has no tools at runtime.
+
+19. **Bare `python` command in mcp_servers.json** — Use `"command": "uv"` with args `["run", "python", ...]`.
+
+## Testing Errors
+
+20. **Using `runner.run()` on forever-alive agents** — `runner.run()` calls `trigger_and_wait()` which blocks until the graph reaches a terminal node. Forever-alive agents have `terminal_nodes=[]`, so **`runner.run()` hangs forever**. This is the #1 cause of stuck test suites.
+
+**For forever-alive agents, write structural tests instead:**
+- Validate graph structure (nodes, edges, entry points)
+- Verify node specs (tools, prompts, client-facing flag)
+- Check goal/constraints/success criteria definitions
+- Test that `AgentRunner.load()` + `_setup()` succeeds (skip if no API key)
+
+**What NOT to do:**
+```python
+# WRONG — hangs forever on forever-alive agents
+result = await runner.run({"topic": "quantum computing"})
+```
+
+**Correct pattern for structure tests:**
+```python
+def test_research_has_web_tools(self):
+    assert "web_search" in research_node.tools
+
+def test_research_routes_back_to_interact(self):
+    edges_to_interact = [e for e in edges if e.source == "research" and e.target == "interact"]
+    assert edges_to_interact
+```
+
+21. **Stale tests after agent restructuring** — When you change an agent's node count or names (e.g., 4 nodes → 2 nodes), the tests MUST be updated too. Tests referencing old node names (e.g., `"review"`, `"report"`) will fail or hang. Always check that test assertions match the current `nodes/__init__.py`.
+
+22. **Running full integration tests without API keys** — Structural tests (validate, import) work without keys. Full integration tests need `ANTHROPIC_API_KEY`. Use `pytest.skip()` in the runner fixture when `_setup()` fails due to missing credentials.
+
+23. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
+
+24. **Not using auto_responder for client-facing nodes** — Tests with client-facing nodes hang without an auto-responder that injects input. But note: even WITH auto_responder, forever-alive agents still hang because the graph never terminates. Auto-responder only helps for agents with terminal nodes.
@@ -0,0 +1,597 @@
+# Agent File Templates
+
+Complete code templates for each file in a Hive agent package.
+
+## config.py
+
+```python
+"""Runtime configuration."""
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+def _load_preferred_model() -> str:
+    """Load preferred model from ~/.hive/configuration.json."""
+    config_path = Path.home() / ".hive" / "configuration.json"
+    if config_path.exists():
+        try:
+            with open(config_path) as f:
+                config = json.load(f)
+            llm = config.get("llm", {})
+            if llm.get("provider") and llm.get("model"):
+                return f"{llm['provider']}/{llm['model']}"
+        except Exception:
+            pass
+    return "anthropic/claude-sonnet-4-20250514"
+
+
+@dataclass
+class RuntimeConfig:
+    model: str = field(default_factory=_load_preferred_model)
+    temperature: float = 0.7
+    max_tokens: int = 40000
+    api_key: str | None = None
+    api_base: str | None = None
+
+
+default_config = RuntimeConfig()
+
+
+@dataclass
+class AgentMetadata:
+    name: str = "My Agent Name"
+    version: str = "1.0.0"
+    description: str = "What this agent does."
+    intro_message: str = "Welcome! What would you like me to do?"
+
+
+metadata = AgentMetadata()
+```
+
+## nodes/__init__.py
+
+```python
+"""Node definitions for My Agent."""
+
+from framework.graph import NodeSpec
+
+# Node 1: Intake (client-facing)
+intake_node = NodeSpec(
+    id="intake",
+    name="Intake",
+    description="Gather requirements from the user",
+    node_type="event_loop",
+    client_facing=True,
+    max_node_visits=0,  # Unlimited for forever-alive
+    input_keys=["topic"],
+    output_keys=["brief"],
+    success_criteria="The brief is specific and actionable.",
+    system_prompt="""\
+You are an intake specialist.
+
+**STEP 1 — Read and respond (text only, NO tool calls):**
+1. Read the topic provided
+2. If vague, ask 1-2 clarifying questions
+3. If clear, confirm your understanding
+
+**STEP 2 — After the user confirms, call set_output:**
+- set_output("brief", "Clear description of what to do")
+""",
+    tools=[],
+)
+
+# Node 2: Worker (autonomous)
+worker_node = NodeSpec(
+    id="worker",
+    name="Worker",
+    description="Do the main work",
+    node_type="event_loop",
+    max_node_visits=0,
+    input_keys=["brief", "feedback"],
+    output_keys=["results"],
+    nullable_output_keys=["feedback"],  # Only on feedback edge
+    success_criteria="Results are complete and accurate.",
+    system_prompt="""\
+You are a worker agent. Given a brief, do the work.
+
+If feedback is provided, this is a follow-up — address the feedback.
+
+Work in phases:
+1. Use tools to gather/process data
+2. Analyze results
+3. Call set_output for each key in a SEPARATE turn:
+   - set_output("results", "structured results")
+""",
+    tools=["web_search", "web_scrape", "save_data", "load_data", "list_data_files"],
+)
+
+# Node 3: Review (client-facing)
+review_node = NodeSpec(
+    id="review",
+    name="Review",
+    description="Present results for user approval",
+    node_type="event_loop",
+    client_facing=True,
+    max_node_visits=0,
+    input_keys=["results", "brief"],
+    output_keys=["next_action", "feedback"],
+    nullable_output_keys=["feedback"],
+    success_criteria="User has reviewed and decided next steps.",
+    system_prompt="""\
+Present the results to the user.
+
+**STEP 1 — Present (text only, NO tool calls):**
+1. Summary of work done
+2. Key results
+3. Ask: satisfied, or want changes?
+
+**STEP 2 — After user responds, call set_output:**
+- set_output("next_action", "new_topic")   — if starting fresh
+- set_output("next_action", "revise")      — if changes needed
+- set_output("feedback", "what to change") — only if revising
+""",
+    tools=[],
+)
+
+__all__ = ["intake_node", "worker_node", "review_node"]
+```
+
+## agent.py
+
+```python
+"""Agent graph construction for My Agent."""
+
+from pathlib import Path
+
+from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
+from framework.graph.edge import GraphSpec
+from framework.graph.executor import ExecutionResult
+from framework.graph.checkpoint_config import CheckpointConfig
+from framework.llm import LiteLLMProvider
+from framework.runner.tool_registry import ToolRegistry
+from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
+from framework.runtime.execution_stream import EntryPointSpec
+
+from .config import default_config, metadata
+from .nodes import intake_node, worker_node, review_node
+
+# Goal definition
+goal = Goal(
+    id="my-agent-goal",
+    name="My Agent Goal",
+    description="What this agent achieves.",
+    success_criteria=[
+        SuccessCriterion(id="sc-1", description="...", metric="...", target="...", weight=0.5),
+        SuccessCriterion(id="sc-2", description="...", metric="...", target="...", weight=0.5),
+    ],
+    constraints=[
+        Constraint(id="c-1", description="...", constraint_type="hard", category="quality"),
+    ],
+)
+
+# Node list
+nodes = [intake_node, worker_node, review_node]
+
+# Edge definitions
+edges = [
+    EdgeSpec(id="intake-to-worker", source="intake", target="worker",
+             condition=EdgeCondition.ON_SUCCESS, priority=1),
+    EdgeSpec(id="worker-to-review", source="worker", target="review",
+             condition=EdgeCondition.ON_SUCCESS, priority=1),
+    # Feedback loop
+    EdgeSpec(id="review-to-worker", source="review", target="worker",
+             condition=EdgeCondition.CONDITIONAL,
+             condition_expr="str(next_action).lower() == 'revise'", priority=2),
+    # Loop back for new topic
+    EdgeSpec(id="review-to-intake", source="review", target="intake",
+             condition=EdgeCondition.CONDITIONAL,
+             condition_expr="str(next_action).lower() == 'new_topic'", priority=1),
+]
+
+# Graph configuration
+entry_node = "intake"
+entry_points = {"start": "intake"}
+pause_nodes = []
+terminal_nodes = []  # Forever-alive
+
+# Module-level vars read by AgentRunner.load()
+conversation_mode = "continuous"
+identity_prompt = "You are a helpful agent."
+loop_config = {"max_iterations": 100, "max_tool_calls_per_turn": 20, "max_history_tokens": 32000}
+
+
+class MyAgent:
+    def __init__(self, config=None):
+        self.config = config or default_config
+        self.goal = goal
+        self.nodes = nodes
+        self.edges = edges
+        self.entry_node = entry_node
+        self.entry_points = entry_points
+        self.pause_nodes = pause_nodes
+        self.terminal_nodes = terminal_nodes
+        self._graph = None
+        self._agent_runtime = None
+        self._tool_registry = None
+        self._storage_path = None
+
+    def _build_graph(self):
+        return GraphSpec(
+            id="my-agent-graph",
+            goal_id=self.goal.id,
+            version="1.0.0",
+            entry_node=self.entry_node,
+            entry_points=self.entry_points,
+            terminal_nodes=self.terminal_nodes,
+            pause_nodes=self.pause_nodes,
+            nodes=self.nodes,
+            edges=self.edges,
+            default_model=self.config.model,
+            max_tokens=self.config.max_tokens,
+            loop_config=loop_config,
+            conversation_mode=conversation_mode,
+            identity_prompt=identity_prompt,
+        )
+
+    def _setup(self, mock_mode=False):
+        self._storage_path = Path.home() / ".hive" / "agents" / "my_agent"
+        self._storage_path.mkdir(parents=True, exist_ok=True)
+        self._tool_registry = ToolRegistry()
+        mcp_config = Path(__file__).parent / "mcp_servers.json"
+        if mcp_config.exists():
+            self._tool_registry.load_mcp_config(mcp_config)
+        llm = None
+        if not mock_mode:
+            llm = LiteLLMProvider(model=self.config.model, api_key=self.config.api_key, api_base=self.config.api_base)
+        tools = list(self._tool_registry.get_tools().values())
+        tool_executor = self._tool_registry.get_executor()
+        self._graph = self._build_graph()
+        self._agent_runtime = create_agent_runtime(
+            graph=self._graph, goal=self.goal, storage_path=self._storage_path,
+            entry_points=[EntryPointSpec(id="default", name="Default", entry_node=self.entry_node,
+                                         trigger_type="manual", isolation_level="shared")],
+            llm=llm, tools=tools, tool_executor=tool_executor,
+            checkpoint_config=CheckpointConfig(enabled=True, checkpoint_on_node_complete=True,
+                                                checkpoint_max_age_days=7, async_checkpoint=True),
+        )
+
+    async def start(self, mock_mode=False):
+        if self._agent_runtime is None:
+            self._setup(mock_mode=mock_mode)
+        if not self._agent_runtime.is_running:
+            await self._agent_runtime.start()
+
+    async def stop(self):
+        if self._agent_runtime and self._agent_runtime.is_running:
+            await self._agent_runtime.stop()
+        self._agent_runtime = None
+
+    async def trigger_and_wait(self, entry_point="default", input_data=None, timeout=None, session_state=None):
+        if self._agent_runtime is None:
+            raise RuntimeError("Agent not started. Call start() first.")
+        return await self._agent_runtime.trigger_and_wait(
+            entry_point_id=entry_point, input_data=input_data or {}, session_state=session_state)
+
+    async def run(self, context, mock_mode=False, session_state=None):
+        await self.start(mock_mode=mock_mode)
+        try:
+            result = await self.trigger_and_wait("default", context, session_state=session_state)
+            return result or ExecutionResult(success=False, error="Execution timeout")
+        finally:
+            await self.stop()
+
+    def info(self):
+        return {
+            "name": metadata.name, "version": metadata.version, "description": metadata.description,
+            "goal": {"name": self.goal.name, "description": self.goal.description},
+            "nodes": [n.id for n in self.nodes], "edges": [e.id for e in self.edges],
+            "entry_node": self.entry_node, "entry_points": self.entry_points,
+            "terminal_nodes": self.terminal_nodes,
+            "client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
+        }
+
+    def validate(self):
+        errors, warnings = [], []
+        node_ids = {n.id for n in self.nodes}
+        for e in self.edges:
+            if e.source not in node_ids: errors.append(f"Edge {e.id}: source '{e.source}' not found")
+            if e.target not in node_ids: errors.append(f"Edge {e.id}: target '{e.target}' not found")
+        if self.entry_node not in node_ids: errors.append(f"Entry node '{self.entry_node}' not found")
+        for t in self.terminal_nodes:
+            if t not in node_ids: errors.append(f"Terminal node '{t}' not found")
+        for ep_id, nid in self.entry_points.items():
+            if nid not in node_ids: errors.append(f"Entry point '{ep_id}' references unknown node '{nid}'")
+        return {"valid": len(errors) == 0, "errors": errors, "warnings": warnings}
+
+
+default_agent = MyAgent()
+```
+
+## agent.py — Async Entry Points Variant
+
+When an agent needs timers, webhooks, or event-driven triggers, add
+`async_entry_points` and optionally `runtime_config` as module-level variables.
+These are IN ADDITION to the standard variables above.
+
+```python
+# Additional imports for async entry points
+from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
+from framework.runtime.agent_runtime import (
+    AgentRuntime, AgentRuntimeConfig, create_agent_runtime,
+)
+
+# ... (goal, nodes, edges, entry_node, entry_points, etc. as above) ...
+
+# Async entry points — event-driven triggers
+async_entry_points = [
+    # Timer with cron: daily at 9am
+    AsyncEntryPointSpec(
+        id="daily-check",
+        name="Daily Check",
+        entry_node="process-node",
+        trigger_type="timer",
+        trigger_config={"cron": "0 9 * * *"},
+        isolation_level="shared",
+        max_concurrent=1,
+    ),
+    # Timer with fixed interval: every 20 minutes
+    AsyncEntryPointSpec(
+        id="scheduled-check",
+        name="Scheduled Check",
+        entry_node="process-node",
+        trigger_type="timer",
+        trigger_config={"interval_minutes": 20, "run_immediately": False},
+        isolation_level="shared",
+        max_concurrent=1,
+    ),
+    # Event: reacts to webhook events
+    AsyncEntryPointSpec(
+        id="webhook-event",
+        name="Webhook Event Handler",
+        entry_node="process-node",
+        trigger_type="event",
+        trigger_config={"event_types": ["webhook_received"]},
+        isolation_level="shared",
+        max_concurrent=10,
+    ),
+]
+
+# Webhook server config (only needed if using webhooks)
+runtime_config = AgentRuntimeConfig(
+    webhook_host="127.0.0.1",
+    webhook_port=8080,
+    webhook_routes=[
+        {
+            "source_id": "my-source",
+            "path": "/webhooks/my-source",
+            "methods": ["POST"],
+        },
+    ],
+)
+```
+
+**Key rules for async entry points:**
+- `async_entry_points` is a list of `AsyncEntryPointSpec` (NOT `EntryPointSpec`)
+- `runtime_config` is `AgentRuntimeConfig` (NOT `RuntimeConfig` from config.py)
+- Valid trigger_types: `timer`, `event`, `webhook`, `manual`, `api`
+- Valid isolation_levels: `isolated`, `shared`, `synchronized`
+- Timer trigger_config (cron): `{"cron": "0 9 * * *"}` — standard 5-field cron expression
+- Timer trigger_config (interval): `{"interval_minutes": float, "run_immediately": bool}`
+- Event trigger_config: `{"event_types": ["webhook_received"], "filter_stream": "...", "filter_node": "..."}`
+- Use `isolation_level="shared"` for async entry points that need to read
+  the primary session's memory (e.g., user-configured rules)
+- The `_build_graph()` method passes `async_entry_points` to GraphSpec
+- Reference: `exports/gmail_inbox_guardian/agent.py`
+
+## __init__.py
+
+**CRITICAL:** The runner imports the package (`__init__.py`) and reads ALL module-level
+variables via `getattr()`. Every variable defined in `agent.py` that the runner needs
+MUST be re-exported here. Missing exports cause silent failures (variables default to
+`None` or `{}`), leading to "must define goal, nodes, edges" errors or graph validation
+failures like "node X is unreachable".
+
+```python
+"""My Agent — description."""
+
+from .agent import (
+    MyAgent,
+    default_agent,
+    goal,
+    nodes,
+    edges,
+    entry_node,
+    entry_points,
+    pause_nodes,
+    terminal_nodes,
+    conversation_mode,
+    identity_prompt,
+    loop_config,
+)
+from .config import default_config, metadata
+
+__all__ = [
+    "MyAgent",
+    "default_agent",
+    "goal",
+    "nodes",
+    "edges",
+    "entry_node",
+    "entry_points",
+    "pause_nodes",
+    "terminal_nodes",
+    "conversation_mode",
+    "identity_prompt",
+    "loop_config",
+    "default_config",
+    "metadata",
+]
+```
+
+**If the agent uses async entry points**, also import and export:
+```python
+from .agent import (
+    ...,
+    async_entry_points,
+    runtime_config,  # Only if using webhooks
+)
+
+__all__ = [
+    ...,
+    "async_entry_points",
+    "runtime_config",
+]
+```
+
+## __main__.py
+
+```python
+"""CLI entry point for My Agent."""
+
+import asyncio, json, logging, sys
+import click
+from .agent import default_agent, MyAgent
+
+
+def setup_logging(verbose=False, debug=False):
+    if debug: level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
+    elif verbose: level, fmt = logging.INFO, "%(message)s"
+    else: level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
+    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
+
+
+@click.group()
+@click.version_option(version="1.0.0")
+def cli():
+    """My Agent — description."""
+    pass
+
+
+@cli.command()
+@click.option("--topic", "-t", required=True)
+@click.option("--mock", is_flag=True)
+@click.option("--verbose", "-v", is_flag=True)
+def run(topic, mock, verbose):
+    """Execute the agent."""
+    setup_logging(verbose=verbose)
+    result = asyncio.run(default_agent.run({"topic": topic}, mock_mode=mock))
+    click.echo(json.dumps({"success": result.success, "output": result.output}, indent=2, default=str))
+    sys.exit(0 if result.success else 1)
+
+
+@cli.command()
+@click.option("--mock", is_flag=True)
+def tui(mock):
+    """Launch TUI dashboard."""
+    from pathlib import Path
+    from framework.tui.app import AdenTUI
+    from framework.llm import LiteLLMProvider
+    from framework.runner.tool_registry import ToolRegistry
+    from framework.runtime.agent_runtime import create_agent_runtime
+    from framework.runtime.execution_stream import EntryPointSpec
+
+    async def run_tui():
+        agent = MyAgent()
+        agent._tool_registry = ToolRegistry()
+        storage = Path.home() / ".hive" / "agents" / "my_agent"
+        storage.mkdir(parents=True, exist_ok=True)
+        mcp_cfg = Path(__file__).parent / "mcp_servers.json"
+        if mcp_cfg.exists(): agent._tool_registry.load_mcp_config(mcp_cfg)
+        llm = None if mock else LiteLLMProvider(model=agent.config.model, api_key=agent.config.api_key, api_base=agent.config.api_base)
+        runtime = create_agent_runtime(
+            graph=agent._build_graph(), goal=agent.goal, storage_path=storage,
+            entry_points=[EntryPointSpec(id="start", name="Start", entry_node="intake", trigger_type="manual", isolation_level="isolated")],
+            llm=llm, tools=list(agent._tool_registry.get_tools().values()), tool_executor=agent._tool_registry.get_executor())
+        await runtime.start()
+        try:
+            app = AdenTUI(runtime)
+            await app.run_async()
+        finally:
+            await runtime.stop()
+    asyncio.run(run_tui())
+
+
+@cli.command()
+def info():
+    """Show agent info."""
+    data = default_agent.info()
+    click.echo(f"Agent: {data['name']}\nVersion: {data['version']}\nDescription: {data['description']}")
+    click.echo(f"Nodes: {', '.join(data['nodes'])}\nClient-facing: {', '.join(data['client_facing_nodes'])}")
+
+
+@cli.command()
+def validate():
+    """Validate agent structure."""
+    v = default_agent.validate()
+    if v["valid"]: click.echo("Agent is valid")
+    else:
+        click.echo("Errors:")
+        for e in v["errors"]: click.echo(f"  {e}")
+    sys.exit(0 if v["valid"] else 1)
+
+
+if __name__ == "__main__":
+    cli()
+```
+
+## mcp_servers.json
+
+```json
+{
+  "hive-tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "mcp_server.py", "--stdio"],
+    "cwd": "../../tools",
+    "description": "Hive tools MCP server"
+  }
+}
+```
+
+**CRITICAL FORMAT RULES:**
+- NO `"mcpServers"` wrapper (flat dict, not nested)
+- `cwd` MUST be `"../../tools"` (relative from `exports/AGENT_NAME/` to `tools/`)
+- `command` MUST be `"uv"` with `"args": ["run", "python", ...]` (NOT bare `"python"`)
+
+## tests/conftest.py
+
+```python
+"""Test fixtures."""
+
+import sys
+from pathlib import Path
+
+import pytest
+import pytest_asyncio
+
+_repo_root = Path(__file__).resolve().parents[3]
+for _p in ["exports", "core"]:
+    _path = str(_repo_root / _p)
+    if _path not in sys.path:
+        sys.path.insert(0, _path)
+
+AGENT_PATH = str(Path(__file__).resolve().parents[1])
+
+
+@pytest.fixture(scope="session")
+def mock_mode():
+    return True
+
+
+@pytest_asyncio.fixture(scope="session")
+async def runner(tmp_path_factory, mock_mode):
+    from framework.runner.runner import AgentRunner
+    storage = tmp_path_factory.mktemp("agent_storage")
+    r = AgentRunner.load(AGENT_PATH, mock_mode=mock_mode, storage_path=storage)
+    r._setup()
+    yield r
+    await r.cleanup_async()
+```
+
+## entry_points Format
+
+MUST be: `{"start": "first-node-id"}`
+NOT: `{"first-node-id": ["input_keys"]}` (WRONG)
+NOT: `{"first-node-id"}` (WRONG — this is a set)
@@ -0,0 +1,433 @@
+# Hive Agent Framework — Condensed Reference
+
+## Architecture
+
+Agents are Python packages in `exports/`:
+```
+exports/my_agent/
+├── __init__.py          # MUST re-export ALL module-level vars from agent.py
+├── __main__.py          # CLI (run, tui, info, validate, shell)
+├── agent.py             # Graph construction (goal, edges, agent class)
+├── config.py            # Runtime config
+├── nodes/__init__.py    # Node definitions (NodeSpec)
+├── mcp_servers.json     # MCP tool server config
+└── tests/               # pytest tests
+```
+
+## Agent Loading Contract
+
+`AgentRunner.load()` imports the package (`__init__.py`) and reads these
+module-level variables via `getattr()`:
+
+| Variable | Required | Default if missing | Consequence |
+|----------|----------|--------------------|-------------|
+| `goal` | YES | `None` | **FATAL** — "must define goal, nodes, edges" |
+| `nodes` | YES | `None` | **FATAL** — same error |
+| `edges` | YES | `None` | **FATAL** — same error |
+| `entry_node` | no | `nodes[0].id` | Probably wrong node |
+| `entry_points` | no | `{}` | **Nodes unreachable** — validation fails |
+| `terminal_nodes` | no | `[]` | OK for forever-alive |
+| `pause_nodes` | no | `[]` | OK |
+| `conversation_mode` | no | not passed | Isolated mode (no context carryover) |
+| `identity_prompt` | no | not passed | No agent-level identity |
+| `loop_config` | no | `{}` | No iteration limits |
+| `async_entry_points` | no | `[]` | No async triggers (timers, webhooks, events) |
+| `runtime_config` | no | `None` | No webhook server |
+
+**CRITICAL:** `__init__.py` MUST import and re-export ALL of these from
+`agent.py`. Missing exports silently fall back to defaults, causing
+hard-to-debug failures.
+
+**Why `default_agent.validate()` is NOT sufficient:**
+`validate()` checks the agent CLASS's internal graph (self.nodes, self.edges).
+These are always correct because the constructor references agent.py's module
+vars directly. But `AgentRunner.load()` reads from the PACKAGE (`__init__.py`),
+not the class. So `validate()` passes while `AgentRunner.load()` fails.
+Always test with `AgentRunner.load("exports/{name}")` — this is the same
+code path the TUI and `hive run` use.
+
+## Goal
+
+Defines success criteria and constraints:
+```python
+goal = Goal(
+    id="kebab-case-id",
+    name="Display Name",
+    description="What the agent does",
+    success_criteria=[
+        SuccessCriterion(id="sc-id", description="...", metric="...", target="...", weight=0.25),
+    ],
+    constraints=[
+        Constraint(id="c-id", description="...", constraint_type="hard", category="quality"),
+    ],
+)
+```
+- 3-5 success criteria, weights sum to 1.0
+- 1-5 constraints (hard/soft, categories: quality, accuracy, interaction, functional)
+
+## NodeSpec Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| id | str | required | kebab-case identifier |
+| name | str | required | Display name |
+| description | str | required | What the node does |
+| node_type | str | required | Always `"event_loop"` |
+| input_keys | list[str] | required | Memory keys this node reads |
+| output_keys | list[str] | required | Memory keys this node writes via set_output |
+| system_prompt | str | "" | LLM instructions |
+| tools | list[str] | [] | Tool names from MCP servers |
+| client_facing | bool | False | If True, streams to user and blocks for input |
+| nullable_output_keys | list[str] | [] | Keys that may remain unset |
+| max_node_visits | int | 0 | 0=unlimited (default); >1 for one-shot feedback loops |
+| max_retries | int | 3 | Retries on failure |
+| success_criteria | str | "" | Natural language for judge evaluation |
+
+## EdgeSpec Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| id | str | kebab-case identifier |
+| source | str | Source node ID |
+| target | str | Target node ID |
+| condition | EdgeCondition | ON_SUCCESS, ON_FAILURE, ALWAYS, CONDITIONAL |
+| condition_expr | str | Python expression evaluated against memory (for CONDITIONAL) |
+| priority | int | Positive=forward (evaluated first), negative=feedback (loop-back) |
+
+## Key Patterns
+
+### STEP 1/STEP 2 (Client-Facing Nodes)
+```
+**STEP 1 — Respond to the user (text only, NO tool calls):**
+[Present information, ask questions]
+
+**STEP 2 — After the user responds, call set_output:**
+- set_output("key", "value based on user response")
+```
+This prevents premature set_output before user interaction.
+
+### Fewer, Richer Nodes (CRITICAL)
+
+**Hard limit: 2-4 nodes for most agents.** Never exceed 5 unless the user
+explicitly requests a complex multi-phase pipeline.
+
+Each node boundary serializes outputs to shared memory and **destroys** all
+in-context information: tool call results, intermediate reasoning, conversation
+history. A research node that searches, fetches, and analyzes in ONE node keeps
+all source material in its conversation context. Split across 3 nodes, each
+downstream node only sees the serialized summary string.
+
+**Decision framework — merge unless ANY of these apply:**
+1. **Client-facing boundary** — Autonomous and client-facing work MUST be
+   separate nodes (different interaction models)
+2. **Disjoint tool sets** — If tools are fundamentally different (e.g., web
+   search vs database), separate nodes make sense
+3. **Parallel execution** — Fan-out branches must be separate nodes
+
+**Red flags that you have too many nodes:**
+- A node with 0 tools (pure LLM reasoning) → merge into predecessor/successor
+- A node that sets only 1 trivial output → collapse into predecessor
+- Multiple consecutive autonomous nodes → combine into one rich node
+- A "report" node that presents analysis → merge into the client-facing node
+- A "confirm" or "schedule" node that doesn't call any external service → remove
+
+**Typical agent structure (3 nodes):**
+```
+intake (client-facing) ←→ process (autonomous) ←→ review (client-facing)
+```
+Or for simpler agents, just 2 nodes:
+```
+interact (client-facing) → process (autonomous) → interact (loop)
+```
+
+### nullable_output_keys
+For inputs that only arrive on certain edges:
+```python
+research_node = NodeSpec(
+    input_keys=["brief", "feedback"],
+    nullable_output_keys=["feedback"],  # Only present on feedback edge
+    max_node_visits=3,
+)
+```
+
+### Mutually Exclusive Outputs
+For routing decisions:
+```python
+review_node = NodeSpec(
+    output_keys=["approved", "feedback"],
+    nullable_output_keys=["approved", "feedback"],  # Node sets one or the other
+)
+```
+
+### Forever-Alive Pattern
+`terminal_nodes=[]` — every node has outgoing edges, graph loops until user exits.
+Use `conversation_mode="continuous"` to preserve context across transitions.
+
+### set_output
+- Synthetic tool injected by framework
+- Call separately from real tool calls (separate turn)
+- `set_output("key", "value")` stores to shared memory
+
+## Edge Conditions
+
+| Condition | When |
+|-----------|------|
+| ON_SUCCESS | Node completed successfully |
+| ON_FAILURE | Node failed |
+| ALWAYS | Unconditional |
+| CONDITIONAL | condition_expr evaluates to True against memory |
+
+condition_expr examples:
+- `"needs_more_research == True"`
+- `"str(next_action).lower() == 'new_agent'"`
+- `"feedback is not None"`
+
+## Graph Lifecycle
+
+| Pattern | terminal_nodes | When |
+|---------|---------------|------|
+| **Forever-alive** | `[]` | **DEFAULT for all agents** |
+| Linear | `["last-node"]` | Only if user explicitly requests one-shot/batch |
+
+**Forever-alive is the default.** Always use `terminal_nodes=[]`.
+The framework default for `max_node_visits` is 0 (unbounded), so
+nodes work correctly in forever-alive loops without explicit override.
+Only set `max_node_visits > 0` in one-shot agents with feedback loops.
+Every node must have at least one outgoing edge — no dead ends. The
+user exits by closing the TUI. Only use terminal nodes if the user
+explicitly asks for a batch/one-shot agent that runs once and exits.
+
+## Continuous Conversation Mode
+
+`conversation_mode` has ONLY two valid states:
+- `"continuous"` — recommended for interactive agents
+- Omit entirely — isolated per-node conversations (each node starts fresh)
+
+**INVALID values** (do NOT use): `"client_facing"`, `"interactive"`,
+`"adaptive"`, `"shared"`. These do not exist in the framework.
+
+When `conversation_mode="continuous"`:
+- Same conversation thread carries across node transitions
+- Layered system prompts: identity (agent-level) + narrative + focus (per-node)
+- Transition markers inserted at boundaries
+- Compaction happens opportunistically at phase transitions
+
+## loop_config
+
+Only three valid keys:
+```python
+loop_config = {
+    "max_iterations": 100,          # Max LLM turns per node visit
+    "max_tool_calls_per_turn": 20,  # Max tool calls per LLM response
+    "max_history_tokens": 32000,    # Triggers conversation compaction
+}
+```
+**INVALID keys** (do NOT use): `"strategy"`, `"mode"`, `"timeout"`,
+`"temperature"`. These are silently ignored or cause errors.
+
+## Data Tools (Spillover)
+
+For large data that exceeds context:
+- `save_data(filename, data)` — Write to session data dir
+- `load_data(filename, offset, limit)` — Read with pagination
+- `list_data_files()` — List files
+- `serve_file_to_user(filename, label)` — Clickable file:// URI
+
+`data_dir` is auto-injected by framework — LLM never sees it.
+
+## Fan-Out / Fan-In
+
+Multiple ON_SUCCESS edges from same source → parallel execution via asyncio.gather().
+- Parallel nodes must have disjoint output_keys
+- Only one branch may have client_facing nodes
+- Fan-in node gets all outputs in shared memory
+
+## Judge System
+
+- **Implicit** (default): ACCEPTs when LLM finishes with no tool calls and all required outputs set
+- **SchemaJudge**: Validates against Pydantic model
+- **Custom**: Implement `evaluate(context) -> JudgeVerdict`
+
+Judge is the SOLE acceptance mechanism — no ad-hoc framework gating.
+
+## Async Entry Points (Webhooks, Timers, Events)
+
+For agents that need to react to external events (incoming emails, scheduled
+tasks, API calls), use `AsyncEntryPointSpec` and optionally `AgentRuntimeConfig`.
+
+### Imports
+```python
+from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
+from framework.runtime.agent_runtime import AgentRuntime, AgentRuntimeConfig, create_agent_runtime
+```
+Note: `AsyncEntryPointSpec` is in `framework.graph.edge` (the graph/declarative layer).
+`AgentRuntimeConfig` is in `framework.runtime.agent_runtime` (the runtime layer).
+
+### AsyncEntryPointSpec Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| id | str | required | Unique identifier |
+| name | str | required | Human-readable name |
+| entry_node | str | required | Node ID to start execution from |
+| trigger_type | str | `"manual"` | `webhook`, `api`, `timer`, `event`, `manual` |
+| trigger_config | dict | `{}` | Trigger-specific config (see below) |
+| isolation_level | str | `"shared"` | `isolated`, `shared`, `synchronized` |
+| priority | int | `0` | Execution priority (higher = more priority) |
+| max_concurrent | int | `10` | Max concurrent executions |
+
+### Trigger Types
+
+**timer** — Fires on a schedule. Two modes: cron expressions or fixed interval.
+
+Cron (preferred for precise scheduling):
+```python
+AsyncEntryPointSpec(
+    id="daily-digest",
+    name="Daily Digest",
+    entry_node="check-node",
+    trigger_type="timer",
+    trigger_config={"cron": "0 9 * * *"},  # daily at 9am
+    isolation_level="shared",
+    max_concurrent=1,
+)
+```
+- `cron` (str) — standard cron expression (5 fields: min hour dom month dow)
+- Examples: `"0 9 * * *"` (daily 9am), `"0 9 * * MON-FRI"` (weekdays 9am), `"*/30 * * * *"` (every 30 min)
+
+Fixed interval (simpler, for polling-style tasks):
+```python
+AsyncEntryPointSpec(
+    id="scheduled-check",
+    name="Scheduled Check",
+    entry_node="check-node",
+    trigger_type="timer",
+    trigger_config={"interval_minutes": 20, "run_immediately": False},
+    isolation_level="shared",
+    max_concurrent=1,
+)
+```
+- `interval_minutes` (float) — how often to fire
+- `run_immediately` (bool, default False) — fire once on startup
+
+**event** — Subscribes to EventBus (e.g., webhook events):
+```python
+AsyncEntryPointSpec(
+    id="email-event",
+    name="Email Event Handler",
+    entry_node="process-emails",
+    trigger_type="event",
+    trigger_config={"event_types": ["webhook_received"]},
+    isolation_level="shared",
+    max_concurrent=10,
+)
+```
+- `event_types` (list[str]) — EventType values to subscribe to
+- `filter_stream` (str, optional) — only receive from this stream
+- `filter_node` (str, optional) — only receive from this node
+
+**webhook** — HTTP endpoint (requires AgentRuntimeConfig):
+The webhook server publishes `WEBHOOK_RECEIVED` events on the EventBus.
+An `event` trigger type with `event_types: ["webhook_received"]` subscribes
+to those events. The flow is:
+```
+HTTP POST /webhooks/gmail → WebhookServer → EventBus (WEBHOOK_RECEIVED)
+  → event entry point → triggers graph execution from entry_node
+```
+
+**manual** — Triggered programmatically via `runtime.trigger()`.
+
+### Isolation Levels
+
+| Level | Meaning |
+|-------|---------|
+| `isolated` | Private state per execution |
+| `shared` | Eventual consistency — async executions can read primary session memory |
+| `synchronized` | Shared with write locks (use when ordering matters) |
+
+For most async patterns, use `shared` — the async execution reads the primary
+session's memory (e.g., user-configured rules) and runs its own workflow.
+
+### AgentRuntimeConfig (for webhook servers)
+
+```python
+from framework.runtime.agent_runtime import AgentRuntimeConfig
+
+runtime_config = AgentRuntimeConfig(
+    webhook_host="127.0.0.1",
+    webhook_port=8080,
+    webhook_routes=[
+        {
+            "source_id": "gmail",
+            "path": "/webhooks/gmail",
+            "methods": ["POST"],
+            "secret": None,  # Optional HMAC-SHA256 secret
+        },
+    ],
+)
+```
+`runtime_config` is a module-level variable read by `AgentRunner.load()`.
+The runner passes it to `create_agent_runtime()`. On `runtime.start()`,
+if webhook_routes is non-empty, an embedded HTTP server starts.
+
+### Session Sharing
+
+Timer and event triggers automatically call `_get_primary_session_state()`
+before execution. This finds the active user-facing session and provides
+its memory to the async execution, filtered to only the async entry node's
+`input_keys`. This means the async flow can read user-configured values
+(like rules, preferences) without needing separate configuration.
+
+### Module-Level Variables
+
+Agents with async entry points must export two additional variables:
+```python
+# In agent.py:
+async_entry_points = [AsyncEntryPointSpec(...), ...]
+runtime_config = AgentRuntimeConfig(...)  # Only if using webhooks
+```
+
+Both must be re-exported from `__init__.py`:
+```python
+from .agent import (
+    ..., async_entry_points, runtime_config,
+)
+```
+
+### Reference Agent
+
+See `exports/gmail_inbox_guardian/agent.py` for a complete example with:
+- Primary client-facing intake node (user configures rules)
+- Timer-based scheduled inbox checks (every 20 min)
+- Webhook-triggered email event handling
+- Shared isolation for memory access across streams
+
+## Framework Capabilities
+
+**Works well:** Multi-turn conversations, HITL review, tool orchestration, structured outputs, parallel execution, context management, error recovery, session persistence.
+
+**Limitations:** LLM latency (2-10s/turn), context window limits (~128K), cost per run, rate limits, node boundaries lose context.
+
+**Not designed for:** Sub-second responses, millions of items, real-time streaming, guaranteed determinism, offline/air-gapped.
+
+## Tool Discovery
+
+Do NOT rely on a static tool list — it will be outdated. Always use
+`discover_mcp_tools()` to get the current tool catalog from the
+hive-tools MCP server. This returns full schemas including parameter
+names, types, and descriptions.
+
+```
+discover_mcp_tools()                          # default: hive-tools
+discover_mcp_tools("exports/my_agent/mcp_servers.json")  # specific agent
+```
+
+Common tool categories (verify via discover_mcp_tools):
+- **Web**: search, scrape, PDF
+- **Data**: save/load/append/list data files, serve to user
+- **File**: view, write, replace, diff, list, grep
+- **Communication**: email, gmail, slack, telegram
+- **CRM**: hubspot, apollo, calcom
+- **GitHub**: stargazers, user profiles, repos
+- **Vision**: image analysis
+- **Time**: current time
@@ -0,0 +1,31 @@
+"""Test fixtures for Hive Coder agent."""
+
+import sys
+from pathlib import Path
+
+import pytest
+import pytest_asyncio
+
+_repo_root = Path(__file__).resolve().parents[3]
+for _p in ["exports", "core"]:
+    _path = str(_repo_root / _p)
+    if _path not in sys.path:
+        sys.path.insert(0, _path)
+
+AGENT_PATH = str(Path(__file__).resolve().parents[1])
+
+
+@pytest.fixture(scope="session")
+def mock_mode():
+    return True
+
+
+@pytest_asyncio.fixture(scope="session")
+async def runner(tmp_path_factory, mock_mode):
+    from framework.runner.runner import AgentRunner
+
+    storage = tmp_path_factory.mktemp("agent_storage")
+    r = AgentRunner.load(AGENT_PATH, mock_mode=mock_mode, storage_path=storage)
+    r._setup()
+    yield r
+    await r.cleanup_async()
@@ -0,0 +1,27 @@
+"""Queen's ticket receiver entry point.
+
+When the Worker Health Judge emits a WORKER_ESCALATION_TICKET event on the
+shared EventBus, this entry point fires and routes to the ``ticket_triage``
+node, where the Queen deliberates and decides whether to notify the operator.
+
+Isolation level is ``isolated`` — the queen's triage memory is kept separate
+from the worker's shared memory. Each ticket triage runs in its own context.
+"""
+
+from __future__ import annotations
+
+from framework.graph.edge import AsyncEntryPointSpec
+
+TICKET_RECEIVER_ENTRY_POINT = AsyncEntryPointSpec(
+    id="ticket_receiver",
+    name="Worker Escalation Ticket Receiver",
+    entry_node="ticket_triage",
+    trigger_type="event",
+    trigger_config={
+        "event_types": ["worker_escalation_ticket"],
+        # Do not fire on our own graph's events (prevents loops if queen
+        # somehow emits a worker_escalation_ticket for herself)
+        "exclude_own_graph": True,
+    },
+    isolation_level="isolated",
+)
@@ -245,20 +245,14 @@ class GraphBuilder:
            warnings.append(f"Node '{node.id}' should have a description")

        # Type-specific validation
-        if node.node_type == "llm_tool_use":
-            if not node.tools:
-                errors.append(f"LLM tool node '{node.id}' must specify tools")
-            if not node.system_prompt:
-                warnings.append(f"LLM node '{node.id}' should have a system_prompt")
+        if node.node_type == "event_loop":
+            if node.tools and not node.system_prompt:
+                warnings.append(f"Event loop node '{node.id}' should have a system_prompt")

        if node.node_type == "router":
            if not node.routes:
                errors.append(f"Router node '{node.id}' must specify routes")

-        if node.node_type == "function":
-            if not node.function:
-                errors.append(f"Function node '{node.id}' must specify function name")
-
        # Check input/output keys
        if not node.input_keys:
            suggestions.append(f"Consider specifying input_keys for '{node.id}'")
@@ -400,9 +394,13 @@ class GraphBuilder:
        if not terminal_candidates and self.session.nodes:
            warnings.append("No terminal nodes found (all nodes have outgoing edges)")

-        # Check reachability
+        # Check reachability from ALL entry candidates (not just the first one).
+        # Agents with async entry points have multiple nodes with no incoming
+        # edges (e.g., a primary entry node and an event-driven entry node).
        if entry_candidates and self.session.nodes:
-            reachable = self._compute_reachable(entry_candidates[0])
+            reachable = set()
+            for candidate in entry_candidates:
+                reachable |= self._compute_reachable(candidate)
            unreachable = [n.id for n in self.session.nodes if n.id not in reachable]
            if unreachable:
                errors.append(f"Unreachable nodes: {unreachable}")
@@ -443,14 +441,15 @@ class GraphBuilder:
        self.session.test_cases.append(test)
        self._save_session()

-    def run_test(
+    async def run_test_async(
        self,
        test: TestCase,
        executor_factory: Callable,
    ) -> TestResult:
        """
-        Run a single test case.
+        Run a single test case asynchronously.

+        This method is safe to call from async contexts (Jupyter, FastAPI, etc.).
        executor_factory should return a configured GraphExecutor.
        """
        self._require_phase([BuildPhase.ADDING_NODES, BuildPhase.ADDING_EDGES, BuildPhase.TESTING])
@@ -462,14 +461,10 @@ class GraphBuilder:
            executor = executor_factory()

            # Run the test
-            import asyncio
-
-            result = asyncio.run(
-                executor.execute(
-                    graph=graph,
-                    goal=self.session.goal,
-                    input_data=test.input,
-                )
+            result = await executor.execute(
+                graph=graph,
+                goal=self.session.goal,
+                input_data=test.input,
            )

            # Check result
@@ -499,6 +494,36 @@ class GraphBuilder:

        return test_result

+    def run_test(
+        self,
+        test: TestCase,
+        executor_factory: Callable,
+    ) -> TestResult:
+        """
+        Run a single test case.
+
+        This is a synchronous wrapper around run_test_async().
+        If called from an async context (Jupyter, FastAPI, etc.), use run_test_async() instead.
+
+        executor_factory should return a configured GraphExecutor.
+        """
+        import asyncio
+
+        # Check if an event loop is already running
+        # get_running_loop() returns a loop if one exists, or raises RuntimeError if none exists
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            # No event loop running - safe to use asyncio.run()
+            return asyncio.run(self.run_test_async(test, executor_factory))
+
+        # Event loop is running - cannot use asyncio.run()
+        raise RuntimeError(
+            "Cannot call run_test() from an async context. "
+            "An event loop is already running. "
+            "Please use 'await builder.run_test_async(test, executor_factory)' instead."
+        )
+
    def run_all_tests(self, executor_factory: Callable) -> list[TestResult]:
        """Run all test cases."""
        results = []
@@ -11,9 +11,9 @@ Usage:

 Testing commands:
    hive test-run <agent_path> --goal <goal_id>
-    hive test-debug <goal_id> <test_id>
-    hive test-list <goal_id>
-    hive test-stats <goal_id>
+    hive test-debug <agent_path> <test_name>
+    hive test-list <agent_path>
+    hive test-stats <agent_path>
 """

 import argparse
@@ -56,6 +56,13 @@ def _configure_paths():
    if (project_root / "core").is_dir() and core_str not in sys.path:
        sys.path.insert(0, core_str)

+    # Add core/framework/agents/ so framework agents are importable as top-level packages
+    framework_agents_dir = project_root / "core" / "framework" / "agents"
+    if framework_agents_dir.is_dir():
+        fa_str = str(framework_agents_dir)
+        if fa_str not in sys.path:
+            sys.path.insert(0, fa_str)
+

 def main():
    _configure_paths()
@@ -0,0 +1,116 @@
+"""Shared Hive configuration utilities.
+
+Centralises reading of ~/.hive/configuration.json so that the runner
+and every agent template share one implementation instead of copy-pasting
+helper functions.
+"""
+
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from framework.graph.edge import DEFAULT_MAX_TOKENS
+
+# ---------------------------------------------------------------------------
+# Low-level config file access
+# ---------------------------------------------------------------------------
+
+HIVE_CONFIG_FILE = Path.home() / ".hive" / "configuration.json"
+
+
+def get_hive_config() -> dict[str, Any]:
+    """Load hive configuration from ~/.hive/configuration.json."""
+    if not HIVE_CONFIG_FILE.exists():
+        return {}
+    try:
+        with open(HIVE_CONFIG_FILE, encoding="utf-8-sig") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return {}
+
+
+# ---------------------------------------------------------------------------
+# Derived helpers
+# ---------------------------------------------------------------------------
+
+
+def get_preferred_model() -> str:
+    """Return the user's preferred LLM model string (e.g. 'anthropic/claude-sonnet-4-20250514')."""
+    llm = get_hive_config().get("llm", {})
+    if llm.get("provider") and llm.get("model"):
+        return f"{llm['provider']}/{llm['model']}"
+    return "anthropic/claude-sonnet-4-20250514"
+
+
+def get_max_tokens() -> int:
+    """Return the configured max_tokens, falling back to DEFAULT_MAX_TOKENS."""
+    return get_hive_config().get("llm", {}).get("max_tokens", DEFAULT_MAX_TOKENS)
+
+
+def get_api_key() -> str | None:
+    """Return the API key, supporting env var, Claude Code subscription, and ZAI Code.
+
+    Priority:
+    1. Claude Code subscription (``use_claude_code_subscription: true``)
+       reads the OAuth token from ``~/.claude/.credentials.json``.
+    2. Environment variable named in ``api_key_env_var``.
+    """
+    llm = get_hive_config().get("llm", {})
+
+    # Claude Code subscription: read OAuth token directly
+    if llm.get("use_claude_code_subscription"):
+        try:
+            from framework.runner.runner import get_claude_code_token
+
+            token = get_claude_code_token()
+            if token:
+                return token
+        except ImportError:
+            pass
+
+    # Standard env-var path (covers ZAI Code and all API-key providers)
+    api_key_env_var = llm.get("api_key_env_var")
+    if api_key_env_var:
+        return os.environ.get(api_key_env_var)
+    return None
+
+
+def get_api_base() -> str | None:
+    """Return the api_base URL for OpenAI-compatible endpoints, if configured."""
+    return get_hive_config().get("llm", {}).get("api_base")
+
+
+def get_llm_extra_kwargs() -> dict[str, Any]:
+    """Return extra kwargs for LiteLLMProvider (e.g. OAuth headers).
+
+    When ``use_claude_code_subscription`` is enabled, returns
+    ``extra_headers`` with the OAuth Bearer token so that litellm's
+    built-in Anthropic OAuth handler adds the required beta headers.
+    """
+    llm = get_hive_config().get("llm", {})
+    if llm.get("use_claude_code_subscription"):
+        api_key = get_api_key()
+        if api_key:
+            return {
+                "extra_headers": {"authorization": f"Bearer {api_key}"},
+            }
+    return {}
+
+
+# ---------------------------------------------------------------------------
+# RuntimeConfig – shared across agent templates
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RuntimeConfig:
+    """Agent runtime configuration loaded from ~/.hive/configuration.json."""
+
+    model: str = field(default_factory=get_preferred_model)
+    temperature: float = 0.7
+    max_tokens: int = field(default_factory=get_max_tokens)
+    api_key: str | None = field(default_factory=get_api_key)
+    api_base: str | None = field(default_factory=get_api_base)
+    extra_kwargs: dict[str, Any] = field(default_factory=get_llm_extra_kwargs)
@@ -59,6 +59,13 @@ from .provider import (
    CredentialProvider,
    StaticProvider,
 )
+from .setup import (
+    CredentialSetupSession,
+    MissingCredential,
+    SetupResult,
+    detect_missing_credentials_from_nodes,
+    run_credential_setup_cli,
+)
 from .storage import (
    CompositeStorage,
    CredentialStorage,
@@ -68,6 +75,7 @@ from .storage import (
 )
 from .store import CredentialStore
 from .template import TemplateResolver
+from .validation import ensure_credential_key_env, validate_agent_credentials

 # Aden sync components (lazy import to avoid httpx dependency when not needed)
 # Usage: from core.framework.credentials.aden import AdenSyncProvider
@@ -84,6 +92,14 @@ try:
 except ImportError:
    _ADEN_AVAILABLE = False

+# Local credential registry (named API key accounts with identity metadata)
+try:
+    from .local import LocalAccountInfo, LocalCredentialRegistry
+
+    _LOCAL_AVAILABLE = True
+except ImportError:
+    _LOCAL_AVAILABLE = False
+
 __all__ = [
    # Main store
    "CredentialStore",
@@ -111,12 +127,25 @@ __all__ = [
    "CredentialRefreshError",
    "CredentialValidationError",
    "CredentialDecryptionError",
+    # Validation
+    "ensure_credential_key_env",
+    "validate_agent_credentials",
+    # Interactive setup
+    "CredentialSetupSession",
+    "MissingCredential",
+    "SetupResult",
+    "detect_missing_credentials_from_nodes",
+    "run_credential_setup_cli",
    # Aden sync (optional - requires httpx)
    "AdenSyncProvider",
    "AdenCredentialClient",
    "AdenClientConfig",
    "AdenCachedStorage",
+    # Local credential registry (optional - requires cryptography)
+    "LocalCredentialRegistry",
+    "LocalAccountInfo",
 ]

 # Track Aden availability for runtime checks
 ADEN_AVAILABLE = _ADEN_AVAILABLE
+LOCAL_AVAILABLE = _LOCAL_AVAILABLE
@@ -1,29 +1,31 @@
 """
 Aden Credential Client.

-HTTP client for communicating with the Aden authentication server.
-The Aden server handles OAuth2 authorization flows and token management.
-This client fetches tokens and delegates refresh operations to Aden.
+HTTP client for the Aden authentication server.
+Aden holds all OAuth secrets; agents receive only short-lived access tokens.
+
+API (all endpoints authenticated with Bearer {api_key}):
+
+    GET  /v1/credentials                          — list integrations
+    GET  /v1/credentials/{integration_id}          — get access token (auto-refreshes)
+    POST /v1/credentials/{integration_id}/refresh  — force refresh
+    GET  /v1/credentials/{integration_id}/validate — check validity
+
+Integration IDs are base64-encoded hashes assigned by the Aden platform
+(e.g. "Z29vZ2xlOlRpbW90aHk6MTYwNjc6MTM2ODQ"), NOT provider names.

 Usage:
-    # API key loaded from ADEN_API_KEY environment variable by default
    client = AdenCredentialClient(AdenClientConfig(
        base_url="https://api.adenhq.com",
    ))

-    # Or explicitly provide the API key
-    client = AdenCredentialClient(AdenClientConfig(
-        base_url="https://api.adenhq.com",
-        api_key="your-api-key",
-    ))
+    # List what's connected
+    for info in client.list_integrations():
+        print(f"{info.provider}/{info.alias}: {info.status}")

-    # Fetch a credential
-    response = client.get_credential("hubspot")
-    if response:
-        print(f"Token expires at: {response.expires_at}")
-
-    # Request a refresh
-    refreshed = client.request_refresh("hubspot")
+    # Get an access token
+    cred = client.get_credential(info.integration_id)
+    print(cred.access_token)
 """

 from __future__ import annotations
@@ -88,8 +90,7 @@ class AdenClientConfig:
    """Base URL of the Aden server (e.g., 'https://api.adenhq.com')."""

    api_key: str | None = None
-    """Agent's API key for authenticating with Aden.
-    If not provided, loaded from ADEN_API_KEY environment variable."""
+    """Agent API key. Loaded from ADEN_API_KEY env var if not provided."""

    tenant_id: str | None = None
    """Optional tenant ID for multi-tenant deployments."""
@@ -104,7 +105,6 @@ class AdenClientConfig:
    """Base delay between retries in seconds (exponential backoff)."""

    def __post_init__(self) -> None:
-        """Load API key from environment if not provided."""
        if self.api_key is None:
            self.api_key = os.environ.get("ADEN_API_KEY")
            if not self.api_key:
@@ -115,71 +115,124 @@ class AdenClientConfig:


@dataclass
-class AdenCredentialResponse:
-    """Response from Aden server containing credential data."""
+class AdenIntegrationInfo:
+    """An integration from GET /v1/credentials.
+
+    Example response item::
+
+        {
+            "integration_id": "Z29vZ2xlOlRpbW90aHk6MTYwNjc6MTM2ODQ",
+            "provider": "google",
+            "alias": "Timothy",
+            "status": "active",
+            "email": "timothy@acho.io",
+            "expires_at": "2026-02-20T21:46:04.863Z"
+        }
+    """

    integration_id: str
-    """Unique identifier for the integration (e.g., 'hubspot')."""
+    """Base64-encoded hash ID assigned by Aden."""

-    integration_type: str
-    """Type of integration (e.g., 'hubspot', 'github', 'slack')."""
+    provider: str
+    """Provider type (e.g. "google", "slack", "hubspot")."""

-    access_token: str
-    """The access token for API calls."""
+    alias: str
+    """User-set alias on the Aden platform."""

-    token_type: str = "Bearer"
-    """Token type (usually 'Bearer')."""
+    status: str
+    """Status: "active", "expired", "requires_reauth"."""
+
+    email: str = ""
+    """Email associated with this connection."""

    expires_at: datetime | None = None
-    """When the access token expires (UTC)."""
+    """When the current access token expires."""

-    scopes: list[str] = field(default_factory=list)
-    """OAuth2 scopes granted to this token."""
-
-    metadata: dict[str, Any] = field(default_factory=dict)
-    """Additional integration-specific metadata."""
+    # Backward compat — old code reads integration_type
+    @property
+    def integration_type(self) -> str:
+        return self.provider

    @classmethod
-    def from_dict(
-        cls, data: dict[str, Any], integration_id: str | None = None
-    ) -> AdenCredentialResponse:
-        """Create from API response dictionary."""
+    def from_dict(cls, data: dict[str, Any]) -> AdenIntegrationInfo:
        expires_at = None
        if data.get("expires_at"):
            expires_at = datetime.fromisoformat(data["expires_at"].replace("Z", "+00:00"))

        return cls(
-            integration_id=integration_id or data.get("alias", data.get("provider", "")),
-            integration_type=data.get("provider", ""),
-            access_token=data["access_token"],
-            token_type=data.get("token_type", "Bearer"),
+            integration_id=data.get("integration_id", ""),
+            provider=data.get("provider", ""),
+            alias=data.get("alias", ""),
+            status=data.get("status", "unknown"),
+            email=data.get("email", ""),
            expires_at=expires_at,
-            scopes=data.get("scopes", []),
-            metadata={"email": data.get("email")} if data.get("email") else {},
        )


@dataclass
-class AdenIntegrationInfo:
-    """Information about an available integration."""
+class AdenCredentialResponse:
+    """Response from GET /v1/credentials/{integration_id}.
+
+    Example::
+
+        {
+            "access_token": "ya29.a0AfH6SM...",
+            "token_type": "Bearer",
+            "expires_at": "2026-02-20T12:00:00.000Z",
+            "provider": "google",
+            "alias": "Timothy",
+            "email": "timothy@acho.io"
+        }
+    """

    integration_id: str
-    integration_type: str
-    status: str  # "active", "requires_reauth", "expired"
+    """The integration_id used in the request."""
+
+    access_token: str
+    """Short-lived access token for API calls."""
+
+    token_type: str = "Bearer"
+
    expires_at: datetime | None = None

+    provider: str = ""
+    """Provider type (e.g. "google")."""
+
+    alias: str = ""
+    """User-set alias."""
+
+    email: str = ""
+    """Email associated with this connection."""
+
+    scopes: list[str] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    # Backward compat
+    @property
+    def integration_type(self) -> str:
+        return self.provider
+
    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> AdenIntegrationInfo:
-        """Create from API response dictionary."""
+    def from_dict(cls, data: dict[str, Any], integration_id: str = "") -> AdenCredentialResponse:
        expires_at = None
        if data.get("expires_at"):
            expires_at = datetime.fromisoformat(data["expires_at"].replace("Z", "+00:00"))

+        # Build metadata from email if present
+        metadata = data.get("metadata") or {}
+        if not metadata and data.get("email"):
+            metadata = {"email": data["email"]}
+
        return cls(
-            integration_id=data["integration_id"],
-            integration_type=data.get("provider", data["integration_id"]),
-            status=data.get("status", "unknown"),
+            integration_id=integration_id or data.get("integration_id", ""),
+            access_token=data["access_token"],
+            token_type=data.get("token_type", "Bearer"),
            expires_at=expires_at,
+            provider=data.get("provider", ""),
+            alias=data.get("alias", ""),
+            email=data.get("email", ""),
+            scopes=data.get("scopes", []),
+            metadata=metadata,
        )


@@ -187,56 +240,33 @@ class AdenCredentialClient:
    """
    HTTP client for Aden credential server.

-    Handles communication with the Aden authentication server,
-    including fetching credentials, requesting refreshes, and
-    reporting usage statistics.
-
-    The client automatically handles:
-    - Retries with exponential backoff for transient failures
-    - Proper error classification (auth, not found, rate limit, etc.)
-    - Request headers for authentication and tenant isolation
-
    Usage:
-        # API key loaded from ADEN_API_KEY environment variable
-        config = AdenClientConfig(
+        client = AdenCredentialClient(AdenClientConfig(
            base_url="https://api.adenhq.com",
-        )
+        ))

-        client = AdenCredentialClient(config)
+        # List integrations
+        for info in client.list_integrations():
+            print(f"{info.provider}/{info.alias}: {info.status}")

-        # Fetch a credential
-        cred = client.get_credential("hubspot")
-        if cred:
-            headers = {"Authorization": f"Bearer {cred.access_token}"}
+        # Get access token (uses base64 integration_id, NOT provider name)
+        cred = client.get_credential(info.integration_id)
+        headers = {"Authorization": f"Bearer {cred.access_token}"}

-        # List all integrations
-        integrations = client.list_integrations()
-        for info in integrations:
-            print(f"{info.integration_id}: {info.status}")
-
-        # Clean up
        client.close()
    """

    def __init__(self, config: AdenClientConfig):
-        """
-        Initialize the Aden client.
-
-        Args:
-            config: Client configuration including base URL and API key.
-        """
        self.config = config
        self._client: httpx.Client | None = None

    def _get_client(self) -> httpx.Client:
-        """Get or create the HTTP client."""
        if self._client is None:
            headers = {
                "Authorization": f"Bearer {self.config.api_key}",
                "Content-Type": "application/json",
                "User-Agent": "hive-credential-store/1.0",
            }
-
            if self.config.tenant_id:
                headers["X-Tenant-ID"] = self.config.tenant_id

@@ -245,7 +275,6 @@ class AdenCredentialClient:
                timeout=self.config.timeout,
                headers=headers,
            )
-
        return self._client

    def _request_with_retry(
@@ -262,10 +291,13 @@ class AdenCredentialClient:
            try:
                response = client.request(method, path, **kwargs)

-                # Handle specific error codes
                if response.status_code == 401:
                    raise AdenAuthenticationError("Agent API key is invalid or revoked")

+                if response.status_code == 403:
+                    data = response.json()
+                    raise AdenClientError(data.get("message", "Forbidden"))
+
                if response.status_code == 404:
                    raise AdenNotFoundError(f"Integration not found: {path}")

@@ -278,14 +310,15 @@ class AdenCredentialClient:

                if response.status_code == 400:
                    data = response.json()
-                    if data.get("error") == "refresh_failed":
+                    msg = data.get("message", "Bad request")
+                    if data.get("error") == "refresh_failed" or "refresh" in msg.lower():
                        raise AdenRefreshError(
-                            data.get("message", "Token refresh failed"),
+                            msg,
                            requires_reauthorization=data.get("requires_reauthorization", False),
                            reauthorization_url=data.get("reauthorization_url"),
                        )
+                    raise AdenClientError(f"Bad request: {msg}")

-                # Success or other error
                response.raise_for_status()
                return response

@@ -306,30 +339,40 @@ class AdenCredentialClient:
                AdenRefreshError,
                AdenRateLimitError,
            ):
-                # Don't retry these errors
                raise

-        # Should not reach here, but just in case
        raise AdenClientError(
            f"Request failed after {self.config.retry_attempts} attempts"
        ) from last_error

-    def get_credential(self, integration_id: str) -> AdenCredentialResponse | None:
+    def list_integrations(self) -> list[AdenIntegrationInfo]:
        """
-        Fetch the current credential for an integration.
+        List all integrations for this agent's team.

-        The Aden server may refresh the token internally if it's expired
-        before returning it.
-
-        Args:
-            integration_id: The integration identifier (e.g., 'hubspot').
+        GET /v1/credentials → {"integrations": [...]}

        Returns:
-            Credential response with access token, or None if not found.
+            List of AdenIntegrationInfo with integration_id, provider,
+            alias, status, email, expires_at.
+        """
+        response = self._request_with_retry("GET", "/v1/credentials")
+        data = response.json()
+        return [AdenIntegrationInfo.from_dict(item) for item in data.get("integrations", [])]

-        Raises:
-            AdenAuthenticationError: If API key is invalid.
-            AdenClientError: For connection failures.
+    # Alias
+    list_connections = list_integrations
+
+    def get_credential(self, integration_id: str) -> AdenCredentialResponse | None:
+        """
+        Get access token for an integration. Auto-refreshes if near expiry.
+
+        GET /v1/credentials/{integration_id}
+
+        Args:
+            integration_id: Base64 hash ID from list_integrations().
+
+        Returns:
+            AdenCredentialResponse with access_token, or None if not found.
        """
        try:
            response = self._request_with_retry("GET", f"/v1/credentials/{integration_id}")
@@ -340,100 +383,34 @@ class AdenCredentialClient:

    def request_refresh(self, integration_id: str) -> AdenCredentialResponse:
        """
-        Request the Aden server to refresh the token.
+        Force refresh the access token.

-        Use this when the local store detects an expired or near-expiry token.
-        The Aden server handles the actual OAuth2 refresh token flow.
+        POST /v1/credentials/{integration_id}/refresh

        Args:
-            integration_id: The integration identifier.
+            integration_id: Base64 hash ID.

        Returns:
-            Credential response with new access token.
-
-        Raises:
-            AdenRefreshError: If refresh fails (may require re-authorization).
-            AdenNotFoundError: If integration not found.
-            AdenAuthenticationError: If API key is invalid.
-            AdenRateLimitError: If rate limited.
+            AdenCredentialResponse with new access_token.
        """
        response = self._request_with_retry("POST", f"/v1/credentials/{integration_id}/refresh")
        data = response.json()
        return AdenCredentialResponse.from_dict(data, integration_id=integration_id)

-    def list_integrations(self) -> list[AdenIntegrationInfo]:
-        """
-        List all integrations available for this agent/tenant.
-
-        Returns:
-            List of integration info objects.
-
-        Raises:
-            AdenAuthenticationError: If API key is invalid.
-            AdenClientError: For connection failures.
-        """
-        response = self._request_with_retry("GET", "/v1/credentials")
-        data = response.json()
-        return [AdenIntegrationInfo.from_dict(item) for item in data.get("integrations", [])]
-
    def validate_token(self, integration_id: str) -> dict[str, Any]:
        """
-        Check if a token is still valid without fetching it.
+        Check if an integration's OAuth connection is valid.

-        Args:
-            integration_id: The integration identifier.
+        GET /v1/credentials/{integration_id}/validate

        Returns:
-            Dict with 'valid' bool and optional 'expires_at', 'reason',
-            'requires_reauthorization', 'reauthorization_url'.
-
-        Raises:
-            AdenNotFoundError: If integration not found.
-            AdenAuthenticationError: If API key is invalid.
+            {"valid": bool, "status": str, "expires_at": str, "error": str|null}
        """
        response = self._request_with_retry("GET", f"/v1/credentials/{integration_id}/validate")
        return response.json()

-    def report_usage(
-        self,
-        integration_id: str,
-        operation: str,
-        status: str = "success",
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        """
-        Report credential usage statistics to Aden.
-
-        This is optional and used for analytics/billing.
-
-        Args:
-            integration_id: The integration identifier.
-            operation: Operation name (e.g., 'api_call').
-            status: Operation status ('success', 'error').
-            metadata: Additional operation metadata.
-        """
-        try:
-            self._request_with_retry(
-                "POST",
-                f"/v1/credentials/{integration_id}/usage",
-                json={
-                    "operation": operation,
-                    "status": status,
-                    "timestamp": datetime.utcnow().isoformat() + "Z",
-                    "metadata": metadata or {},
-                },
-            )
-        except Exception as e:
-            # Usage reporting is best-effort, don't fail on errors
-            logger.warning(f"Failed to report usage for '{integration_id}': {e}")
-
    def health_check(self) -> dict[str, Any]:
-        """
-        Check Aden server health and connectivity.
-
-        Returns:
-            Dict with 'status', 'version', 'timestamp', and optionally 'error'.
-        """
+        """Check Aden server health."""
        try:
            client = self._get_client()
            response = client.get("/health")
@@ -441,26 +418,17 @@ class AdenCredentialClient:
                data = response.json()
                data["latency_ms"] = response.elapsed.total_seconds() * 1000
                return data
-            return {
-                "status": "degraded",
-                "error": f"Unexpected status code: {response.status_code}",
-            }
+            return {"status": "degraded", "error": f"HTTP {response.status_code}"}
        except Exception as e:
-            return {
-                "status": "unhealthy",
-                "error": str(e),
-            }
+            return {"status": "unhealthy", "error": str(e)}

    def close(self) -> None:
-        """Close the HTTP client and release resources."""
        if self._client:
            self._client.close()
            self._client = None

    def __enter__(self) -> AdenCredentialClient:
-        """Context manager entry."""
        return self

    def __exit__(self, *args: Any) -> None:
-        """Context manager exit."""
        self.close()
@@ -282,8 +282,8 @@ class AdenSyncProvider(CredentialProvider):
        """
        Sync all credentials from Aden server to local store.

-        Fetches the list of available integrations from Aden and
-        populates the local credential store with current tokens.
+        Calls GET /v1/credentials to list integrations, then fetches
+        access tokens for each active one.

        Args:
            store: The credential store to populate.
@@ -298,9 +298,7 @@ class AdenSyncProvider(CredentialProvider):

            for info in integrations:
                if info.status != "active":
-                    logger.warning(
-                        f"Skipping integration '{info.integration_id}': status={info.status}"
-                    )
+                    logger.warning(f"Skipping connection '{info.alias}': status={info.status}")
                    continue

                try:
@@ -308,9 +306,9 @@ class AdenSyncProvider(CredentialProvider):
                    if cred:
                        store.save_credential(cred)
                        synced += 1
-                        logger.info(f"Synced credential '{info.integration_id}' from Aden")
+                        logger.info(f"Synced credential '{info.alias}' from Aden")
                except Exception as e:
-                    logger.warning(f"Failed to sync '{info.integration_id}': {e}")
+                    logger.warning(f"Failed to sync '{info.alias}': {e}")

        except AdenClientError as e:
            logger.error(f"Failed to list integrations from Aden: {e}")
@@ -373,6 +371,21 @@ class AdenSyncProvider(CredentialProvider):
            value=SecretStr(aden_response.integration_type),
        )

+        # Store alias (user-set name from Aden platform)
+        if aden_response.alias:
+            credential.keys["_alias"] = CredentialKey(
+                name="_alias",
+                value=SecretStr(aden_response.alias),
+            )
+
+        # Persist Aden metadata as identity keys
+        for meta_key, meta_value in (aden_response.metadata or {}).items():
+            if meta_value and isinstance(meta_value, str):
+                credential.keys[f"_identity_{meta_key}"] = CredentialKey(
+                    name=f"_identity_{meta_key}",
+                    value=SecretStr(meta_value),
+                )
+
        # Update timestamps
        credential.last_refreshed = datetime.now(UTC)
        credential.provider_id = self.provider_id
@@ -400,12 +413,27 @@ class AdenSyncProvider(CredentialProvider):
            ),
        }

+        # Store alias (user-set name from Aden platform)
+        if aden_response.alias:
+            keys["_alias"] = CredentialKey(
+                name="_alias",
+                value=SecretStr(aden_response.alias),
+            )
+
        if aden_response.scopes:
            keys["scope"] = CredentialKey(
                name="scope",
                value=SecretStr(" ".join(aden_response.scopes)),
            )

+        # Persist Aden metadata as identity keys
+        for meta_key, meta_value in (aden_response.metadata or {}).items():
+            if meta_value and isinstance(meta_value, str):
+                keys[f"_identity_{meta_key}"] = CredentialKey(
+                    name=f"_identity_{meta_key}",
+                    value=SecretStr(meta_value),
+                )
+
        return CredentialObject(
            id=aden_response.integration_id,
            credential_type=CredentialType.OAUTH2,
@@ -114,8 +114,10 @@ class AdenCachedStorage(CredentialStorage):
        self._cache_ttl = timedelta(seconds=cache_ttl_seconds)
        self._prefer_local = prefer_local
        self._cache_timestamps: dict[str, datetime] = {}
-        # Index: provider name (e.g., "hubspot") -> credential hash ID
-        self._provider_index: dict[str, str] = {}
+        # Index: provider name (e.g., "hubspot") -> list of credential hash IDs
+        self._provider_index: dict[str, list[str]] = {}
+        # Index: "provider:alias" -> credential hash ID (for alias-based routing)
+        self._alias_index: dict[str, str] = {}

    def save(self, credential: CredentialObject) -> None:
        """
@@ -160,14 +162,16 @@ class AdenCachedStorage(CredentialStorage):
            CredentialObject if found, None otherwise.
        """
        # Check provider index first — Aden-synced credentials take priority
-        resolved_id = self._provider_index.get(credential_id)
-        if resolved_id and resolved_id != credential_id:
-            result = self._load_by_id(resolved_id)
-            if result is not None:
-                logger.info(
-                    f"Loaded credential '{credential_id}' via provider index (id='{resolved_id}')"
-                )
-                return result
+        resolved_ids = self._provider_index.get(credential_id)
+        if resolved_ids:
+            for rid in resolved_ids:
+                if rid != credential_id:
+                    result = self._load_by_id(rid)
+                    if result is not None:
+                        logger.info(
+                            f"Loaded credential '{credential_id}' via provider index (id='{rid}')"
+                        )
+                        return result

        # Direct lookup (exact credential_id match)
        return self._load_by_id(credential_id)
@@ -208,6 +212,22 @@ class AdenCachedStorage(CredentialStorage):
        # Return local credential if it exists (may be None)
        return local_cred

+    def load_all_for_provider(self, provider_name: str) -> list[CredentialObject]:
+        """Load all credentials for a given provider type.
+
+        Args:
+            provider_name: Provider name (e.g. "google", "slack").
+
+        Returns:
+            List of CredentialObjects for all accounts of this provider.
+        """
+        results: list[CredentialObject] = []
+        for cid in self._provider_index.get(provider_name, []):
+            cred = self._load_by_id(cid)
+            if cred:
+                results.append(cred)
+        return results
+
    def delete(self, credential_id: str) -> bool:
        """
        Delete credential from local cache.
@@ -246,9 +266,11 @@ class AdenCachedStorage(CredentialStorage):
        if self._local.exists(credential_id):
            return True
        # Check provider index
-        resolved_id = self._provider_index.get(credential_id)
-        if resolved_id and resolved_id != credential_id:
-            return self._local.exists(resolved_id)
+        resolved_ids = self._provider_index.get(credential_id)
+        if resolved_ids:
+            for rid in resolved_ids:
+                if rid != credential_id and self._local.exists(rid):
+                    return True
        return False

    def _is_cache_fresh(self, credential_id: str) -> bool:
@@ -285,13 +307,15 @@ class AdenCachedStorage(CredentialStorage):

    def _index_provider(self, credential: CredentialObject) -> None:
        """
-        Index a credential by its provider/integration type.
+        Index a credential by its provider/integration type and alias.

        Aden credentials carry an ``_integration_type`` key whose value is
        the provider name (e.g., ``hubspot``).  This method maps that
        provider name to the credential's hash ID so that subsequent
        ``load("hubspot")`` calls resolve to the correct credential.

+        Also indexes by ``_alias`` for alias-based multi-account routing.
+
        Args:
            credential: The credential to index.
        """
@@ -300,19 +324,45 @@ class AdenCachedStorage(CredentialStorage):
            return
        provider_name = integration_type_key.value.get_secret_value()
        if provider_name:
-            self._provider_index[provider_name] = credential.id
+            if provider_name not in self._provider_index:
+                self._provider_index[provider_name] = []
+            if credential.id not in self._provider_index[provider_name]:
+                self._provider_index[provider_name].append(credential.id)
            logger.debug(f"Indexed provider '{provider_name}' -> '{credential.id}'")

+            # Index by alias for multi-account routing
+            alias_key = credential.keys.get("_alias")
+            if alias_key:
+                alias = alias_key.value.get_secret_value()
+                if alias:
+                    self._alias_index[f"{provider_name}:{alias}"] = credential.id
+
+    def load_by_alias(self, provider_name: str, alias: str) -> CredentialObject | None:
+        """Load a credential by provider name and alias.
+
+        Args:
+            provider_name: Provider type (e.g. "google", "slack").
+            alias: User-set alias from the Aden platform.
+
+        Returns:
+            CredentialObject if found, None otherwise.
+        """
+        cred_id = self._alias_index.get(f"{provider_name}:{alias}")
+        if cred_id:
+            return self._load_by_id(cred_id)
+        return None
+
    def rebuild_provider_index(self) -> int:
        """
-        Rebuild the provider index from all locally cached credentials.
+        Rebuild the provider and alias indexes from all locally cached credentials.

-        Useful after loading from disk when the in-memory index is empty.
+        Useful after loading from disk when the in-memory indexes are empty.

        Returns:
            Number of provider mappings indexed.
        """
        self._provider_index.clear()
+        self._alias_index.clear()
        indexed = 0
        for cred_id in self._local.list_all():
            cred = self._local.load(cred_id)
@@ -328,8 +378,8 @@ class AdenCachedStorage(CredentialStorage):
        """
        Sync all credentials from Aden server to local cache.

-        Fetches the list of available integrations from Aden and
-        updates the local cache with current tokens.
+        Calls GET /v1/credentials to list active integrations,
+        then fetches tokens for each.

        Returns:
            Number of credentials synced.
@@ -341,9 +391,7 @@ class AdenCachedStorage(CredentialStorage):

            for info in integrations:
                if info.status != "active":
-                    logger.warning(
-                        f"Skipping integration '{info.integration_id}': status={info.status}"
-                    )
+                    logger.warning(f"Skipping integration '{info.alias}': status={info.status}")
                    continue

                try:
@@ -351,9 +399,9 @@ class AdenCachedStorage(CredentialStorage):
                    if cred:
                        self.save(cred)
                        synced += 1
-                        logger.info(f"Synced credential '{info.integration_id}' from Aden")
+                        logger.info(f"Synced credential '{info.alias}' from Aden")
                except Exception as e:
-                    logger.warning(f"Failed to sync '{info.integration_id}': {e}")
+                    logger.warning(f"Failed to sync '{info.alias}': {e}")

        except Exception as e:
            logger.error(f"Failed to list integrations from Aden: {e}")
@@ -61,11 +61,13 @@ def mock_client(aden_config):
 def aden_response():
    """Create a sample Aden credential response."""
    return AdenCredentialResponse(
-        integration_id="hubspot",
-        integration_type="hubspot",
+        integration_id="aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1",
        access_token="test-access-token",
        token_type="Bearer",
        expires_at=datetime.now(UTC) + timedelta(hours=1),
+        provider="hubspot",
+        alias="My HubSpot",
+        email="test@example.com",
        scopes=["crm.objects.contacts.read", "crm.objects.contacts.write"],
        metadata={"portal_id": "12345"},
    )
@@ -108,18 +110,20 @@ class TestAdenCredentialResponse:
    """Tests for AdenCredentialResponse dataclass."""

    def test_from_dict_basic(self):
-        """Test creating response from dict."""
+        """Test creating response from dict (real get-token format)."""
        data = {
-            "integration_id": "github",
-            "integration_type": "github",
            "access_token": "ghp_xxxxx",
+            "token_type": "Bearer",
+            "provider": "github",
+            "alias": "Work",
        }

-        response = AdenCredentialResponse.from_dict(data)
+        response = AdenCredentialResponse.from_dict(data, integration_id="Z2l0aHViOldvcms6MTIzNDU")

-        assert response.integration_id == "github"
-        assert response.integration_type == "github"
+        assert response.integration_id == "Z2l0aHViOldvcms6MTIzNDU"
        assert response.access_token == "ghp_xxxxx"
+        assert response.provider == "github"
+        assert response.integration_type == "github"  # backward compat property
        assert response.token_type == "Bearer"
        assert response.expires_at is None
        assert response.scopes == []
@@ -127,19 +131,23 @@ class TestAdenCredentialResponse:
    def test_from_dict_full(self):
        """Test creating response with all fields."""
        data = {
-            "integration_id": "hubspot",
-            "integration_type": "hubspot",
            "access_token": "token123",
            "token_type": "Bearer",
            "expires_at": "2026-01-28T15:30:00Z",
+            "provider": "hubspot",
+            "alias": "My HubSpot",
+            "email": "test@example.com",
            "scopes": ["read", "write"],
            "metadata": {"key": "value"},
        }

-        response = AdenCredentialResponse.from_dict(data)
+        response = AdenCredentialResponse.from_dict(data, integration_id="aHVic3BvdDp0ZXN0")

-        assert response.integration_id == "hubspot"
+        assert response.integration_id == "aHVic3BvdDp0ZXN0"
        assert response.access_token == "token123"
+        assert response.provider == "hubspot"
+        assert response.alias == "My HubSpot"
+        assert response.email == "test@example.com"
        assert response.expires_at is not None
        assert response.scopes == ["read", "write"]
        assert response.metadata == {"key": "value"}
@@ -149,21 +157,44 @@ class TestAdenIntegrationInfo:
    """Tests for AdenIntegrationInfo dataclass."""

    def test_from_dict(self):
-        """Test creating integration info from dict."""
+        """Test creating integration info from real API format."""
        data = {
-            "integration_id": "slack",
-            "integration_type": "slack",
+            "integration_id": "c2xhY2s6V29yayBTbGFjazoxMjM0NQ",
+            "provider": "slack",
+            "alias": "Work Slack",
            "status": "active",
-            "expires_at": "2026-02-01T00:00:00Z",
+            "email": "user@example.com",
+            "expires_at": "2026-02-20T21:46:04.863Z",
        }

        info = AdenIntegrationInfo.from_dict(data)

-        assert info.integration_id == "slack"
-        assert info.integration_type == "slack"
+        assert info.integration_id == "c2xhY2s6V29yayBTbGFjazoxMjM0NQ"
+        assert info.provider == "slack"
+        assert info.integration_type == "slack"  # backward compat property
+        assert info.alias == "Work Slack"
+        assert info.email == "user@example.com"
        assert info.status == "active"
        assert info.expires_at is not None

+    def test_from_dict_minimal(self):
+        """Test creating integration info with minimal fields."""
+        data = {
+            "integration_id": "Z29vZ2xlOlRpbW90aHk6MTYwNjc",
+            "provider": "google",
+            "alias": "Timothy",
+            "status": "requires_reauth",
+        }
+
+        info = AdenIntegrationInfo.from_dict(data)
+
+        assert info.integration_id == "Z29vZ2xlOlRpbW90aHk6MTYwNjc"
+        assert info.provider == "google"
+        assert info.alias == "Timothy"
+        assert info.status == "requires_reauth"
+        assert info.email == ""
+        assert info.expires_at is None
+

 # =============================================================================
 # AdenSyncProvider Tests
@@ -220,10 +251,11 @@ class TestAdenSyncProvider:

    def test_refresh_success(self, provider, mock_client, aden_response):
        """Test successful credential refresh."""
+        hash_id = "aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1"
        mock_client.request_refresh.return_value = aden_response

        cred = CredentialObject(
-            id="hubspot",
+            id=hash_id,
            credential_type=CredentialType.OAUTH2,
            keys={
                "access_token": CredentialKey(
@@ -239,7 +271,7 @@ class TestAdenSyncProvider:
        assert refreshed.keys["access_token"].value.get_secret_value() == "test-access-token"
        assert refreshed.keys["_aden_managed"].value.get_secret_value() == "true"
        assert refreshed.last_refreshed is not None
-        mock_client.request_refresh.assert_called_once_with("hubspot")
+        mock_client.request_refresh.assert_called_once_with(hash_id)

    def test_refresh_requires_reauth(self, provider, mock_client):
        """Test refresh that requires re-authorization."""
@@ -339,12 +371,13 @@ class TestAdenSyncProvider:

    def test_fetch_from_aden(self, provider, mock_client, aden_response):
        """Test fetching credential from Aden."""
+        hash_id = "aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1"
        mock_client.get_credential.return_value = aden_response

-        cred = provider.fetch_from_aden("hubspot")
+        cred = provider.fetch_from_aden(hash_id)

        assert cred is not None
-        assert cred.id == "hubspot"
+        assert cred.id == hash_id
        assert cred.keys["access_token"].value.get_secret_value() == "test-access-token"
        assert cred.auto_refresh is True

@@ -360,13 +393,15 @@ class TestAdenSyncProvider:
        """Test syncing all credentials."""
        mock_client.list_integrations.return_value = [
            AdenIntegrationInfo(
-                integration_id="hubspot",
-                integration_type="hubspot",
+                integration_id="aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1",
+                provider="hubspot",
+                alias="My HubSpot",
                status="active",
            ),
            AdenIntegrationInfo(
-                integration_id="github",
-                integration_type="github",
+                integration_id="Z2l0aHViOnRlc3Q6OTk5",
+                provider="github",
+                alias="Work GitHub",
                status="requires_reauth",  # Should be skipped
            ),
        ]
@@ -376,7 +411,7 @@ class TestAdenSyncProvider:
        synced = provider.sync_all(store)

        assert synced == 1  # Only active one was synced
-        assert store.get_credential("hubspot") is not None
+        assert store.get_credential("aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1") is not None

    def test_validate_via_aden(self, provider, mock_client):
        """Test validation via Aden introspection."""
@@ -608,7 +643,7 @@ class TestAdenCachedStorage:

        cached_storage.save(cred)

-        assert cached_storage._provider_index["hubspot"] == "aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1"
+        assert cached_storage._provider_index["hubspot"] == ["aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1"]

    def test_load_by_provider_name(self, cached_storage):
        """Test load resolves provider name to hash-based credential ID."""
@@ -711,8 +746,8 @@ class TestAdenCachedStorage:
        indexed = cached_storage.rebuild_provider_index()

        assert indexed == 2
-        assert cached_storage._provider_index["hubspot"] == "hash_hub"
-        assert cached_storage._provider_index["slack"] == "hash_slack"
+        assert cached_storage._provider_index["hubspot"] == ["hash_hub"]
+        assert cached_storage._provider_index["slack"] == ["hash_slack"]

    def test_save_without_integration_type_no_index(self, cached_storage):
        """Test save does not index credentials without _integration_type key."""
@@ -743,19 +778,23 @@ class TestAdenIntegration:

    def test_full_workflow(self, mock_client, aden_response):
        """Test full workflow: sync, get, refresh."""
+        hash_id = "aHVic3BvdDp0ZXN0OjEzNjExOjExNTI1"
+
        # Setup
        mock_client.list_integrations.return_value = [
            AdenIntegrationInfo(
-                integration_id="hubspot",
-                integration_type="hubspot",
+                integration_id=hash_id,
+                provider="hubspot",
+                alias="My HubSpot",
                status="active",
            ),
        ]
        mock_client.get_credential.return_value = aden_response
        mock_client.request_refresh.return_value = AdenCredentialResponse(
-            integration_id="hubspot",
-            integration_type="hubspot",
+            integration_id=hash_id,
            access_token="refreshed-token",
+            provider="hubspot",
+            alias="My HubSpot",
            expires_at=datetime.now(UTC) + timedelta(hours=2),
            scopes=[],
        )
@@ -772,8 +811,8 @@ class TestAdenIntegration:
        synced = provider.sync_all(store)
        assert synced == 1

-        # Get credential
-        cred = store.get_credential("hubspot")
+        # Get credential by hash ID
+        cred = store.get_credential(hash_id)
        assert cred is not None
        assert cred.keys["access_token"].value.get_secret_value() == "test-access-token"

@@ -0,0 +1,31 @@
+"""
+Local credential registry — named API key accounts with identity metadata.
+
+Provides feature parity with Aden OAuth credentials for locally-stored API keys:
+aliases, identity metadata, status tracking, CRUD, and health validation.
+
+Usage:
+    from framework.credentials.local import LocalCredentialRegistry, LocalAccountInfo
+
+    registry = LocalCredentialRegistry.default()
+
+    # Add a named account
+    info, health = registry.save_account("brave_search", "work", "BSA-xxx")
+
+    # List all stored local accounts
+    for account in registry.list_accounts():
+        print(f"{account.credential_id}/{account.alias}: {account.status}")
+        if account.identity.is_known:
+            print(f"  Identity: {account.identity.label}")
+
+    # Re-validate a stored account
+    result = registry.validate_account("github", "personal")
+"""
+
+from .models import LocalAccountInfo
+from .registry import LocalCredentialRegistry
+
+__all__ = [
+    "LocalAccountInfo",
+    "LocalCredentialRegistry",
+]
@@ -0,0 +1,58 @@
+"""
+Data models for the local credential registry.
+
+LocalAccountInfo mirrors AdenIntegrationInfo, giving local API key credentials
+the same identity/status metadata as Aden OAuth credentials.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import datetime
+
+from framework.credentials.models import CredentialIdentity
+
+
+@dataclass
+class LocalAccountInfo:
+    """
+    A locally-stored named credential account.
+
+    Mirrors AdenIntegrationInfo so local and Aden accounts can be treated
+    uniformly in the credential tester and account selection UI.
+
+    Attributes:
+        credential_id: The logical credential name (e.g. "brave_search", "github")
+        alias: User-provided name for this account (e.g. "work", "personal")
+        status: "active" | "failed" | "unknown"
+        identity: Email, username, workspace, or account_id extracted from health check
+        last_validated: When the key was last verified against the live API
+        created_at: When this account was first stored
+    """
+
+    credential_id: str
+    alias: str
+    status: str = "unknown"
+    identity: CredentialIdentity = field(default_factory=CredentialIdentity)
+    last_validated: datetime | None = None
+    created_at: datetime = field(default_factory=datetime.utcnow)
+
+    @property
+    def storage_id(self) -> str:
+        """The key used in EncryptedFileStorage: '{credential_id}/{alias}'."""
+        return f"{self.credential_id}/{self.alias}"
+
+    def to_account_dict(self) -> dict:
+        """
+        Format compatible with AccountSelectionScreen and configure_for_account().
+
+        Same shape as Aden account dicts, with source='local' added.
+        """
+        return {
+            "provider": self.credential_id,
+            "alias": self.alias,
+            "identity": self.identity.to_dict(),
+            "integration_id": None,
+            "source": "local",
+            "status": self.status,
+        }
@@ -0,0 +1,326 @@
+"""
+Local Credential Registry.
+
+Manages named local API key accounts stored in EncryptedFileStorage.
+Mirrors the Aden integration model so local credentials have feature parity:
+aliases, identity metadata, status tracking, CRUD, and health validation.
+
+Storage convention:
+    {credential_id}/{alias}  →  CredentialObject
+    e.g. "brave_search/work" →  { api_key: "BSA-xxx", _alias: "work",
+                                   _integration_type: "brave_search",
+                                   _status: "active",
+                                   _identity_username: "acme", ... }
+
+Usage:
+    registry = LocalCredentialRegistry.default()
+
+    # Add a new account
+    info, health = registry.save_account("brave_search", "work", "BSA-xxx")
+    print(info.status, info.identity.label)
+
+    # List all accounts
+    for account in registry.list_accounts():
+        print(f"{account.credential_id}/{account.alias}: {account.status}")
+
+    # Get the raw API key for a specific account
+    key = registry.get_key("github", "personal")
+
+    # Re-validate a stored account
+    result = registry.validate_account("github", "personal")
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from framework.credentials.models import CredentialIdentity, CredentialObject
+from framework.credentials.storage import EncryptedFileStorage
+
+from .models import LocalAccountInfo
+
+if TYPE_CHECKING:
+    from aden_tools.credentials.health_check import HealthCheckResult
+
+logger = logging.getLogger(__name__)
+
+_SEPARATOR = "/"
+
+
+class LocalCredentialRegistry:
+    """
+    Named local API key account store backed by EncryptedFileStorage.
+
+    Provides the same list/save/get/delete/validate surface as the Aden
+    client, but for locally-stored API keys.
+    """
+
+    def __init__(self, storage: EncryptedFileStorage) -> None:
+        self._storage = storage
+
+    # ------------------------------------------------------------------
+    # Listing
+    # ------------------------------------------------------------------
+
+    def list_accounts(self, credential_id: str | None = None) -> list[LocalAccountInfo]:
+        """
+        List all stored local accounts.
+
+        Args:
+            credential_id: If given, filter to this credential type only.
+
+        Returns:
+            List of LocalAccountInfo sorted by credential_id then alias.
+        """
+        all_ids = self._storage.list_all()
+        accounts: list[LocalAccountInfo] = []
+
+        for storage_id in all_ids:
+            if _SEPARATOR not in storage_id:
+                continue  # Skip legacy un-aliased entries
+
+            try:
+                cred_obj = self._storage.load(storage_id)
+            except Exception as exc:
+                logger.debug("Skipping unreadable credential %s: %s", storage_id, exc)
+                continue
+
+            if cred_obj is None:
+                continue
+
+            info = self._to_account_info(cred_obj)
+            if info is None:
+                continue
+
+            if credential_id and info.credential_id != credential_id:
+                continue
+
+            accounts.append(info)
+
+        return sorted(accounts, key=lambda a: (a.credential_id, a.alias))
+
+    # ------------------------------------------------------------------
+    # Save / add
+    # ------------------------------------------------------------------
+
+    def save_account(
+        self,
+        credential_id: str,
+        alias: str,
+        api_key: str,
+        run_health_check: bool = True,
+        extra_keys: dict[str, str] | None = None,
+    ) -> tuple[LocalAccountInfo, HealthCheckResult | None]:
+        """
+        Store a named account, optionally validating it first.
+
+        Args:
+            credential_id: Logical credential name (e.g. "brave_search").
+            alias: User-chosen name (e.g. "work"). Defaults to "default".
+            api_key: The raw API key / token value.
+            run_health_check: If True, verify the key against the live API
+                and extract identity metadata. Failure still saves with
+                status="failed" so the user can re-validate later.
+            extra_keys: Additional key/value pairs to store (e.g.
+                cse_id for google_custom_search).
+
+        Returns:
+            (LocalAccountInfo, HealthCheckResult | None)
+        """
+        alias = alias or "default"
+        health_result: HealthCheckResult | None = None
+        identity: dict[str, str] = {}
+        status = "active"
+
+        if run_health_check:
+            try:
+                from aden_tools.credentials.health_check import check_credential_health
+
+                kwargs: dict[str, Any] = {}
+                if extra_keys and "cse_id" in extra_keys:
+                    kwargs["cse_id"] = extra_keys["cse_id"]
+
+                health_result = check_credential_health(credential_id, api_key, **kwargs)
+                status = "active" if health_result.valid else "failed"
+                identity = health_result.details.get("identity", {})
+            except Exception as exc:
+                logger.warning("Health check failed for %s/%s: %s", credential_id, alias, exc)
+                status = "unknown"
+
+        storage_id = f"{credential_id}{_SEPARATOR}{alias}"
+        now = datetime.now(UTC)
+
+        cred_obj = CredentialObject(id=storage_id)
+        cred_obj.set_key("api_key", api_key)
+        cred_obj.set_key("_alias", alias)
+        cred_obj.set_key("_integration_type", credential_id)
+        cred_obj.set_key("_status", status)
+
+        if extra_keys:
+            for k, v in extra_keys.items():
+                cred_obj.set_key(k, v)
+
+        if identity:
+            valid_fields = set(CredentialIdentity.model_fields)
+            filtered = {k: v for k, v in identity.items() if k in valid_fields}
+            if filtered:
+                cred_obj.set_identity(**filtered)
+
+        cred_obj.last_refreshed = now if run_health_check else None
+        self._storage.save(cred_obj)
+
+        account_info = LocalAccountInfo(
+            credential_id=credential_id,
+            alias=alias,
+            status=status,
+            identity=cred_obj.identity,
+            last_validated=cred_obj.last_refreshed,
+            created_at=cred_obj.created_at,
+        )
+        return account_info, health_result
+
+    # ------------------------------------------------------------------
+    # Get
+    # ------------------------------------------------------------------
+
+    def get_account(self, credential_id: str, alias: str) -> CredentialObject | None:
+        """Load the raw CredentialObject for a specific account."""
+        return self._storage.load(f"{credential_id}{_SEPARATOR}{alias}")
+
+    def get_key(self, credential_id: str, alias: str, key_name: str = "api_key") -> str | None:
+        """
+        Return the stored secret value for a specific account.
+
+        Args:
+            credential_id: Logical credential name (e.g. "brave_search").
+            alias: Account alias (e.g. "work").
+            key_name: Key within the credential (default "api_key").
+
+        Returns:
+            The secret value, or None if not found.
+        """
+        cred = self.get_account(credential_id, alias)
+        if cred is None:
+            return None
+        return cred.get_key(key_name)
+
+    def get_account_info(self, credential_id: str, alias: str) -> LocalAccountInfo | None:
+        """Load a LocalAccountInfo for a specific account."""
+        cred = self.get_account(credential_id, alias)
+        if cred is None:
+            return None
+        return self._to_account_info(cred)
+
+    # ------------------------------------------------------------------
+    # Delete
+    # ------------------------------------------------------------------
+
+    def delete_account(self, credential_id: str, alias: str) -> bool:
+        """
+        Remove a stored account.
+
+        Returns:
+            True if the account existed and was deleted, False otherwise.
+        """
+        return self._storage.delete(f"{credential_id}{_SEPARATOR}{alias}")
+
+    # ------------------------------------------------------------------
+    # Validate
+    # ------------------------------------------------------------------
+
+    def validate_account(self, credential_id: str, alias: str) -> HealthCheckResult:
+        """
+        Re-run health check for a stored account and update its status.
+
+        Args:
+            credential_id: Logical credential name.
+            alias: Account alias.
+
+        Returns:
+            HealthCheckResult from the live API check.
+
+        Raises:
+            KeyError: If the account doesn't exist.
+        """
+        from aden_tools.credentials.health_check import HealthCheckResult, check_credential_health
+
+        cred = self.get_account(credential_id, alias)
+        if cred is None:
+            raise KeyError(f"No local account found: {credential_id}/{alias}")
+
+        api_key = cred.get_key("api_key")
+        if not api_key:
+            return HealthCheckResult(valid=False, message="No api_key stored for this account")
+
+        try:
+            kwargs: dict[str, Any] = {}
+            cse_id = cred.get_key("cse_id")
+            if cse_id:
+                kwargs["cse_id"] = cse_id
+
+            result = check_credential_health(credential_id, api_key, **kwargs)
+        except Exception as exc:
+            result = HealthCheckResult(
+                valid=False,
+                message=f"Health check error: {exc}",
+                details={"error": str(exc)},
+            )
+
+        # Update status and timestamp in-place
+        new_status = "active" if result.valid else "failed"
+        cred.set_key("_status", new_status)
+        cred.last_refreshed = datetime.now(UTC)
+
+        # Re-extract identity if available
+        identity = result.details.get("identity", {})
+        if identity:
+            valid_fields = set(CredentialIdentity.model_fields)
+            filtered = {k: v for k, v in identity.items() if k in valid_fields}
+            if filtered:
+                cred.set_identity(**filtered)
+
+        self._storage.save(cred)
+        return result
+
+    # ------------------------------------------------------------------
+    # Factory
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def default(cls) -> LocalCredentialRegistry:
+        """Create a registry using the default encrypted storage at ~/.hive/credentials."""
+        return cls(EncryptedFileStorage())
+
+    @classmethod
+    def at_path(cls, path: str | Path) -> LocalCredentialRegistry:
+        """Create a registry using a custom storage path."""
+        return cls(EncryptedFileStorage(base_path=path))
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    def _to_account_info(self, cred_obj: CredentialObject) -> LocalAccountInfo | None:
+        """Build LocalAccountInfo from a CredentialObject."""
+        cred_type_key = cred_obj.keys.get("_integration_type")
+        if cred_type_key is None:
+            return None
+        cred_id = cred_type_key.get_secret_value()
+
+        alias_key = cred_obj.keys.get("_alias")
+        alias = alias_key.get_secret_value() if alias_key else cred_obj.id.split(_SEPARATOR, 1)[-1]
+
+        status_key = cred_obj.keys.get("_status")
+        status = status_key.get_secret_value() if status_key else "unknown"
+
+        return LocalAccountInfo(
+            credential_id=cred_id,
+            alias=alias,
+            status=status,
+            identity=cred_obj.identity,
+            last_validated=cred_obj.last_refreshed,
+            created_at=cred_obj.created_at,
+        )
@@ -70,6 +70,29 @@ class CredentialKey(BaseModel):
        return self.value.get_secret_value()


+class CredentialIdentity(BaseModel):
+    """Identity information for a credential (whose account is this?)."""
+
+    email: str | None = None
+    username: str | None = None
+    workspace: str | None = None
+    account_id: str | None = None
+
+    @property
+    def label(self) -> str:
+        """Best human-readable identifier for display."""
+        return self.email or self.username or self.workspace or self.account_id or "unknown"
+
+    @property
+    def is_known(self) -> bool:
+        """Whether any identity field is populated."""
+        return bool(self.email or self.username or self.workspace or self.account_id)
+
+    def to_dict(self) -> dict[str, str]:
+        """Return only non-None identity fields."""
+        return {k: v for k, v in self.model_dump().items() if v is not None}
+
+
 class CredentialObject(BaseModel):
    """
    A credential object containing one or more keys.
@@ -202,6 +225,35 @@ class CredentialObject(BaseModel):

        return None

+    @property
+    def identity(self) -> CredentialIdentity:
+        """Extract identity from ``_identity_*`` keys in the vault."""
+        fields = {}
+        for key_name, key_obj in self.keys.items():
+            if key_name.startswith("_identity_"):
+                field_name = key_name[len("_identity_") :]
+                if field_name in CredentialIdentity.model_fields:
+                    fields[field_name] = key_obj.value.get_secret_value()
+        return CredentialIdentity(**fields)
+
+    @property
+    def provider_type(self) -> str | None:
+        """Return the integration/provider type (e.g. 'google', 'slack')."""
+        key = self.keys.get("_integration_type")
+        return key.value.get_secret_value() if key else None
+
+    @property
+    def alias(self) -> str | None:
+        """Return the user-set alias from the Aden platform."""
+        key = self.keys.get("_alias")
+        return key.value.get_secret_value() if key else None
+
+    def set_identity(self, **fields: str) -> None:
+        """Persist identity fields as ``_identity_*`` keys."""
+        for field_name, value in fields.items():
+            if value:
+                self.set_key(f"_identity_{field_name}", value)
+

 class CredentialUsageSpec(BaseModel):
    """
@@ -0,0 +1,744 @@
+"""
+Interactive credential setup for CLI applications.
+
+Provides a modular, reusable credential setup flow that can be triggered
+when validate_agent_credentials() fails. Works with both TUI and headless CLIs.
+
+Usage:
+    from framework.credentials.setup import CredentialSetupSession
+
+    # From agent path
+    session = CredentialSetupSession.from_agent_path("exports/my-agent")
+    result = session.run_interactive()
+
+    # From nodes directly
+    session = CredentialSetupSession.from_nodes(nodes)
+    result = session.run_interactive()
+
+    # With custom I/O (for integration with other UIs)
+    session = CredentialSetupSession(
+        missing=missing_creds,
+        input_fn=my_input,
+        print_fn=my_print,
+    )
+"""
+
+from __future__ import annotations
+
+import getpass
+import json
+import os
+import sys
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from framework.graph import NodeSpec
+
+
+# ANSI colors for terminal output
+class Colors:
+    RED = "\033[0;31m"
+    GREEN = "\033[0;32m"
+    YELLOW = "\033[1;33m"
+    BLUE = "\033[0;34m"
+    CYAN = "\033[0;36m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
+    NC = "\033[0m"  # No Color
+
+    @classmethod
+    def disable(cls):
+        """Disable colors (for non-TTY output)."""
+        cls.RED = cls.GREEN = cls.YELLOW = cls.BLUE = ""
+        cls.CYAN = cls.BOLD = cls.DIM = cls.NC = ""
+
+
+@dataclass
+class MissingCredential:
+    """A credential that needs to be configured."""
+
+    credential_name: str
+    """Internal credential name (e.g., 'brave_search')"""
+
+    env_var: str
+    """Environment variable name (e.g., 'BRAVE_SEARCH_API_KEY')"""
+
+    description: str
+    """Human-readable description"""
+
+    help_url: str
+    """URL where user can obtain credential"""
+
+    api_key_instructions: str
+    """Step-by-step instructions for getting API key"""
+
+    tools: list[str] = field(default_factory=list)
+    """Tools that require this credential"""
+
+    node_types: list[str] = field(default_factory=list)
+    """Node types that require this credential"""
+
+    aden_supported: bool = False
+    """Whether Aden OAuth flow is supported"""
+
+    direct_api_key_supported: bool = True
+    """Whether direct API key entry is supported"""
+
+    credential_id: str = ""
+    """Credential store ID"""
+
+    credential_key: str = "api_key"
+    """Key name within the credential"""
+
+
+@dataclass
+class SetupResult:
+    """Result of credential setup session."""
+
+    success: bool
+    """Whether all required credentials were configured"""
+
+    configured: list[str] = field(default_factory=list)
+    """Credentials that were successfully set up"""
+
+    skipped: list[str] = field(default_factory=list)
+    """Credentials user chose to skip"""
+
+    errors: list[str] = field(default_factory=list)
+    """Any errors encountered"""
+
+
+class CredentialSetupSession:
+    """
+    Interactive credential setup session.
+
+    Can be used by any CLI (runner, coding agent, etc.) to guide users
+    through credential configuration when validation fails.
+
+    Example:
+        from framework.credentials.setup import CredentialSetupSession
+        from framework.credentials.models import CredentialError
+
+        try:
+            validate_agent_credentials(nodes)
+        except CredentialError:
+            session = CredentialSetupSession.from_nodes(nodes)
+            result = session.run_interactive()
+            if result.success:
+                # Retry - credentials are now configured
+                validate_agent_credentials(nodes)
+    """
+
+    def __init__(
+        self,
+        missing: list[MissingCredential],
+        input_fn: Callable[[str], str] | None = None,
+        print_fn: Callable[[str], None] | None = None,
+        password_fn: Callable[[str], str] | None = None,
+    ):
+        """
+        Initialize the setup session.
+
+        Args:
+            missing: List of credentials that need setup
+            input_fn: Custom input function (default: built-in input)
+            print_fn: Custom print function (default: built-in print)
+            password_fn: Custom password input function (default: getpass.getpass)
+        """
+        self.missing = missing
+        self.input_fn = input_fn or input
+        self.print_fn = print_fn or print
+        self.password_fn = password_fn or getpass.getpass
+
+        # Disable colors if not a TTY
+        if not sys.stdout.isatty():
+            Colors.disable()
+
+    @classmethod
+    def from_nodes(cls, nodes: list[NodeSpec]) -> CredentialSetupSession:
+        """Create a setup session by detecting missing credentials from nodes."""
+        missing = detect_missing_credentials_from_nodes(nodes)
+        return cls(missing)
+
+    @classmethod
+    def from_agent_path(cls, agent_path: str | Path) -> CredentialSetupSession:
+        """Create a setup session for an agent by path."""
+        agent_path = Path(agent_path)
+
+        # Load agent to get nodes
+        agent_json = agent_path / "agent.json"
+        agent_py = agent_path / "agent.py"
+
+        nodes = []
+        if agent_py.exists():
+            # Python-based agent
+            nodes = _load_nodes_from_python_agent(agent_path)
+        elif agent_json.exists():
+            # JSON-based agent
+            nodes = _load_nodes_from_json_agent(agent_json)
+
+        missing = detect_missing_credentials_from_nodes(nodes)
+        return cls(missing)
+
+    def run_interactive(self) -> SetupResult:
+        """Run the interactive setup flow."""
+        configured: list[str] = []
+        skipped: list[str] = []
+        errors: list[str] = []
+
+        if not self.missing:
+            self._print(f"\n{Colors.GREEN}✓ All credentials are already configured!{Colors.NC}\n")
+            return SetupResult(success=True)
+
+        self._print_header()
+
+        # Ensure HIVE_CREDENTIAL_KEY is set before storing anything
+        if not self._ensure_credential_key():
+            return SetupResult(
+                success=False,
+                errors=["Failed to initialize credential store encryption key"],
+            )
+
+        for cred in self.missing:
+            try:
+                result = self._setup_single_credential(cred)
+                if result:
+                    configured.append(cred.credential_name)
+                else:
+                    skipped.append(cred.credential_name)
+            except KeyboardInterrupt:
+                self._print(f"\n{Colors.YELLOW}Setup interrupted.{Colors.NC}")
+                skipped.append(cred.credential_name)
+                break
+            except Exception as e:
+                errors.append(f"{cred.credential_name}: {e}")
+
+        self._print_summary(configured, skipped, errors)
+
+        return SetupResult(
+            success=len(errors) == 0 and len(skipped) == 0,
+            configured=configured,
+            skipped=skipped,
+            errors=errors,
+        )
+
+    def _print(self, msg: str) -> None:
+        """Print a message."""
+        self.print_fn(msg)
+
+    def _input(self, prompt: str) -> str:
+        """Get input from user."""
+        return self.input_fn(prompt)
+
+    def _print_header(self) -> None:
+        """Print the setup header."""
+        self._print("")
+        self._print(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
+        self._print(f"{Colors.BOLD}  CREDENTIAL SETUP{Colors.NC}")
+        self._print(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
+        self._print("")
+        self._print(f"  {len(self.missing)} credential(s) need to be configured:")
+        for cred in self.missing:
+            affected = cred.tools or cred.node_types
+            self._print(f"    • {cred.env_var} ({', '.join(affected)})")
+        self._print("")
+
+    def _ensure_credential_key(self) -> bool:
+        """Ensure HIVE_CREDENTIAL_KEY is available for encrypted storage."""
+        if os.environ.get("HIVE_CREDENTIAL_KEY"):
+            return True
+
+        # Try to load from shell config
+        try:
+            from aden_tools.credentials.shell_config import check_env_var_in_shell_config
+
+            found, value = check_env_var_in_shell_config("HIVE_CREDENTIAL_KEY")
+            if found and value:
+                os.environ["HIVE_CREDENTIAL_KEY"] = value
+                return True
+        except ImportError:
+            pass
+
+        # Generate a new key
+        self._print(f"{Colors.YELLOW}Initializing credential store...{Colors.NC}")
+        try:
+            from cryptography.fernet import Fernet
+
+            generated_key = Fernet.generate_key().decode()
+            os.environ["HIVE_CREDENTIAL_KEY"] = generated_key
+
+            # Save to shell config
+            self._save_key_to_shell_config(generated_key)
+            return True
+        except Exception as e:
+            self._print(f"{Colors.RED}Failed to initialize credential store: {e}{Colors.NC}")
+            return False
+
+    def _save_key_to_shell_config(self, key: str) -> None:
+        """Save HIVE_CREDENTIAL_KEY to shell config."""
+        try:
+            from aden_tools.credentials.shell_config import (
+                add_env_var_to_shell_config,
+            )
+
+            success, config_path = add_env_var_to_shell_config(
+                "HIVE_CREDENTIAL_KEY",
+                key,
+                comment="Encryption key for Hive credential store",
+            )
+            if success:
+                self._print(f"{Colors.GREEN}✓ Encryption key saved to {config_path}{Colors.NC}")
+        except Exception:
+            # Fallback: just tell the user
+            self._print("\n")
+            self._print(
+                f"{Colors.YELLOW}Add this to your shell config (~/.zshrc or ~/.bashrc):{Colors.NC}"
+            )
+            self._print(f'  export HIVE_CREDENTIAL_KEY="{key}"')
+
+    def _setup_single_credential(self, cred: MissingCredential) -> bool:
+        """Set up a single credential. Returns True if configured."""
+        self._print(f"\n{Colors.CYAN}{'─' * 60}{Colors.NC}")
+        self._print(f"{Colors.BOLD}Setting up: {cred.credential_name}{Colors.NC}")
+        affected = cred.tools or cred.node_types
+        self._print(f"{Colors.DIM}Required for: {', '.join(affected)}{Colors.NC}")
+        if cred.description:
+            self._print(f"{Colors.DIM}{cred.description}{Colors.NC}")
+        self._print(f"{Colors.CYAN}{'─' * 60}{Colors.NC}")
+
+        # Show auth options
+        options = self._get_auth_options(cred)
+        choice = self._prompt_choice(options)
+
+        if choice == "skip":
+            return False
+        elif choice == "aden":
+            return self._setup_via_aden(cred)
+        elif choice == "direct":
+            return self._setup_direct_api_key(cred)
+
+        return False
+
+    def _get_auth_options(self, cred: MissingCredential) -> list[tuple[str, str, str]]:
+        """Get available auth options as (key, label, description) tuples."""
+        options = []
+
+        if cred.direct_api_key_supported:
+            options.append(
+                (
+                    "direct",
+                    "Enter API key directly",
+                    "Paste your API key from the provider's dashboard",
+                )
+            )
+
+        if cred.aden_supported:
+            options.append(
+                (
+                    "aden",
+                    "Use Aden Platform (OAuth)",
+                    "Secure OAuth2 flow via hive.adenhq.com",
+                )
+            )
+
+        options.append(
+            (
+                "skip",
+                "Skip for now",
+                "Configure this credential later",
+            )
+        )
+
+        return options
+
+    def _prompt_choice(self, options: list[tuple[str, str, str]]) -> str:
+        """Prompt user to choose from options."""
+        self._print("")
+        for i, (key, label, desc) in enumerate(options, 1):
+            if key == "skip":
+                self._print(f"  {Colors.DIM}{i}) {label}{Colors.NC}")
+            else:
+                self._print(f"  {Colors.CYAN}{i}){Colors.NC} {label}")
+                self._print(f"     {Colors.DIM}{desc}{Colors.NC}")
+        self._print("")
+
+        while True:
+            try:
+                choice_str = self._input(f"Select option (1-{len(options)}): ").strip()
+                if not choice_str:
+                    continue
+                choice_num = int(choice_str)
+                if 1 <= choice_num <= len(options):
+                    return options[choice_num - 1][0]
+            except ValueError:
+                pass
+            self._print(f"{Colors.RED}Invalid choice. Enter 1-{len(options)}{Colors.NC}")
+
+    def _setup_direct_api_key(self, cred: MissingCredential) -> bool:
+        """Guide user through direct API key setup."""
+        # Show instructions
+        if cred.api_key_instructions:
+            self._print(f"\n{Colors.BOLD}Setup Instructions:{Colors.NC}")
+            self._print(cred.api_key_instructions)
+
+        if cred.help_url:
+            self._print(f"\n{Colors.CYAN}Get your API key at:{Colors.NC} {cred.help_url}")
+
+        # Collect key (use password input to hide the value)
+        self._print("")
+        try:
+            api_key = self.password_fn(f"Paste your {cred.env_var}: ").strip()
+        except Exception:
+            # Fallback to regular input if password input fails
+            api_key = self._input(f"Paste your {cred.env_var}: ").strip()
+
+        if not api_key:
+            self._print(f"{Colors.YELLOW}No value entered. Skipping.{Colors.NC}")
+            return False
+
+        # Health check
+        health_result = self._run_health_check(cred, api_key)
+        if health_result is not None:
+            if health_result["valid"]:
+                self._print(f"{Colors.GREEN}✓ {health_result['message']}{Colors.NC}")
+            else:
+                self._print(f"{Colors.YELLOW}⚠ {health_result['message']}{Colors.NC}")
+                confirm = self._input("Continue anyway? [y/N]: ").strip().lower()
+                if confirm != "y":
+                    return False
+
+        # Store credential
+        self._store_credential(cred, api_key)
+        return True
+
+    def _setup_via_aden(self, cred: MissingCredential) -> bool:
+        """Guide user through Aden OAuth flow."""
+        self._print(f"\n{Colors.BOLD}Aden Platform Setup{Colors.NC}")
+        self._print("This will sync credentials from your Aden account.")
+        self._print("")
+
+        # Check for ADEN_API_KEY
+        aden_key = os.environ.get("ADEN_API_KEY")
+        if not aden_key:
+            self._print("You need an Aden API key to use this method.")
+            self._print(f"{Colors.CYAN}Get one at:{Colors.NC} https://hive.adenhq.com")
+            self._print("")
+
+            try:
+                aden_key = self.password_fn("Paste your ADEN_API_KEY: ").strip()
+            except Exception:
+                aden_key = self._input("Paste your ADEN_API_KEY: ").strip()
+
+            if not aden_key:
+                self._print(f"{Colors.YELLOW}No key entered. Skipping.{Colors.NC}")
+                return False
+
+            os.environ["ADEN_API_KEY"] = aden_key
+
+            # Save to shell config
+            try:
+                from aden_tools.credentials.shell_config import add_env_var_to_shell_config
+
+                add_env_var_to_shell_config(
+                    "ADEN_API_KEY",
+                    aden_key,
+                    comment="Aden Platform API key",
+                )
+            except Exception:
+                pass
+
+        # Sync from Aden
+        try:
+            from framework.credentials import CredentialStore
+
+            store = CredentialStore.with_aden_sync(
+                base_url="https://api.adenhq.com",
+                auto_sync=True,
+            )
+
+            # Check if the credential was synced
+            cred_id = cred.credential_id or cred.credential_name
+            if store.is_available(cred_id):
+                self._print(f"{Colors.GREEN}✓ {cred.credential_name} synced from Aden{Colors.NC}")
+                # Export to current session
+                try:
+                    value = store.get_key(cred_id, cred.credential_key)
+                    if value:
+                        os.environ[cred.env_var] = value
+                except Exception:
+                    pass
+                return True
+            else:
+                self._print(
+                    f"{Colors.YELLOW}⚠ {cred.credential_name} not found in Aden account.{Colors.NC}"
+                )
+                self._print("Please connect this integration on https://hive.adenhq.com first.")
+                return False
+        except Exception as e:
+            self._print(f"{Colors.RED}Failed to sync from Aden: {e}{Colors.NC}")
+            return False
+
+    def _run_health_check(self, cred: MissingCredential, value: str) -> dict[str, Any] | None:
+        """Run health check on credential value."""
+        try:
+            from aden_tools.credentials import check_credential_health
+
+            result = check_credential_health(cred.credential_name, value)
+            return {
+                "valid": result.valid,
+                "message": result.message,
+                "details": result.details,
+            }
+        except Exception:
+            # No health checker available
+            return None
+
+    def _store_credential(self, cred: MissingCredential, value: str) -> None:
+        """Store credential in encrypted store and export to env."""
+        from pydantic import SecretStr
+
+        from framework.credentials import CredentialKey, CredentialObject, CredentialStore
+
+        try:
+            store = CredentialStore.with_encrypted_storage()
+            cred_id = cred.credential_id or cred.credential_name
+            key_name = cred.credential_key or "api_key"
+
+            cred_obj = CredentialObject(
+                id=cred_id,
+                name=cred.description or cred.credential_name,
+                keys={key_name: CredentialKey(name=key_name, value=SecretStr(value))},
+            )
+            store.save_credential(cred_obj)
+            self._print(f"{Colors.GREEN}✓ Stored in ~/.hive/credentials/{Colors.NC}")
+        except Exception as e:
+            self._print(f"{Colors.YELLOW}⚠ Could not store in credential store: {e}{Colors.NC}")
+
+        # Export to current session
+        os.environ[cred.env_var] = value
+        self._print(f"{Colors.GREEN}✓ Exported to current session{Colors.NC}")
+
+    def _print_summary(self, configured: list[str], skipped: list[str], errors: list[str]) -> None:
+        """Print final summary."""
+        self._print("")
+        self._print(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
+        self._print(f"{Colors.BOLD}  SETUP COMPLETE{Colors.NC}")
+        self._print(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
+
+        if configured:
+            self._print(f"\n{Colors.GREEN}✓ Configured:{Colors.NC}")
+            for name in configured:
+                self._print(f"    • {name}")
+
+        if skipped:
+            self._print(f"\n{Colors.YELLOW}⏭ Skipped:{Colors.NC}")
+            for name in skipped:
+                self._print(f"    • {name}")
+
+        if errors:
+            self._print(f"\n{Colors.RED}✗ Errors:{Colors.NC}")
+            for err in errors:
+                self._print(f"    • {err}")
+
+        if not skipped and not errors:
+            self._print(f"\n{Colors.GREEN}All credentials configured successfully!{Colors.NC}")
+        elif skipped:
+            self._print(f"\n{Colors.YELLOW}Note: Skipped credentials must be configured ")
+            self._print(f"before running the agent.{Colors.NC}")
+
+        self._print("")
+
+
+def detect_missing_credentials_from_nodes(nodes: list) -> list[MissingCredential]:
+    """
+    Detect missing credentials for a list of nodes.
+
+    Args:
+        nodes: List of NodeSpec objects
+
+    Returns:
+        List of MissingCredential objects for credentials that need setup
+    """
+    try:
+        from aden_tools.credentials import CREDENTIAL_SPECS
+
+        from framework.credentials import CredentialStore
+        from framework.credentials.storage import (
+            CompositeStorage,
+            EncryptedFileStorage,
+            EnvVarStorage,
+        )
+    except ImportError:
+        return []
+
+    # Collect required tools and node types
+    required_tools: set[str] = set()
+    node_types: set[str] = set()
+
+    for node in nodes:
+        if hasattr(node, "tools") and node.tools:
+            required_tools.update(node.tools)
+        if hasattr(node, "node_type"):
+            node_types.add(node.node_type)
+
+    # Build credential store to check availability.
+    # Env vars take priority over encrypted store (fresh key wins over stale).
+    env_mapping = {
+        (spec.credential_id or name): spec.env_var for name, spec in CREDENTIAL_SPECS.items()
+    }
+    env_storage = EnvVarStorage(env_mapping=env_mapping)
+    if os.environ.get("HIVE_CREDENTIAL_KEY"):
+        storage = CompositeStorage(primary=env_storage, fallbacks=[EncryptedFileStorage()])
+    else:
+        storage = env_storage
+    store = CredentialStore(storage=storage)
+
+    # Build reverse mappings
+    tool_to_cred: dict[str, str] = {}
+    node_type_to_cred: dict[str, str] = {}
+    for cred_name, spec in CREDENTIAL_SPECS.items():
+        for tool_name in spec.tools:
+            tool_to_cred[tool_name] = cred_name
+        for nt in spec.node_types:
+            node_type_to_cred[nt] = cred_name
+
+    missing: list[MissingCredential] = []
+    checked: set[str] = set()
+
+    # Check tool credentials
+    for tool_name in sorted(required_tools):
+        cred_name = tool_to_cred.get(tool_name)
+        if cred_name is None or cred_name in checked:
+            continue
+        checked.add(cred_name)
+
+        spec = CREDENTIAL_SPECS[cred_name]
+        cred_id = spec.credential_id or cred_name
+        if spec.required and not store.is_available(cred_id):
+            affected_tools = sorted(t for t in required_tools if t in spec.tools)
+            missing.append(
+                MissingCredential(
+                    credential_name=cred_name,
+                    env_var=spec.env_var,
+                    description=spec.description,
+                    help_url=spec.help_url,
+                    api_key_instructions=spec.api_key_instructions,
+                    tools=affected_tools,
+                    aden_supported=spec.aden_supported,
+                    direct_api_key_supported=spec.direct_api_key_supported,
+                    credential_id=spec.credential_id,
+                    credential_key=spec.credential_key,
+                )
+            )
+
+    # Check node type credentials
+    for nt in sorted(node_types):
+        cred_name = node_type_to_cred.get(nt)
+        if cred_name is None or cred_name in checked:
+            continue
+        checked.add(cred_name)
+
+        spec = CREDENTIAL_SPECS[cred_name]
+        cred_id = spec.credential_id or cred_name
+        if spec.required and not store.is_available(cred_id):
+            affected_types = sorted(t for t in node_types if t in spec.node_types)
+            missing.append(
+                MissingCredential(
+                    credential_name=cred_name,
+                    env_var=spec.env_var,
+                    description=spec.description,
+                    help_url=spec.help_url,
+                    api_key_instructions=spec.api_key_instructions,
+                    node_types=affected_types,
+                    aden_supported=spec.aden_supported,
+                    direct_api_key_supported=spec.direct_api_key_supported,
+                    credential_id=spec.credential_id,
+                    credential_key=spec.credential_key,
+                )
+            )
+
+    return missing
+
+
+def _load_nodes_from_python_agent(agent_path: Path) -> list:
+    """Load nodes from a Python-based agent."""
+    import importlib.util
+
+    agent_py = agent_path / "agent.py"
+    if not agent_py.exists():
+        return []
+
+    try:
+        # Add agent path and its parent to sys.path so imports work
+        paths_to_add = [str(agent_path), str(agent_path.parent)]
+        for p in paths_to_add:
+            if p not in sys.path:
+                sys.path.insert(0, p)
+
+        spec = importlib.util.spec_from_file_location(
+            f"{agent_path.name}.agent",
+            agent_py,
+            submodule_search_locations=[str(agent_path)],
+        )
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[spec.name] = module
+        spec.loader.exec_module(module)
+        return getattr(module, "nodes", [])
+    except Exception:
+        return []
+
+
+def _load_nodes_from_json_agent(agent_json: Path) -> list:
+    """Load nodes from a JSON-based agent."""
+    try:
+        with open(agent_json) as f:
+            data = json.load(f)
+
+        from framework.graph import NodeSpec
+
+        nodes_data = data.get("graph", {}).get("nodes", [])
+        nodes = []
+        for node_data in nodes_data:
+            nodes.append(
+                NodeSpec(
+                    id=node_data.get("id", ""),
+                    name=node_data.get("name", ""),
+                    description=node_data.get("description", ""),
+                    node_type=node_data.get("node_type", ""),
+                    tools=node_data.get("tools", []),
+                    input_keys=node_data.get("input_keys", []),
+                    output_keys=node_data.get("output_keys", []),
+                )
+            )
+        return nodes
+    except Exception:
+        return []
+
+
+def run_credential_setup_cli(agent_path: str | Path | None = None) -> int:
+    """
+    Standalone CLI entry point for credential setup.
+
+    Can be called from:
+    - `hive setup-credentials <agent>`
+    - After CredentialError in runner CLI
+    - From coding agent CLI
+
+    Args:
+        agent_path: Optional path to agent directory
+
+    Returns:
+        Exit code (0 = success, 1 = failure/skipped)
+    """
+    if agent_path:
+        session = CredentialSetupSession.from_agent_path(agent_path)
+    else:
+        # No agent specified - detect from current context or show error
+        print("Usage: hive setup-credentials <agent_path>")
+        return 1
+
+    result = session.run_interactive()
+    return 0 if result.success else 1
@@ -362,6 +362,54 @@ class CredentialStore:
        """
        return self._storage.list_all()

+    def list_accounts(self, provider_name: str) -> list[dict[str, Any]]:
+        """List all accounts for a provider type with their identities.
+
+        Args:
+            provider_name: Provider type name (e.g. "google", "slack").
+
+        Returns:
+            List of dicts with credential_id, provider, alias, identity, label.
+        """
+        if hasattr(self._storage, "load_all_for_provider"):
+            creds = self._storage.load_all_for_provider(provider_name)
+        else:
+            cred = self.get_credential(provider_name)
+            creds = [cred] if cred else []
+        return [
+            {
+                "credential_id": c.id,
+                "provider": provider_name,
+                "alias": c.alias,
+                "identity": c.identity.to_dict(),
+            }
+            for c in creds
+        ]
+
+    def get_credential_by_alias(self, provider_name: str, alias: str) -> CredentialObject | None:
+        """Find a credential by provider name and alias.
+
+        Args:
+            provider_name: Provider type name (e.g. "google").
+            alias: User-set alias from the Aden platform.
+
+        Returns:
+            CredentialObject if found, None otherwise.
+        """
+        if hasattr(self._storage, "load_by_alias"):
+            return self._storage.load_by_alias(provider_name, alias)
+
+        # Scan fallback for storage backends without alias index
+        if hasattr(self._storage, "load_all_for_provider"):
+            for cred in self._storage.load_all_for_provider(provider_name):
+                if cred.alias == alias:
+                    return cred
+        return None
+
+    def get_credential_by_identity(self, provider_name: str, label: str) -> CredentialObject | None:
+        """Alias for get_credential_by_alias (backward compat)."""
+        return self.get_credential_by_alias(provider_name, label)
+
    def is_available(self, credential_id: str) -> bool:
        """
        Check if a credential is available.
@@ -0,0 +1,351 @@
+"""Credential validation utilities.
+
+Provides reusable credential validation for agents, whether run through
+the AgentRunner or directly via GraphExecutor.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_credential_key_env() -> None:
+    """Load credentials from shell config if not in environment.
+
+    The quickstart.sh and setup-credentials skill write API keys to ~/.zshrc
+    or ~/.bashrc. If the user hasn't sourced their config in the current shell,
+    this reads them directly so the runner (and any MCP subprocesses) can use them.
+
+    Loads:
+    - HIVE_CREDENTIAL_KEY (encrypted credential store)
+    - ADEN_API_KEY (Aden OAuth sync)
+    - All LLM API keys (ANTHROPIC_API_KEY, OPENAI_API_KEY, ZAI_API_KEY, etc.)
+    """
+    try:
+        from aden_tools.credentials.shell_config import check_env_var_in_shell_config
+    except ImportError:
+        return
+
+    # Core credentials that are always checked
+    env_vars_to_load = ["HIVE_CREDENTIAL_KEY", "ADEN_API_KEY"]
+
+    # Add all LLM/tool API keys from CREDENTIAL_SPECS
+    try:
+        from aden_tools.credentials import CREDENTIAL_SPECS
+
+        for spec in CREDENTIAL_SPECS.values():
+            if spec.env_var and spec.env_var not in env_vars_to_load:
+                env_vars_to_load.append(spec.env_var)
+    except ImportError:
+        pass
+
+    for var_name in env_vars_to_load:
+        if os.environ.get(var_name):
+            continue
+        found, value = check_env_var_in_shell_config(var_name)
+        if found and value:
+            os.environ[var_name] = value
+            logger.debug("Loaded %s from shell config", var_name)
+
+
+@dataclass
+class _CredentialCheck:
+    """Result of checking a single credential."""
+
+    env_var: str
+    source: str
+    used_by: str
+    available: bool
+    help_url: str = ""
+
+
+def _presync_aden_tokens(credential_specs: dict) -> None:
+    """Sync Aden-backed OAuth tokens into env vars for validation.
+
+    When ADEN_API_KEY is available, fetches fresh OAuth tokens from the Aden
+    server and exports them to env vars.  This ensures validation sees real
+    tokens instead of stale or mis-stored values in the encrypted store.
+    Only touches credentials that are ``aden_supported`` AND whose env var
+    is not already set (so explicit user exports always win).
+    """
+    from framework.credentials.store import CredentialStore
+
+    try:
+        aden_store = CredentialStore.with_aden_sync(auto_sync=True)
+    except Exception as e:
+        logger.warning("Aden pre-sync unavailable: %s", e)
+        return
+
+    for name, spec in credential_specs.items():
+        if not spec.aden_supported:
+            continue
+        if os.environ.get(spec.env_var):
+            continue  # Already set — don't overwrite
+        cred_id = spec.credential_id or name
+        try:
+            value = aden_store.get_key(cred_id, spec.credential_key)
+            if value:
+                os.environ[spec.env_var] = value
+                logger.debug("Pre-synced %s from Aden", spec.env_var)
+            else:
+                logger.warning(
+                    "Pre-sync: %s (id=%s) available but key '%s' returned None",
+                    spec.env_var,
+                    cred_id,
+                    spec.credential_key,
+                )
+        except Exception as e:
+            logger.warning(
+                "Pre-sync failed for %s (id=%s): %s",
+                spec.env_var,
+                cred_id,
+                e,
+            )
+
+
+def validate_agent_credentials(nodes: list, quiet: bool = False, verify: bool = True) -> None:
+    """Check that required credentials are available and valid before running an agent.
+
+    Two-phase validation:
+    1. **Presence** — is the credential set (env var, encrypted store, or Aden sync)?
+    2. **Health check** — does the credential actually work? Uses each tool's
+       registered ``check_credential_health`` endpoint (lightweight HTTP call).
+
+    Args:
+        nodes: List of NodeSpec objects from the agent graph.
+        quiet: If True, suppress the credential summary output.
+        verify: If True (default), run health checks on present credentials.
+    """
+    # Collect required tools and node types
+    required_tools = {tool for node in nodes if node.tools for tool in node.tools}
+    node_types = {node.node_type for node in nodes}
+
+    try:
+        from aden_tools.credentials import CREDENTIAL_SPECS
+    except ImportError:
+        return  # aden_tools not installed, skip check
+
+    from framework.credentials.storage import CompositeStorage, EncryptedFileStorage, EnvVarStorage
+    from framework.credentials.store import CredentialStore
+
+    # Build credential store.
+    # Env vars take priority — if a user explicitly exports a fresh key it
+    # must win over a potentially stale value in the encrypted store.
+    #
+    # Pre-sync: when ADEN_API_KEY is available, sync OAuth tokens from Aden
+    # into env vars so validation sees fresh tokens instead of stale values
+    # in the encrypted store (e.g., a previously mis-stored google.enc).
+    if os.environ.get("ADEN_API_KEY"):
+        _presync_aden_tokens(CREDENTIAL_SPECS)
+
+    env_mapping = {
+        (spec.credential_id or name): spec.env_var for name, spec in CREDENTIAL_SPECS.items()
+    }
+    env_storage = EnvVarStorage(env_mapping=env_mapping)
+    if os.environ.get("HIVE_CREDENTIAL_KEY"):
+        storage = CompositeStorage(primary=env_storage, fallbacks=[EncryptedFileStorage()])
+    else:
+        storage = env_storage
+    store = CredentialStore(storage=storage)
+
+    # Build reverse mappings
+    tool_to_cred: dict[str, str] = {}
+    node_type_to_cred: dict[str, str] = {}
+    for cred_name, spec in CREDENTIAL_SPECS.items():
+        for tool_name in spec.tools:
+            tool_to_cred[tool_name] = cred_name
+        for nt in spec.node_types:
+            node_type_to_cred[nt] = cred_name
+
+    missing: list[str] = []
+    invalid: list[str] = []
+    # Aden-backed creds where ADEN_API_KEY is set but integration not connected
+    aden_not_connected: list[str] = []
+    failed_cred_names: list[str] = []  # all cred names that need (re-)collection
+    has_aden_key = bool(os.environ.get("ADEN_API_KEY"))
+    checked: set[str] = set()
+    # Credentials that are present and should be health-checked
+    to_verify: list[tuple[str, str]] = []  # (cred_name, used_by_label)
+
+    def _check_credential(spec, cred_name: str, label: str) -> None:
+        cred_id = spec.credential_id or cred_name
+        if not store.is_available(cred_id):
+            # If ADEN_API_KEY is set and this is an Aden-only credential,
+            # the issue is that the integration isn't connected on hive.adenhq.com,
+            # NOT that the user needs to re-enter ADEN_API_KEY.
+            if has_aden_key and spec.aden_supported and not spec.direct_api_key_supported:
+                aden_not_connected.append(
+                    f"  {spec.env_var} for {label}"
+                    f"\n    Connect this integration at hive.adenhq.com first."
+                )
+            else:
+                entry = f"  {spec.env_var} for {label}"
+                if spec.help_url:
+                    entry += f"\n    Get it at: {spec.help_url}"
+                missing.append(entry)
+                failed_cred_names.append(cred_name)
+        elif verify and spec.health_check_endpoint:
+            to_verify.append((cred_name, label))
+
+    # Check tool credentials
+    for tool_name in sorted(required_tools):
+        cred_name = tool_to_cred.get(tool_name)
+        if cred_name is None or cred_name in checked:
+            continue
+        checked.add(cred_name)
+        spec = CREDENTIAL_SPECS[cred_name]
+        if not spec.required:
+            continue
+        affected = sorted(t for t in required_tools if t in spec.tools)
+        label = ", ".join(affected)
+        _check_credential(spec, cred_name, label)
+
+    # Check node type credentials (e.g., ANTHROPIC_API_KEY for LLM nodes)
+    for nt in sorted(node_types):
+        cred_name = node_type_to_cred.get(nt)
+        if cred_name is None or cred_name in checked:
+            continue
+        checked.add(cred_name)
+        spec = CREDENTIAL_SPECS[cred_name]
+        if not spec.required:
+            continue
+        affected_types = sorted(t for t in node_types if t in spec.node_types)
+        label = ", ".join(affected_types) + " nodes"
+        _check_credential(spec, cred_name, label)
+
+    # Phase 2: health-check present credentials
+    if to_verify:
+        try:
+            from aden_tools.credentials import check_credential_health
+        except ImportError:
+            check_credential_health = None  # type: ignore[assignment]
+
+        if check_credential_health is not None:
+            for cred_name, label in to_verify:
+                spec = CREDENTIAL_SPECS[cred_name]
+                cred_id = spec.credential_id or cred_name
+                value = store.get(cred_id)
+                if not value:
+                    continue
+                try:
+                    result = check_credential_health(
+                        cred_name,
+                        value,
+                        health_check_endpoint=spec.health_check_endpoint,
+                        health_check_method=spec.health_check_method,
+                    )
+                    if not result.valid:
+                        entry = f"  {spec.env_var} for {label} — {result.message}"
+                        if spec.help_url:
+                            entry += f"\n    Get a new key at: {spec.help_url}"
+                        invalid.append(entry)
+                        failed_cred_names.append(cred_name)
+                    elif result.valid:
+                        # Persist identity from health check (best-effort)
+                        identity_data = result.details.get("identity")
+                        if identity_data and isinstance(identity_data, dict):
+                            try:
+                                cred_obj = store.get_credential(cred_id, refresh_if_needed=False)
+                                if cred_obj:
+                                    cred_obj.set_identity(**identity_data)
+                                    store.save_credential(cred_obj)
+                            except Exception:
+                                pass  # Identity persistence is best-effort
+                except Exception as exc:
+                    logger.debug("Health check for %s failed: %s", cred_name, exc)
+
+    errors = missing + invalid + aden_not_connected
+    if errors:
+        from framework.credentials.models import CredentialError
+
+        lines: list[str] = []
+        if missing:
+            lines.append("Missing credentials:\n")
+            lines.extend(missing)
+        if invalid:
+            if missing:
+                lines.append("")
+            lines.append("Invalid or expired credentials:\n")
+            lines.extend(invalid)
+        if aden_not_connected:
+            if missing or invalid:
+                lines.append("")
+            lines.append(
+                "Aden integrations not connected "
+                "(ADEN_API_KEY is set but OAuth tokens unavailable):\n"
+            )
+            lines.extend(aden_not_connected)
+        lines.append(
+            "\nTo fix: run /hive-credentials in Claude Code."
+            "\nIf you've already set up credentials, "
+            "restart your terminal to load them."
+        )
+        exc = CredentialError("\n".join(lines))
+        exc.failed_cred_names = failed_cred_names  # type: ignore[attr-defined]
+        raise exc
+
+
+def build_setup_session_from_error(
+    credential_error: Exception,
+    nodes: list | None = None,
+    agent_path: str | None = None,
+):
+    """Build a ``CredentialSetupSession`` that covers all failed credentials.
+
+    ``validate_agent_credentials`` attaches ``failed_cred_names`` (both missing
+    and invalid) to the ``CredentialError``.  This helper converts those names
+    into ``MissingCredential`` entries so the setup screen can re-collect them.
+
+    Falls back to the normal ``from_nodes`` / ``from_agent_path`` detection
+    when the attribute is absent.
+
+    Args:
+        credential_error: The ``CredentialError`` raised by validation.
+        nodes: Graph nodes (preferred — avoids re-loading from disk).
+        agent_path: Agent directory path (used when nodes aren't available).
+    """
+    from framework.credentials.setup import CredentialSetupSession, MissingCredential
+
+    # Start with normal detection (picks up truly missing creds)
+    if nodes is not None:
+        session = CredentialSetupSession.from_nodes(nodes)
+    elif agent_path is not None:
+        session = CredentialSetupSession.from_agent_path(agent_path)
+    else:
+        session = CredentialSetupSession(missing=[])
+
+    # Add credentials that are present but failed health checks
+    already = {m.credential_name for m in session.missing}
+    failed_names: list[str] = getattr(credential_error, "failed_cred_names", [])
+    if failed_names:
+        try:
+            from aden_tools.credentials import CREDENTIAL_SPECS
+
+            for name in failed_names:
+                if name in already:
+                    continue
+                spec = CREDENTIAL_SPECS.get(name)
+                if spec is None:
+                    continue
+                session.missing.append(
+                    MissingCredential(
+                        credential_name=name,
+                        env_var=spec.env_var,
+                        description=spec.description,
+                        help_url=spec.help_url,
+                        api_key_instructions=spec.api_key_instructions,
+                        tools=list(spec.tools),
+                        aden_supported=spec.aden_supported,
+                        direct_api_key_supported=spec.direct_api_key_supported,
+                        credential_id=spec.credential_id,
+                        credential_key=spec.credential_key,
+                    )
+                )
+        except ImportError:
+            pass
+
+    return session
@@ -1,4 +1,4 @@
-"""Graph structures: Goals, Nodes, Edges, and Flexible Execution."""
+"""Graph structures: Goals, Nodes, Edges, and Execution."""

 from framework.graph.client_io import (
    ActiveNodeClientIO,
@@ -6,10 +6,9 @@ from framework.graph.client_io import (
    InertNodeClientIO,
    NodeClientIO,
 )
-from framework.graph.code_sandbox import CodeSandbox, safe_eval, safe_exec
 from framework.graph.context_handoff import ContextHandoff, HandoffContext
 from framework.graph.conversation import ConversationStore, Message, NodeConversation
-from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
+from framework.graph.edge import DEFAULT_MAX_TOKENS, EdgeCondition, EdgeSpec, GraphSpec
 from framework.graph.event_loop_node import (
    EventLoopNode,
    JudgeProtocol,
@@ -18,31 +17,9 @@ from framework.graph.event_loop_node import (
    OutputAccumulator,
 )
 from framework.graph.executor import GraphExecutor
-from framework.graph.flexible_executor import ExecutorConfig, FlexibleGraphExecutor
 from framework.graph.goal import Constraint, Goal, GoalStatus, SuccessCriterion
-from framework.graph.judge import HybridJudge, create_default_judge
 from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec

-# Flexible execution (Worker-Judge pattern)
-from framework.graph.plan import (
-    ActionSpec,
-    ActionType,
-    # HITL (Human-in-the-loop)
-    ApprovalDecision,
-    ApprovalRequest,
-    ApprovalResult,
-    EvaluationRule,
-    ExecutionStatus,
-    Judgment,
-    JudgmentAction,
-    Plan,
-    PlanExecutionResult,
-    PlanStep,
-    StepStatus,
-    load_export,
-)
-from framework.graph.worker_node import StepExecutionResult, WorkerNode
-
 __all__ = [
    # Goal
    "Goal",
@@ -58,35 +35,9 @@ __all__ = [
    "EdgeSpec",
    "EdgeCondition",
    "GraphSpec",
-    # Executor (fixed graph)
+    "DEFAULT_MAX_TOKENS",
+    # Executor
    "GraphExecutor",
-    # Plan (flexible execution)
-    "Plan",
-    "PlanStep",
-    "ActionSpec",
-    "ActionType",
-    "StepStatus",
-    "Judgment",
-    "JudgmentAction",
-    "EvaluationRule",
-    "PlanExecutionResult",
-    "ExecutionStatus",
-    "load_export",
-    # HITL (Human-in-the-loop)
-    "ApprovalDecision",
-    "ApprovalRequest",
-    "ApprovalResult",
-    # Worker-Judge
-    "HybridJudge",
-    "create_default_judge",
-    "WorkerNode",
-    "StepExecutionResult",
-    "FlexibleGraphExecutor",
-    "ExecutorConfig",
-    # Code Sandbox
-    "CodeSandbox",
-    "safe_exec",
-    "safe_eval",
    # Conversation
    "NodeConversation",
    "ConversationStore",
@@ -0,0 +1,85 @@
+"""
+Checkpoint Configuration - Controls checkpoint behavior during execution.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class CheckpointConfig:
+    """
+    Configuration for checkpoint behavior during graph execution.
+
+    Controls when checkpoints are created, how they're stored,
+    and when they're pruned.
+    """
+
+    # Enable/disable checkpointing
+    enabled: bool = True
+
+    # When to checkpoint
+    checkpoint_on_node_start: bool = True
+    checkpoint_on_node_complete: bool = True
+
+    # Pruning (time-based)
+    checkpoint_max_age_days: int = 7  # Prune checkpoints older than 1 week
+    prune_every_n_nodes: int = 10  # Check for pruning every N nodes
+
+    # Performance
+    async_checkpoint: bool = True  # Don't block execution on checkpoint writes
+
+    # What to include in checkpoints
+    include_full_memory: bool = True
+    include_metrics: bool = True
+
+    def should_checkpoint_node_start(self) -> bool:
+        """Check if should checkpoint before node execution."""
+        return self.enabled and self.checkpoint_on_node_start
+
+    def should_checkpoint_node_complete(self) -> bool:
+        """Check if should checkpoint after node execution."""
+        return self.enabled and self.checkpoint_on_node_complete
+
+    def should_prune_checkpoints(self, nodes_executed: int) -> bool:
+        """
+        Check if should prune checkpoints based on execution progress.
+
+        Args:
+            nodes_executed: Number of nodes executed so far
+
+        Returns:
+            True if should check for old checkpoints and prune them
+        """
+        return (
+            self.enabled
+            and self.prune_every_n_nodes > 0
+            and nodes_executed % self.prune_every_n_nodes == 0
+        )
+
+
+# Default configuration for most agents
+DEFAULT_CHECKPOINT_CONFIG = CheckpointConfig(
+    enabled=True,
+    checkpoint_on_node_start=True,
+    checkpoint_on_node_complete=True,
+    checkpoint_max_age_days=7,
+    prune_every_n_nodes=10,
+    async_checkpoint=True,
+)
+
+
+# Minimal configuration (only checkpoint at node completion)
+MINIMAL_CHECKPOINT_CONFIG = CheckpointConfig(
+    enabled=True,
+    checkpoint_on_node_start=False,
+    checkpoint_on_node_complete=True,
+    checkpoint_max_age_days=7,
+    prune_every_n_nodes=20,
+    async_checkpoint=True,
+)
+
+
+# Disabled configuration (no checkpointing)
+DISABLED_CHECKPOINT_CONFIG = CheckpointConfig(
+    enabled=False,
+)
@@ -1,413 +0,0 @@
-"""
-Code Sandbox for Safe Execution of Dynamic Code.
-
-Provides a restricted execution environment for code generated by
-the external planner. This is critical for open-ended planning where
-the planner can create arbitrary code actions.
-
-Security measures:
-1. Restricted builtins (no file I/O, no imports of dangerous modules)
-2. Timeout enforcement
-3. Memory limits (via resource module on Unix)
-4. Namespace isolation
-"""
-
-import ast
-import signal
-import sys
-from contextlib import contextmanager
-from dataclasses import dataclass, field
-from typing import Any
-
-# Safe builtins whitelist
-SAFE_BUILTINS = {
-    # Basic types
-    "True": True,
-    "False": False,
-    "None": None,
-    # Type constructors
-    "bool": bool,
-    "int": int,
-    "float": float,
-    "str": str,
-    "list": list,
-    "dict": dict,
-    "set": set,
-    "tuple": tuple,
-    "frozenset": frozenset,
-    # Basic functions
-    "abs": abs,
-    "all": all,
-    "any": any,
-    "bin": bin,
-    "chr": chr,
-    "divmod": divmod,
-    "enumerate": enumerate,
-    "filter": filter,
-    "format": format,
-    "hex": hex,
-    "isinstance": isinstance,
-    "issubclass": issubclass,
-    "iter": iter,
-    "len": len,
-    "map": map,
-    "max": max,
-    "min": min,
-    "next": next,
-    "oct": oct,
-    "ord": ord,
-    "pow": pow,
-    "range": range,
-    "repr": repr,
-    "reversed": reversed,
-    "round": round,
-    "slice": slice,
-    "sorted": sorted,
-    "sum": sum,
-    "zip": zip,
-}
-
-# Modules that can be imported
-ALLOWED_MODULES = {
-    "math",
-    "json",
-    "re",
-    "datetime",
-    "collections",
-    "itertools",
-    "functools",
-    "operator",
-    "string",
-    "random",
-    "statistics",
-    "decimal",
-    "fractions",
-}
-
-# Dangerous AST nodes to block
-BLOCKED_AST_NODES = {
-    ast.Import,
-    ast.ImportFrom,
-    ast.Global,
-    ast.Nonlocal,
-}
-
-
-class CodeSandboxError(Exception):
-    """Error during sandboxed code execution."""
-
-    pass
-
-
-class TimeoutError(CodeSandboxError):
-    """Code execution timed out."""
-
-    pass
-
-
-class SecurityError(CodeSandboxError):
-    """Code contains potentially dangerous operations."""
-
-    pass
-
-
-@dataclass
-class SandboxResult:
-    """Result of sandboxed code execution."""
-
-    success: bool
-    result: Any = None
-    error: str | None = None
-    stdout: str = ""
-    variables: dict[str, Any] = field(default_factory=dict)
-    execution_time_ms: int = 0
-
-
-class RestrictedImporter:
-    """Custom importer that only allows whitelisted modules."""
-
-    def __init__(self, allowed_modules: set[str]):
-        self.allowed_modules = allowed_modules
-        self._cache: dict[str, Any] = {}
-
-    def __call__(self, name: str, *args, **kwargs):
-        if name not in self.allowed_modules:
-            raise SecurityError(f"Import of module '{name}' is not allowed")
-
-        if name not in self._cache:
-            import importlib
-
-            self._cache[name] = importlib.import_module(name)
-
-        return self._cache[name]
-
-
-class CodeValidator:
-    """Validates code for safety before execution."""
-
-    def __init__(self, blocked_nodes: set[type] | None = None):
-        self.blocked_nodes = blocked_nodes or BLOCKED_AST_NODES
-
-    def validate(self, code: str) -> list[str]:
-        """
-        Validate code and return list of issues.
-
-        Returns empty list if code is safe.
-        """
-        issues = []
-
-        try:
-            tree = ast.parse(code)
-        except SyntaxError as e:
-            return [f"Syntax error: {e}"]
-
-        for node in ast.walk(tree):
-            # Check for blocked node types
-            if type(node) in self.blocked_nodes:
-                lineno = getattr(node, "lineno", "?")
-                issues.append(f"Blocked operation: {type(node).__name__} at line {lineno}")
-
-            # Check for dangerous attribute access
-            if isinstance(node, ast.Attribute):
-                if node.attr.startswith("_"):
-                    issues.append(
-                        f"Access to private attribute '{node.attr}' at line {node.lineno}"
-                    )
-
-            # Check for exec/eval calls
-            if isinstance(node, ast.Call):
-                if isinstance(node.func, ast.Name):
-                    if node.func.id in ("exec", "eval", "compile", "__import__"):
-                        issues.append(
-                            f"Blocked function call: {node.func.id} at line {node.lineno}"
-                        )
-
-        return issues
-
-
-class CodeSandbox:
-    """
-    Sandboxed environment for executing dynamic code.
-
-    Usage:
-        sandbox = CodeSandbox(timeout_seconds=5)
-        result = sandbox.execute(
-            code="x = 1 + 2\\nresult = x * 3",
-            inputs={"multiplier": 2},
-        )
-        if result.success:
-            print(result.variables["result"])  # 6
-    """
-
-    def __init__(
-        self,
-        timeout_seconds: int = 10,
-        allowed_modules: set[str] | None = None,
-        safe_builtins: dict[str, Any] | None = None,
-    ):
-        self.timeout_seconds = timeout_seconds
-        self.allowed_modules = allowed_modules or ALLOWED_MODULES
-        self.safe_builtins = safe_builtins or SAFE_BUILTINS
-        self.validator = CodeValidator()
-        self.importer = RestrictedImporter(self.allowed_modules)
-
-    @contextmanager
-    def _timeout_context(self, seconds: int):
-        """Context manager for timeout enforcement."""
-
-        def handler(signum, frame):
-            raise TimeoutError(f"Code execution timed out after {seconds} seconds")
-
-        # Only works on Unix-like systems
-        if hasattr(signal, "SIGALRM"):
-            old_handler = signal.signal(signal.SIGALRM, handler)
-            signal.alarm(seconds)
-            try:
-                yield
-            finally:
-                signal.alarm(0)
-                signal.signal(signal.SIGALRM, old_handler)
-        else:
-            # Windows: no timeout support, just execute
-            yield
-
-    def _create_namespace(self, inputs: dict[str, Any]) -> dict[str, Any]:
-        """Create isolated namespace for code execution."""
-        namespace = {
-            "__builtins__": dict(self.safe_builtins),
-            "__import__": self.importer,
-        }
-
-        # Add input variables
-        namespace.update(inputs)
-
-        return namespace
-
-    def execute(
-        self,
-        code: str,
-        inputs: dict[str, Any] | None = None,
-        extract_vars: list[str] | None = None,
-    ) -> SandboxResult:
-        """
-        Execute code in sandbox.
-
-        Args:
-            code: Python code to execute
-            inputs: Variables to inject into namespace
-            extract_vars: Variable names to extract from namespace after execution
-
-        Returns:
-            SandboxResult with execution outcome
-        """
-        import time
-
-        inputs = inputs or {}
-        extract_vars = extract_vars or []
-
-        # Validate code first
-        issues = self.validator.validate(code)
-        if issues:
-            return SandboxResult(
-                success=False,
-                error=f"Code validation failed: {'; '.join(issues)}",
-            )
-
-        # Create isolated namespace
-        namespace = self._create_namespace(inputs)
-
-        # Capture stdout
-        import io
-
-        old_stdout = sys.stdout
-        sys.stdout = captured_stdout = io.StringIO()
-
-        start_time = time.time()
-
-        try:
-            with self._timeout_context(self.timeout_seconds):
-                # Compile and execute
-                compiled = compile(code, "<sandbox>", "exec")
-                exec(compiled, namespace)
-
-            execution_time_ms = int((time.time() - start_time) * 1000)
-
-            # Extract requested variables
-            extracted = {}
-            for var in extract_vars:
-                if var in namespace:
-                    extracted[var] = namespace[var]
-
-            # Also extract any new variables (not in inputs or builtins)
-            for key, value in namespace.items():
-                if key not in inputs and key not in self.safe_builtins and not key.startswith("_"):
-                    extracted[key] = value
-
-            return SandboxResult(
-                success=True,
-                result=namespace.get("result"),  # Convention: 'result' is the return value
-                stdout=captured_stdout.getvalue(),
-                variables=extracted,
-                execution_time_ms=execution_time_ms,
-            )
-
-        except TimeoutError as e:
-            return SandboxResult(
-                success=False,
-                error=str(e),
-                execution_time_ms=self.timeout_seconds * 1000,
-            )
-
-        except SecurityError as e:
-            return SandboxResult(
-                success=False,
-                error=f"Security violation: {e}",
-                execution_time_ms=int((time.time() - start_time) * 1000),
-            )
-
-        except Exception as e:
-            return SandboxResult(
-                success=False,
-                error=f"{type(e).__name__}: {e}",
-                stdout=captured_stdout.getvalue(),
-                execution_time_ms=int((time.time() - start_time) * 1000),
-            )
-
-        finally:
-            sys.stdout = old_stdout
-
-    def execute_expression(
-        self,
-        expression: str,
-        inputs: dict[str, Any] | None = None,
-    ) -> SandboxResult:
-        """
-        Execute a single expression and return its value.
-
-        Simpler than execute() - just evaluates one expression.
-        """
-        inputs = inputs or {}
-
-        # Validate
-        try:
-            ast.parse(expression, mode="eval")
-        except SyntaxError as e:
-            return SandboxResult(success=False, error=f"Syntax error: {e}")
-
-        namespace = self._create_namespace(inputs)
-
-        try:
-            with self._timeout_context(self.timeout_seconds):
-                result = eval(expression, namespace)
-
-            return SandboxResult(success=True, result=result)
-
-        except Exception as e:
-            return SandboxResult(
-                success=False,
-                error=f"{type(e).__name__}: {e}",
-            )
-
-
-# Singleton instance with default settings
-default_sandbox = CodeSandbox()
-
-
-def safe_exec(
-    code: str,
-    inputs: dict[str, Any] | None = None,
-    timeout_seconds: int = 10,
-) -> SandboxResult:
-    """
-    Convenience function for safe code execution.
-
-    Args:
-        code: Python code to execute
-        inputs: Variables to inject
-        timeout_seconds: Max execution time
-
-    Returns:
-        SandboxResult
-    """
-    sandbox = CodeSandbox(timeout_seconds=timeout_seconds)
-    return sandbox.execute(code, inputs)
-
-
-def safe_eval(
-    expression: str,
-    inputs: dict[str, Any] | None = None,
-    timeout_seconds: int = 5,
-) -> SandboxResult:
-    """
-    Convenience function for safe expression evaluation.
-
-    Args:
-        expression: Python expression to evaluate
-        inputs: Variables to inject
-        timeout_seconds: Max execution time
-
-    Returns:
-        SandboxResult
-    """
-    sandbox = CodeSandbox(timeout_seconds=timeout_seconds)
-    return sandbox.execute_expression(expression, inputs)
@@ -27,6 +27,9 @@ class Message:
    tool_use_id: str | None = None
    tool_calls: list[dict[str, Any]] | None = None
    is_error: bool = False
+    # Phase-aware compaction metadata (continuous mode)
+    phase_id: str | None = None
+    is_transition_marker: bool = False

    def to_llm_dict(self) -> dict[str, Any]:
        """Convert to OpenAI-format message dict."""
@@ -60,6 +63,10 @@ class Message:
            d["tool_calls"] = self.tool_calls
        if self.is_error:
            d["is_error"] = self.is_error
+        if self.phase_id is not None:
+            d["phase_id"] = self.phase_id
+        if self.is_transition_marker:
+            d["is_transition_marker"] = self.is_transition_marker
        return d

    @classmethod
@@ -72,6 +79,8 @@ class Message:
            tool_use_id=data.get("tool_use_id"),
            tool_calls=data.get("tool_calls"),
            is_error=data.get("is_error", False),
+            phase_id=data.get("phase_id"),
+            is_transition_marker=data.get("is_transition_marker", False),
        )


@@ -188,6 +197,7 @@ class NodeConversation:
        self._next_seq: int = 0
        self._meta_persisted: bool = False
        self._last_api_input_tokens: int | None = None
+        self._current_phase: str | None = None

    # --- Properties --------------------------------------------------------

@@ -195,6 +205,23 @@ class NodeConversation:
    def system_prompt(self) -> str:
        return self._system_prompt

+    def update_system_prompt(self, new_prompt: str) -> None:
+        """Update the system prompt.
+
+        Used in continuous conversation mode at phase transitions to swap
+        Layer 3 (focus) while preserving the conversation history.
+        """
+        self._system_prompt = new_prompt
+        self._meta_persisted = False  # re-persist with new prompt
+
+    def set_current_phase(self, phase_id: str) -> None:
+        """Set the current phase ID. Subsequent messages will be stamped with it."""
+        self._current_phase = phase_id
+
+    @property
+    def current_phase(self) -> str | None:
+        return self._current_phase
+
    @property
    def messages(self) -> list[Message]:
        """Return a defensive copy of the message list."""
@@ -216,8 +243,19 @@ class NodeConversation:

    # --- Add messages ------------------------------------------------------

-    async def add_user_message(self, content: str) -> Message:
-        msg = Message(seq=self._next_seq, role="user", content=content)
+    async def add_user_message(
+        self,
+        content: str,
+        *,
+        is_transition_marker: bool = False,
+    ) -> Message:
+        msg = Message(
+            seq=self._next_seq,
+            role="user",
+            content=content,
+            phase_id=self._current_phase,
+            is_transition_marker=is_transition_marker,
+        )
        self._messages.append(msg)
        self._next_seq += 1
        await self._persist(msg)
@@ -233,6 +271,7 @@ class NodeConversation:
            role="assistant",
            content=content,
            tool_calls=tool_calls,
+            phase_id=self._current_phase,
        )
        self._messages.append(msg)
        self._next_seq += 1
@@ -251,6 +290,7 @@ class NodeConversation:
            content=content,
            tool_use_id=tool_use_id,
            is_error=is_error,
+            phase_id=self._current_phase,
        )
        self._messages.append(msg)
        self._next_seq += 1
@@ -380,6 +420,11 @@ class NodeConversation:
        spillover filename reference (if any). Message structure (role,
        seq, tool_use_id) stays valid for the LLM API.

+        Phase-aware behavior (continuous mode): when messages have ``phase_id``
+        metadata, all messages in the current phase are protected regardless of
+        token budget. Transition markers are never pruned. Older phases' tool
+        results are pruned more aggressively.
+
        Error tool results are never pruned — they prevent re-calling
        failing tools.

@@ -388,13 +433,18 @@ class NodeConversation:
        if not self._messages:
            return 0

-        # Phase 1: Walk backward, classify tool results as protected vs pruneable
+        # Walk backward, classify tool results as protected vs pruneable
        protected_tokens = 0
        pruneable: list[int] = []  # indices into self._messages
        pruneable_tokens = 0

        for i in range(len(self._messages) - 1, -1, -1):
            msg = self._messages[i]
+
+            # Transition markers are never pruned (any role)
+            if msg.is_transition_marker:
+                continue
+
            if msg.role != "tool":
                continue
            if msg.is_error:
@@ -402,6 +452,10 @@ class NodeConversation:
            if msg.content.startswith("[Pruned tool result"):
                continue  # already pruned

+            # Phase-aware: protect current phase messages
+            if self._current_phase and msg.phase_id == self._current_phase:
+                continue
+
            est = len(msg.content) // 4
            if protected_tokens < protect_tokens:
                protected_tokens += est
@@ -409,11 +463,11 @@ class NodeConversation:
                pruneable.append(i)
                pruneable_tokens += est

-        # Phase 2: Only prune if enough to be worthwhile
+        # Only prune if enough to be worthwhile
        if pruneable_tokens < min_prune_tokens:
            return 0

-        # Phase 3: Replace content with compact placeholder
+        # Replace content with compact placeholder
        count = 0
        for i in pruneable:
            msg = self._messages[i]
@@ -436,6 +490,8 @@ class NodeConversation:
                tool_use_id=msg.tool_use_id,
                tool_calls=msg.tool_calls,
                is_error=msg.is_error,
+                phase_id=msg.phase_id,
+                is_transition_marker=msg.is_transition_marker,
            )
            count += 1

@@ -446,22 +502,38 @@ class NodeConversation:
        self._last_api_input_tokens = None
        return count

-    async def compact(self, summary: str, keep_recent: int = 2) -> None:
+    async def compact(
+        self,
+        summary: str,
+        keep_recent: int = 2,
+        phase_graduated: bool = False,
+    ) -> None:
        """Replace old messages with a summary, optionally keeping recent ones.

        Args:
            summary: Caller-provided summary text.
            keep_recent: Number of recent messages to preserve (default 2).
                         Clamped to [0, len(messages) - 1].
+            phase_graduated: When True and messages have phase_id metadata,
+                split at phase boundaries instead of using keep_recent.
+                Keeps current + previous phase intact; compacts older phases.
        """
        if not self._messages:
            return

-        # Clamp: must discard at least 1 message
-        keep_recent = max(0, min(keep_recent, len(self._messages) - 1))
-
        total = len(self._messages)
-        split = total - keep_recent if keep_recent > 0 else total
+
+        # Phase-graduated: find the split point based on phase boundaries.
+        # Keeps current phase + previous phase intact, compacts older phases.
+        if phase_graduated and self._current_phase:
+            split = self._find_phase_graduated_split()
+        else:
+            split = None
+
+        if split is None:
+            # Fallback: use keep_recent (non-phase or single-phase conversation)
+            keep_recent = max(0, min(keep_recent, total - 1))
+            split = total - keep_recent if keep_recent > 0 else total

        # Advance split past orphaned tool results at the boundary.
        # Tool-role messages reference a tool_use from the preceding
@@ -470,6 +542,10 @@ class NodeConversation:
        while split < total and self._messages[split].role == "tool":
            split += 1

+        # Nothing to compact
+        if split == 0:
+            return
+
        old_messages = list(self._messages[:split])
        recent_messages = list(self._messages[split:])

@@ -504,6 +580,33 @@ class NodeConversation:
        self._messages = [summary_msg] + recent_messages
        self._last_api_input_tokens = None  # reset; next LLM call will recalibrate

+    def _find_phase_graduated_split(self) -> int | None:
+        """Find split point that preserves current + previous phase.
+
+        Returns the index of the first message in the protected set,
+        or None if phase graduation doesn't apply (< 3 phases).
+        """
+        # Collect distinct phases in order of first appearance
+        phases_seen: list[str] = []
+        for msg in self._messages:
+            if msg.phase_id and msg.phase_id not in phases_seen:
+                phases_seen.append(msg.phase_id)
+
+        # Need at least 3 phases for graduation to be meaningful
+        # (current + previous are protected, older get compacted)
+        if len(phases_seen) < 3:
+            return None
+
+        # Protect: current phase + previous phase
+        protected_phases = {phases_seen[-1], phases_seen[-2]}
+
+        # Find split: first message belonging to a protected phase
+        for i, msg in enumerate(self._messages):
+            if msg.phase_id in protected_phases:
+                return i
+
+        return None
+
    async def clear(self) -> None:
        """Remove all messages, keep system prompt, preserve ``_next_seq``."""
        if self._store:
@@ -569,9 +672,20 @@ class NodeConversation:
    # --- Restore -----------------------------------------------------------

    @classmethod
-    async def restore(cls, store: ConversationStore) -> NodeConversation | None:
+    async def restore(
+        cls,
+        store: ConversationStore,
+        phase_id: str | None = None,
+    ) -> NodeConversation | None:
        """Reconstruct a NodeConversation from a store.

+        Args:
+            store: The conversation store to read from.
+            phase_id: If set, only load parts matching this phase_id.
+                Used in isolated mode so a node only sees its own
+                messages in the shared flat store.  In continuous mode
+                pass ``None`` to load all parts.
+
        Returns ``None`` if the store contains no metadata (i.e. the
        conversation was never persisted).
        """
@@ -589,6 +703,8 @@ class NodeConversation:
        conv._meta_persisted = True

        parts = await store.read_parts()
+        if phase_id:
+            parts = [p for p in parts if p.get("phase_id") == phase_id]
        conv._messages = [Message.from_storage_dict(p) for p in parts]

        cursor = await store.read_cursor()
@@ -0,0 +1,177 @@
+"""Level 2 Conversation-Aware Judge.
+
+When a node has `success_criteria` set, the implicit judge upgrades:
+after Level 0 passes (all output keys set), a fast LLM call evaluates
+whether the conversation actually meets the criteria.
+
+This prevents nodes from "checking boxes" (setting output keys) without
+doing quality work. The LLM reads the recent conversation and assesses
+whether the phase's goal was genuinely accomplished.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+from framework.graph.conversation import NodeConversation
+from framework.llm.provider import LLMProvider
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PhaseVerdict:
+    """Result of Level 2 conversation-aware evaluation."""
+
+    action: str  # "ACCEPT" or "RETRY"
+    confidence: float = 0.8
+    feedback: str = ""
+
+
+async def evaluate_phase_completion(
+    llm: LLMProvider,
+    conversation: NodeConversation,
+    phase_name: str,
+    phase_description: str,
+    success_criteria: str,
+    accumulator_state: dict[str, Any],
+    max_history_tokens: int = 8_196,
+) -> PhaseVerdict:
+    """Level 2 judge: read the conversation and evaluate quality.
+
+    Only called after Level 0 passes (all output keys set).
+
+    Args:
+        llm: LLM provider for evaluation
+        conversation: The current conversation to evaluate
+        phase_name: Name of the current phase/node
+        phase_description: Description of the phase
+        success_criteria: Natural-language criteria for phase completion
+        accumulator_state: Current output key values
+        max_history_tokens: Main conversation token budget (judge gets 20%)
+
+    Returns:
+        PhaseVerdict with action and optional feedback
+    """
+    # Build a compact view of the recent conversation
+    recent_messages = _extract_recent_context(conversation, max_messages=10)
+    outputs_summary = _format_outputs(accumulator_state)
+
+    system_prompt = (
+        "You are a quality judge evaluating whether a phase of work is complete. "
+        "Be concise. Evaluate based on the success criteria, not on style."
+    )
+
+    user_prompt = f"""Evaluate this phase:
+
+PHASE: {phase_name}
+DESCRIPTION: {phase_description}
+
+SUCCESS CRITERIA:
+{success_criteria}
+
+OUTPUTS SET:
+{outputs_summary}
+
+RECENT CONVERSATION:
+{recent_messages}
+
+Has this phase accomplished its goal based on the success criteria?
+
+Respond in exactly this format:
+ACTION: ACCEPT or RETRY
+CONFIDENCE: 0.X
+FEEDBACK: (reason if RETRY, empty if ACCEPT)"""
+
+    try:
+        response = await llm.acomplete(
+            messages=[{"role": "user", "content": user_prompt}],
+            system=system_prompt,
+            max_tokens=max(1024, max_history_tokens // 5),
+            max_retries=1,
+        )
+        if not response.content or not response.content.strip():
+            logger.debug("Level 2 judge: empty response, accepting by default")
+            return PhaseVerdict(action="ACCEPT", confidence=0.5, feedback="")
+        return _parse_verdict(response.content)
+    except Exception as e:
+        logger.warning(f"Level 2 judge failed, accepting by default: {e}")
+        # On failure, don't block — Level 0 already passed
+        return PhaseVerdict(action="ACCEPT", confidence=0.5, feedback="")
+
+
+def _extract_recent_context(conversation: NodeConversation, max_messages: int = 10) -> str:
+    """Extract recent conversation messages for evaluation."""
+    messages = conversation.messages
+    recent = messages[-max_messages:] if len(messages) > max_messages else messages
+
+    parts = []
+    for msg in recent:
+        role = msg.role.upper()
+        content = msg.content or ""
+        # Truncate long tool results
+        if msg.role == "tool" and len(content) > 200:
+            content = content[:200] + "..."
+        if content.strip():
+            parts.append(f"[{role}]: {content.strip()}")
+
+    return "\n".join(parts) if parts else "(no messages)"
+
+
+def _format_outputs(accumulator_state: dict[str, Any]) -> str:
+    """Format output key values for evaluation.
+
+    Lists and dicts get structural formatting so the judge can assess
+    quantity and structure, not just a truncated stringification.
+    """
+    if not accumulator_state:
+        return "(none)"
+    parts = []
+    for key, value in accumulator_state.items():
+        if isinstance(value, list):
+            # Show count + brief per-item preview so the judge can
+            # verify quantity without the full serialization.
+            items_preview = []
+            for i, item in enumerate(value[:8]):
+                item_str = str(item)
+                if len(item_str) > 150:
+                    item_str = item_str[:150] + "..."
+                items_preview.append(f"    [{i}]: {item_str}")
+            val_str = f"list ({len(value)} items):\n" + "\n".join(items_preview)
+            if len(value) > 8:
+                val_str += f"\n    ... and {len(value) - 8} more"
+        elif isinstance(value, dict):
+            val_str = str(value)
+            if len(val_str) > 400:
+                val_str = val_str[:400] + "..."
+        else:
+            val_str = str(value)
+            if len(val_str) > 300:
+                val_str = val_str[:300] + "..."
+        parts.append(f"  {key}: {val_str}")
+    return "\n".join(parts)
+
+
+def _parse_verdict(response: str) -> PhaseVerdict:
+    """Parse LLM response into PhaseVerdict."""
+    action = "ACCEPT"
+    confidence = 0.8
+    feedback = ""
+
+    for line in response.strip().split("\n"):
+        line = line.strip()
+        if line.startswith("ACTION:"):
+            action_str = line.split(":", 1)[1].strip().upper()
+            if action_str in ("ACCEPT", "RETRY"):
+                action = action_str
+        elif line.startswith("CONFIDENCE:"):
+            try:
+                confidence = float(line.split(":", 1)[1].strip())
+            except ValueError:
+                pass
+        elif line.startswith("FEEDBACK:"):
+            feedback = line.split(":", 1)[1].strip()
+
+    return PhaseVerdict(action=action, confidence=confidence, feedback=feedback)
@@ -21,13 +21,20 @@ allowing the LLM to evaluate whether proceeding along an edge makes sense
 given the current goal, context, and execution state.
 """

+import json
+import logging
+import re
 from enum import StrEnum
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator

 from framework.graph.safe_eval import safe_eval

+logger = logging.getLogger(__name__)
+
+DEFAULT_MAX_TOKENS = 8192
+

 class EdgeCondition(StrEnum):
    """When an edge should be traversed."""
@@ -97,7 +104,7 @@ class EdgeSpec(BaseModel):

    model_config = {"extra": "allow"}

-    def should_traverse(
+    async def should_traverse(
        self,
        source_success: bool,
        source_output: dict[str, Any],
@@ -138,7 +145,7 @@ class EdgeSpec(BaseModel):
            if llm is None or goal is None:
                # Fallback to ON_SUCCESS if LLM not available
                return source_success
-            return self._llm_decide(
+            return await self._llm_decide(
                llm=llm,
                goal=goal,
                source_success=source_success,
@@ -156,9 +163,6 @@ class EdgeSpec(BaseModel):
        memory: dict[str, Any],
    ) -> bool:
        """Evaluate a conditional expression."""
-        import logging
-
-        logger = logging.getLogger(__name__)

        if not self.condition_expr:
            return True
@@ -199,7 +203,7 @@ class EdgeSpec(BaseModel):
            logger.warning(f"         Available context keys: {list(context.keys())}")
            return False

-    def _llm_decide(
+    async def _llm_decide(
        self,
        llm: Any,
        goal: Any,
@@ -215,8 +219,6 @@ class EdgeSpec(BaseModel):
        The LLM evaluates whether proceeding to the target node
        is the best next step toward achieving the goal.
        """
-        import json
-
        # Build context for LLM
        prompt = f"""You are evaluating whether to proceed along an edge in an agent workflow.

@@ -245,15 +247,13 @@ Respond with ONLY a JSON object:
 {{"proceed": true/false, "reasoning": "brief explanation"}}"""

        try:
-            response = llm.complete(
+            response = await llm.acomplete(
                messages=[{"role": "user", "content": prompt}],
                system="You are a routing agent. Respond with JSON only.",
                max_tokens=150,
            )

            # Parse response
-            import re
-
            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
@@ -261,9 +261,6 @@ Respond with ONLY a JSON object:
                reasoning = data.get("reasoning", "")

                # Log the decision (using basic print for now)
-                import logging
-
-                logger = logging.getLogger(__name__)
                logger.info(f"      🤔 LLM routing decision: {'PROCEED' if proceed else 'SKIP'}")
                logger.info(f"         Reason: {reasoning}")

@@ -271,9 +268,6 @@ Respond with ONLY a JSON object:

        except Exception as e:
            # Fallback: proceed on success
-            import logging
-
-            logger = logging.getLogger(__name__)
            logger.warning(f"      ⚠ LLM routing failed, defaulting to on_success: {e}")
            return source_success

@@ -347,6 +341,12 @@ class AsyncEntryPointSpec(BaseModel):

    model_config = {"extra": "allow"}

+    def get_isolation_level(self):
+        """Convert string isolation level to enum (duck-type with EntryPointSpec)."""
+        from framework.runtime.execution_stream import IsolationLevel
+
+        return IsolationLevel(self.isolation_level)
+

 class GraphSpec(BaseModel):
    """
@@ -424,11 +424,11 @@ class GraphSpec(BaseModel):

    # Default LLM settings
    default_model: str = "claude-haiku-4-5-20251001"
-    max_tokens: int = 1024
+    max_tokens: int = Field(default=None)  # resolved by _resolve_max_tokens validator

    # Cleanup LLM for JSON extraction fallback (fast/cheap model preferred)
    # If not set, uses CEREBRAS_API_KEY -> cerebras/llama-3.3-70b or
-    # ANTHROPIC_API_KEY -> claude-3-5-haiku as fallback
+    # ANTHROPIC_API_KEY -> claude-haiku-4-5 as fallback
    cleanup_llm_model: str | None = None

    # Execution limits
@@ -441,12 +441,41 @@ class GraphSpec(BaseModel):
        description="EventLoopNode configuration (max_iterations, max_tool_calls_per_turn, etc.)",
    )

+    # Conversation mode
+    conversation_mode: str = Field(
+        default="continuous",
+        description=(
+            "How conversations flow between event_loop nodes. "
+            "'continuous' (default): one conversation threads through all "
+            "event_loop nodes with cumulative tools and layered prompt composition. "
+            "'isolated': each node gets a fresh conversation."
+        ),
+    )
+    identity_prompt: str | None = Field(
+        default=None,
+        description=(
+            "Agent-level identity prompt (Layer 1 of the onion model). "
+            "In continuous mode, this is the static identity that persists "
+            "unchanged across all node transitions. In isolated mode, ignored."
+        ),
+    )
+
    # Metadata
    description: str = ""
    created_by: str = ""  # "human" or "builder_agent"

    model_config = {"extra": "allow"}

+    @model_validator(mode="before")
+    @classmethod
+    def _resolve_max_tokens(cls, values: Any) -> Any:
+        """Resolve max_tokens from the global config store when not explicitly set."""
+        if isinstance(values, dict) and values.get("max_tokens") is None:
+            from framework.config import get_max_tokens
+
+            values["max_tokens"] = get_max_tokens()
+        return values
+
    def get_node(self, node_id: str) -> Any | None:
        """Get a node by ID."""
        for node in self.nodes:
@@ -1,552 +0,0 @@
-"""
-Flexible Graph Executor with Worker-Judge Loop.
-
-Executes plans created by external planner (Claude Code, etc.)
-using a Worker-Judge loop:
-
-1. External planner creates Plan
-2. FlexibleGraphExecutor receives Plan
-3. Worker executes each step
-4. Judge evaluates each result
-5. If Judge says "replan" → return to external planner with feedback
-6. If Judge says "escalate" → request human intervention
-7. If all steps complete → return success
-
-This keeps planning external while execution/evaluation is internal.
-"""
-
-from collections.abc import Callable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any
-
-from framework.graph.code_sandbox import CodeSandbox
-from framework.graph.goal import Goal
-from framework.graph.judge import HybridJudge, create_default_judge
-from framework.graph.plan import (
-    ApprovalDecision,
-    ApprovalRequest,
-    ApprovalResult,
-    ExecutionStatus,
-    Judgment,
-    JudgmentAction,
-    Plan,
-    PlanExecutionResult,
-    PlanStep,
-    StepStatus,
-)
-from framework.graph.worker_node import StepExecutionResult, WorkerNode
-from framework.llm.provider import LLMProvider, Tool
-from framework.runtime.core import Runtime
-
-# Type alias for approval callback
-ApprovalCallback = Callable[[ApprovalRequest], ApprovalResult]
-
-
-@dataclass
-class ExecutorConfig:
-    """Configuration for FlexibleGraphExecutor."""
-
-    max_retries_per_step: int = 3
-    max_total_steps: int = 100
-    timeout_seconds: int = 300
-    enable_parallel_execution: bool = False  # Future: parallel step execution
-
-
-class FlexibleGraphExecutor:
-    """
-    Executes plans with Worker-Judge loop.
-
-    Plans come from external source (Claude Code, etc.).
-    Returns feedback for replanning if needed.
-
-    Usage:
-        executor = FlexibleGraphExecutor(
-            runtime=runtime,
-            llm=llm_provider,
-            tools=tools,
-        )
-
-        result = await executor.execute_plan(plan, goal, context)
-
-        if result.status == ExecutionStatus.NEEDS_REPLAN:
-            # External planner should create new plan using result.feedback
-            new_plan = external_planner.replan(result.feedback_context)
-            result = await executor.execute_plan(new_plan, goal, result.feedback_context)
-    """
-
-    def __init__(
-        self,
-        runtime: Runtime,
-        llm: LLMProvider | None = None,
-        tools: dict[str, Tool] | None = None,
-        tool_executor: Callable | None = None,
-        functions: dict[str, Callable] | None = None,
-        judge: HybridJudge | None = None,
-        config: ExecutorConfig | None = None,
-        approval_callback: ApprovalCallback | None = None,
-    ):
-        """
-        Initialize the FlexibleGraphExecutor.
-
-        Args:
-            runtime: Runtime for decision logging
-            llm: LLM provider for Worker and Judge
-            tools: Available tools
-            tool_executor: Function to execute tools
-            functions: Registered functions
-            judge: Custom judge (defaults to HybridJudge with default rules)
-            config: Executor configuration
-            approval_callback: Callback for human-in-the-loop approval.
-                If None, steps requiring approval will pause execution.
-        """
-        self.runtime = runtime
-        self.llm = llm
-        self.tools = tools or {}
-        self.tool_executor = tool_executor
-        self.functions = functions or {}
-        self.config = config or ExecutorConfig()
-        self.approval_callback = approval_callback
-
-        # Create judge
-        self.judge = judge or create_default_judge(llm)
-
-        # Create worker
-        self.worker = WorkerNode(
-            runtime=runtime,
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-            functions=functions,
-            sandbox=CodeSandbox(),
-        )
-
-    async def execute_plan(
-        self,
-        plan: Plan,
-        goal: Goal,
-        context: dict[str, Any] | None = None,
-    ) -> PlanExecutionResult:
-        """
-        Execute a plan created by external planner.
-
-        Args:
-            plan: The plan to execute
-            goal: The goal context
-            context: Initial context (e.g., from previous execution)
-
-        Returns:
-            PlanExecutionResult with status and feedback
-        """
-        context = context or {}
-        context.update(plan.context)  # Merge plan's accumulated context
-
-        # Start run
-        _run_id = self.runtime.start_run(
-            goal_id=goal.id,
-            goal_description=goal.description,
-            input_data={"plan_id": plan.id, "revision": plan.revision},
-        )
-
-        steps_executed = 0
-        total_tokens = 0
-        total_latency = 0
-
-        try:
-            while steps_executed < self.config.max_total_steps:
-                # Get next ready steps
-                ready_steps = plan.get_ready_steps()
-
-                if not ready_steps:
-                    # Check if we're done or stuck
-                    if plan.is_complete():
-                        break
-                    else:
-                        # No ready steps but not complete - something's wrong
-                        return self._create_result(
-                            status=ExecutionStatus.NEEDS_REPLAN,
-                            plan=plan,
-                            context=context,
-                            feedback=(
-                                "No executable steps available but plan not complete. "
-                                "Check dependencies."
-                            ),
-                            steps_executed=steps_executed,
-                            total_tokens=total_tokens,
-                            total_latency=total_latency,
-                        )
-
-                # Execute next step (for now, sequential; could be parallel)
-                step = ready_steps[0]
-                # Debug: show ready steps
-                # ready_ids = [s.id for s in ready_steps]
-                # print(f"  [DEBUG] Ready steps: {ready_ids}, executing: {step.id}")
-
-                # APPROVAL CHECK - before execution
-                if step.requires_approval:
-                    approval_result = await self._request_approval(step, context)
-
-                    if approval_result is None:
-                        # No callback, pause execution
-                        step.status = StepStatus.AWAITING_APPROVAL
-                        return self._create_result(
-                            status=ExecutionStatus.AWAITING_APPROVAL,
-                            plan=plan,
-                            context=context,
-                            feedback=f"Step '{step.id}' requires approval: {step.description}",
-                            steps_executed=steps_executed,
-                            total_tokens=total_tokens,
-                            total_latency=total_latency,
-                        )
-
-                    if approval_result.decision == ApprovalDecision.REJECT:
-                        step.status = StepStatus.REJECTED
-                        step.error = approval_result.reason or "Rejected by human"
-                        # Skip this step and continue with dependents marked as skipped
-                        self._skip_dependent_steps(plan, step.id)
-                        continue
-
-                    if approval_result.decision == ApprovalDecision.ABORT:
-                        return self._create_result(
-                            status=ExecutionStatus.ABORTED,
-                            plan=plan,
-                            context=context,
-                            feedback=approval_result.reason or "Aborted by human",
-                            steps_executed=steps_executed,
-                            total_tokens=total_tokens,
-                            total_latency=total_latency,
-                        )
-
-                    if approval_result.decision == ApprovalDecision.MODIFY:
-                        # Apply modifications to step
-                        if approval_result.modifications:
-                            self._apply_modifications(step, approval_result.modifications)
-
-                    # APPROVE - continue to execution
-
-                step.status = StepStatus.IN_PROGRESS
-                step.started_at = datetime.now()
-                step.attempts += 1
-
-                # WORK
-                work_result = await self.worker.execute(step, context)
-                steps_executed += 1
-                total_tokens += work_result.tokens_used
-                total_latency += work_result.latency_ms
-
-                # JUDGE
-                judgment = await self.judge.evaluate(
-                    step=step,
-                    result=work_result.__dict__,
-                    goal=goal,
-                    context=context,
-                )
-
-                # Handle judgment
-                result = await self._handle_judgment(
-                    step=step,
-                    work_result=work_result,
-                    judgment=judgment,
-                    plan=plan,
-                    goal=goal,
-                    context=context,
-                    steps_executed=steps_executed,
-                    total_tokens=total_tokens,
-                    total_latency=total_latency,
-                )
-
-                if result is not None:
-                    # Judgment resulted in early return (replan/escalate)
-                    self.runtime.end_run(
-                        success=False,
-                        narrative=f"Execution stopped: {result.status.value}",
-                    )
-                    return result
-
-            # All steps completed successfully
-            self.runtime.end_run(
-                success=True,
-                output_data=context,
-                narrative=f"Plan completed: {steps_executed} steps executed",
-            )
-
-            return self._create_result(
-                status=ExecutionStatus.COMPLETED,
-                plan=plan,
-                context=context,
-                steps_executed=steps_executed,
-                total_tokens=total_tokens,
-                total_latency=total_latency,
-            )
-
-        except Exception as e:
-            self.runtime.report_problem(
-                severity="critical",
-                description=str(e),
-            )
-            self.runtime.end_run(
-                success=False,
-                narrative=f"Execution failed: {e}",
-            )
-
-            return PlanExecutionResult(
-                status=ExecutionStatus.FAILED,
-                error=str(e),
-                feedback=f"Execution error: {e}",
-                feedback_context=plan.to_feedback_context(),
-                completed_steps=[s.id for s in plan.get_completed_steps()],
-                steps_executed=steps_executed,
-                total_tokens=total_tokens,
-                total_latency_ms=total_latency,
-            )
-
-    async def _handle_judgment(
-        self,
-        step: PlanStep,
-        work_result: StepExecutionResult,
-        judgment: Judgment,
-        plan: Plan,
-        goal: Goal,
-        context: dict[str, Any],
-        steps_executed: int,
-        total_tokens: int,
-        total_latency: int,
-    ) -> PlanExecutionResult | None:
-        """
-        Handle judgment and return result if execution should stop.
-
-        Returns None to continue execution, or PlanExecutionResult to stop.
-        """
-        if judgment.action == JudgmentAction.ACCEPT:
-            # Step succeeded - update state and continue
-            step.status = StepStatus.COMPLETED
-            step.completed_at = datetime.now()
-            step.result = work_result.outputs
-
-            # Map outputs to expected output keys
-            # If output has generic "result" key but step expects specific keys, map it
-            outputs_to_store = work_result.outputs.copy()
-            if step.expected_outputs and "result" in outputs_to_store:
-                result_value = outputs_to_store["result"]
-                # For each expected output key that's not in outputs, map from "result"
-                for expected_key in step.expected_outputs:
-                    if expected_key not in outputs_to_store:
-                        outputs_to_store[expected_key] = result_value
-
-            # Update context with mapped outputs
-            context.update(outputs_to_store)
-
-            # Store in plan context for replanning feedback
-            plan.context[step.id] = outputs_to_store
-
-            return None  # Continue execution
-
-        elif judgment.action == JudgmentAction.RETRY:
-            # Retry step if under limit
-            if step.attempts < step.max_retries:
-                step.status = StepStatus.PENDING
-                step.error = judgment.feedback
-
-                # Record retry decision
-                self.runtime.decide(
-                    intent=f"Retry step {step.id}",
-                    options=[{"id": "retry", "description": "Retry with feedback"}],
-                    chosen="retry",
-                    reasoning=judgment.reasoning,
-                    context={"attempt": step.attempts, "feedback": judgment.feedback},
-                )
-
-                return None  # Continue (step will be retried)
-            else:
-                # Max retries exceeded - escalate to replan
-                step.status = StepStatus.FAILED
-                step.error = f"Max retries ({step.max_retries}) exceeded: {judgment.feedback}"
-
-                return self._create_result(
-                    status=ExecutionStatus.NEEDS_REPLAN,
-                    plan=plan,
-                    context=context,
-                    feedback=(
-                        f"Step '{step.id}' failed after {step.attempts} attempts: "
-                        f"{judgment.feedback}"
-                    ),
-                    steps_executed=steps_executed,
-                    total_tokens=total_tokens,
-                    total_latency=total_latency,
-                )
-
-        elif judgment.action == JudgmentAction.REPLAN:
-            # Return to external planner
-            step.status = StepStatus.FAILED
-            step.error = judgment.feedback
-
-            return self._create_result(
-                status=ExecutionStatus.NEEDS_REPLAN,
-                plan=plan,
-                context=context,
-                feedback=judgment.feedback or f"Step '{step.id}' requires replanning",
-                steps_executed=steps_executed,
-                total_tokens=total_tokens,
-                total_latency=total_latency,
-            )
-
-        elif judgment.action == JudgmentAction.ESCALATE:
-            # Request human intervention
-            return self._create_result(
-                status=ExecutionStatus.NEEDS_ESCALATION,
-                plan=plan,
-                context=context,
-                feedback=judgment.feedback or f"Step '{step.id}' requires human intervention",
-                steps_executed=steps_executed,
-                total_tokens=total_tokens,
-                total_latency=total_latency,
-            )
-
-        return None  # Unknown action - continue
-
-    def _create_result(
-        self,
-        status: ExecutionStatus,
-        plan: Plan,
-        context: dict[str, Any],
-        feedback: str | None = None,
-        steps_executed: int = 0,
-        total_tokens: int = 0,
-        total_latency: int = 0,
-    ) -> PlanExecutionResult:
-        """Create a PlanExecutionResult."""
-        return PlanExecutionResult(
-            status=status,
-            results=context,
-            feedback=feedback,
-            feedback_context=plan.to_feedback_context(),
-            completed_steps=[s.id for s in plan.get_completed_steps()],
-            steps_executed=steps_executed,
-            total_tokens=total_tokens,
-            total_latency_ms=total_latency,
-        )
-
-    def register_function(self, name: str, func: Callable) -> None:
-        """Register a function for FUNCTION actions."""
-        self.functions[name] = func
-        self.worker.register_function(name, func)
-
-    def register_tool(self, tool: Tool) -> None:
-        """Register a tool for TOOL_USE actions."""
-        self.tools[tool.name] = tool
-        self.worker.register_tool(tool)
-
-    def add_evaluation_rule(self, rule) -> None:
-        """Add an evaluation rule to the judge."""
-        self.judge.add_rule(rule)
-
-    async def _request_approval(
-        self,
-        step: PlanStep,
-        context: dict[str, Any],
-    ) -> ApprovalResult | None:
-        """
-        Request human approval for a step.
-
-        Returns None if no callback is set (execution should pause).
-        """
-        if self.approval_callback is None:
-            return None
-
-        # Build preview of what will happen
-        preview_parts = []
-        if step.action.tool_name:
-            preview_parts.append(f"Tool: {step.action.tool_name}")
-            if step.action.tool_args:
-                import json
-
-                args_preview = json.dumps(step.action.tool_args, indent=2, default=str)
-                if len(args_preview) > 500:
-                    args_preview = args_preview[:500] + "..."
-                preview_parts.append(f"Args: {args_preview}")
-        elif step.action.prompt:
-            prompt_preview = (
-                step.action.prompt[:300] + "..."
-                if len(step.action.prompt) > 300
-                else step.action.prompt
-            )
-            preview_parts.append(f"Prompt: {prompt_preview}")
-
-        # Include step inputs resolved from context (what will be sent/used)
-        relevant_context = {}
-        for input_key, input_value in step.inputs.items():
-            # Resolve variable references like "$email_sequence"
-            if isinstance(input_value, str) and input_value.startswith("$"):
-                context_key = input_value[1:]  # Remove $ prefix
-                if context_key in context:
-                    relevant_context[input_key] = context[context_key]
-            else:
-                relevant_context[input_key] = input_value
-
-        request = ApprovalRequest(
-            step_id=step.id,
-            step_description=step.description,
-            action_type=step.action.action_type.value,
-            action_details={
-                "tool_name": step.action.tool_name,
-                "tool_args": step.action.tool_args,
-                "prompt": step.action.prompt,
-            },
-            context=relevant_context,
-            approval_message=step.approval_message,
-            preview="\n".join(preview_parts) if preview_parts else None,
-        )
-
-        return self.approval_callback(request)
-
-    def _skip_dependent_steps(self, plan: Plan, rejected_step_id: str) -> None:
-        """Mark steps that depend on a rejected step as skipped."""
-        for step in plan.steps:
-            if rejected_step_id in step.dependencies:
-                if step.status == StepStatus.PENDING:
-                    step.status = StepStatus.SKIPPED
-                    step.error = f"Skipped because dependency '{rejected_step_id}' was rejected"
-                    # Recursively skip dependents
-                    self._skip_dependent_steps(plan, step.id)
-
-    def _apply_modifications(self, step: PlanStep, modifications: dict[str, Any]) -> None:
-        """Apply human modifications to a step before execution."""
-        # Allow modifying tool args
-        if "tool_args" in modifications and step.action.tool_args:
-            step.action.tool_args.update(modifications["tool_args"])
-
-        # Allow modifying prompt
-        if "prompt" in modifications:
-            step.action.prompt = modifications["prompt"]
-
-        # Allow modifying inputs
-        if "inputs" in modifications:
-            step.inputs.update(modifications["inputs"])
-
-    def set_approval_callback(self, callback: ApprovalCallback) -> None:
-        """Set the approval callback for HITL steps."""
-        self.approval_callback = callback
-
-
-# Convenience function for simple execution
-async def execute_plan(
-    plan: Plan,
-    goal: Goal,
-    runtime: Runtime,
-    llm: LLMProvider | None = None,
-    tools: dict[str, Tool] | None = None,
-    tool_executor: Callable | None = None,
-    context: dict[str, Any] | None = None,
-) -> PlanExecutionResult:
-    """
-    Execute a plan with default configuration.
-
-    Convenience function for simple use cases.
-    """
-    executor = FlexibleGraphExecutor(
-        runtime=runtime,
-        llm=llm,
-        tools=tools,
-        tool_executor=tool_executor,
-    )
-    return await executor.execute_plan(plan, goal, context)
@@ -44,6 +44,11 @@ class SuccessCriterion(BaseModel):
    metric: str = Field(
        description="How to measure: 'output_contains', 'output_equals', 'llm_judge', 'custom'"
    )
+    # NEW: runtime evaluation type (separate from metric)
+    type: str = Field(
+        default="success_rate", description="Runtime evaluation type, e.g. 'success_rate'"
+    )
+
    target: Any = Field(description="The target value or condition")
    weight: float = Field(default=1.0, ge=0.0, le=1.0, description="Relative importance (0-1)")
    met: bool = False
@@ -197,7 +197,7 @@ Example format:

            client = anthropic.Anthropic(api_key=api_key)
            message = client.messages.create(
-                model="claude-3-5-haiku-20241022",
+                model="claude-haiku-4-5-20251001",
                max_tokens=500,
                messages=[{"role": "user", "content": prompt}],
            )
@@ -1,406 +0,0 @@
-"""
-Hybrid Judge for Evaluating Plan Step Results.
-
-The HybridJudge evaluates step execution results using:
-1. Rule-based evaluation (fast, deterministic)
-2. LLM-based evaluation (fallback for ambiguous cases)
-
-Escalation path: rules → LLM → human
-"""
-
-from dataclasses import dataclass, field
-from typing import Any
-
-from framework.graph.code_sandbox import safe_eval
-from framework.graph.goal import Goal
-from framework.graph.plan import (
-    EvaluationRule,
-    Judgment,
-    JudgmentAction,
-    PlanStep,
-)
-from framework.llm.provider import LLMProvider
-
-
-@dataclass
-class RuleEvaluationResult:
-    """Result of rule-based evaluation."""
-
-    is_definitive: bool  # True if a rule matched definitively
-    judgment: Judgment | None = None
-    context: dict[str, Any] = field(default_factory=dict)
-    rules_checked: int = 0
-    rule_matched: str | None = None
-
-
-class HybridJudge:
-    """
-    Evaluates plan step results using rules first, then LLM fallback.
-
-    Usage:
-        judge = HybridJudge(llm=llm_provider)
-        judge.add_rule(EvaluationRule(
-            id="success_check",
-            condition="result.get('success') == True",
-            action=JudgmentAction.ACCEPT,
-        ))
-
-        judgment = await judge.evaluate(step, result, goal)
-    """
-
-    def __init__(
-        self,
-        llm: LLMProvider | None = None,
-        rules: list[EvaluationRule] | None = None,
-        llm_confidence_threshold: float = 0.7,
-    ):
-        """
-        Initialize the HybridJudge.
-
-        Args:
-            llm: LLM provider for ambiguous cases
-            rules: Initial evaluation rules
-            llm_confidence_threshold: Confidence below this triggers escalation
-        """
-        self.llm = llm
-        self.rules: list[EvaluationRule] = rules or []
-        self.llm_confidence_threshold = llm_confidence_threshold
-
-        # Sort rules by priority (higher first)
-        self._sort_rules()
-
-    def _sort_rules(self):
-        """Sort rules by priority."""
-        self.rules.sort(key=lambda r: -r.priority)
-
-    def add_rule(self, rule: EvaluationRule) -> None:
-        """Add an evaluation rule."""
-        self.rules.append(rule)
-        self._sort_rules()
-
-    def remove_rule(self, rule_id: str) -> bool:
-        """Remove a rule by ID. Returns True if found and removed."""
-        for i, rule in enumerate(self.rules):
-            if rule.id == rule_id:
-                self.rules.pop(i)
-                return True
-        return False
-
-    async def evaluate(
-        self,
-        step: PlanStep,
-        result: Any,
-        goal: Goal,
-        context: dict[str, Any] | None = None,
-    ) -> Judgment:
-        """
-        Evaluate a step result.
-
-        Args:
-            step: The executed plan step
-            result: The result of executing the step
-            goal: The goal context for evaluation
-            context: Additional context from previous steps
-
-        Returns:
-            Judgment with action and feedback
-        """
-        context = context or {}
-
-        # Try rule-based evaluation first
-        rule_result = self._evaluate_rules(step, result, goal, context)
-
-        if rule_result.is_definitive:
-            return rule_result.judgment
-
-        # Fall back to LLM evaluation
-        if self.llm:
-            return await self._evaluate_llm(step, result, goal, context, rule_result)
-
-        # No LLM available - default to accept with low confidence
-        return Judgment(
-            action=JudgmentAction.ACCEPT,
-            reasoning="No definitive rule matched and no LLM available for evaluation",
-            confidence=0.5,
-            llm_used=False,
-        )
-
-    def _evaluate_rules(
-        self,
-        step: PlanStep,
-        result: Any,
-        goal: Goal,
-        context: dict[str, Any],
-    ) -> RuleEvaluationResult:
-        """Evaluate step using rules."""
-        rules_checked = 0
-
-        # Build evaluation context
-        eval_context = {
-            "step": step.model_dump() if hasattr(step, "model_dump") else step,
-            "result": result,
-            "goal": goal.model_dump() if hasattr(goal, "model_dump") else goal,
-            "context": context,
-            "success": isinstance(result, dict) and result.get("success", False),
-            "error": isinstance(result, dict) and result.get("error"),
-        }
-
-        for rule in self.rules:
-            rules_checked += 1
-
-            # Evaluate rule condition
-            eval_result = safe_eval(rule.condition, eval_context)
-
-            if eval_result.success and eval_result.result:
-                # Rule matched!
-                feedback = self._format_feedback(rule.feedback_template, eval_context)
-
-                return RuleEvaluationResult(
-                    is_definitive=True,
-                    judgment=Judgment(
-                        action=rule.action,
-                        reasoning=rule.description,
-                        feedback=feedback if feedback else None,
-                        rule_matched=rule.id,
-                        confidence=1.0,
-                        llm_used=False,
-                    ),
-                    rules_checked=rules_checked,
-                    rule_matched=rule.id,
-                )
-
-        # No rule matched definitively
-        return RuleEvaluationResult(
-            is_definitive=False,
-            context=eval_context,
-            rules_checked=rules_checked,
-        )
-
-    def _format_feedback(
-        self,
-        template: str,
-        context: dict[str, Any],
-    ) -> str:
-        """Format feedback template with context values."""
-        if not template:
-            return ""
-
-        try:
-            return template.format(**context)
-        except (KeyError, ValueError):
-            return template
-
-    async def _evaluate_llm(
-        self,
-        step: PlanStep,
-        result: Any,
-        goal: Goal,
-        context: dict[str, Any],
-        rule_result: RuleEvaluationResult,
-    ) -> Judgment:
-        """Evaluate step using LLM."""
-        system_prompt = self._build_llm_system_prompt(goal)
-        user_prompt = self._build_llm_user_prompt(step, result, context, rule_result)
-
-        try:
-            response = self.llm.complete(
-                messages=[{"role": "user", "content": user_prompt}],
-                system=system_prompt,
-            )
-
-            # Parse LLM response
-            judgment = self._parse_llm_response(response.content)
-            judgment.llm_used = True
-
-            # Check confidence threshold
-            if judgment.confidence < self.llm_confidence_threshold:
-                # Low confidence - escalate
-                return Judgment(
-                    action=JudgmentAction.ESCALATE,
-                    reasoning=(
-                        f"LLM confidence ({judgment.confidence:.2f}) "
-                        f"below threshold ({self.llm_confidence_threshold})"
-                    ),
-                    feedback=judgment.feedback,
-                    confidence=judgment.confidence,
-                    llm_used=True,
-                    context={"original_judgment": judgment.model_dump()},
-                )
-
-            return judgment
-
-        except Exception as e:
-            # LLM failed - escalate
-            return Judgment(
-                action=JudgmentAction.ESCALATE,
-                reasoning=f"LLM evaluation failed: {e}",
-                feedback="Human review needed due to LLM error",
-                llm_used=True,
-            )
-
-    def _build_llm_system_prompt(self, goal: Goal) -> str:
-        """Build system prompt for LLM judge."""
-        return f"""You are a judge evaluating the execution of a plan step.
-
-GOAL: {goal.description}
-
-SUCCESS CRITERIA:
-{chr(10).join(f"- {sc.description}" for sc in goal.success_criteria)}
-
-CONSTRAINTS:
-{chr(10).join(f"- {c.description}" for c in goal.constraints)}
-
-Your task is to evaluate whether the step was executed successfully and decide the next action.
-
-Respond in this exact format:
-ACTION: [ACCEPT|RETRY|REPLAN|ESCALATE]
-CONFIDENCE: [0.0-1.0]
-REASONING: [Your reasoning]
-FEEDBACK: [Feedback for retry/replan, or empty if accepting]
-
-Actions:
- ACCEPT: Step completed successfully, continue to next step
- RETRY: Step failed but can be retried with feedback
- REPLAN: Step failed in a way that requires replanning
- ESCALATE: Requires human intervention
-"""
-
-    def _build_llm_user_prompt(
-        self,
-        step: PlanStep,
-        result: Any,
-        context: dict[str, Any],
-        rule_result: RuleEvaluationResult,
-    ) -> str:
-        """Build user prompt for LLM judge."""
-        return f"""Evaluate this step execution:
-
-STEP: {step.description}
-STEP ID: {step.id}
-ACTION TYPE: {step.action.action_type}
-EXPECTED OUTPUTS: {step.expected_outputs}
-
-RESULT:
-{result}
-
-CONTEXT FROM PREVIOUS STEPS:
-{context}
-
-RULES CHECKED: {rule_result.rules_checked} (none matched definitively)
-
-Please evaluate and provide your judgment."""
-
-    def _parse_llm_response(self, response: str) -> Judgment:
-        """Parse LLM response into Judgment."""
-        lines = response.strip().split("\n")
-
-        action = JudgmentAction.ACCEPT
-        confidence = 0.8
-        reasoning = ""
-        feedback = ""
-
-        for line in lines:
-            line = line.strip()
-            if line.startswith("ACTION:"):
-                action_str = line.split(":", 1)[1].strip().upper()
-                try:
-                    action = JudgmentAction(action_str.lower())
-                except ValueError:
-                    action = JudgmentAction.ESCALATE
-
-            elif line.startswith("CONFIDENCE:"):
-                try:
-                    confidence = float(line.split(":", 1)[1].strip())
-                except ValueError:
-                    confidence = 0.5
-
-            elif line.startswith("REASONING:"):
-                reasoning = line.split(":", 1)[1].strip()
-
-            elif line.startswith("FEEDBACK:"):
-                feedback = line.split(":", 1)[1].strip()
-
-        return Judgment(
-            action=action,
-            reasoning=reasoning or "LLM evaluation",
-            feedback=feedback if feedback else None,
-            confidence=confidence,
-        )
-
-
-# Factory function for creating judge with common rules
-def create_default_judge(llm: LLMProvider | None = None) -> HybridJudge:
-    """
-    Create a HybridJudge with commonly useful default rules.
-
-    Args:
-        llm: LLM provider for fallback evaluation
-
-    Returns:
-        Configured HybridJudge instance
-    """
-    judge = HybridJudge(llm=llm)
-
-    # Rule: Accept on explicit success flag
-    judge.add_rule(
-        EvaluationRule(
-            id="explicit_success",
-            description="Step explicitly marked as successful",
-            condition="isinstance(result, dict) and result.get('success') == True",
-            action=JudgmentAction.ACCEPT,
-            priority=100,
-        )
-    )
-
-    # Rule: Retry on transient errors
-    judge.add_rule(
-        EvaluationRule(
-            id="transient_error_retry",
-            description="Transient error that can be retried",
-            condition=(
-                "isinstance(result, dict) and "
-                "result.get('error_type') in ['timeout', 'rate_limit', 'connection_error']"
-            ),
-            action=JudgmentAction.RETRY,
-            feedback_template="Transient error: {result[error]}. Please retry.",
-            priority=90,
-        )
-    )
-
-    # Rule: Replan on missing data
-    judge.add_rule(
-        EvaluationRule(
-            id="missing_data_replan",
-            description="Required data not available",
-            condition="isinstance(result, dict) and result.get('error_type') == 'missing_data'",
-            action=JudgmentAction.REPLAN,
-            feedback_template="Missing required data: {result[error]}. Plan needs adjustment.",
-            priority=80,
-        )
-    )
-
-    # Rule: Escalate on security issues
-    judge.add_rule(
-        EvaluationRule(
-            id="security_escalate",
-            description="Security issue detected",
-            condition="isinstance(result, dict) and result.get('error_type') == 'security'",
-            action=JudgmentAction.ESCALATE,
-            feedback_template="Security issue detected: {result[error]}",
-            priority=200,
-        )
-    )
-
-    # Rule: Fail on max retries exceeded
-    judge.add_rule(
-        EvaluationRule(
-            id="max_retries_fail",
-            description="Maximum retries exceeded",
-            condition="step.get('attempts', 0) >= step.get('max_retries', 3)",
-            action=JudgmentAction.REPLAN,
-            feedback_template="Step '{step[id]}' failed after {step[attempts]} attempts",
-            priority=150,
-        )
-    )
-
-    return judge
--- a/Show More
+++ b/Show More