Compare commits
30 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| be8ec867e5 | |||
| b2ba42e541 | |||
| 94d0038e03 | |||
| e1bf300e3c | |||
| c6b922e831 | |||
| 71d12a7904 | |||
| 24c25d408c | |||
| 2e99fc9fe5 | |||
| c1f066b8ba | |||
| e7a6074800 | |||
| 719942d29a | |||
| 190450a2b2 | |||
| 44d609b719 | |||
| 8c9892f9f6 | |||
| 6ade844722 | |||
| b9a3c67fea | |||
| 219bbe00fc | |||
| ef6af5404f | |||
| b7d57f3d49 | |||
| 58c892babb | |||
| 9e2004e33b | |||
| b8be3056ed | |||
| 39029b82d6 | |||
| 232890b970 | |||
| 13a8e28ae2 | |||
| 34a44aa83c | |||
| 8468c45dc2 | |||
| d2c3649566 | |||
| 71226d9625 | |||
| 9102328d1c |
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"agent-builder": {
|
||||
"command": "uv",
|
||||
"args": ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"],
|
||||
"disabled": false
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-concepts
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-create
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-credentials
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-patterns
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-test
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive-concepts
|
||||
---
|
||||
|
||||
use hive-concepts skill
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive-create
|
||||
---
|
||||
|
||||
use hive-create skill
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive-credentials
|
||||
---
|
||||
|
||||
use hive-credentials skill
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive-patterns
|
||||
---
|
||||
|
||||
use hive-patterns skill
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive-test
|
||||
---
|
||||
|
||||
use hive-test skill
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
description: hive
|
||||
---
|
||||
|
||||
use hive skill
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-concepts
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-create
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-credentials
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-patterns
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-test
|
||||
@@ -1,399 +0,0 @@
|
||||
---
|
||||
name: hive-concepts
|
||||
description: Core concepts for goal-driven agents - architecture, node types (event_loop, function), tool discovery, and workflow overview. Use when starting agent development or need to understand agent fundamentals.
|
||||
license: Apache-2.0
|
||||
metadata:
|
||||
author: hive
|
||||
version: "2.0"
|
||||
type: foundational
|
||||
part_of: hive
|
||||
---
|
||||
|
||||
# Building Agents - Core Concepts
|
||||
|
||||
Foundational knowledge for building goal-driven agents as Python packages.
|
||||
|
||||
## Architecture: Python Services (Not JSON Configs)
|
||||
|
||||
Agents are built as Python packages:
|
||||
|
||||
```
|
||||
exports/my_agent/
|
||||
├── __init__.py # Package exports
|
||||
├── __main__.py # CLI (run, info, validate, shell)
|
||||
├── agent.py # Graph construction (goal, edges, agent class)
|
||||
├── nodes/__init__.py # Node definitions (NodeSpec)
|
||||
├── config.py # Runtime config
|
||||
└── README.md # Documentation
|
||||
```
|
||||
|
||||
**Key Principle: Agent is visible and editable during build**
|
||||
|
||||
- Files created immediately as components are approved
|
||||
- User can watch files grow in their editor
|
||||
- No session state - just direct file writes
|
||||
- No "export" step - agent is ready when build completes
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### Goal
|
||||
|
||||
Success criteria and constraints (written to agent.py)
|
||||
|
||||
```python
|
||||
goal = Goal(
|
||||
id="research-goal",
|
||||
name="Technical Research Agent",
|
||||
description="Research technical topics thoroughly",
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="completeness",
|
||||
description="Cover all aspects of topic",
|
||||
metric="coverage_score",
|
||||
target=">=0.9",
|
||||
weight=0.4,
|
||||
),
|
||||
# 3-5 success criteria total
|
||||
],
|
||||
constraints=[
|
||||
Constraint(
|
||||
id="accuracy",
|
||||
description="All information must be verified",
|
||||
constraint_type="hard",
|
||||
category="quality",
|
||||
),
|
||||
# 1-5 constraints total
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
### Node
|
||||
|
||||
Unit of work (written to nodes/__init__.py)
|
||||
|
||||
**Node Types:**
|
||||
|
||||
- `event_loop` — Multi-turn streaming loop with tool execution and judge-based evaluation. Works with or without tools.
|
||||
- `function` — Deterministic Python operations. No LLM involved.
|
||||
|
||||
```python
|
||||
search_node = NodeSpec(
|
||||
id="search-web",
|
||||
name="Search Web",
|
||||
description="Search for information and extract results",
|
||||
node_type="event_loop",
|
||||
input_keys=["query"],
|
||||
output_keys=["search_results"],
|
||||
system_prompt="Search the web for: {query}. Use the web_search tool to find results, then call set_output to store them.",
|
||||
tools=["web_search"],
|
||||
)
|
||||
```
|
||||
|
||||
**NodeSpec Fields for Event Loop Nodes:**
|
||||
|
||||
| Field | Default | Description |
|
||||
|-------|---------|-------------|
|
||||
| `client_facing` | `False` | If True, streams output to user and blocks for input between turns |
|
||||
| `nullable_output_keys` | `[]` | Output keys that may remain unset (for mutually exclusive outputs) |
|
||||
| `max_node_visits` | `1` | Max times this node executes per run. Set >1 for feedback loop targets |
|
||||
|
||||
### Edge
|
||||
|
||||
Connection between nodes (written to agent.py)
|
||||
|
||||
**Edge Conditions:**
|
||||
|
||||
- `on_success` — Proceed if node succeeds (most common)
|
||||
- `on_failure` — Handle errors
|
||||
- `always` — Always proceed
|
||||
- `conditional` — Based on expression evaluating node output
|
||||
|
||||
**Edge Priority:**
|
||||
|
||||
Priority controls evaluation order when multiple edges leave the same node. Higher priority edges are evaluated first. Use negative priority for feedback edges (edges that loop back to earlier nodes).
|
||||
|
||||
```python
|
||||
# Forward edge (evaluated first)
|
||||
EdgeSpec(
|
||||
id="review-to-campaign",
|
||||
source="review",
|
||||
target="campaign-builder",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved_contacts') is not None",
|
||||
priority=1,
|
||||
)
|
||||
|
||||
# Feedback edge (evaluated after forward edges)
|
||||
EdgeSpec(
|
||||
id="review-feedback",
|
||||
source="review",
|
||||
target="extractor",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('redo_extraction') is not None",
|
||||
priority=-1,
|
||||
)
|
||||
```
|
||||
|
||||
### Client-Facing Nodes
|
||||
|
||||
For multi-turn conversations with the user, set `client_facing=True` on a node. The node will:
|
||||
- Stream its LLM output directly to the end user
|
||||
- Block for user input between conversational turns
|
||||
- Resume when new input is injected via `inject_event()`
|
||||
|
||||
```python
|
||||
intake_node = NodeSpec(
|
||||
id="intake",
|
||||
name="Intake",
|
||||
description="Gather requirements from the user",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
input_keys=[],
|
||||
output_keys=["repo_url", "project_url"],
|
||||
system_prompt="You are the intake agent. Ask the user for the repo URL and project URL.",
|
||||
)
|
||||
```
|
||||
|
||||
> **Legacy Note:** The old `pause_nodes` / `entry_points` pattern still works but `client_facing=True` is preferred for new agents.
|
||||
|
||||
**STEP 1 / STEP 2 Prompt Pattern:** For client-facing nodes, structure the system prompt with two explicit phases:
|
||||
|
||||
```python
|
||||
system_prompt="""\
|
||||
**STEP 1 — Respond to the user (text only, NO tool calls):**
|
||||
[Present information, ask questions, etc.]
|
||||
|
||||
**STEP 2 — After the user responds, call set_output:**
|
||||
[Call set_output with the structured outputs]
|
||||
"""
|
||||
```
|
||||
|
||||
This prevents the LLM from calling `set_output` prematurely before the user has had a chance to respond.
|
||||
|
||||
### Node Design: Fewer, Richer Nodes
|
||||
|
||||
Prefer fewer nodes that do more work over many thin single-purpose nodes:
|
||||
|
||||
- **Bad**: 8 thin nodes (parse query → search → fetch → evaluate → synthesize → write → check → save)
|
||||
- **Good**: 4 rich nodes (intake → research → review → report)
|
||||
|
||||
Why: Each node boundary requires serializing outputs and passing context. Fewer nodes means the LLM retains full context of its work within the node. A research node that searches, fetches, and analyzes keeps all the source material in its conversation history.
|
||||
|
||||
### nullable_output_keys for Cross-Edge Inputs
|
||||
|
||||
When a node receives inputs that only arrive on certain edges (e.g., `feedback` only comes from a review → research feedback loop, not from intake → research), mark those keys as `nullable_output_keys`:
|
||||
|
||||
```python
|
||||
research_node = NodeSpec(
|
||||
id="research",
|
||||
input_keys=["research_brief", "feedback"],
|
||||
nullable_output_keys=["feedback"], # Not present on first visit
|
||||
max_node_visits=3,
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
## Event Loop Architecture Concepts
|
||||
|
||||
### How EventLoopNode Works
|
||||
|
||||
An event loop node runs a multi-turn loop:
|
||||
1. LLM receives system prompt + conversation history
|
||||
2. LLM responds (text and/or tool calls)
|
||||
3. Tool calls are executed, results added to conversation
|
||||
4. Judge evaluates: ACCEPT (exit loop), RETRY (loop again), or ESCALATE
|
||||
5. Repeat until judge ACCEPTs or max_iterations reached
|
||||
|
||||
### EventLoopNode Runtime
|
||||
|
||||
EventLoopNodes are **auto-created** by `GraphExecutor` at runtime. You do NOT need to manually register them. Both `GraphExecutor` (direct) and `AgentRuntime` / `create_agent_runtime()` handle event_loop nodes automatically.
|
||||
|
||||
```python
|
||||
# Direct execution — executor auto-creates EventLoopNodes
|
||||
from framework.graph.executor import GraphExecutor
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
runtime = Runtime(storage_path)
|
||||
executor = GraphExecutor(
|
||||
runtime=runtime,
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
storage_path=storage_path,
|
||||
)
|
||||
result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
|
||||
|
||||
# TUI execution — AgentRuntime also works
|
||||
from framework.runtime.agent_runtime import create_agent_runtime
|
||||
runtime = create_agent_runtime(
|
||||
graph=graph, goal=goal, storage_path=storage_path,
|
||||
entry_points=[...], llm=llm, tools=tools, tool_executor=tool_executor,
|
||||
)
|
||||
```
|
||||
|
||||
### set_output
|
||||
|
||||
Nodes produce structured outputs by calling `set_output(key, value)` — a synthetic tool injected by the framework. When the LLM calls `set_output`, the value is stored in the output accumulator and made available to downstream nodes via shared memory.
|
||||
|
||||
`set_output` is NOT a real tool — it is excluded from `real_tool_results`. For client-facing nodes, this means a turn where the LLM only calls `set_output` (no other tools) is treated as a conversational boundary and will block for user input.
|
||||
|
||||
### JudgeProtocol
|
||||
|
||||
**The judge is the SOLE mechanism for acceptance decisions.** Do not add ad-hoc framework gating, output rollback, or premature rejection logic. If the LLM calls `set_output` too early, fix it with better prompts or a custom judge — not framework-level guards.
|
||||
|
||||
The judge controls when a node's loop exits:
|
||||
- **Implicit judge** (default, no judge configured): ACCEPTs when the LLM finishes with no tool calls and all required output keys are set
|
||||
- **SchemaJudge**: Validates outputs against a Pydantic model
|
||||
- **Custom judges**: Implement `evaluate(context) -> JudgeVerdict`
|
||||
|
||||
### LoopConfig
|
||||
|
||||
Controls loop behavior:
|
||||
- `max_iterations` (default 50) — prevents infinite loops
|
||||
- `max_tool_calls_per_turn` (default 10) — limits tool calls per LLM response
|
||||
- `tool_call_overflow_margin` (default 0.5) — wiggle room before discarding extra tool calls (50% means hard cutoff at 150% of limit)
|
||||
- `stall_detection_threshold` (default 3) — detects repeated identical responses
|
||||
- `max_history_tokens` (default 32000) — triggers conversation compaction
|
||||
|
||||
### Data Tools (Spillover Management)
|
||||
|
||||
When tool results exceed the context window, the framework automatically saves them to a spillover directory and truncates with a hint. Nodes that produce or consume large data should include the data tools:
|
||||
|
||||
- `save_data(filename, data)` — Write data to a file in the data directory
|
||||
- `load_data(filename, offset=0, limit=50)` — Read data with line-based pagination
|
||||
- `list_data_files()` — List available data files
|
||||
- `serve_file_to_user(filename, label="")` — Get a clickable file:// URI for the user
|
||||
|
||||
Note: `data_dir` is a framework-injected context parameter — the LLM never sees or passes it. `GraphExecutor.execute()` sets it per-execution via `contextvars`, so data tools and spillover always share the same session-scoped directory.
|
||||
|
||||
These are real MCP tools (not synthetic). Add them to nodes that handle large tool results:
|
||||
|
||||
```python
|
||||
research_node = NodeSpec(
|
||||
...
|
||||
tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
|
||||
)
|
||||
```
|
||||
|
||||
### Fan-Out / Fan-In
|
||||
|
||||
Multiple ON_SUCCESS edges from the same source create parallel execution. All branches run concurrently via `asyncio.gather()`. Parallel event_loop nodes must have disjoint `output_keys`.
|
||||
|
||||
### max_node_visits
|
||||
|
||||
Controls how many times a node can execute in one graph run. Default is 1. Set higher for nodes that are targets of feedback edges (review-reject loops). Set 0 for unlimited (guarded by max_steps).
|
||||
|
||||
## Tool Discovery & Validation
|
||||
|
||||
**CRITICAL:** Before adding a node with tools, you MUST verify the tools exist.
|
||||
|
||||
Tools are provided by MCP servers. Never assume a tool exists - always discover dynamically.
|
||||
|
||||
### Step 1: Register MCP Server (if not already done)
|
||||
|
||||
```python
|
||||
mcp__agent-builder__add_mcp_server(
|
||||
name="tools",
|
||||
transport="stdio",
|
||||
command="python",
|
||||
args='["mcp_server.py", "--stdio"]',
|
||||
cwd="../tools"
|
||||
)
|
||||
```
|
||||
|
||||
### Step 2: Discover Available Tools
|
||||
|
||||
```python
|
||||
# List all tools from all registered servers
|
||||
mcp__agent-builder__list_mcp_tools()
|
||||
|
||||
# Or list tools from a specific server
|
||||
mcp__agent-builder__list_mcp_tools(server_name="tools")
|
||||
```
|
||||
|
||||
### Step 3: Validate Before Adding Nodes
|
||||
|
||||
Before writing a node with `tools=[...]`:
|
||||
|
||||
1. Call `list_mcp_tools()` to get available tools
|
||||
2. Check each tool in your node exists in the response
|
||||
3. If a tool doesn't exist:
|
||||
- **DO NOT proceed** with the node
|
||||
- Inform the user: "The tool 'X' is not available. Available tools are: ..."
|
||||
- Ask if they want to use an alternative or proceed without the tool
|
||||
|
||||
### Tool Validation Anti-Patterns
|
||||
|
||||
- **Never assume a tool exists** - always call `list_mcp_tools()` first
|
||||
- **Never write a node with unverified tools** - validate before writing
|
||||
- **Never silently drop tools** - if a tool doesn't exist, inform the user
|
||||
- **Never guess tool names** - use exact names from discovery response
|
||||
|
||||
## Workflow Overview: Incremental File Construction
|
||||
|
||||
```
|
||||
1. CREATE PACKAGE → mkdir + write skeletons
|
||||
2. DEFINE GOAL → Write to agent.py + config.py
|
||||
3. FOR EACH NODE:
|
||||
- Propose design (event_loop for LLM work, function for deterministic)
|
||||
- User approves
|
||||
- Write to nodes/__init__.py IMMEDIATELY
|
||||
- (Optional) Validate with test_node
|
||||
4. CONNECT EDGES → Update agent.py
|
||||
- Use priority for feedback edges (negative priority)
|
||||
- (Optional) Validate with validate_graph
|
||||
5. FINALIZE → Write agent class to agent.py
|
||||
6. DONE - Agent ready at exports/my_agent/
|
||||
```
|
||||
|
||||
**Files written immediately. MCP tools optional for validation/testing bookkeeping.**
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use hive-concepts when:
|
||||
- Starting a new agent project and need to understand fundamentals
|
||||
- Need to understand agent architecture before building
|
||||
- Want to validate tool availability before proceeding
|
||||
- Learning about node types, edges, and graph execution
|
||||
|
||||
**Next Steps:**
|
||||
- Ready to build? → Use `hive-create` skill
|
||||
- Need patterns and examples? → Use `hive-patterns` skill
|
||||
|
||||
## MCP Tools for Validation
|
||||
|
||||
After writing files, optionally use MCP tools for validation:
|
||||
|
||||
**test_node** - Validate node configuration with mock inputs
|
||||
```python
|
||||
mcp__agent-builder__test_node(
|
||||
node_id="search-web",
|
||||
test_input='{"query": "test query"}',
|
||||
mock_llm_response='{"results": "mock output"}'
|
||||
)
|
||||
```
|
||||
|
||||
**validate_graph** - Check graph structure
|
||||
```python
|
||||
mcp__agent-builder__validate_graph()
|
||||
# Returns: unreachable nodes, missing connections, event_loop validation, etc.
|
||||
```
|
||||
|
||||
**configure_loop** - Set event loop parameters
|
||||
```python
|
||||
mcp__agent-builder__configure_loop(
|
||||
max_iterations=50,
|
||||
max_tool_calls_per_turn=10,
|
||||
stall_detection_threshold=3,
|
||||
max_history_tokens=32000
|
||||
)
|
||||
```
|
||||
|
||||
**Key Point:** Files are written FIRST. MCP tools are for validation only.
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **hive-create** - Step-by-step building process
|
||||
- **hive-patterns** - Best practices: judges, feedback edges, fan-out, context management
|
||||
- **hive** - Complete workflow orchestrator
|
||||
- **hive-test** - Test and validate completed agents
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,24 +0,0 @@
|
||||
"""
|
||||
Deep Research Agent - Interactive, rigorous research with TUI conversation.
|
||||
|
||||
Research any topic through multi-source web search, quality evaluation,
|
||||
and synthesis. Features client-facing TUI interaction at key checkpoints
|
||||
for user guidance and iterative deepening.
|
||||
"""
|
||||
|
||||
from .agent import DeepResearchAgent, default_agent, goal, nodes, edges
|
||||
from .config import RuntimeConfig, AgentMetadata, default_config, metadata
|
||||
|
||||
__version__ = "1.0.0"
|
||||
|
||||
__all__ = [
|
||||
"DeepResearchAgent",
|
||||
"default_agent",
|
||||
"goal",
|
||||
"nodes",
|
||||
"edges",
|
||||
"RuntimeConfig",
|
||||
"AgentMetadata",
|
||||
"default_config",
|
||||
"metadata",
|
||||
]
|
||||
@@ -1,241 +0,0 @@
|
||||
"""
|
||||
CLI entry point for Deep Research Agent.
|
||||
|
||||
Uses AgentRuntime for multi-entrypoint support with HITL pause/resume.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import click
|
||||
|
||||
from .agent import default_agent, DeepResearchAgent
|
||||
|
||||
|
||||
def setup_logging(verbose=False, debug=False):
|
||||
"""Configure logging for execution visibility."""
|
||||
if debug:
|
||||
level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
|
||||
elif verbose:
|
||||
level, fmt = logging.INFO, "%(message)s"
|
||||
else:
|
||||
level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
|
||||
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
|
||||
logging.getLogger("framework").setLevel(level)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(version="1.0.0")
|
||||
def cli():
|
||||
"""Deep Research Agent - Interactive, rigorous research with TUI conversation."""
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--topic", "-t", type=str, required=True, help="Research topic")
|
||||
@click.option("--mock", is_flag=True, help="Run in mock mode")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
||||
def run(topic, mock, quiet, verbose, debug):
|
||||
"""Execute research on a topic."""
|
||||
if not quiet:
|
||||
setup_logging(verbose=verbose, debug=debug)
|
||||
|
||||
context = {"topic": topic}
|
||||
|
||||
result = asyncio.run(default_agent.run(context, mock_mode=mock))
|
||||
|
||||
output_data = {
|
||||
"success": result.success,
|
||||
"steps_executed": result.steps_executed,
|
||||
"output": result.output,
|
||||
}
|
||||
if result.error:
|
||||
output_data["error"] = result.error
|
||||
|
||||
click.echo(json.dumps(output_data, indent=2, default=str))
|
||||
sys.exit(0 if result.success else 1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--mock", is_flag=True, help="Run in mock mode")
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
||||
def tui(mock, verbose, debug):
|
||||
"""Launch the TUI dashboard for interactive research."""
|
||||
setup_logging(verbose=verbose, debug=debug)
|
||||
|
||||
try:
|
||||
from framework.tui.app import AdenTUI
|
||||
except ImportError:
|
||||
click.echo(
|
||||
"TUI requires the 'textual' package. Install with: pip install textual"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.agent_runtime import create_agent_runtime
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
async def run_with_tui():
|
||||
agent = DeepResearchAgent()
|
||||
|
||||
# Build graph and tools
|
||||
agent._event_bus = EventBus()
|
||||
agent._tool_registry = ToolRegistry()
|
||||
|
||||
storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
|
||||
storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
||||
if mcp_config_path.exists():
|
||||
agent._tool_registry.load_mcp_config(mcp_config_path)
|
||||
|
||||
llm = None
|
||||
if not mock:
|
||||
llm = LiteLLMProvider(
|
||||
model=agent.config.model,
|
||||
api_key=agent.config.api_key,
|
||||
api_base=agent.config.api_base,
|
||||
)
|
||||
|
||||
tools = list(agent._tool_registry.get_tools().values())
|
||||
tool_executor = agent._tool_registry.get_executor()
|
||||
graph = agent._build_graph()
|
||||
|
||||
runtime = create_agent_runtime(
|
||||
graph=graph,
|
||||
goal=agent.goal,
|
||||
storage_path=storage_path,
|
||||
entry_points=[
|
||||
EntryPointSpec(
|
||||
id="start",
|
||||
name="Start Research",
|
||||
entry_node="intake",
|
||||
trigger_type="manual",
|
||||
isolation_level="isolated",
|
||||
),
|
||||
],
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
)
|
||||
|
||||
await runtime.start()
|
||||
|
||||
try:
|
||||
app = AdenTUI(runtime)
|
||||
await app.run_async()
|
||||
finally:
|
||||
await runtime.stop()
|
||||
|
||||
asyncio.run(run_with_tui())
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--json", "output_json", is_flag=True)
|
||||
def info(output_json):
|
||||
"""Show agent information."""
|
||||
info_data = default_agent.info()
|
||||
if output_json:
|
||||
click.echo(json.dumps(info_data, indent=2))
|
||||
else:
|
||||
click.echo(f"Agent: {info_data['name']}")
|
||||
click.echo(f"Version: {info_data['version']}")
|
||||
click.echo(f"Description: {info_data['description']}")
|
||||
click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
|
||||
click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
|
||||
click.echo(f"Entry: {info_data['entry_node']}")
|
||||
click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
|
||||
|
||||
|
||||
@cli.command()
|
||||
def validate():
|
||||
"""Validate agent structure."""
|
||||
validation = default_agent.validate()
|
||||
if validation["valid"]:
|
||||
click.echo("Agent is valid")
|
||||
if validation["warnings"]:
|
||||
for warning in validation["warnings"]:
|
||||
click.echo(f" WARNING: {warning}")
|
||||
else:
|
||||
click.echo("Agent has errors:")
|
||||
for error in validation["errors"]:
|
||||
click.echo(f" ERROR: {error}")
|
||||
sys.exit(0 if validation["valid"] else 1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
def shell(verbose):
|
||||
"""Interactive research session (CLI, no TUI)."""
|
||||
asyncio.run(_interactive_shell(verbose))
|
||||
|
||||
|
||||
async def _interactive_shell(verbose=False):
|
||||
"""Async interactive shell."""
|
||||
setup_logging(verbose=verbose)
|
||||
|
||||
click.echo("=== Deep Research Agent ===")
|
||||
click.echo("Enter a topic to research (or 'quit' to exit):\n")
|
||||
|
||||
agent = DeepResearchAgent()
|
||||
await agent.start()
|
||||
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
topic = await asyncio.get_event_loop().run_in_executor(
|
||||
None, input, "Topic> "
|
||||
)
|
||||
if topic.lower() in ["quit", "exit", "q"]:
|
||||
click.echo("Goodbye!")
|
||||
break
|
||||
|
||||
if not topic.strip():
|
||||
continue
|
||||
|
||||
click.echo("\nResearching...\n")
|
||||
|
||||
result = await agent.trigger_and_wait("start", {"topic": topic})
|
||||
|
||||
if result is None:
|
||||
click.echo("\n[Execution timed out]\n")
|
||||
continue
|
||||
|
||||
if result.success:
|
||||
output = result.output
|
||||
if "report_content" in output:
|
||||
click.echo("\n--- Report ---\n")
|
||||
click.echo(output["report_content"])
|
||||
click.echo("\n")
|
||||
if "references" in output:
|
||||
click.echo("--- References ---\n")
|
||||
for ref in output.get("references", []):
|
||||
click.echo(
|
||||
f" [{ref.get('number', '?')}] {ref.get('title', '')} - {ref.get('url', '')}"
|
||||
)
|
||||
click.echo("\n")
|
||||
else:
|
||||
click.echo(f"\nResearch failed: {result.error}\n")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
click.echo("\nGoodbye!")
|
||||
break
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
await agent.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
@@ -1,358 +0,0 @@
|
||||
"""Agent graph construction for Deep Research Agent."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
|
||||
from framework.graph.edge import GraphSpec
|
||||
from framework.graph.executor import ExecutionResult
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
from .config import default_config, metadata
|
||||
from .nodes import (
|
||||
intake_node,
|
||||
research_node,
|
||||
review_node,
|
||||
report_node,
|
||||
)
|
||||
|
||||
# Goal definition
|
||||
goal = Goal(
|
||||
id="rigorous-interactive-research",
|
||||
name="Rigorous Interactive Research",
|
||||
description=(
|
||||
"Research any topic by searching diverse sources, analyzing findings, "
|
||||
"and producing a cited report — with user checkpoints to guide direction."
|
||||
),
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="source-diversity",
|
||||
description="Use multiple diverse, authoritative sources",
|
||||
metric="source_count",
|
||||
target=">=5",
|
||||
weight=0.25,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="citation-coverage",
|
||||
description="Every factual claim in the report cites its source",
|
||||
metric="citation_coverage",
|
||||
target="100%",
|
||||
weight=0.25,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="user-satisfaction",
|
||||
description="User reviews findings before report generation",
|
||||
metric="user_approval",
|
||||
target="true",
|
||||
weight=0.25,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="report-completeness",
|
||||
description="Final report answers the original research questions",
|
||||
metric="question_coverage",
|
||||
target="90%",
|
||||
weight=0.25,
|
||||
),
|
||||
],
|
||||
constraints=[
|
||||
Constraint(
|
||||
id="no-hallucination",
|
||||
description="Only include information found in fetched sources",
|
||||
constraint_type="quality",
|
||||
category="accuracy",
|
||||
),
|
||||
Constraint(
|
||||
id="source-attribution",
|
||||
description="Every claim must cite its source with a numbered reference",
|
||||
constraint_type="quality",
|
||||
category="accuracy",
|
||||
),
|
||||
Constraint(
|
||||
id="user-checkpoint",
|
||||
description="Present findings to the user before writing the final report",
|
||||
constraint_type="functional",
|
||||
category="interaction",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# Node list
|
||||
nodes = [
|
||||
intake_node,
|
||||
research_node,
|
||||
review_node,
|
||||
report_node,
|
||||
]
|
||||
|
||||
# Edge definitions
|
||||
edges = [
|
||||
# intake -> research
|
||||
EdgeSpec(
|
||||
id="intake-to-research",
|
||||
source="intake",
|
||||
target="research",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
# research -> review
|
||||
EdgeSpec(
|
||||
id="research-to-review",
|
||||
source="research",
|
||||
target="review",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
# review -> research (feedback loop)
|
||||
EdgeSpec(
|
||||
id="review-to-research-feedback",
|
||||
source="review",
|
||||
target="research",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="needs_more_research == True",
|
||||
priority=1,
|
||||
),
|
||||
# review -> report (user satisfied)
|
||||
EdgeSpec(
|
||||
id="review-to-report",
|
||||
source="review",
|
||||
target="report",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="needs_more_research == False",
|
||||
priority=2,
|
||||
),
|
||||
# report -> research (user wants deeper research on current topic)
|
||||
EdgeSpec(
|
||||
id="report-to-research",
|
||||
source="report",
|
||||
target="research",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="str(next_action).lower() == 'more_research'",
|
||||
priority=2,
|
||||
),
|
||||
# report -> intake (user wants a new topic — default when not more_research)
|
||||
EdgeSpec(
|
||||
id="report-to-intake",
|
||||
source="report",
|
||||
target="intake",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="str(next_action).lower() != 'more_research'",
|
||||
priority=1,
|
||||
),
|
||||
]
|
||||
|
||||
# Graph configuration
|
||||
entry_node = "intake"
|
||||
entry_points = {"start": "intake"}
|
||||
pause_nodes = []
|
||||
terminal_nodes = []
|
||||
|
||||
|
||||
class DeepResearchAgent:
|
||||
"""
|
||||
Deep Research Agent — 4-node pipeline with user checkpoints.
|
||||
|
||||
Flow: intake -> research -> review -> report
|
||||
^ |
|
||||
+-- feedback loop (if user wants more)
|
||||
|
||||
Uses AgentRuntime for proper session management:
|
||||
- Session-scoped storage (sessions/{session_id}/)
|
||||
- Checkpointing for resume capability
|
||||
- Runtime logging
|
||||
- Data folder for save_data/load_data
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
self.config = config or default_config
|
||||
self.goal = goal
|
||||
self.nodes = nodes
|
||||
self.edges = edges
|
||||
self.entry_node = entry_node
|
||||
self.entry_points = entry_points
|
||||
self.pause_nodes = pause_nodes
|
||||
self.terminal_nodes = terminal_nodes
|
||||
self._graph: GraphSpec | None = None
|
||||
self._agent_runtime: AgentRuntime | None = None
|
||||
self._tool_registry: ToolRegistry | None = None
|
||||
self._storage_path: Path | None = None
|
||||
|
||||
def _build_graph(self) -> GraphSpec:
|
||||
"""Build the GraphSpec."""
|
||||
return GraphSpec(
|
||||
id="deep-research-agent-graph",
|
||||
goal_id=self.goal.id,
|
||||
version="1.0.0",
|
||||
entry_node=self.entry_node,
|
||||
entry_points=self.entry_points,
|
||||
terminal_nodes=self.terminal_nodes,
|
||||
pause_nodes=self.pause_nodes,
|
||||
nodes=self.nodes,
|
||||
edges=self.edges,
|
||||
default_model=self.config.model,
|
||||
max_tokens=self.config.max_tokens,
|
||||
loop_config={
|
||||
"max_iterations": 100,
|
||||
"max_tool_calls_per_turn": 30,
|
||||
"max_history_tokens": 32000,
|
||||
},
|
||||
conversation_mode="continuous",
|
||||
identity_prompt=(
|
||||
"You are a rigorous research agent. You search for information "
|
||||
"from diverse, authoritative sources, analyze findings critically, "
|
||||
"and produce well-cited reports. You never fabricate information — "
|
||||
"every claim must trace back to a source you actually retrieved."
|
||||
),
|
||||
)
|
||||
|
||||
def _setup(self, mock_mode=False) -> None:
|
||||
"""Set up the agent runtime with sessions, checkpoints, and logging."""
|
||||
self._storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
|
||||
self._storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._tool_registry = ToolRegistry()
|
||||
|
||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
||||
if mcp_config_path.exists():
|
||||
self._tool_registry.load_mcp_config(mcp_config_path)
|
||||
|
||||
llm = None
|
||||
if not mock_mode:
|
||||
llm = LiteLLMProvider(
|
||||
model=self.config.model,
|
||||
api_key=self.config.api_key,
|
||||
api_base=self.config.api_base,
|
||||
)
|
||||
|
||||
tool_executor = self._tool_registry.get_executor()
|
||||
tools = list(self._tool_registry.get_tools().values())
|
||||
|
||||
self._graph = self._build_graph()
|
||||
|
||||
checkpoint_config = CheckpointConfig(
|
||||
enabled=True,
|
||||
checkpoint_on_node_start=False,
|
||||
checkpoint_on_node_complete=True,
|
||||
checkpoint_max_age_days=7,
|
||||
async_checkpoint=True,
|
||||
)
|
||||
|
||||
entry_point_specs = [
|
||||
EntryPointSpec(
|
||||
id="default",
|
||||
name="Default",
|
||||
entry_node=self.entry_node,
|
||||
trigger_type="manual",
|
||||
isolation_level="shared",
|
||||
)
|
||||
]
|
||||
|
||||
self._agent_runtime = create_agent_runtime(
|
||||
graph=self._graph,
|
||||
goal=self.goal,
|
||||
storage_path=self._storage_path,
|
||||
entry_points=entry_point_specs,
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
async def start(self, mock_mode=False) -> None:
|
||||
"""Set up and start the agent runtime."""
|
||||
if self._agent_runtime is None:
|
||||
self._setup(mock_mode=mock_mode)
|
||||
if not self._agent_runtime.is_running:
|
||||
await self._agent_runtime.start()
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop the agent runtime and clean up."""
|
||||
if self._agent_runtime and self._agent_runtime.is_running:
|
||||
await self._agent_runtime.stop()
|
||||
self._agent_runtime = None
|
||||
|
||||
async def trigger_and_wait(
|
||||
self,
|
||||
entry_point: str = "default",
|
||||
input_data: dict | None = None,
|
||||
timeout: float | None = None,
|
||||
session_state: dict | None = None,
|
||||
) -> ExecutionResult | None:
|
||||
"""Execute the graph and wait for completion."""
|
||||
if self._agent_runtime is None:
|
||||
raise RuntimeError("Agent not started. Call start() first.")
|
||||
|
||||
return await self._agent_runtime.trigger_and_wait(
|
||||
entry_point_id=entry_point,
|
||||
input_data=input_data or {},
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, context: dict, mock_mode=False, session_state=None
|
||||
) -> ExecutionResult:
|
||||
"""Run the agent (convenience method for single execution)."""
|
||||
await self.start(mock_mode=mock_mode)
|
||||
try:
|
||||
result = await self.trigger_and_wait(
|
||||
"default", context, session_state=session_state
|
||||
)
|
||||
return result or ExecutionResult(success=False, error="Execution timeout")
|
||||
finally:
|
||||
await self.stop()
|
||||
|
||||
def info(self):
|
||||
"""Get agent information."""
|
||||
return {
|
||||
"name": metadata.name,
|
||||
"version": metadata.version,
|
||||
"description": metadata.description,
|
||||
"goal": {
|
||||
"name": self.goal.name,
|
||||
"description": self.goal.description,
|
||||
},
|
||||
"nodes": [n.id for n in self.nodes],
|
||||
"edges": [e.id for e in self.edges],
|
||||
"entry_node": self.entry_node,
|
||||
"entry_points": self.entry_points,
|
||||
"pause_nodes": self.pause_nodes,
|
||||
"terminal_nodes": self.terminal_nodes,
|
||||
"client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Validate agent structure."""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
node_ids = {node.id for node in self.nodes}
|
||||
for edge in self.edges:
|
||||
if edge.source not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
|
||||
if edge.target not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
|
||||
|
||||
if self.entry_node not in node_ids:
|
||||
errors.append(f"Entry node '{self.entry_node}' not found")
|
||||
|
||||
for terminal in self.terminal_nodes:
|
||||
if terminal not in node_ids:
|
||||
errors.append(f"Terminal node '{terminal}' not found")
|
||||
|
||||
for ep_id, node_id in self.entry_points.items():
|
||||
if node_id not in node_ids:
|
||||
errors.append(
|
||||
f"Entry point '{ep_id}' references unknown node '{node_id}'"
|
||||
)
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
|
||||
# Create default instance
|
||||
default_agent = DeepResearchAgent()
|
||||
@@ -1,26 +0,0 @@
|
||||
"""Runtime configuration."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from framework.config import RuntimeConfig
|
||||
|
||||
default_config = RuntimeConfig()
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentMetadata:
|
||||
name: str = "Deep Research Agent"
|
||||
version: str = "1.0.0"
|
||||
description: str = (
|
||||
"Interactive research agent that rigorously investigates topics through "
|
||||
"multi-source search, quality evaluation, and synthesis - with TUI conversation "
|
||||
"at key checkpoints for user guidance and feedback."
|
||||
)
|
||||
intro_message: str = (
|
||||
"Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
|
||||
"thoroughly — searching multiple sources, evaluating quality, and synthesizing "
|
||||
"a comprehensive report. What would you like me to research?"
|
||||
)
|
||||
|
||||
|
||||
metadata = AgentMetadata()
|
||||
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"hive-tools": {
|
||||
"transport": "stdio",
|
||||
"command": "uv",
|
||||
"args": ["run", "python", "mcp_server.py", "--stdio"],
|
||||
"cwd": "../../tools",
|
||||
"description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
|
||||
}
|
||||
}
|
||||
@@ -1,213 +0,0 @@
|
||||
"""Node definitions for Deep Research Agent."""
|
||||
|
||||
from framework.graph import NodeSpec
|
||||
|
||||
# Node 1: Intake (client-facing)
|
||||
# Brief conversation to clarify what the user wants researched.
|
||||
intake_node = NodeSpec(
|
||||
id="intake",
|
||||
name="Research Intake",
|
||||
description="Discuss the research topic with the user, clarify scope, and confirm direction",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
max_node_visits=0,
|
||||
input_keys=["topic"],
|
||||
output_keys=["research_brief"],
|
||||
success_criteria=(
|
||||
"The research brief is specific and actionable: it states the topic, "
|
||||
"the key questions to answer, the desired scope, and depth."
|
||||
),
|
||||
system_prompt="""\
|
||||
You are a research intake specialist. The user wants to research a topic.
|
||||
Have a brief conversation to clarify what they need.
|
||||
|
||||
**STEP 1 — Read and respond (text only, NO tool calls):**
|
||||
1. Read the topic provided
|
||||
2. If it's vague, ask 1-2 clarifying questions (scope, angle, depth)
|
||||
3. If it's already clear, confirm your understanding and ask the user to confirm
|
||||
|
||||
Keep it short. Don't over-ask.
|
||||
|
||||
**STEP 2 — After the user confirms, call set_output:**
|
||||
- set_output("research_brief", "A clear paragraph describing exactly what to research, \
|
||||
what questions to answer, what scope to cover, and how deep to go.")
|
||||
""",
|
||||
tools=[],
|
||||
)
|
||||
|
||||
# Node 2: Research
|
||||
# The workhorse — searches the web, fetches content, analyzes sources.
|
||||
# One node with both tools avoids the context-passing overhead of 5 separate nodes.
|
||||
research_node = NodeSpec(
|
||||
id="research",
|
||||
name="Research",
|
||||
description="Search the web, fetch source content, and compile findings",
|
||||
node_type="event_loop",
|
||||
max_node_visits=0,
|
||||
input_keys=["research_brief", "feedback"],
|
||||
output_keys=["findings", "sources", "gaps"],
|
||||
nullable_output_keys=["feedback"],
|
||||
success_criteria=(
|
||||
"Findings reference at least 3 distinct sources with URLs. "
|
||||
"Key claims are substantiated by fetched content, not generated."
|
||||
),
|
||||
system_prompt="""\
|
||||
You are a research agent. Given a research brief, find and analyze sources.
|
||||
|
||||
If feedback is provided, this is a follow-up round — focus on the gaps identified.
|
||||
|
||||
Work in phases:
|
||||
1. **Search**: Use web_search with 3-5 diverse queries covering different angles.
|
||||
Prioritize authoritative sources (.edu, .gov, established publications).
|
||||
2. **Fetch**: Use web_scrape on the most promising URLs (aim for 5-8 sources).
|
||||
Skip URLs that fail. Extract the substantive content.
|
||||
3. **Analyze**: Review what you've collected. Identify key findings, themes,
|
||||
and any contradictions between sources.
|
||||
|
||||
Important:
|
||||
- Work in batches of 3-4 tool calls at a time — never more than 10 per turn
|
||||
- After each batch, assess whether you have enough material
|
||||
- Prefer quality over quantity — 5 good sources beat 15 thin ones
|
||||
- Track which URL each finding comes from (you'll need citations later)
|
||||
- Call set_output for each key in a SEPARATE turn (not in the same turn as other tool calls)
|
||||
|
||||
Context management:
|
||||
- Your tool results are automatically saved to files. After compaction, the file \
|
||||
references remain in the conversation — use load_data() to recover any content you need.
|
||||
- Use append_data('research_notes.md', ...) to maintain a running log of key findings \
|
||||
as you go. This survives compaction and helps the report node produce a detailed report.
|
||||
|
||||
When done, use set_output (one key at a time, separate turns):
|
||||
- set_output("findings", "Structured summary: key findings with source URLs for each claim. \
|
||||
Include themes, contradictions, and confidence levels.")
|
||||
- set_output("sources", [{"url": "...", "title": "...", "summary": "..."}])
|
||||
- set_output("gaps", "What aspects of the research brief are NOT well-covered yet, if any.")
|
||||
""",
|
||||
tools=[
|
||||
"web_search",
|
||||
"web_scrape",
|
||||
"load_data",
|
||||
"save_data",
|
||||
"append_data",
|
||||
"list_data_files",
|
||||
],
|
||||
)
|
||||
|
||||
# Node 3: Review (client-facing)
|
||||
# Shows the user what was found and asks whether to dig deeper or proceed.
|
||||
review_node = NodeSpec(
|
||||
id="review",
|
||||
name="Review Findings",
|
||||
description="Present findings to user and decide whether to research more or write the report",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
max_node_visits=0,
|
||||
input_keys=["findings", "sources", "gaps", "research_brief"],
|
||||
output_keys=["needs_more_research", "feedback"],
|
||||
success_criteria=(
|
||||
"The user has been presented with findings and has explicitly indicated "
|
||||
"whether they want more research or are ready for the report."
|
||||
),
|
||||
system_prompt="""\
|
||||
Present the research findings to the user clearly and concisely.
|
||||
|
||||
**STEP 1 — Present (your first message, text only, NO tool calls):**
|
||||
1. **Summary** (2-3 sentences of what was found)
|
||||
2. **Key Findings** (bulleted, with confidence levels)
|
||||
3. **Sources Used** (count and quality assessment)
|
||||
4. **Gaps** (what's still unclear or under-covered)
|
||||
|
||||
End by asking: Are they satisfied, or do they want deeper research? \
|
||||
Should we proceed to writing the final report?
|
||||
|
||||
**STEP 2 — After the user responds, call set_output:**
|
||||
- set_output("needs_more_research", "true") — if they want more
|
||||
- set_output("needs_more_research", "false") — if they're satisfied
|
||||
- set_output("feedback", "What the user wants explored further, or empty string")
|
||||
""",
|
||||
tools=[],
|
||||
)
|
||||
|
||||
# Node 4: Report (client-facing)
|
||||
# Writes an HTML report, serves the link to the user, and answers follow-ups.
|
||||
report_node = NodeSpec(
|
||||
id="report",
|
||||
name="Write & Deliver Report",
|
||||
description="Write a cited HTML report from the findings and present it to the user",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
max_node_visits=0,
|
||||
input_keys=["findings", "sources", "research_brief"],
|
||||
output_keys=["delivery_status", "next_action"],
|
||||
success_criteria=(
|
||||
"An HTML report has been saved, the file link has been presented to the user, "
|
||||
"and the user has indicated what they want to do next."
|
||||
),
|
||||
system_prompt="""\
|
||||
Write a research report as an HTML file and present it to the user.
|
||||
|
||||
IMPORTANT: save_data requires TWO separate arguments: filename and data.
|
||||
Call it like: save_data(filename="report.html", data="<html>...</html>")
|
||||
Do NOT use _raw, do NOT nest arguments inside a JSON string.
|
||||
|
||||
**STEP 1 — Write and save the HTML report (tool calls, NO text to user yet):**
|
||||
|
||||
Build a clean HTML document. Keep the HTML concise — aim for clarity over length.
|
||||
Use minimal embedded CSS (a few lines of style, not a full framework).
|
||||
|
||||
Report structure:
|
||||
- Title & date
|
||||
- Executive Summary (2-3 paragraphs)
|
||||
- Key Findings (organized by theme, with [n] citation links)
|
||||
- Analysis (synthesis, implications)
|
||||
- Conclusion (key takeaways)
|
||||
- References (numbered list with clickable URLs)
|
||||
|
||||
Requirements:
|
||||
- Every factual claim must cite its source with [n] notation
|
||||
- Be objective — present multiple viewpoints where sources disagree
|
||||
- Answer the original research questions from the brief
|
||||
- If findings appear incomplete or summarized, call list_data_files() and load_data() \
|
||||
to access the detailed source material from the research phase. The research node's \
|
||||
tool results and research_notes.md contain the full data.
|
||||
|
||||
Save the HTML:
|
||||
save_data(filename="report.html", data="<html>...</html>")
|
||||
|
||||
Then get the clickable link:
|
||||
serve_file_to_user(filename="report.html", label="Research Report")
|
||||
|
||||
If save_data fails, simplify and shorten the HTML, then retry.
|
||||
|
||||
**STEP 2 — Present the link to the user (text only, NO tool calls):**
|
||||
|
||||
Tell the user the report is ready and include the file:// URI from
|
||||
serve_file_to_user so they can click it to open. Give a brief summary
|
||||
of what the report covers. Ask if they have questions or want to continue.
|
||||
|
||||
**STEP 3 — After the user responds:**
|
||||
- Answer any follow-up questions from the research material
|
||||
- When the user is ready to move on, ask what they'd like to do next:
|
||||
- Research a new topic?
|
||||
- Dig deeper into the current topic?
|
||||
- Then call set_output:
|
||||
- set_output("delivery_status", "completed")
|
||||
- set_output("next_action", "new_topic") — if they want a new topic
|
||||
- set_output("next_action", "more_research") — if they want deeper research
|
||||
""",
|
||||
tools=[
|
||||
"save_data",
|
||||
"append_data",
|
||||
"edit_data",
|
||||
"serve_file_to_user",
|
||||
"load_data",
|
||||
"list_data_files",
|
||||
],
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"intake_node",
|
||||
"research_node",
|
||||
"review_node",
|
||||
"report_node",
|
||||
]
|
||||
@@ -1,640 +0,0 @@
|
||||
---
|
||||
name: hive-credentials
|
||||
description: Set up and install credentials for an agent. Detects missing credentials from agent config, collects them from the user, and stores them securely in the local encrypted store at ~/.hive/credentials.
|
||||
license: Apache-2.0
|
||||
metadata:
|
||||
author: hive
|
||||
version: "2.3"
|
||||
type: utility
|
||||
---
|
||||
|
||||
# Setup Credentials
|
||||
|
||||
Interactive credential setup for agents with multiple authentication options. Detects what's missing, offers auth method choices, validates with health checks, and stores credentials securely.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Before running or testing an agent for the first time
|
||||
- When `AgentRunner.run()` fails with "missing required credentials"
|
||||
- When a user asks to configure credentials for an agent
|
||||
- After building a new agent that uses tools requiring API keys
|
||||
|
||||
## Workflow
|
||||
|
||||
### Step 1: Identify the Agent
|
||||
|
||||
Determine which agent needs credentials. The user will either:
|
||||
|
||||
- Name the agent directly (e.g., "set up credentials for hubspot-agent")
|
||||
- Have an agent directory open (check `exports/` for agent dirs)
|
||||
- Be working on an agent in the current session
|
||||
|
||||
Locate the agent's directory under `exports/{agent_name}/`.
|
||||
|
||||
### Step 2: Detect Missing Credentials
|
||||
|
||||
Use the `check_missing_credentials` MCP tool to detect what the agent needs and what's already configured. This tool loads the agent, inspects its required tools and node types, maps them to credentials via `CREDENTIAL_SPECS`, and checks both the encrypted store and environment variables.
|
||||
|
||||
```
|
||||
check_missing_credentials(agent_path="exports/{agent_name}")
|
||||
```
|
||||
|
||||
The tool returns a JSON response:
|
||||
|
||||
```json
|
||||
{
|
||||
"agent": "exports/{agent_name}",
|
||||
"missing": [
|
||||
{
|
||||
"credential_name": "brave_search",
|
||||
"env_var": "BRAVE_SEARCH_API_KEY",
|
||||
"description": "Brave Search API key for web search",
|
||||
"help_url": "https://brave.com/search/api/",
|
||||
"tools": ["web_search"]
|
||||
}
|
||||
],
|
||||
"available": [
|
||||
{
|
||||
"credential_name": "anthropic",
|
||||
"env_var": "ANTHROPIC_API_KEY",
|
||||
"source": "encrypted_store"
|
||||
}
|
||||
],
|
||||
"total_missing": 1,
|
||||
"ready": false
|
||||
}
|
||||
```
|
||||
|
||||
**If `ready` is true (nothing missing):** Report all credentials as configured and skip Steps 3-5. Example:
|
||||
|
||||
```
|
||||
All required credentials are already configured:
|
||||
✓ anthropic (ANTHROPIC_API_KEY)
|
||||
✓ brave_search (BRAVE_SEARCH_API_KEY)
|
||||
Your agent is ready to run!
|
||||
```
|
||||
|
||||
**If credentials are missing:** Continue to Step 3 with the `missing` list.
|
||||
|
||||
### Step 3: Present Auth Options for Each Missing Credential
|
||||
|
||||
For each missing credential, check what authentication methods are available:
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
||||
|
||||
spec = CREDENTIAL_SPECS.get("hubspot")
|
||||
if spec:
|
||||
# Determine available auth options
|
||||
auth_options = []
|
||||
if spec.aden_supported:
|
||||
auth_options.append("aden")
|
||||
if spec.direct_api_key_supported:
|
||||
auth_options.append("direct")
|
||||
auth_options.append("custom") # Always available
|
||||
|
||||
# Get setup info
|
||||
setup_info = {
|
||||
"env_var": spec.env_var,
|
||||
"description": spec.description,
|
||||
"help_url": spec.help_url,
|
||||
"api_key_instructions": spec.api_key_instructions,
|
||||
}
|
||||
```
|
||||
|
||||
Present the available options using AskUserQuestion:
|
||||
|
||||
```
|
||||
Choose how to configure HUBSPOT_ACCESS_TOKEN:
|
||||
|
||||
1) Aden Platform (OAuth) (Recommended)
|
||||
Secure OAuth2 flow via hive.adenhq.com
|
||||
- Quick setup with automatic token refresh
|
||||
- No need to manage API keys manually
|
||||
|
||||
2) Direct API Key
|
||||
Enter your own API key manually
|
||||
- Requires creating a HubSpot Private App
|
||||
- Full control over scopes and permissions
|
||||
|
||||
3) Local Credential Setup (Advanced)
|
||||
Programmatic configuration for CI/CD
|
||||
- For automated deployments
|
||||
- Requires manual API calls
|
||||
```
|
||||
|
||||
### Step 4: Execute Auth Flow Based on User Choice
|
||||
|
||||
#### Prerequisite: Ensure HIVE_CREDENTIAL_KEY Is Available
|
||||
|
||||
Before storing any credentials, verify `HIVE_CREDENTIAL_KEY` is set (needed to encrypt/decrypt the local store). Check both the current session and shell config:
|
||||
|
||||
```bash
|
||||
# Check current session
|
||||
printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "session: set" || echo "session: not set"
|
||||
|
||||
# Check shell config files
|
||||
for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTIAL_KEY' "$f" && echo "$f"; done
|
||||
```
|
||||
|
||||
- **In current session** — proceed to store credentials
|
||||
- **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
|
||||
- **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile
|
||||
|
||||
> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
|
||||
> ```
|
||||
> ⚠️ Environment variables were added to your shell config.
|
||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
||||
> ```
|
||||
|
||||
#### Option 1: Aden Platform (OAuth)
|
||||
|
||||
This is the recommended flow for supported integrations (HubSpot, etc.).
|
||||
|
||||
**How Aden OAuth Works:**
|
||||
|
||||
The ADEN_API_KEY represents a user who has already completed OAuth authorization on Aden's platform. When users sign up and connect integrations on Aden, those OAuth tokens are stored server-side. Having an ADEN_API_KEY means:
|
||||
|
||||
1. User has an Aden account
|
||||
2. User has already authorized integrations (HubSpot, etc.) via OAuth on Aden
|
||||
3. We just need to sync those credentials down to the local credential store
|
||||
|
||||
**4.1a. Check for ADEN_API_KEY**
|
||||
|
||||
```python
|
||||
import os
|
||||
aden_key = os.environ.get("ADEN_API_KEY")
|
||||
```
|
||||
|
||||
If not set, guide user to get one from Aden (this is where they do OAuth):
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import open_browser, get_aden_setup_url
|
||||
|
||||
# Open browser to Aden - user will sign up and connect integrations there
|
||||
url = get_aden_setup_url() # https://hive.adenhq.com
|
||||
success, msg = open_browser(url)
|
||||
|
||||
print("Please sign in to Aden and connect your integrations (HubSpot, etc.).")
|
||||
print("Once done, copy your API key and return here.")
|
||||
```
|
||||
|
||||
Ask user to provide the ADEN_API_KEY they received.
|
||||
|
||||
**4.1b. Save ADEN_API_KEY to Shell Config**
|
||||
|
||||
With user approval, persist ADEN_API_KEY to their shell config:
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import (
|
||||
detect_shell,
|
||||
add_env_var_to_shell_config,
|
||||
get_shell_source_command,
|
||||
)
|
||||
|
||||
shell_type = detect_shell() # 'bash', 'zsh', or 'unknown'
|
||||
|
||||
# Ask user for approval before modifying shell config
|
||||
# If approved:
|
||||
success, config_path = add_env_var_to_shell_config(
|
||||
"ADEN_API_KEY",
|
||||
user_provided_key,
|
||||
comment="Aden Platform (OAuth) API key"
|
||||
)
|
||||
|
||||
if success:
|
||||
source_cmd = get_shell_source_command()
|
||||
print(f"Saved to {config_path}")
|
||||
print(f"Run: {source_cmd}")
|
||||
```
|
||||
|
||||
> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
|
||||
> ```
|
||||
> ⚠️ Environment variables were added to your shell config.
|
||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
||||
> ```
|
||||
|
||||
Also save to `~/.hive/configuration.json` for the framework:
|
||||
|
||||
```python
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
config = json.loads(config_path.read_text()) if config_path.exists() else {}
|
||||
|
||||
config["aden"] = {
|
||||
"api_key_configured": True,
|
||||
"api_url": "https://api.adenhq.com"
|
||||
}
|
||||
|
||||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
config_path.write_text(json.dumps(config, indent=2))
|
||||
```
|
||||
|
||||
**4.1c. Sync Credentials from Aden Server**
|
||||
|
||||
Since the user has already authorized integrations on Aden, use the one-liner factory method:
|
||||
|
||||
```python
|
||||
from core.framework.credentials import CredentialStore
|
||||
|
||||
# This single call handles everything:
|
||||
# - Creates encrypted local storage at ~/.hive/credentials
|
||||
# - Configures Aden client from ADEN_API_KEY env var
|
||||
# - Syncs all credentials from Aden server automatically
|
||||
store = CredentialStore.with_aden_sync(
|
||||
base_url="https://api.adenhq.com",
|
||||
auto_sync=True, # Syncs on creation
|
||||
)
|
||||
|
||||
# Check what was synced
|
||||
synced = store.list_credentials()
|
||||
print(f"Synced credentials: {synced}")
|
||||
|
||||
# If the required credential wasn't synced, the user hasn't authorized it on Aden yet
|
||||
if "hubspot" not in synced:
|
||||
print("HubSpot not found in your Aden account.")
|
||||
print("Please visit https://hive.adenhq.com to connect HubSpot, then try again.")
|
||||
```
|
||||
|
||||
For more control over the sync process:
|
||||
|
||||
```python
|
||||
from core.framework.credentials import CredentialStore
|
||||
from core.framework.credentials.aden import (
|
||||
AdenCredentialClient,
|
||||
AdenClientConfig,
|
||||
AdenSyncProvider,
|
||||
)
|
||||
|
||||
# Create client (API key loaded from ADEN_API_KEY env var)
|
||||
client = AdenCredentialClient(AdenClientConfig(
|
||||
base_url="https://api.adenhq.com",
|
||||
))
|
||||
|
||||
# Create provider and store
|
||||
provider = AdenSyncProvider(client=client)
|
||||
store = CredentialStore.with_encrypted_storage()
|
||||
|
||||
# Manual sync
|
||||
synced_count = provider.sync_all(store)
|
||||
print(f"Synced {synced_count} credentials from Aden")
|
||||
```
|
||||
|
||||
**4.1d. Run Health Check**
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import check_credential_health
|
||||
|
||||
# Get the token from the store
|
||||
cred = store.get_credential("hubspot")
|
||||
token = cred.keys["access_token"].value.get_secret_value()
|
||||
|
||||
result = check_credential_health("hubspot", token)
|
||||
if result.valid:
|
||||
print("HubSpot credentials validated successfully!")
|
||||
else:
|
||||
print(f"Validation failed: {result.message}")
|
||||
# Offer to retry the OAuth flow
|
||||
```
|
||||
|
||||
#### Option 2: Direct API Key
|
||||
|
||||
For users who prefer manual API key management.
|
||||
|
||||
**4.2a. Show Setup Instructions**
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
||||
|
||||
spec = CREDENTIAL_SPECS.get("hubspot")
|
||||
if spec and spec.api_key_instructions:
|
||||
print(spec.api_key_instructions)
|
||||
# Output:
|
||||
# To get a HubSpot Private App token:
|
||||
# 1. Go to HubSpot Settings > Integrations > Private Apps
|
||||
# 2. Click "Create a private app"
|
||||
# 3. Name your app (e.g., "Hive Agent")
|
||||
# ...
|
||||
|
||||
if spec and spec.help_url:
|
||||
print(f"More info: {spec.help_url}")
|
||||
```
|
||||
|
||||
**4.2b. Collect API Key from User**
|
||||
|
||||
Use AskUserQuestion to securely collect the API key:
|
||||
|
||||
```
|
||||
Please provide your HubSpot access token:
|
||||
(This will be stored securely in ~/.hive/credentials)
|
||||
```
|
||||
|
||||
**4.2c. Run Health Check Before Storing**
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import check_credential_health
|
||||
|
||||
result = check_credential_health("hubspot", user_provided_token)
|
||||
if not result.valid:
|
||||
print(f"Warning: {result.message}")
|
||||
# Ask user if they want to:
|
||||
# 1. Try a different token
|
||||
# 2. Continue anyway (not recommended)
|
||||
```
|
||||
|
||||
**4.2d. Store in Local Encrypted Store**
|
||||
|
||||
```python
|
||||
from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
|
||||
from pydantic import SecretStr
|
||||
|
||||
store = CredentialStore.with_encrypted_storage()
|
||||
|
||||
cred = CredentialObject(
|
||||
id="hubspot",
|
||||
name="HubSpot Access Token",
|
||||
keys={
|
||||
"access_token": CredentialKey(
|
||||
name="access_token",
|
||||
value=SecretStr(user_provided_token),
|
||||
)
|
||||
},
|
||||
)
|
||||
store.save_credential(cred)
|
||||
```
|
||||
|
||||
**4.2e. Export to Current Session**
|
||||
|
||||
```bash
|
||||
export HUBSPOT_ACCESS_TOKEN="the-value"
|
||||
```
|
||||
|
||||
#### Option 3: Local Credential Setup (Advanced)
|
||||
|
||||
For programmatic/CI/CD setups.
|
||||
|
||||
**4.3a. Show Documentation**
|
||||
|
||||
```
|
||||
For advanced credential management, you can use the CredentialStore API directly:
|
||||
|
||||
from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
|
||||
from pydantic import SecretStr
|
||||
|
||||
store = CredentialStore.with_encrypted_storage()
|
||||
|
||||
cred = CredentialObject(
|
||||
id="hubspot",
|
||||
name="HubSpot Access Token",
|
||||
keys={"access_token": CredentialKey(name="access_token", value=SecretStr("..."))}
|
||||
)
|
||||
store.save_credential(cred)
|
||||
|
||||
For CI/CD environments:
|
||||
- Set HIVE_CREDENTIAL_KEY for encryption
|
||||
- Pre-populate ~/.hive/credentials programmatically
|
||||
- Or use environment variables directly (HUBSPOT_ACCESS_TOKEN)
|
||||
|
||||
Documentation: See core/framework/credentials/README.md
|
||||
```
|
||||
|
||||
### Step 5: Record Configuration Method
|
||||
|
||||
Track which auth method was used for each credential in `~/.hive/configuration.json`:
|
||||
|
||||
```python
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
config = json.loads(config_path.read_text()) if config_path.exists() else {}
|
||||
|
||||
if "credential_methods" not in config:
|
||||
config["credential_methods"] = {}
|
||||
|
||||
config["credential_methods"]["hubspot"] = {
|
||||
"method": "aden", # or "direct" or "custom"
|
||||
"configured_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
config_path.write_text(json.dumps(config, indent=2))
|
||||
```
|
||||
|
||||
### Step 6: Verify All Credentials
|
||||
|
||||
Use the `verify_credentials` MCP tool to confirm everything is properly configured:
|
||||
|
||||
```
|
||||
verify_credentials(agent_path="exports/{agent_name}")
|
||||
```
|
||||
|
||||
The tool returns:
|
||||
|
||||
```json
|
||||
{
|
||||
"agent": "exports/{agent_name}",
|
||||
"ready": true,
|
||||
"missing_credentials": [],
|
||||
"warnings": [],
|
||||
"errors": []
|
||||
}
|
||||
```
|
||||
|
||||
If `ready` is true, report success. If `missing_credentials` is non-empty, identify what failed and loop back to Step 3 for the remaining credentials.
|
||||
|
||||
## Health Check Reference
|
||||
|
||||
Health checks validate credentials by making lightweight API calls:
|
||||
|
||||
| Credential | Endpoint | What It Checks |
|
||||
| --------------- | --------------------------------------- | --------------------------------- |
|
||||
| `anthropic` | `POST /v1/messages` | API key validity |
|
||||
| `brave_search` | `GET /res/v1/web/search?q=test&count=1` | API key validity |
|
||||
| `google_search` | `GET /customsearch/v1?q=test&num=1` | API key + CSE ID validity |
|
||||
| `github` | `GET /user` | Token validity, user identity |
|
||||
| `hubspot` | `GET /crm/v3/objects/contacts?limit=1` | Bearer token validity, CRM scopes |
|
||||
| `resend` | `GET /domains` | API key validity |
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import check_credential_health, HealthCheckResult
|
||||
|
||||
result: HealthCheckResult = check_credential_health("hubspot", token_value)
|
||||
# result.valid: bool
|
||||
# result.message: str
|
||||
# result.details: dict (status_code, rate_limited, etc.)
|
||||
```
|
||||
|
||||
## Encryption Key (HIVE_CREDENTIAL_KEY)
|
||||
|
||||
The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.
|
||||
|
||||
- If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
|
||||
- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
|
||||
- Without this key, stored credentials cannot be decrypted
|
||||
|
||||
**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
|
||||
- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
|
||||
- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
|
||||
|
||||
All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**
|
||||
|
||||
If `HIVE_CREDENTIAL_KEY` is not set:
|
||||
|
||||
1. Let the store generate one
|
||||
2. Tell the user to save it: `export HIVE_CREDENTIAL_KEY="{generated_key}"`
|
||||
3. Recommend adding it to `~/.bashrc` or their shell profile
|
||||
|
||||
## Security Rules
|
||||
|
||||
- **NEVER** log, print, or echo credential values in tool output
|
||||
- **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
|
||||
- **NEVER** hardcode credentials in source code
|
||||
- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
|
||||
- **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
|
||||
- **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
|
||||
- **ALWAYS** run health checks before storing credentials (when possible)
|
||||
- **ALWAYS** verify credentials were stored by re-running validation, not by reading them back
|
||||
- When modifying `~/.bashrc` or `~/.zshrc`, confirm with the user first
|
||||
|
||||
## Credential Sources Reference
|
||||
|
||||
All credential specs are defined in `tools/src/aden_tools/credentials/`:
|
||||
|
||||
| File | Category | Credentials | Aden Supported |
|
||||
| ----------------- | ------------- | --------------------------------------------- | -------------- |
|
||||
| `llm.py` | LLM Providers | `anthropic` | No |
|
||||
| `search.py` | Search Tools | `brave_search`, `google_search`, `google_cse` | No |
|
||||
| `email.py` | Email | `resend` | No |
|
||||
| `integrations.py` | Integrations | `github`, `hubspot`, `google_calendar_oauth` | No / Yes |
|
||||
|
||||
**Note:** Additional LLM providers (Cerebras, Groq, OpenAI) are handled by LiteLLM via environment
|
||||
variables (`CEREBRAS_API_KEY`, `GROQ_API_KEY`, `OPENAI_API_KEY`) but are not yet in CREDENTIAL_SPECS.
|
||||
Add them to `llm.py` as needed.
|
||||
|
||||
To check what's registered:
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
||||
for name, spec in CREDENTIAL_SPECS.items():
|
||||
print(f"{name}: aden={spec.aden_supported}, direct={spec.direct_api_key_supported}")
|
||||
```
|
||||
|
||||
## Migration: CredentialManager → CredentialStore
|
||||
|
||||
**CredentialManager is deprecated.** Use CredentialStore instead.
|
||||
|
||||
| Old (Deprecated) | New (Recommended) |
|
||||
| ----------------------------------------- | -------------------------------------------------------------------- |
|
||||
| `CredentialManager()` | `CredentialStore.with_encrypted_storage()` |
|
||||
| `creds.get("hubspot")` | `store.get("hubspot")` or `store.get_key("hubspot", "access_token")` |
|
||||
| `creds.validate_for_tools(tools)` | Use `store.is_available(cred_id)` per credential |
|
||||
| `creds.get_auth_options("hubspot")` | Check `CREDENTIAL_SPECS["hubspot"].aden_supported` |
|
||||
| `creds.get_setup_instructions("hubspot")` | Access `CREDENTIAL_SPECS["hubspot"]` directly |
|
||||
|
||||
**Why migrate?**
|
||||
|
||||
- **CredentialStore** supports encrypted storage, multi-key credentials, template resolution, and automatic token refresh
|
||||
- **CredentialManager** only reads from environment variables and .env files (no encryption, no refresh)
|
||||
- **CredentialStoreAdapter** exists for backward compatibility during migration
|
||||
|
||||
```python
|
||||
# Old way (deprecated)
|
||||
from aden_tools.credentials import CredentialManager
|
||||
creds = CredentialManager()
|
||||
token = creds.get("hubspot")
|
||||
|
||||
# New way (recommended)
|
||||
from core.framework.credentials import CredentialStore
|
||||
store = CredentialStore.with_encrypted_storage()
|
||||
token = store.get("hubspot")
|
||||
|
||||
# With Aden sync (recommended for OAuth integrations)
|
||||
store = CredentialStore.with_aden_sync()
|
||||
token = store.get_key("hubspot", "access_token")
|
||||
```
|
||||
|
||||
## Example Session
|
||||
|
||||
```
|
||||
User: /hive-credentials for my research-agent
|
||||
|
||||
Agent: Let me check what credentials your research-agent needs.
|
||||
|
||||
[Calls check_missing_credentials(agent_path="exports/research-agent")]
|
||||
→ Returns:
|
||||
available: anthropic (encrypted_store), brave_search (encrypted_store)
|
||||
missing: google_search (GOOGLE_API_KEY), google_cse (GOOGLE_CSE_ID)
|
||||
ready: false
|
||||
|
||||
Agent: 2 of 4 required credentials are already configured. Only Google Custom
|
||||
Search needs setup (2 values).
|
||||
|
||||
--- Setting up Google Custom Search (google_search + google_cse) ---
|
||||
|
||||
This requires two values that work together.
|
||||
|
||||
[Checks HIVE_CREDENTIAL_KEY before storing]
|
||||
$ printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "set" || echo "not set"
|
||||
set
|
||||
|
||||
First, the Google API Key:
|
||||
1. Go to https://console.cloud.google.com/apis/credentials
|
||||
2. Create a new project (or select an existing one)
|
||||
3. Enable the "Custom Search API" from the API Library
|
||||
4. Go to Credentials > Create Credentials > API Key
|
||||
5. Copy the generated API key
|
||||
|
||||
[AskUserQuestion: "Please provide your Google API key:"]
|
||||
[User provides key]
|
||||
|
||||
Now, the Custom Search Engine ID:
|
||||
1. Go to https://programmablesearchengine.google.com/controlpanel/all
|
||||
2. Click "Add" to create a new search engine
|
||||
3. Under "What to search", select "Search the entire web"
|
||||
4. Give your search engine a name
|
||||
5. Click "Create"
|
||||
6. Copy the Search Engine ID (cx value)
|
||||
|
||||
[AskUserQuestion: "Please provide your Google CSE ID:"]
|
||||
[User provides ID]
|
||||
|
||||
[Runs health check with both values - GET /customsearch/v1?q=test&num=1 → 200 OK]
|
||||
[Stores both in local encrypted store, exports to env]
|
||||
|
||||
✓ Google Custom Search credentials valid
|
||||
|
||||
[Calls verify_credentials(agent_path="exports/research-agent")]
|
||||
→ Returns: ready: true, missing_credentials: []
|
||||
|
||||
All credentials are now configured:
|
||||
✓ anthropic (ANTHROPIC_API_KEY) — already in encrypted store
|
||||
✓ brave_search (BRAVE_SEARCH_API_KEY) — already in encrypted store
|
||||
✓ google_search (GOOGLE_API_KEY) — stored in encrypted store
|
||||
✓ google_cse (GOOGLE_CSE_ID) — stored in encrypted store
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ✅ CREDENTIALS CONFIGURED │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ OPEN A NEW TERMINAL before running commands below. │
|
||||
│ Environment variables were saved to your shell config but │
|
||||
│ only take effect in new terminal sessions. │
|
||||
│ │
|
||||
│ NEXT STEPS: │
|
||||
│ │
|
||||
│ 1. RUN YOUR AGENT: │
|
||||
│ │
|
||||
│ hive tui │
|
||||
│ │
|
||||
│ 2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER: │
|
||||
│ │
|
||||
│ /hive-debugger │
|
||||
│ │
|
||||
│ The debugger analyzes runtime logs, identifies retry loops, tool │
|
||||
│ failures, stalled execution, and provides actionable fix suggestions. │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,385 +0,0 @@
|
||||
---
|
||||
name: hive-patterns
|
||||
description: Best practices, patterns, and examples for building goal-driven agents. Includes client-facing interaction, feedback edges, judge patterns, fan-out/fan-in, context management, and anti-patterns.
|
||||
license: Apache-2.0
|
||||
metadata:
|
||||
author: hive
|
||||
version: "2.0"
|
||||
type: reference
|
||||
part_of: hive
|
||||
---
|
||||
|
||||
# Building Agents - Patterns & Best Practices
|
||||
|
||||
Design patterns, examples, and best practices for building robust goal-driven agents.
|
||||
|
||||
**Prerequisites:** Complete agent structure using `hive-create`.
|
||||
|
||||
## Practical Example: Hybrid Workflow
|
||||
|
||||
How to build a node using both direct file writes and optional MCP validation:
|
||||
|
||||
```python
|
||||
# 1. WRITE TO FILE FIRST (Primary - makes it visible)
|
||||
node_code = '''
|
||||
search_node = NodeSpec(
|
||||
id="search-web",
|
||||
node_type="event_loop",
|
||||
input_keys=["query"],
|
||||
output_keys=["search_results"],
|
||||
system_prompt="Search the web for: {query}. Use web_search, then call set_output to store results.",
|
||||
tools=["web_search"],
|
||||
)
|
||||
'''
|
||||
|
||||
Edit(
|
||||
file_path="exports/research_agent/nodes/__init__.py",
|
||||
old_string="# Nodes will be added here",
|
||||
new_string=node_code
|
||||
)
|
||||
|
||||
# 2. OPTIONALLY VALIDATE WITH MCP (Secondary - bookkeeping)
|
||||
validation = mcp__agent-builder__test_node(
|
||||
node_id="search-web",
|
||||
test_input='{"query": "python tutorials"}',
|
||||
mock_llm_response='{"search_results": [...mock results...]}'
|
||||
)
|
||||
```
|
||||
|
||||
**User experience:**
|
||||
|
||||
- Immediately sees node in their editor (from step 1)
|
||||
- Gets validation feedback (from step 2)
|
||||
- Can edit the file directly if needed
|
||||
|
||||
## Multi-Turn Interaction Patterns
|
||||
|
||||
For agents needing multi-turn conversations with users, use `client_facing=True` on event_loop nodes.
|
||||
|
||||
### Client-Facing Nodes
|
||||
|
||||
A client-facing node streams LLM output to the user and blocks for user input between conversational turns. This replaces the old pause/resume pattern.
|
||||
|
||||
```python
|
||||
# Client-facing node with STEP 1/STEP 2 prompt pattern
|
||||
intake_node = NodeSpec(
|
||||
id="intake",
|
||||
name="Intake",
|
||||
description="Gather requirements from the user",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
input_keys=["topic"],
|
||||
output_keys=["research_brief"],
|
||||
system_prompt="""\
|
||||
You are an intake specialist.
|
||||
|
||||
**STEP 1 — Read and respond (text only, NO tool calls):**
|
||||
1. Read the topic provided
|
||||
2. If it's vague, ask 1-2 clarifying questions
|
||||
3. If it's clear, confirm your understanding
|
||||
|
||||
**STEP 2 — After the user confirms, call set_output:**
|
||||
- set_output("research_brief", "Clear description of what to research")
|
||||
""",
|
||||
)
|
||||
|
||||
# Internal node runs without user interaction
|
||||
research_node = NodeSpec(
|
||||
id="research",
|
||||
name="Research",
|
||||
description="Search and analyze sources",
|
||||
node_type="event_loop",
|
||||
input_keys=["research_brief"],
|
||||
output_keys=["findings", "sources"],
|
||||
system_prompt="Research the topic using web_search and web_scrape...",
|
||||
tools=["web_search", "web_scrape", "load_data", "save_data"],
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
|
||||
- Client-facing nodes stream LLM text to the user and block for input after each response
|
||||
- User input is injected via `node.inject_event(text)`
|
||||
- When the LLM calls `set_output` to produce structured outputs, the judge evaluates and ACCEPTs
|
||||
- Internal nodes (non-client-facing) run their entire loop without blocking
|
||||
- `set_output` is a synthetic tool — a turn with only `set_output` calls (no real tools) triggers user input blocking
|
||||
|
||||
**STEP 1/STEP 2 pattern:** Always structure client-facing prompts with explicit phases. STEP 1 is text-only conversation. STEP 2 calls `set_output` after user confirmation. This prevents the LLM from calling `set_output` prematurely before the user responds.
|
||||
|
||||
### When to Use client_facing
|
||||
|
||||
| Scenario | client_facing | Why |
|
||||
| ----------------------------------- | :-----------: | ---------------------- |
|
||||
| Gathering user requirements | Yes | Need user input |
|
||||
| Human review/approval checkpoint | Yes | Need human decision |
|
||||
| Data processing (scanning, scoring) | No | Runs autonomously |
|
||||
| Report generation | No | No user input needed |
|
||||
| Final confirmation before action | Yes | Need explicit approval |
|
||||
|
||||
> **Legacy Note:** The `pause_nodes` / `entry_points` pattern still works for backward compatibility but `client_facing=True` is preferred for new agents.
|
||||
|
||||
## Edge-Based Routing and Feedback Loops
|
||||
|
||||
### Conditional Edge Routing
|
||||
|
||||
Multiple conditional edges from the same source replace the old `router` node type. Each edge checks a condition on the node's output.
|
||||
|
||||
```python
|
||||
# Node with mutually exclusive outputs
|
||||
review_node = NodeSpec(
|
||||
id="review",
|
||||
name="Review",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
output_keys=["approved_contacts", "redo_extraction"],
|
||||
nullable_output_keys=["approved_contacts", "redo_extraction"],
|
||||
max_node_visits=3,
|
||||
system_prompt="Present the contact list to the operator. If they approve, call set_output('approved_contacts', ...). If they want changes, call set_output('redo_extraction', 'true').",
|
||||
)
|
||||
|
||||
# Forward edge (positive priority, evaluated first)
|
||||
EdgeSpec(
|
||||
id="review-to-campaign",
|
||||
source="review",
|
||||
target="campaign-builder",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('approved_contacts') is not None",
|
||||
priority=1,
|
||||
)
|
||||
|
||||
# Feedback edge (negative priority, evaluated after forward edges)
|
||||
EdgeSpec(
|
||||
id="review-feedback",
|
||||
source="review",
|
||||
target="extractor",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output.get('redo_extraction') is not None",
|
||||
priority=-1,
|
||||
)
|
||||
```
|
||||
|
||||
**Key concepts:**
|
||||
|
||||
- `nullable_output_keys`: Lists output keys that may remain unset. The node sets exactly one of the mutually exclusive keys per execution.
|
||||
- `max_node_visits`: Must be >1 on the feedback target (extractor) so it can re-execute. Default is 1.
|
||||
- `priority`: Positive = forward edge (evaluated first). Negative = feedback edge. The executor tries forward edges first; if none match, falls back to feedback edges.
|
||||
|
||||
### Routing Decision Table
|
||||
|
||||
| Pattern | Old Approach | New Approach |
|
||||
| ---------------------- | ----------------------- | --------------------------------------------- |
|
||||
| Conditional branching | `router` node | Conditional edges with `condition_expr` |
|
||||
| Binary approve/reject | `pause_nodes` + resume | `client_facing=True` + `nullable_output_keys` |
|
||||
| Loop-back on rejection | Manual entry_points | Feedback edge with `priority=-1` |
|
||||
| Multi-way routing | Router with routes dict | Multiple conditional edges with priorities |
|
||||
|
||||
## Judge Patterns
|
||||
|
||||
**Core Principle: The judge is the SOLE mechanism for acceptance decisions.** Never add ad-hoc framework gating to compensate for LLM behavior. If the LLM calls `set_output` prematurely, fix the system prompt or use a custom judge. Anti-patterns to avoid:
|
||||
|
||||
- Output rollback logic
|
||||
- `_user_has_responded` flags
|
||||
- Premature set_output rejection
|
||||
- Interaction protocol injection into system prompts
|
||||
|
||||
Judges control when an event_loop node's loop exits. Choose based on validation needs.
|
||||
|
||||
### Implicit Judge (Default)
|
||||
|
||||
When no judge is configured, the implicit judge ACCEPTs when:
|
||||
|
||||
- The LLM finishes its response with no tool calls
|
||||
- All required output keys have been set via `set_output`
|
||||
|
||||
Best for simple nodes where "all outputs set" is sufficient validation.
|
||||
|
||||
### SchemaJudge
|
||||
|
||||
Validates outputs against a Pydantic model. Use when you need structural validation.
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
|
||||
class ScannerOutput(BaseModel):
|
||||
github_users: list[dict] # Must be a list of user objects
|
||||
|
||||
class SchemaJudge:
|
||||
def __init__(self, output_model: type[BaseModel]):
|
||||
self._model = output_model
|
||||
|
||||
async def evaluate(self, context: dict) -> JudgeVerdict:
|
||||
missing = context.get("missing_keys", [])
|
||||
if missing:
|
||||
return JudgeVerdict(
|
||||
action="RETRY",
|
||||
feedback=f"Missing output keys: {missing}. Use set_output to provide them.",
|
||||
)
|
||||
try:
|
||||
self._model.model_validate(context["output_accumulator"])
|
||||
return JudgeVerdict(action="ACCEPT")
|
||||
except ValidationError as e:
|
||||
return JudgeVerdict(action="RETRY", feedback=str(e))
|
||||
```
|
||||
|
||||
### When to Use Which Judge
|
||||
|
||||
| Judge | Use When | Example |
|
||||
| --------------- | ------------------------------------- | ---------------------- |
|
||||
| Implicit (None) | Output keys are sufficient validation | Simple data extraction |
|
||||
| SchemaJudge | Need structural validation of outputs | API response parsing |
|
||||
| Custom | Domain-specific validation logic | Score must be 0.0-1.0 |
|
||||
|
||||
## Fan-Out / Fan-In (Parallel Execution)
|
||||
|
||||
Multiple ON_SUCCESS edges from the same source trigger parallel execution. All branches run concurrently via `asyncio.gather()`.
|
||||
|
||||
```python
|
||||
# Scanner fans out to Profiler and Scorer in parallel
|
||||
EdgeSpec(id="scanner-to-profiler", source="scanner", target="profiler",
|
||||
condition=EdgeCondition.ON_SUCCESS)
|
||||
EdgeSpec(id="scanner-to-scorer", source="scanner", target="scorer",
|
||||
condition=EdgeCondition.ON_SUCCESS)
|
||||
|
||||
# Both fan in to Extractor
|
||||
EdgeSpec(id="profiler-to-extractor", source="profiler", target="extractor",
|
||||
condition=EdgeCondition.ON_SUCCESS)
|
||||
EdgeSpec(id="scorer-to-extractor", source="scorer", target="extractor",
|
||||
condition=EdgeCondition.ON_SUCCESS)
|
||||
```
|
||||
|
||||
**Requirements:**
|
||||
|
||||
- Parallel event_loop nodes must have **disjoint output_keys** (no key written by both)
|
||||
- Only one parallel branch may contain a `client_facing` node
|
||||
- Fan-in node receives outputs from all completed branches in shared memory
|
||||
|
||||
## Context Management Patterns
|
||||
|
||||
### Tiered Compaction
|
||||
|
||||
EventLoopNode automatically manages context window usage with tiered compaction:
|
||||
|
||||
1. **Pruning** — Old tool results replaced with compact placeholders (zero-cost, no LLM call)
|
||||
2. **Normal compaction** — LLM summarizes older messages
|
||||
3. **Aggressive compaction** — Keeps only recent messages + summary
|
||||
4. **Emergency** — Hard reset with tool history preservation
|
||||
|
||||
### Spillover Pattern
|
||||
|
||||
The framework automatically truncates large tool results and saves full content to a spillover directory. The LLM receives a truncation message with instructions to use `load_data` to read the full result.
|
||||
|
||||
For explicit data management, use the data tools (real MCP tools, not synthetic):
|
||||
|
||||
```python
|
||||
# save_data, load_data, list_data_files, serve_file_to_user are real MCP tools
|
||||
# data_dir is auto-injected by the framework — the LLM never sees it
|
||||
|
||||
# Saving large results
|
||||
save_data(filename="sources.json", data=large_json_string)
|
||||
|
||||
# Reading with pagination (line-based offset/limit)
|
||||
load_data(filename="sources.json", offset=0, limit=50)
|
||||
|
||||
# Listing available files
|
||||
list_data_files()
|
||||
|
||||
# Serving a file to the user as a clickable link
|
||||
serve_file_to_user(filename="report.html", label="Research Report")
|
||||
```
|
||||
|
||||
Add data tools to nodes that handle large tool results:
|
||||
|
||||
```python
|
||||
research_node = NodeSpec(
|
||||
...
|
||||
tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
|
||||
)
|
||||
```
|
||||
|
||||
`data_dir` is a framework context parameter — auto-injected at call time. `GraphExecutor.execute()` sets it per-execution via `ToolRegistry.set_execution_context(data_dir=...)` (using `contextvars` for concurrency safety), ensuring it matches the session-scoped spillover directory.
|
||||
|
||||
## Anti-Patterns
|
||||
|
||||
### What NOT to Do
|
||||
|
||||
- **Don't rely on `export_graph`** — Write files immediately, not at end
|
||||
- **Don't hide code in session** — Write to files as components are approved
|
||||
- **Don't wait to write files** — Agent visible from first step
|
||||
- **Don't batch everything** — Write incrementally, one component at a time
|
||||
- **Don't create too many thin nodes** — Prefer fewer, richer nodes (see below)
|
||||
- **Don't add framework gating for LLM behavior** — Fix prompts or use judges instead
|
||||
|
||||
### Fewer, Richer Nodes
|
||||
|
||||
A common mistake is splitting work into too many small single-purpose nodes. Each node boundary requires serializing outputs, losing in-context information, and adding edge complexity.
|
||||
|
||||
| Bad (8 thin nodes) | Good (4 rich nodes) |
|
||||
| ------------------- | ----------------------------------- |
|
||||
| parse-query | intake (client-facing) |
|
||||
| search-sources | research (search + fetch + analyze) |
|
||||
| fetch-content | review (client-facing) |
|
||||
| evaluate-sources | report (write + deliver) |
|
||||
| synthesize-findings | |
|
||||
| write-report | |
|
||||
| quality-check | |
|
||||
| save-report | |
|
||||
|
||||
**Why fewer nodes are better:**
|
||||
|
||||
- The LLM retains full context of its work within a single node
|
||||
- A research node that searches, fetches, and analyzes keeps all source material in its conversation history
|
||||
- Fewer edges means simpler graph and fewer failure points
|
||||
- Data tools (`save_data`/`load_data`) handle context window limits within a single node
|
||||
|
||||
### MCP Tools - Correct Usage
|
||||
|
||||
**MCP tools OK for:**
|
||||
|
||||
- `test_node` — Validate node configuration with mock inputs
|
||||
- `validate_graph` — Check graph structure
|
||||
- `configure_loop` — Set event loop parameters
|
||||
- `create_session` — Track session state for bookkeeping
|
||||
|
||||
**Just don't:** Use MCP as the primary construction method or rely on export_graph
|
||||
|
||||
## Error Handling Patterns
|
||||
|
||||
### Graceful Failure with Fallback
|
||||
|
||||
```python
|
||||
edges = [
|
||||
# Success path
|
||||
EdgeSpec(id="api-success", source="api-call", target="process-results",
|
||||
condition=EdgeCondition.ON_SUCCESS),
|
||||
# Fallback on failure
|
||||
EdgeSpec(id="api-to-fallback", source="api-call", target="fallback-cache",
|
||||
condition=EdgeCondition.ON_FAILURE, priority=1),
|
||||
# Report if fallback also fails
|
||||
EdgeSpec(id="fallback-to-error", source="fallback-cache", target="report-error",
|
||||
condition=EdgeCondition.ON_FAILURE, priority=1),
|
||||
]
|
||||
```
|
||||
|
||||
## Handoff to Testing
|
||||
|
||||
When agent is complete, transition to testing phase:
|
||||
|
||||
### Pre-Testing Checklist
|
||||
|
||||
- [ ] Agent structure validates: `uv run python -m agent_name validate`
|
||||
- [ ] All nodes defined in nodes/**init**.py
|
||||
- [ ] All edges connect valid nodes with correct priorities
|
||||
- [ ] Feedback edge targets have `max_node_visits > 1`
|
||||
- [ ] Client-facing nodes have meaningful system prompts
|
||||
- [ ] Agent can be imported: `from exports.agent_name import default_agent`
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **hive-concepts** — Fundamental concepts (node types, edges, event loop architecture)
|
||||
- **hive-create** — Step-by-step building process
|
||||
- **hive-test** — Test and validate agents
|
||||
- **hive** — Complete workflow orchestrator
|
||||
|
||||
---
|
||||
|
||||
**Remember: Agent is actively constructed, visible the whole time. No hidden state. No surprise exports. Just transparent, incremental file building.**
|
||||
@@ -1,940 +0,0 @@
|
||||
---
|
||||
name: hive-test
|
||||
description: Iterative agent testing with session recovery. Execute, analyze, fix, resume from checkpoints. Use when testing an agent, debugging test failures, or verifying fixes without re-running from scratch.
|
||||
---
|
||||
|
||||
# Agent Testing
|
||||
|
||||
Test agents iteratively: execute, analyze failures, fix, resume from checkpoint, repeat.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Testing a newly built agent against its goal
|
||||
- Debugging a failing agent iteratively
|
||||
- Verifying fixes without re-running expensive early nodes
|
||||
- Running final regression tests before deployment
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Agent package at `exports/{agent_name}/` (built with `/hive-create`)
|
||||
2. Credentials configured (`/hive-credentials`)
|
||||
3. `ANTHROPIC_API_KEY` set (or appropriate LLM provider key)
|
||||
|
||||
**Path distinction** (critical — don't confuse these):
|
||||
- `exports/{agent_name}/` — agent source code (edit here)
|
||||
- `~/.hive/agents/{agent_name}/` — runtime data: sessions, checkpoints, logs (read here)
|
||||
|
||||
---
|
||||
|
||||
## The Iterative Test Loop
|
||||
|
||||
This is the core workflow. Don't re-run the entire agent when a late node fails — analyze, fix, and resume from the last clean checkpoint.
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────┐
|
||||
│ PHASE 1: Generate Test Scenarios │
|
||||
│ Goal → synthetic test inputs + tests │
|
||||
└──────────────┬───────────────────────┘
|
||||
↓
|
||||
┌──────────────────────────────────────┐
|
||||
│ PHASE 2: Execute │◄────────────────┐
|
||||
│ Run agent (CLI or pytest) │ │
|
||||
└──────────────┬───────────────────────┘ │
|
||||
↓ │
|
||||
Pass? ──yes──► PHASE 6: Final Verification │
|
||||
│ │
|
||||
no │
|
||||
↓ │
|
||||
┌──────────────────────────────────────┐ │
|
||||
│ PHASE 3: Analyze │ │
|
||||
│ Session + runtime logs + checkpoints │ │
|
||||
└──────────────┬───────────────────────┘ │
|
||||
↓ │
|
||||
┌──────────────────────────────────────┐ │
|
||||
│ PHASE 4: Fix │ │
|
||||
│ Prompt / code / graph / goal │ │
|
||||
└──────────────┬───────────────────────┘ │
|
||||
↓ │
|
||||
┌──────────────────────────────────────┐ │
|
||||
│ PHASE 5: Recover & Resume │─────────────────┘
|
||||
│ Checkpoint resume OR fresh re-run │
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 1: Generate Test Scenarios
|
||||
|
||||
Create synthetic tests from the agent's goal, constraints, and success criteria.
|
||||
|
||||
#### Step 1a: Read the goal
|
||||
|
||||
```python
|
||||
# Read goal from agent.py
|
||||
Read(file_path="exports/{agent_name}/agent.py")
|
||||
# Extract the Goal definition and convert to JSON string
|
||||
```
|
||||
|
||||
#### Step 1b: Get test guidelines
|
||||
|
||||
```python
|
||||
# Get constraint test guidelines
|
||||
generate_constraint_tests(
|
||||
goal_id="your-goal-id",
|
||||
goal_json='{"id": "...", "constraints": [...]}',
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
|
||||
# Get success criteria test guidelines
|
||||
generate_success_tests(
|
||||
goal_id="your-goal-id",
|
||||
goal_json='{"id": "...", "success_criteria": [...]}',
|
||||
node_names="intake,research,review,report",
|
||||
tool_names="web_search,web_scrape",
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
```
|
||||
|
||||
These return `file_header`, `test_template`, `constraints_formatted`/`success_criteria_formatted`, and `test_guidelines`. They do NOT generate test code — you write the tests.
|
||||
|
||||
#### Step 1c: Write tests
|
||||
|
||||
```python
|
||||
Write(
|
||||
file_path=result["output_file"],
|
||||
content=result["file_header"] + "\n\n" + your_test_code
|
||||
)
|
||||
```
|
||||
|
||||
#### Test writing rules
|
||||
|
||||
- Every test MUST be `async` with `@pytest.mark.asyncio`
|
||||
- Every test MUST accept `runner, auto_responder, mock_mode` fixtures
|
||||
- Use `await auto_responder.start()` before running, `await auto_responder.stop()` in `finally`
|
||||
- Use `await runner.run(input_dict)` — this goes through AgentRunner → AgentRuntime → ExecutionStream
|
||||
- Access output via `result.output.get("key")` — NEVER `result.output["key"]`
|
||||
- `result.success=True` means no exception, NOT goal achieved — always check output
|
||||
- Write 8-15 tests total, not 30+
|
||||
- Each real test costs ~3 seconds + LLM tokens
|
||||
- NEVER use `default_agent.run()` — it bypasses the runtime (no sessions, no logs, client-facing nodes hang)
|
||||
|
||||
#### Step 1d: Check existing tests
|
||||
|
||||
Before generating, check if tests already exist:
|
||||
|
||||
```python
|
||||
list_tests(
|
||||
goal_id="your-goal-id",
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Execute
|
||||
|
||||
Two execution paths, use the right one for your situation.
|
||||
|
||||
#### Iterative debugging (for complex agents)
|
||||
|
||||
Run the agent via CLI. This creates sessions with checkpoints at `~/.hive/agents/{agent_name}/sessions/`:
|
||||
|
||||
```bash
|
||||
uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
|
||||
```
|
||||
|
||||
Sessions and checkpoints are saved automatically.
|
||||
|
||||
**Client-facing nodes**: Agents with `client_facing=True` nodes (interactive conversation) work in headless mode when run from a real terminal — the agent streams output to stdout and reads user input from stdin via a `>>> ` prompt. In non-interactive shells (like Claude Code's Bash tool), client-facing nodes will hang because there is no stdin. For testing interactive agents from Claude Code, use `run_tests` with mock mode or have the user run the agent manually in their terminal.
|
||||
|
||||
#### Automated regression (for CI or final verification)
|
||||
|
||||
Use the `run_tests` MCP tool to run all pytest tests:
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="your-goal-id",
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
```
|
||||
|
||||
Returns structured results:
|
||||
```json
|
||||
{
|
||||
"overall_passed": false,
|
||||
"summary": {"total": 12, "passed": 10, "failed": 2, "pass_rate": "83.3%"},
|
||||
"test_results": [{"test_name": "test_success_source_diversity", "status": "failed"}],
|
||||
"failures": [{"test_name": "test_success_source_diversity", "details": "..."}]
|
||||
}
|
||||
```
|
||||
|
||||
**Options:**
|
||||
```python
|
||||
# Run only constraint tests
|
||||
run_tests(goal_id, agent_path, test_types='["constraint"]')
|
||||
|
||||
# Stop on first failure
|
||||
run_tests(goal_id, agent_path, fail_fast=True)
|
||||
|
||||
# Parallel execution
|
||||
run_tests(goal_id, agent_path, parallel=4)
|
||||
```
|
||||
|
||||
**Note:** `run_tests` uses `AgentRunner` with `tmp_path` storage, so sessions are isolated per test run. For checkpoint-based recovery with persistent sessions, use CLI execution. Use `run_tests` for quick regression checks and final verification.
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: Analyze Failures
|
||||
|
||||
When a test fails, drill down systematically. Don't guess — use the tools.
|
||||
|
||||
#### Step 3a: Get error category
|
||||
|
||||
```python
|
||||
debug_test(
|
||||
goal_id="your-goal-id",
|
||||
test_name="test_success_source_diversity",
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
```
|
||||
|
||||
Returns error category (`IMPLEMENTATION_ERROR`, `ASSERTION_FAILURE`, `TIMEOUT`, `IMPORT_ERROR`, `API_ERROR`) plus full traceback and suggestions.
|
||||
|
||||
#### Step 3b: Find the failed session
|
||||
|
||||
```python
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
status="failed",
|
||||
limit=5
|
||||
)
|
||||
```
|
||||
|
||||
Returns session list with IDs, timestamps, current_node (where it failed), execution_quality.
|
||||
|
||||
#### Step 3c: Inspect session state
|
||||
|
||||
```python
|
||||
get_agent_session_state(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="session_20260209_143022_abc12345"
|
||||
)
|
||||
```
|
||||
|
||||
Returns execution path, which node was current, step count, timestamps — but excludes memory values (to avoid context bloat). Shows `memory_keys` and `memory_size` instead.
|
||||
|
||||
#### Step 3d: Examine runtime logs (L2/L3)
|
||||
|
||||
```python
|
||||
# L2: Per-node success/failure, retry counts
|
||||
query_runtime_log_details(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
run_id="session_20260209_143022_abc12345",
|
||||
needs_attention_only=True
|
||||
)
|
||||
|
||||
# L3: Exact LLM responses, tool call inputs/outputs
|
||||
query_runtime_log_raw(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
run_id="session_20260209_143022_abc12345",
|
||||
node_id="research"
|
||||
)
|
||||
```
|
||||
|
||||
#### Step 3e: Inspect memory data
|
||||
|
||||
```python
|
||||
# See what data a node actually produced
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="session_20260209_143022_abc12345",
|
||||
key="research_results"
|
||||
)
|
||||
```
|
||||
|
||||
#### Step 3f: Find recovery points
|
||||
|
||||
```python
|
||||
list_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="session_20260209_143022_abc12345",
|
||||
is_clean="true"
|
||||
)
|
||||
```
|
||||
|
||||
Returns checkpoint summaries with IDs, types (`node_start`, `node_complete`), which node, and `is_clean` flag. Clean checkpoints are safe resume points.
|
||||
|
||||
#### Step 3g: Compare checkpoints (optional)
|
||||
|
||||
To understand what changed between two points in execution:
|
||||
|
||||
```python
|
||||
compare_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="session_20260209_143022_abc12345",
|
||||
checkpoint_id_before="cp_node_complete_research_143030",
|
||||
checkpoint_id_after="cp_node_complete_review_143115"
|
||||
)
|
||||
```
|
||||
|
||||
Returns memory diff (added/removed/changed keys) and execution path diff.
|
||||
|
||||
---
|
||||
|
||||
### Phase 4: Fix Based on Root Cause
|
||||
|
||||
Use the analysis from Phase 3 to determine what to fix and where.
|
||||
|
||||
| Root Cause | What to Fix | Where to Edit |
|
||||
|------------|------------|---------------|
|
||||
| **Prompt issue** — LLM produces wrong output format, misses instructions | Node `system_prompt` | `exports/{agent}/nodes/__init__.py` |
|
||||
| **Code bug** — TypeError, KeyError, logic error in Python | Agent code | `exports/{agent}/agent.py`, `nodes/__init__.py` |
|
||||
| **Graph issue** — wrong routing, missing edge, bad condition_expr | Edges, node config | `exports/{agent}/agent.py` |
|
||||
| **Tool issue** — MCP tool fails, wrong config, missing credential | Tool config | `exports/{agent}/mcp_servers.json`, `/hive-credentials` |
|
||||
| **Goal issue** — success criteria too strict/vague, wrong constraints | Goal definition | `exports/{agent}/agent.py` (goal section) |
|
||||
| **Test issue** — test expectations don't match actual agent behavior | Test code | `exports/{agent}/tests/test_*.py` |
|
||||
|
||||
#### Fix strategies by error category
|
||||
|
||||
**IMPLEMENTATION_ERROR** (TypeError, AttributeError, KeyError):
|
||||
```python
|
||||
# Read the failing code
|
||||
Read(file_path="exports/{agent_name}/nodes/__init__.py")
|
||||
|
||||
# Fix the bug
|
||||
Edit(
|
||||
file_path="exports/{agent_name}/nodes/__init__.py",
|
||||
old_string="results.get('videos')",
|
||||
new_string="(results or {}).get('videos', [])"
|
||||
)
|
||||
```
|
||||
|
||||
**ASSERTION_FAILURE** (test assertions fail but agent ran successfully):
|
||||
- Check if the agent's output is actually wrong → fix the prompt
|
||||
- Check if the test's expectations are unrealistic → fix the test
|
||||
- Use `get_agent_session_memory` to see what the agent actually produced
|
||||
|
||||
**TIMEOUT / STALL** (agent runs too long):
|
||||
- Check `node_visit_counts` for feedback loops hitting max_node_visits
|
||||
- Check L3 logs for tool calls that hang
|
||||
- Reduce `max_iterations` in loop_config or fix the prompt to converge faster
|
||||
|
||||
**API_ERROR** (connection, rate limit, auth):
|
||||
- Verify credentials with `/hive-credentials`
|
||||
- Check MCP server configuration
|
||||
|
||||
---
|
||||
|
||||
### Phase 5: Recover & Resume
|
||||
|
||||
After fixing the agent, decide whether to resume or re-run.
|
||||
|
||||
#### When to resume from checkpoint
|
||||
|
||||
Resume when ALL of these are true:
|
||||
- The fix is to a node that comes AFTER existing clean checkpoints
|
||||
- Clean checkpoints exist (from a CLI execution with checkpointing)
|
||||
- The early nodes are expensive (web scraping, API calls, long LLM chains)
|
||||
|
||||
```bash
|
||||
# Resume from the last clean checkpoint before the failing node
|
||||
uv run hive run exports/{agent_name} \
|
||||
--resume-session session_20260209_143022_abc12345 \
|
||||
--checkpoint cp_node_complete_research_143030
|
||||
```
|
||||
|
||||
This skips all nodes before the checkpoint and only re-runs the fixed node onward.
|
||||
|
||||
#### When to re-run from scratch
|
||||
|
||||
Re-run when ANY of these are true:
|
||||
- The fix is to the entry node or an early node
|
||||
- No checkpoints exist (e.g., agent was run via `run_tests`)
|
||||
- The agent is fast (2-3 nodes, completes in seconds)
|
||||
- You changed the graph structure (added/removed nodes/edges)
|
||||
|
||||
```bash
|
||||
uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
|
||||
```
|
||||
|
||||
#### Inspecting a checkpoint before resuming
|
||||
|
||||
```python
|
||||
get_agent_checkpoint(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="session_20260209_143022_abc12345",
|
||||
checkpoint_id="cp_node_complete_research_143030"
|
||||
)
|
||||
```
|
||||
|
||||
Returns the full checkpoint: shared_memory snapshot, execution_path, current_node, next_node, is_clean.
|
||||
|
||||
#### Loop back to Phase 2
|
||||
|
||||
After resuming or re-running, check if the fix worked. If not, go back to Phase 3.
|
||||
|
||||
---
|
||||
|
||||
### Phase 6: Final Verification
|
||||
|
||||
Once the iterative fix loop converges (the agent produces correct output), run the full automated test suite:
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="your-goal-id",
|
||||
agent_path="exports/{agent_name}"
|
||||
)
|
||||
```
|
||||
|
||||
All tests should pass. If not, repeat the loop for remaining failures.
|
||||
|
||||
---
|
||||
|
||||
## Credential Requirements
|
||||
|
||||
**CRITICAL: Testing requires ALL credentials the agent depends on.** This includes both the LLM API key AND any tool-specific credentials (HubSpot, Brave Search, etc.).
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Before running agent tests, you MUST collect ALL required credentials from the user.
|
||||
|
||||
**Step 1: LLM API Key (always required)**
|
||||
```bash
|
||||
export ANTHROPIC_API_KEY="your-key-here"
|
||||
```
|
||||
|
||||
**Step 2: Tool-specific credentials (depends on agent's tools)**
|
||||
|
||||
Inspect the agent's `mcp_servers.json` and tool configuration to determine which tools the agent uses, then check for all required credentials:
|
||||
|
||||
```python
|
||||
from aden_tools.credentials import CredentialManager, CREDENTIAL_SPECS
|
||||
|
||||
creds = CredentialManager()
|
||||
|
||||
# Determine which tools the agent uses (from agent.json or mcp_servers.json)
|
||||
agent_tools = [...] # e.g., ["hubspot_search_contacts", "web_search", ...]
|
||||
|
||||
# Find all missing credentials for those tools
|
||||
missing = creds.get_missing_for_tools(agent_tools)
|
||||
```
|
||||
|
||||
Common tool credentials:
|
||||
| Tool | Env Var | Help URL |
|
||||
|------|---------|----------|
|
||||
| HubSpot CRM | `HUBSPOT_ACCESS_TOKEN` | https://developers.hubspot.com/docs/api/private-apps |
|
||||
| Brave Search | `BRAVE_SEARCH_API_KEY` | https://brave.com/search/api/ |
|
||||
| Google Search | `GOOGLE_SEARCH_API_KEY` + `GOOGLE_SEARCH_CX` | https://developers.google.com/custom-search |
|
||||
|
||||
**Why ALL credentials are required:**
|
||||
- Tests need to execute the agent's LLM nodes to validate behavior
|
||||
- Tools with missing credentials will return error dicts instead of real data
|
||||
- Mock mode bypasses everything, providing no confidence in real-world performance
|
||||
|
||||
### Mock Mode Limitations
|
||||
|
||||
Mock mode (`--mock` flag or `MOCK_MODE=1`) is **ONLY for structure validation**:
|
||||
|
||||
- Validates graph structure (nodes, edges, connections)
|
||||
- Validates that `AgentRunner.load()` succeeds and the agent is importable
|
||||
- Does NOT execute event_loop agents — MockLLMProvider never calls `set_output`, so event_loop nodes loop forever
|
||||
- Does NOT test LLM reasoning, content quality, or constraint validation
|
||||
- Does NOT test real API integrations or tool use
|
||||
|
||||
**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials.
|
||||
|
||||
### Enforcing Credentials in Tests
|
||||
|
||||
When writing tests, **ALWAYS include credential checks**:
|
||||
|
||||
```python
|
||||
import os
|
||||
import pytest
|
||||
from aden_tools.credentials import CredentialManager
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"),
|
||||
reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1."
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def check_credentials():
|
||||
"""Ensure ALL required credentials are set for real testing."""
|
||||
creds = CredentialManager()
|
||||
mock_mode = os.environ.get("MOCK_MODE")
|
||||
|
||||
if not creds.is_available("anthropic"):
|
||||
if mock_mode:
|
||||
print("\nRunning in MOCK MODE - structure validation only")
|
||||
else:
|
||||
pytest.fail(
|
||||
"\nANTHROPIC_API_KEY not set!\n"
|
||||
"Set API key: export ANTHROPIC_API_KEY='your-key-here'\n"
|
||||
"Or run structure validation: MOCK_MODE=1 pytest exports/{agent}/tests/"
|
||||
)
|
||||
|
||||
if not mock_mode:
|
||||
agent_tools = [] # Update per agent
|
||||
missing = creds.get_missing_for_tools(agent_tools)
|
||||
if missing:
|
||||
lines = ["\nMissing tool credentials!"]
|
||||
for name in missing:
|
||||
spec = creds.specs.get(name)
|
||||
if spec:
|
||||
lines.append(f" {spec.env_var} - {spec.description}")
|
||||
pytest.fail("\n".join(lines))
|
||||
```
|
||||
|
||||
### User Communication
|
||||
|
||||
When the user asks to test an agent, **ALWAYS check for ALL credentials first**:
|
||||
|
||||
1. **Identify the agent's tools** from `mcp_servers.json`
|
||||
2. **Check ALL required credentials** using `CredentialManager`
|
||||
3. **Ask the user to provide any missing credentials** before proceeding
|
||||
4. Collect ALL missing credentials in a single prompt — not one at a time
|
||||
|
||||
---
|
||||
|
||||
## Safe Test Patterns
|
||||
|
||||
### OutputCleaner
|
||||
|
||||
The framework automatically validates and cleans node outputs using a fast LLM at edge traversal time. Tests should still use safe patterns because OutputCleaner may not catch all issues.
|
||||
|
||||
### Safe Access (REQUIRED)
|
||||
|
||||
```python
|
||||
# UNSAFE - will crash on missing keys
|
||||
approval = result.output["approval_decision"]
|
||||
category = result.output["analysis"]["category"]
|
||||
|
||||
# SAFE - use .get() with defaults
|
||||
output = result.output or {}
|
||||
approval = output.get("approval_decision", "UNKNOWN")
|
||||
|
||||
# SAFE - type check before operations
|
||||
analysis = output.get("analysis", {})
|
||||
if isinstance(analysis, dict):
|
||||
category = analysis.get("category", "unknown")
|
||||
|
||||
# SAFE - handle JSON parsing trap (LLM response as string)
|
||||
import json
|
||||
recommendation = output.get("recommendation", "{}")
|
||||
if isinstance(recommendation, str):
|
||||
try:
|
||||
parsed = json.loads(recommendation)
|
||||
if isinstance(parsed, dict):
|
||||
approval = parsed.get("approval_decision", "UNKNOWN")
|
||||
except json.JSONDecodeError:
|
||||
approval = "UNKNOWN"
|
||||
elif isinstance(recommendation, dict):
|
||||
approval = recommendation.get("approval_decision", "UNKNOWN")
|
||||
|
||||
# SAFE - type check before iteration
|
||||
items = output.get("items", [])
|
||||
if isinstance(items, list):
|
||||
for item in items:
|
||||
...
|
||||
```
|
||||
|
||||
### Helper Functions for conftest.py
|
||||
|
||||
```python
|
||||
import json
|
||||
import re
|
||||
|
||||
def _parse_json_from_output(result, key):
|
||||
"""Parse JSON from agent output (framework may store full LLM response as string)."""
|
||||
response_text = result.output.get(key, "")
|
||||
json_text = re.sub(r'```json\s*|\s*```', '', response_text).strip()
|
||||
try:
|
||||
return json.loads(json_text)
|
||||
except (json.JSONDecodeError, AttributeError, TypeError):
|
||||
return result.output.get(key)
|
||||
|
||||
def safe_get_nested(result, key_path, default=None):
|
||||
"""Safely get nested value from result.output."""
|
||||
output = result.output or {}
|
||||
current = output
|
||||
for key in key_path:
|
||||
if isinstance(current, dict):
|
||||
current = current.get(key)
|
||||
elif isinstance(current, str):
|
||||
try:
|
||||
json_text = re.sub(r'```json\s*|\s*```', '', current).strip()
|
||||
parsed = json.loads(json_text)
|
||||
if isinstance(parsed, dict):
|
||||
current = parsed.get(key)
|
||||
else:
|
||||
return default
|
||||
except json.JSONDecodeError:
|
||||
return default
|
||||
else:
|
||||
return default
|
||||
return current if current is not None else default
|
||||
|
||||
# Make available in tests
|
||||
pytest.parse_json_from_output = _parse_json_from_output
|
||||
pytest.safe_get_nested = safe_get_nested
|
||||
```
|
||||
|
||||
### ExecutionResult Fields
|
||||
|
||||
**`result.success=True` means NO exception, NOT goal achieved**
|
||||
|
||||
```python
|
||||
# WRONG
|
||||
assert result.success
|
||||
|
||||
# RIGHT
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
approval = output.get("approval_decision")
|
||||
assert approval == "APPROVED", f"Expected APPROVED, got {approval}"
|
||||
```
|
||||
|
||||
All fields:
|
||||
- `success: bool` — Completed without exception (NOT goal achieved!)
|
||||
- `output: dict` — Complete memory snapshot (may contain raw strings)
|
||||
- `error: str | None` — Error message if failed
|
||||
- `steps_executed: int` — Number of nodes executed
|
||||
- `total_tokens: int` — Cumulative token usage
|
||||
- `total_latency_ms: int` — Total execution time
|
||||
- `path: list[str]` — Node IDs traversed (may repeat in feedback loops)
|
||||
- `paused_at: str | None` — Node ID if paused
|
||||
- `session_state: dict` — State for resuming
|
||||
- `node_visit_counts: dict[str, int]` — Visit counts per node (feedback loop testing)
|
||||
- `execution_quality: str` — "clean", "degraded", or "failed"
|
||||
|
||||
### Test Count Guidance
|
||||
|
||||
**Write 8-15 tests, not 30+**
|
||||
|
||||
- 2-3 tests per success criterion
|
||||
- 1 happy path test
|
||||
- 1 boundary/edge case test
|
||||
- 1 error handling test (optional)
|
||||
|
||||
Each real test costs ~3 seconds + LLM tokens. 12 tests = ~36 seconds, $0.12.
|
||||
|
||||
---
|
||||
|
||||
## Test Patterns
|
||||
|
||||
### Happy Path
|
||||
```python
|
||||
@pytest.mark.asyncio
|
||||
async def test_happy_path(runner, auto_responder, mock_mode):
|
||||
"""Test normal successful execution."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "python tutorials"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
assert output.get("report"), "No report produced"
|
||||
```
|
||||
|
||||
### Boundary Condition
|
||||
```python
|
||||
@pytest.mark.asyncio
|
||||
async def test_minimum_sources(runner, auto_responder, mock_mode):
|
||||
"""Test at minimum source threshold."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "niche topic"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
sources = output.get("sources", [])
|
||||
if isinstance(sources, list):
|
||||
assert len(sources) >= 3, f"Expected >= 3 sources, got {len(sources)}"
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
```python
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_input(runner, auto_responder, mock_mode):
|
||||
"""Test graceful handling of empty input."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": ""})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
# Agent should either fail gracefully or produce an error message
|
||||
output = result.output or {}
|
||||
assert not result.success or output.get("error"), "Should handle empty input"
|
||||
```
|
||||
|
||||
### Feedback Loop
|
||||
```python
|
||||
@pytest.mark.asyncio
|
||||
async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
|
||||
"""Test that feedback loops don't run forever."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "test"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
visits = result.node_visit_counts or {}
|
||||
for node_id, count in visits.items():
|
||||
assert count <= 5, f"Node {node_id} visited {count} times — possible infinite loop"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## MCP Tool Reference
|
||||
|
||||
### Phase 1: Test Generation
|
||||
|
||||
```python
|
||||
# Check existing tests
|
||||
list_tests(goal_id, agent_path)
|
||||
|
||||
# Get constraint test guidelines (returns templates, NOT generated tests)
|
||||
generate_constraint_tests(goal_id, goal_json, agent_path)
|
||||
# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines
|
||||
|
||||
# Get success criteria test guidelines
|
||||
generate_success_tests(goal_id, goal_json, node_names, tool_names, agent_path)
|
||||
# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines
|
||||
```
|
||||
|
||||
### Phase 2: Execution
|
||||
|
||||
```python
|
||||
# Automated regression (no checkpoints, fresh runs)
|
||||
run_tests(goal_id, agent_path, test_types='["all"]', parallel=-1, fail_fast=False)
|
||||
|
||||
# Run only specific test types
|
||||
run_tests(goal_id, agent_path, test_types='["constraint"]')
|
||||
run_tests(goal_id, agent_path, test_types='["success"]')
|
||||
```
|
||||
|
||||
```bash
|
||||
# Iterative debugging with checkpoints (via CLI)
|
||||
uv run hive run exports/{agent_name} --input '{"query": "test"}'
|
||||
```
|
||||
|
||||
### Phase 3: Analysis
|
||||
|
||||
```python
|
||||
# Debug a specific failed test
|
||||
debug_test(goal_id, test_name, agent_path)
|
||||
|
||||
# Find failed sessions
|
||||
list_agent_sessions(agent_work_dir, status="failed", limit=5)
|
||||
|
||||
# Inspect session state (excludes memory values)
|
||||
get_agent_session_state(agent_work_dir, session_id)
|
||||
|
||||
# Inspect memory data
|
||||
get_agent_session_memory(agent_work_dir, session_id, key="research_results")
|
||||
|
||||
# Runtime logs: L1 summaries
|
||||
query_runtime_logs(agent_work_dir, status="needs_attention")
|
||||
|
||||
# Runtime logs: L2 per-node details
|
||||
query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True)
|
||||
|
||||
# Runtime logs: L3 tool/LLM raw data
|
||||
query_runtime_log_raw(agent_work_dir, run_id, node_id="research")
|
||||
|
||||
# Find clean checkpoints
|
||||
list_agent_checkpoints(agent_work_dir, session_id, is_clean="true")
|
||||
|
||||
# Compare checkpoints (memory diff)
|
||||
compare_agent_checkpoints(agent_work_dir, session_id, cp_before, cp_after)
|
||||
```
|
||||
|
||||
### Phase 5: Recovery
|
||||
|
||||
```python
|
||||
# Inspect checkpoint before resuming
|
||||
get_agent_checkpoint(agent_work_dir, session_id, checkpoint_id)
|
||||
# Empty checkpoint_id = latest checkpoint
|
||||
```
|
||||
|
||||
```bash
|
||||
# Resume from checkpoint via CLI (headless)
|
||||
uv run hive run exports/{agent_name} \
|
||||
--resume-session {session_id} --checkpoint {checkpoint_id}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Anti-Patterns
|
||||
|
||||
| Don't | Do Instead |
|
||||
|-------|-----------|
|
||||
| Use `default_agent.run()` in tests | Use `runner.run()` with `auto_responder` fixtures (goes through AgentRuntime) |
|
||||
| Re-run entire agent when a late node fails | Resume from last clean checkpoint |
|
||||
| Treat `result.success` as goal achieved | Check `result.output` for actual criteria |
|
||||
| Access `result.output["key"]` directly | Use `result.output.get("key")` |
|
||||
| Fix random things hoping tests pass | Analyze L2/L3 logs to find root cause first |
|
||||
| Write 30+ tests | Write 8-15 focused tests |
|
||||
| Skip credential check | Use `/hive-credentials` before testing |
|
||||
| Confuse `exports/` with `~/.hive/agents/` | Code in `exports/`, runtime data in `~/.hive/` |
|
||||
| Use `run_tests` for iterative debugging | Use headless CLI with checkpoints for iterative debugging |
|
||||
| Use headless CLI for final regression | Use `run_tests` for automated regression |
|
||||
| Use `--tui` from Claude Code | Use headless `run` command — TUI hangs in non-interactive shells |
|
||||
| Test client-facing nodes from Claude Code | Use mock mode, or have the user run the agent in their terminal |
|
||||
| Run tests without reading goal first | Always understand the goal before writing tests |
|
||||
| Skip Phase 3 analysis and guess | Use session + log tools to identify root cause |
|
||||
|
||||
---
|
||||
|
||||
## Example Walkthrough: Deep Research Agent
|
||||
|
||||
A complete iteration showing the test loop for an agent with nodes: `intake → research → review → report`.
|
||||
|
||||
### Phase 1: Generate tests
|
||||
|
||||
```python
|
||||
# Read the goal
|
||||
Read(file_path="exports/deep_research_agent/agent.py")
|
||||
|
||||
# Get success criteria test guidelines
|
||||
result = generate_success_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "target": ">=5"}, {"id": "citation-coverage", "target": "100%"}, {"id": "report-completeness", "target": "90%"}]}',
|
||||
node_names="intake,research,review,report",
|
||||
tool_names="web_search,web_scrape",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
|
||||
# Write tests
|
||||
Write(
|
||||
file_path=result["output_file"],
|
||||
content=result["file_header"] + "\n\n" + test_code
|
||||
)
|
||||
```
|
||||
|
||||
### Phase 2: First execution
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent",
|
||||
fail_fast=True
|
||||
)
|
||||
```
|
||||
|
||||
Result: `test_success_source_diversity` fails — agent only found 2 sources instead of 5.
|
||||
|
||||
### Phase 3: Analyze
|
||||
|
||||
```python
|
||||
# Debug the failing test
|
||||
debug_test(
|
||||
goal_id="rigorous-interactive-research",
|
||||
test_name="test_success_source_diversity",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# → ASSERTION_FAILURE: Expected >= 5 sources, got 2
|
||||
|
||||
# Find the session
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="completed",
|
||||
limit=1
|
||||
)
|
||||
# → session_20260209_150000_abc12345
|
||||
|
||||
# See what the research node produced
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_150000_abc12345",
|
||||
key="research_results"
|
||||
)
|
||||
# → Only 2 web_search calls made, each returned 1 source
|
||||
|
||||
# Check the LLM's behavior in the research node
|
||||
query_runtime_log_raw(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
run_id="session_20260209_150000_abc12345",
|
||||
node_id="research"
|
||||
)
|
||||
# → LLM called web_search only twice, then called set_output
|
||||
```
|
||||
|
||||
Root cause: The research node's prompt doesn't tell the LLM to search for at least 5 diverse sources. It stops after the first couple of searches.
|
||||
|
||||
### Phase 4: Fix the prompt
|
||||
|
||||
```python
|
||||
Read(file_path="exports/deep_research_agent/nodes/__init__.py")
|
||||
|
||||
Edit(
|
||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
||||
old_string='system_prompt="Search for information on the user\'s topic."',
|
||||
new_string='system_prompt="Search for information on the user\'s topic. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries to ensure source diversity. Do not stop searching until you have at least 5 distinct sources."'
|
||||
)
|
||||
```
|
||||
|
||||
### Phase 5: Resume from checkpoint
|
||||
|
||||
For this example, the fix is to the `research` node. If we had run via CLI with checkpointing, we could resume from the checkpoint after `intake` to skip re-running intake:
|
||||
|
||||
```bash
|
||||
# Check if clean checkpoint exists after intake
|
||||
list_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_150000_abc12345",
|
||||
is_clean="true"
|
||||
)
|
||||
# → cp_node_complete_intake_150005
|
||||
|
||||
# Resume from after intake, re-run research with fixed prompt
|
||||
uv run hive run exports/deep_research_agent \
|
||||
--resume-session session_20260209_150000_abc12345 \
|
||||
--checkpoint cp_node_complete_intake_150005
|
||||
```
|
||||
|
||||
Or for this simple case (intake is fast), just re-run:
|
||||
|
||||
```bash
|
||||
uv run hive run exports/deep_research_agent --input '{"topic": "test"}'
|
||||
```
|
||||
|
||||
### Phase 6: Final verification
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# → All 12 tests pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Test File Structure
|
||||
|
||||
```
|
||||
exports/{agent_name}/
|
||||
├── agent.py ← Agent to test (goal, nodes, edges)
|
||||
├── nodes/__init__.py ← Node implementations (prompts, config)
|
||||
├── config.py ← Agent configuration
|
||||
├── mcp_servers.json ← Tool server config
|
||||
└── tests/
|
||||
├── conftest.py ← Shared fixtures + safe access helpers
|
||||
├── test_constraints.py ← Constraint tests
|
||||
├── test_success_criteria.py ← Success criteria tests
|
||||
└── test_edge_cases.py ← Edge case tests
|
||||
```
|
||||
|
||||
## Integration with Other Skills
|
||||
|
||||
| Scenario | From | To | Action |
|
||||
|----------|------|----|--------|
|
||||
| Agent built, ready to test | `/hive-create` | `/hive-test` | Generate tests, start loop |
|
||||
| Prompt fix needed | `/hive-test` Phase 4 | Direct edit | Edit `nodes/__init__.py`, resume |
|
||||
| Goal definition wrong | `/hive-test` Phase 4 | `/hive-create` | Update goal, may need rebuild |
|
||||
| Missing credentials | `/hive-test` Phase 3 | `/hive-credentials` | Set up credentials |
|
||||
| Complex runtime failure | `/hive-test` Phase 3 | `/hive-debugger` | Deep L1/L2/L3 analysis |
|
||||
| All tests pass | `/hive-test` Phase 6 | Done | Agent validated |
|
||||
@@ -1,333 +0,0 @@
|
||||
# Example: Iterative Testing of a Research Agent
|
||||
|
||||
This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.
|
||||
|
||||
## Agent Structure
|
||||
|
||||
```
|
||||
exports/deep_research_agent/
|
||||
├── agent.py # Goal + graph: intake → research → review → report
|
||||
├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
|
||||
├── config.py # Model config
|
||||
├── mcp_servers.json # Tools: web_search, web_scrape
|
||||
└── tests/ # Test files (we'll create these)
|
||||
```
|
||||
|
||||
**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Generate Tests
|
||||
|
||||
### Read the goal
|
||||
|
||||
```python
|
||||
Read(file_path="exports/deep_research_agent/agent.py")
|
||||
# Extract: goal_id="rigorous-interactive-research"
|
||||
# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
|
||||
# constraints: no-hallucination, source-attribution
|
||||
```
|
||||
|
||||
### Get test guidelines
|
||||
|
||||
```python
|
||||
result = generate_success_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
|
||||
node_names="intake,research,review,report",
|
||||
tool_names="web_search,web_scrape",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
```
|
||||
|
||||
### Write tests
|
||||
|
||||
```python
|
||||
Write(
|
||||
file_path="exports/deep_research_agent/tests/test_success_criteria.py",
|
||||
content=result["file_header"] + '''
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_source_diversity(runner, auto_responder, mock_mode):
|
||||
"""At least 5 diverse sources are found."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "impact of remote work on productivity"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
sources = output.get("sources", [])
|
||||
if isinstance(sources, list):
|
||||
assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_citation_coverage(runner, auto_responder, mock_mode):
|
||||
"""Every factual claim in the report cites its source."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "climate change effects on agriculture"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
report = output.get("report", "")
|
||||
# Check that report contains numbered references
|
||||
assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_report_completeness(runner, auto_responder, mock_mode):
|
||||
"""Report addresses the original research question."""
|
||||
query = "pros and cons of nuclear energy"
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": query})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
report = output.get("report", "")
|
||||
assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_query_handling(runner, auto_responder, mock_mode):
|
||||
"""Agent handles empty input gracefully."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": ""})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
output = result.output or {}
|
||||
assert not result.success or output.get("error"), "Should handle empty query"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
|
||||
"""Feedback loop between review and research terminates."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "quantum computing basics"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
visits = result.node_visit_counts or {}
|
||||
for node_id, count in visits.items():
|
||||
assert count <= 5, f"Node {node_id} visited {count} times"
|
||||
'''
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: First Execution
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent",
|
||||
fail_fast=True
|
||||
)
|
||||
```
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"overall_passed": false,
|
||||
"summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
|
||||
"failures": [
|
||||
{"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
|
||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Analyze (Iteration 1)
|
||||
|
||||
### Debug the first failure
|
||||
|
||||
```python
|
||||
debug_test(
|
||||
goal_id="rigorous-interactive-research",
|
||||
test_name="test_success_source_diversity",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
|
||||
```
|
||||
|
||||
### Find the session and inspect memory
|
||||
|
||||
```python
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="completed",
|
||||
limit=1
|
||||
)
|
||||
# → session_20260209_150000_abc12345
|
||||
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_150000_abc12345",
|
||||
key="research_results"
|
||||
)
|
||||
# → Only 2 sources found. LLM stopped searching after 2 queries.
|
||||
```
|
||||
|
||||
### Check LLM behavior in the research node
|
||||
|
||||
```python
|
||||
query_runtime_log_raw(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
run_id="session_20260209_150000_abc12345",
|
||||
node_id="research"
|
||||
)
|
||||
# → LLM called web_search twice, got results, immediately called set_output.
|
||||
# → Prompt doesn't instruct it to find at least 5 sources.
|
||||
```
|
||||
|
||||
**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Fix (Iteration 1)
|
||||
|
||||
```python
|
||||
Read(file_path="exports/deep_research_agent/nodes/__init__.py")
|
||||
|
||||
# Fix the research node prompt
|
||||
Edit(
|
||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
||||
old_string='system_prompt="Search for information on the user\'s topic using web search."',
|
||||
new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Recover & Resume (Iteration 1)
|
||||
|
||||
The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent",
|
||||
fail_fast=True
|
||||
)
|
||||
```
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"overall_passed": false,
|
||||
"summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
|
||||
"failures": [
|
||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Source diversity now passes. Citation coverage still fails.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Analyze (Iteration 2)
|
||||
|
||||
```python
|
||||
debug_test(
|
||||
goal_id="rigorous-interactive-research",
|
||||
test_name="test_success_citation_coverage",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# Category: ASSERTION_FAILURE — Report lacks citations
|
||||
|
||||
# Check what the report node produced
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="completed",
|
||||
limit=1
|
||||
)
|
||||
# → session_20260209_151500_def67890
|
||||
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_151500_def67890",
|
||||
key="report"
|
||||
)
|
||||
# → Report text exists but uses no numbered references.
|
||||
# → Sources are in memory but report node doesn't cite them.
|
||||
```
|
||||
|
||||
**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Fix (Iteration 2)
|
||||
|
||||
```python
|
||||
Edit(
|
||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
||||
old_string='system_prompt="Write a comprehensive report based on the research findings."',
|
||||
new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Resume (Iteration 2)
|
||||
|
||||
The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
|
||||
|
||||
```bash
|
||||
# Run via CLI to get checkpoints
|
||||
uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
|
||||
|
||||
# After it runs, find the clean checkpoint before report
|
||||
list_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_152000_ghi34567",
|
||||
is_clean="true"
|
||||
)
|
||||
# → cp_node_complete_review_152100 (after review, before report)
|
||||
|
||||
# Resume — skips intake, research, review entirely
|
||||
uv run hive run exports/deep_research_agent \
|
||||
--resume-session session_20260209_152000_ghi34567 \
|
||||
--checkpoint cp_node_complete_review_152100
|
||||
```
|
||||
|
||||
Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: Final Verification
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
```
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"overall_passed": true,
|
||||
"summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
|
||||
}
|
||||
```
|
||||
|
||||
All tests pass.
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
| Iteration | Failure | Root Cause | Fix | Recovery |
|
||||
|-----------|---------|------------|-----|----------|
|
||||
| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
|
||||
| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |
|
||||
|
||||
**Key takeaways:**
|
||||
- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
|
||||
- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
|
||||
- Final `run_tests` confirms all scenarios pass end-to-end
|
||||
@@ -1,526 +0,0 @@
|
||||
---
|
||||
name: hive
|
||||
description: Complete workflow for building, implementing, and testing goal-driven agents. Orchestrates hive-* skills. Use when starting a new agent project, unsure which skill to use, or need end-to-end guidance.
|
||||
license: Apache-2.0
|
||||
metadata:
|
||||
author: hive
|
||||
version: "2.0"
|
||||
type: workflow-orchestrator
|
||||
orchestrates:
|
||||
- hive-concepts
|
||||
- hive-create
|
||||
- hive-patterns
|
||||
- hive-test
|
||||
- hive-credentials
|
||||
- hive-debugger
|
||||
---
|
||||
|
||||
# Agent Development Workflow
|
||||
|
||||
**THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**
|
||||
|
||||
When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
|
||||
|
||||
```
|
||||
Use AskUserQuestion with these options:
|
||||
- "Build a new agent" → Then invoke /hive-create
|
||||
- "Test an existing agent" → Then invoke /hive-test
|
||||
- "Learn agent concepts" → Then invoke /hive-concepts
|
||||
- "Optimize agent design" → Then invoke /hive-patterns
|
||||
- "Set up credentials" → Then invoke /hive-credentials
|
||||
- "Debug a failing agent" → Then invoke /hive-debugger
|
||||
- "Other" (please describe what you want to achieve)
|
||||
```
|
||||
|
||||
**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
|
||||
|
||||
---
|
||||
|
||||
Complete Standard Operating Procedure (SOP) for building production-ready goal-driven agents.
|
||||
|
||||
## Overview
|
||||
|
||||
This workflow orchestrates specialized skills to take you from initial concept to production-ready agent:
|
||||
|
||||
1. **Understand Concepts** → `/hive-concepts` (optional)
|
||||
2. **Build Structure** → `/hive-create`
|
||||
3. **Optimize Design** → `/hive-patterns` (optional)
|
||||
4. **Setup Credentials** → `/hive-credentials` (if agent uses tools requiring API keys)
|
||||
5. **Test & Validate** → `/hive-test`
|
||||
6. **Debug Issues** → `/hive-debugger` (if agent fails at runtime)
|
||||
|
||||
## When to Use This Workflow
|
||||
|
||||
Use this meta-skill when:
|
||||
- Starting a new agent from scratch
|
||||
- Unclear which skill to use first
|
||||
- Need end-to-end guidance for agent development
|
||||
- Want consistent, repeatable agent builds
|
||||
|
||||
**Skip this workflow** if:
|
||||
- You only need to test an existing agent → use `/hive-test` directly
|
||||
- You know exactly which phase you're in → use specific skill directly
|
||||
|
||||
## Quick Decision Tree
|
||||
|
||||
```
|
||||
"Need to understand agent concepts" → hive-concepts
|
||||
"Build a new agent" → hive-create
|
||||
"Optimize my agent design" → hive-patterns
|
||||
"Need client-facing nodes or feedback loops" → hive-patterns
|
||||
"Set up API keys for my agent" → hive-credentials
|
||||
"Test my agent" → hive-test
|
||||
"My agent is failing/stuck/has errors" → hive-debugger
|
||||
"Not sure what I need" → Read phases below, then decide
|
||||
"Agent has structure but needs implementation" → See agent directory STATUS.md
|
||||
```
|
||||
|
||||
## Phase 0: Understand Concepts (Optional)
|
||||
|
||||
**Skill**: `/hive-concepts`
|
||||
**Input**: Questions about agent architecture
|
||||
|
||||
### When to Use
|
||||
|
||||
- First time building an agent
|
||||
- Need to understand node types, edges, goals
|
||||
- Want to validate tool availability
|
||||
- Learning about event loop architecture and client-facing nodes
|
||||
|
||||
### What This Phase Provides
|
||||
|
||||
- Architecture overview (Python packages, not JSON)
|
||||
- Core concepts (Goal, Node, Edge, Event Loop, Judges)
|
||||
- Tool discovery and validation procedures
|
||||
- Workflow overview
|
||||
|
||||
**Skip this phase** if you already understand agent fundamentals.
|
||||
|
||||
## Phase 1: Build Agent Structure
|
||||
|
||||
**Skill**: `/hive-create`
|
||||
**Input**: User requirements ("Build an agent that...") or a template to start from
|
||||
|
||||
### What This Phase Does
|
||||
|
||||
Creates the complete agent architecture:
|
||||
- Package structure (`exports/agent_name/`)
|
||||
- Goal with success criteria and constraints
|
||||
- Workflow graph (nodes and edges)
|
||||
- Node specifications
|
||||
- CLI interface
|
||||
- Documentation
|
||||
|
||||
### Process
|
||||
|
||||
1. **Create package** - Directory structure with skeleton files
|
||||
2. **Define goal** - Success criteria and constraints written to agent.py
|
||||
3. **Design nodes** - Each node approved and written incrementally
|
||||
4. **Connect edges** - Workflow graph with conditional routing
|
||||
5. **Finalize** - Agent class, exports, and documentation
|
||||
|
||||
### Outputs
|
||||
|
||||
- ✅ `exports/agent_name/` package created
|
||||
- ✅ Goal defined in agent.py
|
||||
- ✅ 3-5 success criteria defined
|
||||
- ✅ 1-5 constraints defined
|
||||
- ✅ 5-10 nodes specified in nodes/__init__.py
|
||||
- ✅ 8-15 edges connecting workflow
|
||||
- ✅ Validated structure (passes `uv run python -m agent_name validate`)
|
||||
- ✅ README.md with usage instructions
|
||||
- ✅ CLI commands (info, validate, run, shell)
|
||||
|
||||
### Success Criteria
|
||||
|
||||
You're ready for Phase 2 when:
|
||||
- Agent structure validates without errors
|
||||
- All nodes and edges are defined
|
||||
- CLI commands work (info, validate)
|
||||
- You see: "Agent complete: exports/agent_name/"
|
||||
|
||||
### Common Outputs
|
||||
|
||||
The hive-create skill produces:
|
||||
```
|
||||
exports/agent_name/
|
||||
├── __init__.py (package exports)
|
||||
├── __main__.py (CLI interface)
|
||||
├── agent.py (goal, graph, agent class)
|
||||
├── nodes/__init__.py (node specifications)
|
||||
├── config.py (configuration)
|
||||
├── implementations.py (may be created for Python functions)
|
||||
└── README.md (documentation)
|
||||
```
|
||||
|
||||
### Next Steps
|
||||
|
||||
**If structure complete and validated:**
|
||||
→ Check `exports/agent_name/STATUS.md` or `IMPLEMENTATION_GUIDE.md`
|
||||
→ These files explain implementation options
|
||||
→ You may need to add Python functions or MCP tools (not covered by current skills)
|
||||
|
||||
**If want to optimize design:**
|
||||
→ Proceed to Phase 1.5 (hive-patterns)
|
||||
|
||||
**If ready to test:**
|
||||
→ Proceed to Phase 2
|
||||
|
||||
## Phase 1.5: Optimize Design (Optional)
|
||||
|
||||
**Skill**: `/hive-patterns`
|
||||
**Input**: Completed agent structure
|
||||
|
||||
### When to Use
|
||||
|
||||
- Want to add client-facing blocking or feedback edges
|
||||
- Need judge patterns for output validation
|
||||
- Want fan-out/fan-in (parallel execution)
|
||||
- Need error handling patterns
|
||||
- Want best practices guidance
|
||||
|
||||
### What This Phase Provides
|
||||
|
||||
- Client-facing interaction patterns
|
||||
- Feedback edge routing with nullable output keys
|
||||
- Judge patterns (implicit, SchemaJudge)
|
||||
- Fan-out/fan-in parallel execution
|
||||
- Context management and spillover patterns
|
||||
- Anti-patterns to avoid
|
||||
|
||||
**Skip this phase** if your agent design is straightforward.
|
||||
|
||||
## Phase 2: Test & Validate
|
||||
|
||||
**Skill**: `/hive-test`
|
||||
**Input**: Working agent from Phase 1
|
||||
|
||||
### What This Phase Does
|
||||
|
||||
Guides the creation and execution of a comprehensive test suite:
|
||||
- Constraint tests
|
||||
- Success criteria tests
|
||||
- Edge case tests
|
||||
- Integration tests
|
||||
|
||||
### Process
|
||||
|
||||
1. **Analyze agent** - Read goal, constraints, success criteria
|
||||
2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
|
||||
3. **User approval** - Review and approve each test
|
||||
4. **Run evaluation** - Execute tests and collect results
|
||||
5. **Debug failures** - Identify and fix issues
|
||||
6. **Iterate** - Repeat until all tests pass
|
||||
|
||||
### Outputs
|
||||
|
||||
- ✅ Test files in `exports/agent_name/tests/`
|
||||
- ✅ Test report with pass/fail metrics
|
||||
- ✅ Coverage of all success criteria
|
||||
- ✅ Coverage of all constraints
|
||||
- ✅ Edge case handling verified
|
||||
|
||||
### Success Criteria
|
||||
|
||||
You're done when:
|
||||
- All tests pass
|
||||
- All success criteria validated
|
||||
- All constraints verified
|
||||
- Agent handles edge cases
|
||||
- Test coverage is comprehensive
|
||||
|
||||
### Next Steps
|
||||
|
||||
**Agent ready for:**
|
||||
- Production deployment
|
||||
- Integration into larger systems
|
||||
- Documentation and handoff
|
||||
- Continuous monitoring
|
||||
|
||||
## Phase Transitions
|
||||
|
||||
### From Phase 1 to Phase 2
|
||||
|
||||
**Trigger signals:**
|
||||
- "Agent complete: exports/..."
|
||||
- Structure validation passes
|
||||
- README indicates implementation complete
|
||||
|
||||
**Before proceeding:**
|
||||
- Verify agent can be imported: `from exports.agent_name import default_agent`
|
||||
- Check if implementation is needed (see STATUS.md or IMPLEMENTATION_GUIDE.md)
|
||||
- Confirm agent executes without import errors
|
||||
|
||||
### Skipping Phases
|
||||
|
||||
**When to skip Phase 1:**
|
||||
- Agent structure already exists
|
||||
- Only need to add tests
|
||||
- Modifying existing agent
|
||||
|
||||
**When to skip Phase 2:**
|
||||
- Prototyping or exploring
|
||||
- Agent not production-bound
|
||||
- Manual testing sufficient
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Pattern 1: Complete New Build (Simple)
|
||||
|
||||
```
|
||||
User: "Build an agent that monitors files"
|
||||
→ Use /hive-create
|
||||
→ Agent structure created
|
||||
→ Use /hive-test
|
||||
→ Tests created and passing
|
||||
→ Done: Production-ready agent
|
||||
```
|
||||
|
||||
### Pattern 1b: Complete New Build (With Learning)
|
||||
|
||||
```
|
||||
User: "Build an agent (first time)"
|
||||
→ Use /hive-concepts (understand concepts)
|
||||
→ Use /hive-create (build structure)
|
||||
→ Use /hive-patterns (optimize design)
|
||||
→ Use /hive-test (validate)
|
||||
→ Done: Production-ready agent
|
||||
```
|
||||
|
||||
### Pattern 1c: Build from Template
|
||||
|
||||
```
|
||||
User: "Build an agent based on the deep research template"
|
||||
→ Use /hive-create
|
||||
→ Select "From a template" path
|
||||
→ Pick template, name new agent
|
||||
→ Review/modify goal, nodes, graph
|
||||
→ Agent exported with customizations
|
||||
→ Use /hive-test
|
||||
→ Done: Customized agent
|
||||
```
|
||||
|
||||
### Pattern 2: Test Existing Agent
|
||||
|
||||
```
|
||||
User: "Test my agent at exports/my_agent"
|
||||
→ Skip Phase 1
|
||||
→ Use /hive-test directly
|
||||
→ Tests created
|
||||
→ Done: Validated agent
|
||||
```
|
||||
|
||||
### Pattern 3: Iterative Development
|
||||
|
||||
```
|
||||
User: "Build an agent"
|
||||
→ Use /hive-create (Phase 1)
|
||||
→ Implementation needed (see STATUS.md)
|
||||
→ [User implements functions]
|
||||
→ Use /hive-test (Phase 2)
|
||||
→ Tests reveal bugs
|
||||
→ [Fix bugs manually]
|
||||
→ Re-run tests
|
||||
→ Done: Working agent
|
||||
```
|
||||
|
||||
### Pattern 4: Agent with Review Loops and HITL Checkpoints
|
||||
|
||||
```
|
||||
User: "Build an agent with human review and feedback loops"
|
||||
→ Use /hive-concepts (learn event loop, client-facing nodes)
|
||||
→ Use /hive-create (build structure with feedback edges)
|
||||
→ Use /hive-patterns (implement client-facing + feedback patterns)
|
||||
→ Use /hive-test (validate review flows and edge routing)
|
||||
→ Done: Agent with HITL checkpoints and review loops
|
||||
```
|
||||
|
||||
## Skill Dependencies
|
||||
|
||||
```
|
||||
hive (meta-skill)
|
||||
│
|
||||
├── hive-concepts (foundational)
|
||||
│ ├── Architecture concepts (event loop, judges)
|
||||
│ ├── Node types (event_loop, function)
|
||||
│ ├── Edge routing and priority
|
||||
│ ├── Tool discovery procedures
|
||||
│ └── Workflow overview
|
||||
│
|
||||
├── hive-create (procedural)
|
||||
│ ├── Creates package structure
|
||||
│ ├── Defines goal
|
||||
│ ├── Adds nodes (event_loop, function)
|
||||
│ ├── Connects edges with priority routing
|
||||
│ ├── Finalizes agent class
|
||||
│ └── Requires: hive-concepts
|
||||
│
|
||||
├── hive-patterns (reference)
|
||||
│ ├── Client-facing interaction patterns
|
||||
│ ├── Feedback edges and review loops
|
||||
│ ├── Judge patterns (implicit, SchemaJudge)
|
||||
│ ├── Fan-out/fan-in parallel execution
|
||||
│ └── Context management and anti-patterns
|
||||
│
|
||||
├── hive-credentials (utility)
|
||||
│ ├── Detects missing credentials
|
||||
│ ├── Offers auth method choices (Aden OAuth, direct API key)
|
||||
│ ├── Stores securely in ~/.hive/credentials
|
||||
│ └── Validates with health checks
|
||||
│
|
||||
├── hive-test (validation)
|
||||
│ ├── Reads agent goal
|
||||
│ ├── Generates tests
|
||||
│ ├── Runs evaluation
|
||||
│ └── Reports results
|
||||
│
|
||||
└── hive-debugger (troubleshooting)
|
||||
├── Monitors runtime logs (L1/L2/L3)
|
||||
├── Identifies retry loops, tool failures
|
||||
├── Categorizes issues (10 categories)
|
||||
└── Provides fix recommendations
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Agent structure won't validate"
|
||||
|
||||
- Check node IDs match between nodes/__init__.py and agent.py
|
||||
- Verify all edges reference valid node IDs
|
||||
- Ensure entry_node exists in nodes list
|
||||
- Run: `PYTHONPATH=exports uv run python -m agent_name validate`
|
||||
|
||||
### "Agent has structure but won't run"
|
||||
|
||||
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
|
||||
- Implementation may be needed (Python functions or MCP tools)
|
||||
- This is expected - hive-create creates structure, not implementation
|
||||
- See implementation guide for completion options
|
||||
|
||||
### "Tests are failing"
|
||||
|
||||
- Review test output for specific failures
|
||||
- Check agent goal and success criteria
|
||||
- Verify constraints are met
|
||||
- Use `/hive-test` to debug and iterate
|
||||
- Fix agent code and re-run tests
|
||||
|
||||
### "Agent is failing at runtime"
|
||||
|
||||
- Use `/hive-debugger` to analyze runtime logs
|
||||
- The debugger identifies retry loops, tool failures, and stalled execution
|
||||
- Get actionable fix recommendations with code changes
|
||||
- Monitor the agent in real-time during TUI sessions
|
||||
|
||||
### "Not sure which phase I'm in"
|
||||
|
||||
Run these checks:
|
||||
|
||||
```bash
|
||||
# Check if agent structure exists
|
||||
ls exports/my_agent/agent.py
|
||||
|
||||
# Check if it validates
|
||||
PYTHONPATH=exports uv run python -m my_agent validate
|
||||
|
||||
# Check if tests exist
|
||||
ls exports/my_agent/tests/
|
||||
|
||||
# If structure exists and validates → Phase 2 (testing)
|
||||
# If structure doesn't exist → Phase 1 (building)
|
||||
# If tests exist but failing → Debug phase
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Phase 1 (Building)
|
||||
|
||||
1. **Start with clear requirements** - Know what the agent should do
|
||||
2. **Define success criteria early** - Measurable goals drive design
|
||||
3. **Keep nodes focused** - One responsibility per node
|
||||
4. **Use descriptive names** - Node IDs should explain purpose
|
||||
5. **Validate incrementally** - Check structure after each major addition
|
||||
|
||||
### For Phase 2 (Testing)
|
||||
|
||||
1. **Test constraints first** - Hard requirements must pass
|
||||
2. **Mock external dependencies** - Use mock mode for LLMs/APIs
|
||||
3. **Cover edge cases** - Test failures, not just success paths
|
||||
4. **Iterate quickly** - Fix one test at a time
|
||||
5. **Document test patterns** - Future tests follow same structure
|
||||
|
||||
### General Workflow
|
||||
|
||||
1. **Use version control** - Git commit after each phase
|
||||
2. **Document decisions** - Update README with changes
|
||||
3. **Keep iterations small** - Build → Test → Fix → Repeat
|
||||
4. **Preserve working states** - Tag successful iterations
|
||||
5. **Learn from failures** - Failed tests reveal design issues
|
||||
|
||||
## Exit Criteria
|
||||
|
||||
You're done with the workflow when:
|
||||
|
||||
✅ Agent structure validates
|
||||
✅ All tests pass
|
||||
✅ Success criteria met
|
||||
✅ Constraints verified
|
||||
✅ Documentation complete
|
||||
✅ Agent ready for deployment
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **hive-concepts**: See `.claude/skills/hive-concepts/SKILL.md`
|
||||
- **hive-create**: See `.claude/skills/hive-create/SKILL.md`
|
||||
- **hive-patterns**: See `.claude/skills/hive-patterns/SKILL.md`
|
||||
- **hive-test**: See `.claude/skills/hive-test/SKILL.md`
|
||||
- **Agent framework docs**: See `core/README.md`
|
||||
- **Example agents**: See `exports/` directory
|
||||
|
||||
## Summary
|
||||
|
||||
This workflow provides a proven path from concept to production-ready agent:
|
||||
|
||||
1. **Learn** with `/hive-concepts` → Understand fundamentals (optional)
|
||||
2. **Build** with `/hive-create` → Get validated structure
|
||||
3. **Optimize** with `/hive-patterns` → Apply best practices (optional)
|
||||
4. **Configure** with `/hive-credentials` → Set up API keys (if needed)
|
||||
5. **Test** with `/hive-test` → Get verified functionality
|
||||
6. **Debug** with `/hive-debugger` → Fix runtime issues (if needed)
|
||||
|
||||
The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
|
||||
|
||||
## Skill Selection Guide
|
||||
|
||||
**Choose hive-concepts when:**
|
||||
- First time building agents
|
||||
- Need to understand event loop architecture
|
||||
- Validating tool availability
|
||||
- Learning about node types, edges, and judges
|
||||
|
||||
**Choose hive-create when:**
|
||||
- Actually building an agent
|
||||
- Have clear requirements
|
||||
- Ready to write code
|
||||
- Want step-by-step guidance
|
||||
- Want to start from an existing template and customize it
|
||||
|
||||
**Choose hive-patterns when:**
|
||||
- Agent structure complete
|
||||
- Need client-facing nodes or feedback edges
|
||||
- Implementing review loops or fan-out/fan-in
|
||||
- Want judge patterns or context management
|
||||
- Want best practices
|
||||
|
||||
**Choose hive-test when:**
|
||||
- Agent structure complete
|
||||
- Ready to validate functionality
|
||||
- Need comprehensive test coverage
|
||||
- Testing feedback loops, output keys, or fan-out
|
||||
|
||||
**Choose hive-debugger when:**
|
||||
- Agent is failing or stuck at runtime
|
||||
- Seeing retry loops or escalations
|
||||
- Tool calls are failing
|
||||
- Need to understand why a node isn't completing
|
||||
- Want real-time monitoring of agent execution
|
||||
@@ -1,199 +0,0 @@
|
||||
# Example: File Monitor Agent
|
||||
|
||||
This example shows the complete /hive workflow in action for building a file monitoring agent.
|
||||
|
||||
## Initial Request
|
||||
|
||||
```
|
||||
User: "Build an agent that monitors ~/Downloads and copies new files to ~/Documents"
|
||||
```
|
||||
|
||||
## Phase 1: Building (20 minutes)
|
||||
|
||||
### Step 1: Create Structure
|
||||
|
||||
Agent invokes `/hive-create` skill and:
|
||||
|
||||
1. Creates `exports/file_monitor_agent/` package
|
||||
2. Writes skeleton files (__init__.py, __main__.py, agent.py, etc.)
|
||||
|
||||
**Output**: Package structure visible immediately
|
||||
|
||||
### Step 2: Define Goal
|
||||
|
||||
```python
|
||||
goal = Goal(
|
||||
id="file-monitor-copy",
|
||||
name="Automated File Monitor & Copy",
|
||||
success_criteria=[
|
||||
# 100% detection rate
|
||||
# 100% copy success
|
||||
# 100% conflict resolution
|
||||
# >99% uptime
|
||||
],
|
||||
constraints=[
|
||||
# Preserve originals
|
||||
# Handle errors gracefully
|
||||
# Track state
|
||||
# Respect permissions
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
**Output**: Goal written to agent.py
|
||||
|
||||
### Step 3: Design Nodes
|
||||
|
||||
7 nodes approved and written incrementally:
|
||||
|
||||
1. `initialize-state` - Set up tracking
|
||||
2. `list-downloads` - Scan directory
|
||||
3. `identify-new-files` - Find new files
|
||||
4. `check-for-new-files` - Router
|
||||
5. `copy-files` - Copy with conflict resolution
|
||||
6. `update-state` - Mark as processed
|
||||
7. `wait-interval` - Sleep between cycles
|
||||
|
||||
**Output**: All nodes in nodes/__init__.py
|
||||
|
||||
### Step 4: Connect Edges
|
||||
|
||||
8 edges connecting the workflow loop:
|
||||
|
||||
```
|
||||
initialize → list → identify → check
|
||||
↓ ↓
|
||||
copy wait
|
||||
↓ ↑
|
||||
update ↓
|
||||
↓ ↓
|
||||
wait → list (loop)
|
||||
```
|
||||
|
||||
**Output**: Edges written to agent.py
|
||||
|
||||
### Step 5: Finalize
|
||||
|
||||
```bash
|
||||
$ PYTHONPATH=exports uv run python -m file_monitor_agent validate
|
||||
✓ Agent is valid
|
||||
|
||||
$ PYTHONPATH=exports uv run python -m file_monitor_agent info
|
||||
Agent: File Monitor & Copy Agent
|
||||
Nodes: 7
|
||||
Edges: 8
|
||||
```
|
||||
|
||||
**Phase 1 Complete**: Structure validated ✅
|
||||
|
||||
### Status After Phase 1
|
||||
|
||||
```
|
||||
exports/file_monitor_agent/
|
||||
├── __init__.py ✅ (exports)
|
||||
├── __main__.py ✅ (CLI)
|
||||
├── agent.py ✅ (goal, graph, agent class)
|
||||
├── nodes/__init__.py ✅ (7 nodes)
|
||||
├── config.py ✅ (configuration)
|
||||
├── implementations.py ✅ (Python functions)
|
||||
├── README.md ✅ (documentation)
|
||||
├── IMPLEMENTATION_GUIDE.md ✅ (next steps)
|
||||
└── STATUS.md ✅ (current state)
|
||||
```
|
||||
|
||||
**Note**: Implementation gap exists - data flow needs connection (covered in STATUS.md)
|
||||
|
||||
## Phase 2: Testing (25 minutes)
|
||||
|
||||
### Step 1: Analyze Agent
|
||||
|
||||
Agent invokes `/hive-test` skill and:
|
||||
|
||||
1. Reads goal from `exports/file_monitor_agent/agent.py`
|
||||
2. Identifies 4 success criteria to test
|
||||
3. Identifies 4 constraints to verify
|
||||
4. Plans test coverage
|
||||
|
||||
### Step 2: Generate Tests
|
||||
|
||||
Creates test files:
|
||||
|
||||
```
|
||||
exports/file_monitor_agent/tests/
|
||||
├── conftest.py (fixtures)
|
||||
├── test_constraints.py (4 constraint tests)
|
||||
├── test_success_criteria.py (4 success tests)
|
||||
└── test_edge_cases.py (error handling)
|
||||
```
|
||||
|
||||
Tests approved incrementally by user.
|
||||
|
||||
### Step 3: Run Tests
|
||||
|
||||
```bash
|
||||
$ PYTHONPATH=exports uv run pytest exports/file_monitor_agent/tests/
|
||||
|
||||
test_constraints.py::test_preserves_originals PASSED
|
||||
test_constraints.py::test_handles_errors PASSED
|
||||
test_constraints.py::test_tracks_state PASSED
|
||||
test_constraints.py::test_respects_permissions PASSED
|
||||
|
||||
test_success_criteria.py::test_detects_all_files PASSED
|
||||
test_success_criteria.py::test_copies_all_files PASSED
|
||||
test_success_criteria.py::test_resolves_conflicts PASSED
|
||||
test_success_criteria.py::test_continuous_run PASSED
|
||||
|
||||
test_edge_cases.py::test_empty_directory PASSED
|
||||
test_edge_cases.py::test_permission_denied PASSED
|
||||
test_edge_cases.py::test_disk_full PASSED
|
||||
test_edge_cases.py::test_large_files PASSED
|
||||
|
||||
========================== 12 passed in 3.42s ==========================
|
||||
```
|
||||
|
||||
**Phase 2 Complete**: All tests pass ✅
|
||||
|
||||
## Final Output
|
||||
|
||||
**Production-Ready Agent:**
|
||||
|
||||
```bash
|
||||
# Run the agent
|
||||
./RUN_AGENT.sh
|
||||
|
||||
# Or manually
|
||||
PYTHONPATH=exports uv run python -m file_monitor_agent run
|
||||
```
|
||||
|
||||
**Capabilities:**
|
||||
- Monitors ~/Downloads continuously
|
||||
- Copies new files to ~/Documents
|
||||
- Resolves conflicts with timestamps
|
||||
- Handles errors gracefully
|
||||
- Tracks processed files
|
||||
- Runs as background service
|
||||
|
||||
**Total Time**: ~45 minutes from concept to production
|
||||
|
||||
## Key Learnings
|
||||
|
||||
1. **Incremental building** - Files written immediately, visible throughout
|
||||
2. **Validation early** - Structure validated before moving to implementation
|
||||
3. **Test-driven** - Tests reveal real behavior
|
||||
4. **Documentation included** - README, STATUS, and guides auto-generated
|
||||
5. **Repeatable process** - Same workflow for any agent type
|
||||
|
||||
## Variations
|
||||
|
||||
**For simpler agents:**
|
||||
- Fewer nodes (3-5 instead of 7)
|
||||
- Simpler workflow (linear instead of looping)
|
||||
- Faster build time (10-15 minutes)
|
||||
|
||||
**For complex agents:**
|
||||
- More nodes (10-15+)
|
||||
- Multiple subgraphs
|
||||
- Pause/resume points for human-in-the-loop
|
||||
- Longer build time (45-60 minutes)
|
||||
|
||||
The workflow scales to your needs!
|
||||
@@ -1,7 +0,0 @@
|
||||
# Project-level Codex config for Hive.
|
||||
# Keep this file minimal: MCP connectivity + skill discovery.
|
||||
|
||||
[mcp_servers.agent-builder]
|
||||
command = "uv"
|
||||
args = ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"]
|
||||
cwd = "."
|
||||
@@ -1,20 +0,0 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"agent-builder": {
|
||||
"command": "python",
|
||||
"args": ["-m", "framework.mcp.agent_builder_server"],
|
||||
"cwd": "core",
|
||||
"env": {
|
||||
"PYTHONPATH": "../tools/src"
|
||||
}
|
||||
},
|
||||
"tools": {
|
||||
"command": "python",
|
||||
"args": ["mcp_server.py", "--stdio"],
|
||||
"cwd": "tools",
|
||||
"env": {
|
||||
"PYTHONPATH": "src"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-concepts
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-create
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-credentials
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-patterns
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-test
|
||||
@@ -1,30 +0,0 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"agent-builder": {
|
||||
"command": "uv",
|
||||
"args": [
|
||||
"run",
|
||||
"python",
|
||||
"-m",
|
||||
"framework.mcp.agent_builder_server"
|
||||
],
|
||||
"cwd": "core",
|
||||
"env": {
|
||||
"PYTHONPATH": "../tools/src"
|
||||
}
|
||||
},
|
||||
"tools": {
|
||||
"command": "uv",
|
||||
"args": [
|
||||
"run",
|
||||
"python",
|
||||
"mcp_server.py",
|
||||
"--stdio"
|
||||
],
|
||||
"cwd": "tools",
|
||||
"env": {
|
||||
"PYTHONPATH": "src"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-concepts
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-create
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-credentials
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-debugger
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-patterns
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/hive-test
|
||||
@@ -1 +0,0 @@
|
||||
../../.claude/skills/triage-issue
|
||||
Vendored
-7
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"charliermarsh.ruff",
|
||||
"editorconfig.editorconfig",
|
||||
"ms-python.python"
|
||||
]
|
||||
}
|
||||
+1
-1
@@ -145,7 +145,7 @@ uv run python -m framework test-debug <agent_path> <test_name>
|
||||
uv run python -m framework test-list <agent_path>
|
||||
```
|
||||
|
||||
For detailed testing workflows, see the [hive-test skill](../.claude/skills/hive-test/SKILL.md).
|
||||
For detailed testing workflows, see [developer-guide.md](../docs/developer-guide.md).
|
||||
|
||||
### Analyzing Agent Behavior with Builder
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ goal = Goal(
|
||||
id="framework-compliance",
|
||||
description=(
|
||||
"Generated code follows framework patterns: STEP 1/STEP 2 "
|
||||
"for client-facing, correct imports, entry_points format"
|
||||
"for client-facing and correct imports"
|
||||
),
|
||||
metric="pattern_compliance",
|
||||
target="100%",
|
||||
|
||||
@@ -8,7 +8,6 @@ from framework.graph import NodeSpec
|
||||
# No voluntary read_file() calls needed — the LLM gets everything upfront.
|
||||
_ref_dir = Path(__file__).parent.parent / "reference"
|
||||
_framework_guide = (_ref_dir / "framework_guide.md").read_text(encoding="utf-8")
|
||||
_file_templates = (_ref_dir / "file_templates.md").read_text(encoding="utf-8")
|
||||
_anti_patterns = (_ref_dir / "anti_patterns.md").read_text(encoding="utf-8")
|
||||
_gcu_guide_path = _ref_dir / "gcu_guide.md"
|
||||
_gcu_guide = _gcu_guide_path.read_text(encoding="utf-8") if _gcu_guide_path.exists() else ""
|
||||
@@ -27,19 +26,22 @@ def _build_appendices() -> str:
|
||||
parts = (
|
||||
"\n\n# Appendix: Framework Reference\n\n"
|
||||
+ _framework_guide
|
||||
+ "\n\n# Appendix: File Templates\n\n"
|
||||
+ _file_templates
|
||||
+ "\n\n# Appendix: Anti-Patterns\n\n"
|
||||
+ _anti_patterns
|
||||
)
|
||||
if _is_gcu_enabled() and _gcu_guide:
|
||||
parts += "\n\n# Appendix: GCU Browser Automation Guide\n\n" + _gcu_guide
|
||||
return parts
|
||||
|
||||
|
||||
# Shared appendices — appended to every coding node's system prompt.
|
||||
_appendices = _build_appendices()
|
||||
|
||||
# GCU first-class section for building phase (when GCU is enabled).
|
||||
# This is placed prominently in the main prompt body, not as an appendix.
|
||||
_gcu_building_section = (
|
||||
"\n\n# GCU Nodes — Browser Automation\n\n"
|
||||
+ _gcu_guide
|
||||
) if _is_gcu_enabled() and _gcu_guide else ""
|
||||
|
||||
# Tools available to both coder (worker) and queen.
|
||||
_SHARED_TOOLS = [
|
||||
# File I/O
|
||||
@@ -59,16 +61,17 @@ _SHARED_TOOLS = [
|
||||
"list_agent_checkpoints",
|
||||
"get_agent_checkpoint",
|
||||
"run_agent_tests",
|
||||
"initialize_agent_package",
|
||||
]
|
||||
|
||||
# Queen mode-specific tool sets.
|
||||
# Building mode: full coding + agent construction tools.
|
||||
# Queen phase-specific tool sets.
|
||||
# Building phase: full coding + agent construction tools.
|
||||
_QUEEN_BUILDING_TOOLS = _SHARED_TOOLS + [
|
||||
"load_built_agent",
|
||||
"list_credentials",
|
||||
]
|
||||
|
||||
# Staging mode: agent loaded but not yet running — inspect, configure, launch.
|
||||
# Staging phase: agent loaded but not yet running — inspect, configure, launch.
|
||||
_QUEEN_STAGING_TOOLS = [
|
||||
# Read-only (inspect agent files, logs)
|
||||
"read_file",
|
||||
@@ -83,7 +86,7 @@ _QUEEN_STAGING_TOOLS = [
|
||||
"stop_worker_and_edit",
|
||||
]
|
||||
|
||||
# Running mode: worker is executing — monitor and control.
|
||||
# Running phase: worker is executing — monitor and control.
|
||||
_QUEEN_RUNNING_TOOLS = [
|
||||
# Read-only coding (for inspecting logs, files)
|
||||
"read_file",
|
||||
@@ -111,9 +114,16 @@ _QUEEN_RUNNING_TOOLS = [
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_agent_builder_knowledge = """\
|
||||
**A responsible engineer doesn't jump into building. First, \
|
||||
understand the problem and be transparent about what the framework can and cannot do.**
|
||||
|
||||
Use the user's selection (or their custom description if they chose "Other") \
|
||||
as context when shaping the goal below. If the user already described \
|
||||
what they want before this step, skip the question and proceed directly.
|
||||
|
||||
# Core Mandates
|
||||
|
||||
- **DO NOT propose a complete goal on your own.** Instead, \
|
||||
collaborate with the user to define it.
|
||||
- **Read before writing.** NEVER write code from assumptions. Read \
|
||||
reference agents and templates first. Read every file before editing.
|
||||
- **Conventions first.** Follow existing project patterns exactly. \
|
||||
@@ -169,55 +179,61 @@ You are not just a file writer. You have deep integration with the \
|
||||
Hive framework:
|
||||
|
||||
## Tool Discovery (MANDATORY before designing)
|
||||
Before designing any agent, run list_agent_tools() to discover all \
|
||||
available tools. ONLY use tools from this list in your node definitions. \
|
||||
Before designing any agent, run list_agent_tools() with NO arguments \
|
||||
to see ALL available tools (names + descriptions, grouped by category). \
|
||||
ONLY use tools from this list in your node definitions. \
|
||||
NEVER guess or fabricate tool names from memory.
|
||||
|
||||
list_agent_tools() # names + descriptions
|
||||
list_agent_tools(output_schema="full") # include input_schema
|
||||
list_agent_tools(group="gmail") # only gmail_* tools
|
||||
list_agent_tools("exports/{agent_name}/mcp_servers.json") # specific agent
|
||||
list_agent_tools() # ALWAYS call this first
|
||||
list_agent_tools(group="gmail", output_schema="full") # then drill into a category
|
||||
|
||||
## Agent Awareness
|
||||
Run list_agents() to see what agents already exist. Read their code \
|
||||
for patterns:
|
||||
read_file("exports/{name}/agent.py")
|
||||
read_file("exports/{name}/nodes/__init__.py")
|
||||
NEVER skip the first call. Always start with the full list \
|
||||
so you know what categories and tools exist before drilling in.
|
||||
|
||||
## Post-Build Testing
|
||||
After writing agent code, validate structurally AND run tests:
|
||||
run_command("python -c 'from {name} import default_agent; \\
|
||||
run_command("uv run python -c 'from {name} import default_agent; \\
|
||||
print(default_agent.validate())'")
|
||||
run_agent_tests("{name}")
|
||||
|
||||
## Debugging Built Agents
|
||||
When a user says "my agent is failing" or "debug this agent":
|
||||
1. list_agent_sessions("{agent_name}") — find the session
|
||||
2. get_worker_status
|
||||
4. list_agent_checkpoints / get_agent_checkpoint — trace execution
|
||||
2. get_worker_status(focus="issues") — check for problems
|
||||
3. list_agent_checkpoints / get_agent_checkpoint — trace execution
|
||||
|
||||
# Agent Building Workflow
|
||||
|
||||
You operate in a continuous loop. The user describes what they want, \
|
||||
you build it. No rigid phases — use judgment. But the general flow is:
|
||||
|
||||
## 1. Understand & Qualify (3-5 turns)
|
||||
## 1: Fast Discovery (3-6 Turns)
|
||||
|
||||
This is ONE conversation, not two phases. Discovery and qualification \
|
||||
happen together. Surface problems as you find them, not in a batch.
|
||||
**The core principle**: Discovery should feel like progress, not paperwork. The stakeholder should walk away feeling like you understood them faster than anyone else would have.
|
||||
|
||||
**Before your first response**, silently run list_agent_tools() and \
|
||||
consult the **Framework Reference** appendix. Know what's possible \
|
||||
before you speak.
|
||||
**Communication sytle**: Be concise. Say less. Mean more. Impatient stakeholders don't want a wall of text — they want to know you get it. Every sentence you say should either move the conversation forward or prove you understood something. If it does neither, cut it.
|
||||
|
||||
### How to respond to the user's first message
|
||||
**Ask Question Rules: Respect Their Time.** Every question must earn its place by:
|
||||
1. **Preventing a costly wrong turn** — you're about to build the wrong thing
|
||||
2. **Unlocking a shortcut** — their answer lets you simplify the design
|
||||
3. **Surfacing a dealbreaker** — there's a constraint that changes everything
|
||||
4. **Provide Options** - Provide options to your questions if possible, but also always allow the user to type something beyong the options.
|
||||
|
||||
If a question doesn't do one of these, don't ask it. Make an assumption, state it, and move on.
|
||||
|
||||
---
|
||||
|
||||
### 1.1: Let Them Talk, But Listen Like an Architect
|
||||
|
||||
When the stakeholder describes what they want, don't just hear the words — listen for the architecture underneath. While they talk, mentally construct:
|
||||
|
||||
**Listen like an architect.** While they talk, hear the structure:
|
||||
- **The actors**: Who are the people/systems involved?
|
||||
- **The trigger**: What kicks off the workflow?
|
||||
- **The core loop**: What's the main thing that happens repeatedly?
|
||||
- **The output**: What's the valuable thing produced?
|
||||
- **The pain**: What about today is broken, slow, or missing?
|
||||
- **The output**: What's the valuable thing produced at the end?
|
||||
- **The pain**: What about today's situation is broken, slow, or missing?
|
||||
|
||||
You are extracting a **domain model** from natural language in real time. Most stakeholders won't give you this structure explicitly — they'll give you a story. Your job is to hear the structure inside the story.
|
||||
|
||||
| They say... | You're hearing... |
|
||||
|-------------|-------------------|
|
||||
@@ -225,67 +241,127 @@ before you speak.
|
||||
| Verbs they emphasize | Your core operations |
|
||||
| Frustrations they mention | Your design constraints |
|
||||
| Workarounds they describe | What the system must replace |
|
||||
| People they name | Your user types |
|
||||
|
||||
**Use domain knowledge aggressively.** If they say "research agent," \
|
||||
you already know it involves search, summarization, source tracking, \
|
||||
iteration. Don't ask about each — use them as defaults and let their \
|
||||
specifics override. Merge your general knowledge with their specifics: \
|
||||
60-80% right before you ask a single question.
|
||||
---
|
||||
|
||||
### Play back a model WITH qualification baked in
|
||||
### 1.2: Use Domain Knowledge to Fill In the Blanks
|
||||
|
||||
Don't separate "here's what I understood" from "here's what might be \
|
||||
a problem." Weave them together. Your playback should sound like:
|
||||
You have broad knowledge of how systems work. Use it aggressively.
|
||||
|
||||
"Here's how I'm picturing this: [concrete proposed solution]. \
|
||||
The framework handles [X and Y] well for this. [One concern: Z tool \
|
||||
doesn't exist, so we'd use W instead / Z would need real-time which \
|
||||
isn't a fit, but we could do polling]. For MVP I'd focus on \
|
||||
[highest-value thing]. Before I start — [1-2 questions]."
|
||||
If they say "I need a research agent," you already know it probably involves: search, summarization, source tracking, and iteration. Don't ask about each — use them as your starting mental model and let their specifics override your defaults.
|
||||
|
||||
If there's a deal-breaker, lead with it: "Before I go further — \
|
||||
this needs [X] which the framework can't do because [Y]. We could \
|
||||
[workaround] or reconsider the approach. What do you think?"
|
||||
If they say "I need to monitor files and alert me," you know this probably involves: watch patterns, triggers, notifications, and state tracking.
|
||||
|
||||
**Surface problems immediately. Don't save them for a formal review.**
|
||||
**The key move**: Take your general knowledge of the domain and merge it with the specifics they've given you. The result is a draft understanding that's 60-80% right before you've asked a single question. Your questions close the remaining 20-40%.
|
||||
|
||||
### Ask only what you CANNOT infer
|
||||
---
|
||||
|
||||
Every question must earn its place by preventing a costly wrong turn, \
|
||||
unlocking a shortcut, or surfacing a dealbreaker.
|
||||
### 1.3: Play Back a Proposed Model (Not a List of Questions)
|
||||
|
||||
Good questions: "Who's the primary user?", "Is this replacing \
|
||||
something or net new?", "Does this integrate with anything?"
|
||||
After listening, present a **concrete picture** of what you think they need. Make it specific enough that they can spot what's wrong.
|
||||
|
||||
Bad questions (DON'T ask): "What should happen on error?", "Should \
|
||||
it have search?", "What tools should I use?" — these are your job.
|
||||
**Pattern: "Here's what I heard — tell me where I'm off"**
|
||||
|
||||
### Conversation flow
|
||||
> "OK here's how I'm picturing this: [User type] needs to [core action]. Right now they're [current painful workflow]. What you want is [proposed solution that replaces the pain].
|
||||
>
|
||||
> The way I'd structure this: [key entities] connected by [key relationships], with the main flow being [trigger → steps → outcome].
|
||||
>
|
||||
> For the MVP, I'd focus on [the one thing that delivers the most value] and hold off on [things that can wait].
|
||||
>
|
||||
> Before I start — [1-2 specific questions you genuinely can't infer]."
|
||||
|
||||
---
|
||||
|
||||
### 1.4: Ask Only What You Cannot Infer
|
||||
|
||||
Your questions should be **narrow, specific, and consequential**. Never ask what you could answer yourself.
|
||||
|
||||
**Good questions** (high-stakes, can't infer):
|
||||
- "Who's the primary user — you or your end customers?"
|
||||
- "Is this replacing a spreadsheet, or is there literally nothing today?"
|
||||
- "Does this need to integrate with anything, or standalone?"
|
||||
- "Is there existing data to migrate, or starting fresh?"
|
||||
|
||||
**Bad questions** (low-stakes, inferable):
|
||||
- "What should happen if there's an error?" *(handle gracefully, obviously)*
|
||||
- "Should it have search?" *(if there's a list, yes)*
|
||||
- "How should we handle permissions?" *(follow standard patterns)*
|
||||
- "What tools should I use?" *(your call, not theirs)*
|
||||
|
||||
---
|
||||
|
||||
#### Conversation Flow (3-6 Turns)
|
||||
|
||||
| Turn | Who | What |
|
||||
|------|-----|------|
|
||||
| 1 | User | Describes what they need |
|
||||
| 2 | You | Play back model with concerns baked in. 1-2 questions max. |
|
||||
| 2 | Agent | Plays back understanding as a proposed model. Asks 1-2 critical questions max. |
|
||||
| 3 | User | Corrects, confirms, or adds detail |
|
||||
| 4 | You | Adjust model, confirm scope, move to design |
|
||||
| 4 | Agent | Adjusts model, confirms MVP scope, states assumptions, declares starting point |
|
||||
| *(5)* | *(Only if Turn 3 revealed something that fundamentally changes the approach)* |
|
||||
|
||||
### Anti-patterns
|
||||
**AFTER the conversation, IMMEDIATELY proceed to 2b. DO NOT skip to building.**
|
||||
|
||||
| Don't | Do instead |
|
||||
---
|
||||
|
||||
#### Anti-Patterns
|
||||
|
||||
| Don't | Do Instead |
|
||||
|-------|------------|
|
||||
| Open with a list of questions | Open with what you understood |
|
||||
| Separate "assessment" dump | Weave concerns into your playback |
|
||||
| Good/Bad/Ugly formal section | Mention issues naturally in context |
|
||||
| Ask about every edge case | Smart defaults, flag in summary |
|
||||
| 10+ turn discovery | 3-5 turns, then start building |
|
||||
| Wait for certainty | Start at 80% confidence, iterate |
|
||||
| Ask what tech/tools to use | Decide, disclose, move on |
|
||||
| Open with a list of questions | Open with what you understood from their request |
|
||||
| "What are your requirements?" | "Here's what I think you need — am I right?" |
|
||||
| Ask about every edge case | Handle with smart defaults, flag in summary |
|
||||
| 10+ turn discovery conversation | 3-8 turns. Start building, iterate with real software. |
|
||||
| Being lazy nd not understand what user want to achieve | Understand "what" and "why |
|
||||
| Ask for permission to start | State your plan and start |
|
||||
| Wait for certainty | Start at 80% confidence, iterate the rest |
|
||||
| Ask what tech/tools to use | That's your job. Decide, disclose, move on. |
|
||||
|
||||
## 3. Design
|
||||
---
|
||||
|
||||
## 2: Capability Assessment
|
||||
|
||||
**After the user responds, analyze the fit.** Present this assessment honestly:
|
||||
|
||||
> **Framework Fit Assessment**
|
||||
>
|
||||
> Based on what you've described, here's my honest assessment of how well this framework fits your use case:
|
||||
>
|
||||
> **What Works Well (The Good):**
|
||||
> - [List 2-4 things the framework handles well for this use case]
|
||||
> - Examples: multi-turn conversations, human-in-the-loop review, tool orchestration, structured outputs
|
||||
>
|
||||
> **Limitations to Be Aware Of (The Bad):**
|
||||
> - [List 2-3 limitations that apply but are workable]
|
||||
> - Examples: LLM latency means not suitable for sub-second responses, context window limits for very large documents, cost per run for heavy tool usage
|
||||
>
|
||||
> **Potential Deal-Breakers (The Ugly):**
|
||||
> - [List any significant challenges or missing capabilities — be honest]
|
||||
> - Examples: no tool available for X, would require custom MCP server, framework not designed for Y
|
||||
|
||||
**Be specific.** Reference the actual tools discovered in Step 1. If the user needs `send_email` but it's not available, say so. If they need real-time streaming from a database, explain that's not how the framework works.
|
||||
|
||||
## 3: Gap Analysis
|
||||
|
||||
**Identify specific gaps** between what user wants and what you can deliver:
|
||||
|
||||
| Requirement | Framework Support | Gap/Workaround |
|
||||
|-------------|-------------------|----------------|
|
||||
| [User need] | [✅ Supported / ⚠️ Partial / ❌ Not supported] | [How to handle or why it's a problem] |
|
||||
|
||||
**Examples of gaps to identify:**
|
||||
- Missing tools (user needs X, but only Y and Z are available)
|
||||
- Scope issues (user wants to process 10,000 items, but LLM rate limits apply)
|
||||
- Interaction mismatches (user wants CLI-only, but agent is designed for TUI)
|
||||
- Data flow issues (user needs to persist state across runs, but sessions are isolated)
|
||||
- Latency requirements (user needs instant responses, but LLM calls take seconds)
|
||||
|
||||
## 4: Design Graph and Propose
|
||||
|
||||
Design the agent architecture:
|
||||
- Goal: id, name, description, 3-5 success criteria, 2-4 constraints
|
||||
- Nodes: **2-4 nodes MAXIMUM** (see rules below)
|
||||
- Nodes: **2-5 nodes** (warn if <2 or >5)
|
||||
- Edges: on_success for linear, conditional for routing
|
||||
- Lifecycle: ALWAYS forever-alive (`terminal_nodes=[]`) unless the user \
|
||||
explicitly requests a one-shot/batch agent. Forever-alive agents loop \
|
||||
@@ -303,19 +379,18 @@ tools — merge them into nodes that do real work.
|
||||
- Node has NO tools (pure LLM reasoning) → merge into predecessor/successor
|
||||
- Node sets only 1 trivial output → collapse into predecessor
|
||||
- Multiple consecutive autonomous nodes → combine into one rich node
|
||||
- A "report" or "summary" node → merge into the client-facing node
|
||||
- A "report" or "summary" node → merge into a processing node and return results to queen
|
||||
- A "confirm" or "schedule" node that calls no external service → remove
|
||||
|
||||
**SEPARATE nodes only when:**
|
||||
- Client-facing vs autonomous (different interaction models)
|
||||
- Fundamentally different tool sets
|
||||
- Fan-out parallelism (parallel branches MUST be separate)
|
||||
|
||||
**Typical patterns (queen manages intake — NO client-facing intake node):**
|
||||
- 2 nodes: `process (autonomous) → review (client-facing) → process`
|
||||
- 1 node: `process (autonomous)` — simplest; queen handles all interaction
|
||||
**Typical patterns (queen manages all user interaction):**
|
||||
- 2 nodes: `process (autonomous) → validate (autonomous) → process`
|
||||
- 1 node: `process (autonomous)` — simplest; queen handles intake/review
|
||||
- WRONG: 7 nodes where half have no tools and just do LLM reasoning
|
||||
- WRONG: Intake node that asks the user for requirements — the queen does intake
|
||||
- WRONG: Any worker node with `client_facing=True`
|
||||
|
||||
Read reference agents before designing:
|
||||
list_agents()
|
||||
@@ -328,16 +403,16 @@ use box-drawing characters and clear flow arrows:
|
||||
|
||||
```
|
||||
┌─────────────────────────┐
|
||||
│ process (autonomous) │
|
||||
│ in: user_request │
|
||||
│ tools: web_search, │
|
||||
│ save_data │
|
||||
│ process │
|
||||
│ in: user_request │
|
||||
│ tools: web_search, │
|
||||
│ save_data │
|
||||
└────────────┬────────────┘
|
||||
│ on_success
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ review (client-facing) │
|
||||
│ tools: set_output │
|
||||
│ review │
|
||||
│ tools: set_output │
|
||||
└────────────┬────────────┘
|
||||
│ on_success
|
||||
└──────► back to process
|
||||
@@ -346,186 +421,61 @@ use box-drawing characters and clear flow arrows:
|
||||
The queen owns intake: she gathers user requirements, then calls \
|
||||
`run_agent_with_input(task)` with a structured task description. \
|
||||
When building the agent, design the entry node's `input_keys` to \
|
||||
match what the queen will provide at run time. No client-facing \
|
||||
intake node in the worker.
|
||||
match what the queen will provide at run time. Worker nodes should \
|
||||
use `escalate_to_coder` for blockers.
|
||||
|
||||
Follow the graph with a brief summary of each node's purpose. \
|
||||
Get user approval before implementing.
|
||||
|
||||
## 4. Implement
|
||||
## 5: Get Explicit Acknowledgment
|
||||
|
||||
Consult the **File Templates** and **Anti-Patterns** appendices below.
|
||||
**CALL AskUserQuestion:**
|
||||
"options": [
|
||||
{"label": "Proceed as described"},
|
||||
{"label": "Adjust scope", "description": "Let's modify the requirements to fit better"},
|
||||
{"label": "More questions", "description": "I have questions about the assessment"},
|
||||
{"label": "Reconsider", "description": "Maybe this isn't the right approach"}
|
||||
]
|
||||
|
||||
Write files in order:
|
||||
1. mkdir -p exports/{name}/nodes exports/{name}/tests
|
||||
2. config.py — RuntimeConfig + AgentMetadata
|
||||
3. nodes/__init__.py — NodeSpec definitions with system prompts
|
||||
4. agent.py — Goal, edges, graph, agent class
|
||||
5. __init__.py — package exports
|
||||
6. __main__.py — CLI with click
|
||||
7. mcp_servers.json — tool server config
|
||||
8. tests/ — fixtures
|
||||
**WAIT for user response.**
|
||||
|
||||
### Critical Rules
|
||||
- If **Proceed**: Move to next implementing
|
||||
- If **Adjust scope**: Discuss what to change, update your notes, re-assess if needed
|
||||
- If **More questions**: Answer them honestly, then ask again
|
||||
- If **Reconsider**: Discuss alternatives. If they decide to proceed anyway, that's their informed choice
|
||||
|
||||
**Imports** (must match exactly — only import what you use):
|
||||
```python
|
||||
from framework.graph import (
|
||||
NodeSpec, EdgeSpec, EdgeCondition,
|
||||
Goal, SuccessCriterion, Constraint,
|
||||
)
|
||||
from framework.graph.edge import GraphSpec
|
||||
from framework.graph.executor import ExecutionResult
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.agent_runtime import (
|
||||
AgentRuntime, create_agent_runtime,
|
||||
)
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
```
|
||||
For agents with async entry points (timers, webhooks, events), also add:
|
||||
```python
|
||||
from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
|
||||
from framework.runtime.agent_runtime import (
|
||||
AgentRuntime, AgentRuntimeConfig, create_agent_runtime,
|
||||
)
|
||||
```
|
||||
NEVER `from core.framework...` — PYTHONPATH includes core/.
|
||||
|
||||
**__init__.py MUST re-export ALL module-level variables** \
|
||||
(THIS IS THE #1 SOURCE OF AGENT LOAD FAILURES):
|
||||
The runner imports the package (__init__.py), NOT agent.py. It reads \
|
||||
goal, nodes, edges, entry_node, entry_points, pause_nodes, \
|
||||
terminal_nodes, conversation_mode, identity_prompt, loop_config via \
|
||||
getattr(). If ANY are missing from __init__.py, they silently default \
|
||||
to None or {} — causing "must define goal, nodes, edges" or "node X \
|
||||
is unreachable" errors. The __init__.py MUST import and re-export \
|
||||
ALL of these from .agent:
|
||||
```python
|
||||
from .agent import (
|
||||
MyAgent, default_agent, goal, nodes, edges,
|
||||
entry_node, entry_points, pause_nodes, terminal_nodes,
|
||||
conversation_mode, identity_prompt, loop_config,
|
||||
)
|
||||
```
|
||||
## 6. Implement
|
||||
|
||||
**entry_points**: `{"start": "first-node-id"}`
|
||||
The first node should be an autonomous processing node (NOT a \
|
||||
client-facing intake). For agents with multiple entry points, \
|
||||
add them: `{"start": "process", "reminder": "check"}`
|
||||
Call `initialize_agent_package(agent_name)` to generate all package files \
|
||||
from your graph session. The agent_name must be snake_case (e.g., "my_agent").
|
||||
The tool creates: config.py, nodes/__init__.py, agent.py, \
|
||||
__init__.py, __main__.py, mcp_servers.json, tests/conftest.py, \
|
||||
agent.json, README.md.
|
||||
|
||||
**conversation_mode** — ONLY two valid values:
|
||||
- `"continuous"` — recommended for interactive agents (context carries \
|
||||
across node transitions)
|
||||
- Omit entirely — for isolated per-node conversations
|
||||
NEVER use: "client_facing", "interactive", "adaptive", or any other \
|
||||
value. These DO NOT EXIST.
|
||||
After initialization, review and customize if needed:
|
||||
- System prompts in nodes/__init__.py
|
||||
- CLI options in __main__.py
|
||||
- Identity prompt in agent.py
|
||||
- For async entry points (timers/webhooks), add AsyncEntryPointSpec \
|
||||
and AgentRuntimeConfig to agent.py manually
|
||||
|
||||
**loop_config** — ONLY three valid keys:
|
||||
```python
|
||||
loop_config = {
|
||||
"max_iterations": 100,
|
||||
"max_tool_calls_per_turn": 30,
|
||||
"max_history_tokens": 32000,
|
||||
}
|
||||
```
|
||||
NEVER add: "strategy", "mode", "timeout", or other keys.
|
||||
Do NOT manually write these files from scratch — always use the tool.
|
||||
|
||||
**mcp_servers.json**:
|
||||
```json
|
||||
{
|
||||
"hive-tools": {
|
||||
"transport": "stdio",
|
||||
"command": "uv",
|
||||
"args": ["run", "python", "mcp_server.py", "--stdio"],
|
||||
"cwd": "../../tools"
|
||||
}
|
||||
}
|
||||
```
|
||||
NO "mcpServers" wrapper. cwd "../../tools". command "uv".
|
||||
|
||||
**Storage**: `Path.home() / ".hive" / "agents" / "{name}"`
|
||||
|
||||
**Client-facing system prompts** (review/approval nodes only, NOT intake) \
|
||||
— STEP 1/STEP 2 pattern:
|
||||
```
|
||||
STEP 1 — Present to user (text only, NO tool calls):
|
||||
[instructions]
|
||||
|
||||
STEP 2 — After user responds, call set_output:
|
||||
[set_output calls]
|
||||
```
|
||||
The queen manages intake. Workers should NOT have a client-facing node \
|
||||
that asks for requirements. Use client_facing=True only for review or \
|
||||
approval checkpoints mid-execution.
|
||||
|
||||
**Autonomous system prompts** — set_output in SEPARATE turn.
|
||||
|
||||
**Tools** — NEVER fabricate tool names. Common hallucinations: \
|
||||
csv_read, csv_write, csv_append, file_upload, database_query. \
|
||||
If list_agent_tools() shows these don't exist, use alternatives \
|
||||
(e.g. save_data/load_data for data persistence).
|
||||
|
||||
**Node rules**:
|
||||
- **NO intake nodes.** The queen owns intake. She defines the entry \
|
||||
node's input_keys at build time and fills them via \
|
||||
`run_agent_with_input(task)` at run time.
|
||||
- Don't abuse nodes without tools — merge them into a node that does work.
|
||||
- A node with 0 tools is NOT a real node — merge it.
|
||||
- node_type "event_loop" for all regular graph nodes. Use "gcu" ONLY for
|
||||
browser automation subagents (see GCU appendix). GCU nodes MUST be in a
|
||||
parent node's sub_agents list, NEVER connected via edges, and NEVER used
|
||||
as entry/terminal nodes.
|
||||
- max_node_visits default is 0 (unbounded) — correct for forever-alive. \
|
||||
Only set >0 in one-shot agents with bounded feedback loops.
|
||||
- Feedback inputs: nullable_output_keys
|
||||
- terminal_nodes=[] for forever-alive (the default)
|
||||
- Every node MUST have at least one outgoing edge (no dead ends)
|
||||
- Agents are forever-alive unless user explicitly asks for one-shot
|
||||
|
||||
**Agent class**: CamelCase name, default_agent at module level. \
|
||||
Constructor takes `config=None`. Follow the exact pattern in \
|
||||
file_templates.md — do NOT invent constructor params like \
|
||||
`llm_provider` or `tool_registry`.
|
||||
|
||||
**Module-level variables** (read by AgentRunner.load()):
|
||||
goal, nodes, edges, entry_node, entry_points, pause_nodes,
|
||||
terminal_nodes, conversation_mode, identity_prompt, loop_config
|
||||
|
||||
For agents with async triggers, also export:
|
||||
async_entry_points, runtime_config
|
||||
|
||||
**Async entry points** (timers, webhooks, events):
|
||||
When an agent needs scheduled tasks, webhook reactions, or event-driven \
|
||||
triggers, use `AsyncEntryPointSpec` (from framework.graph.edge) and \
|
||||
`AgentRuntimeConfig` (from framework.runtime.agent_runtime):
|
||||
- Timer (cron): `trigger_type="timer"`, \
|
||||
`trigger_config={"cron": "0 9 * * *"}` — standard 5-field cron expression \
|
||||
(e.g. `"0 9 * * MON-FRI"` weekdays 9am, `"*/30 * * * *"` every 30 min)
|
||||
- Timer (interval): `trigger_type="timer"`, \
|
||||
`trigger_config={"interval_minutes": 20, "run_immediately": False}`
|
||||
- Event (for webhooks): `trigger_type="event"`, \
|
||||
`trigger_config={"event_types": ["webhook_received"]}`
|
||||
- `isolation_level="shared"` so async runs can read primary session memory
|
||||
- `runtime_config = AgentRuntimeConfig(webhook_routes=[...])` for HTTP webhooks
|
||||
- Reference: `exports/gmail_inbox_guardian/agent.py`
|
||||
- Full docs: see **Framework Reference** appendix (Async Entry Points section)
|
||||
|
||||
## 5. Verify
|
||||
## 7. Verify
|
||||
|
||||
Run FOUR validation steps after writing. All must pass:
|
||||
|
||||
**Step A — Class validation** (checks graph structure):
|
||||
**Step A — Class validation** (checks graph structure and entry_points contract):
|
||||
```
|
||||
run_command("python -c 'from {name} import default_agent; \\
|
||||
run_command("uv run python -c 'from {name} import default_agent; \\
|
||||
print(default_agent.validate())'")
|
||||
```
|
||||
|
||||
**Step B — Runner load test** (checks package export contract — \
|
||||
THIS IS THE SAME PATH THE TUI USES):
|
||||
```
|
||||
run_command("python -c 'from framework.runner.runner import \\
|
||||
run_command("uv run python -c 'from framework.runner.runner import \\
|
||||
AgentRunner; r = AgentRunner.load(\"exports/{name}\"); \\
|
||||
print(\"AgentRunner.load: OK\")'")
|
||||
```
|
||||
@@ -603,59 +553,66 @@ start_agent("{name}") # triggers default entry point
|
||||
# Queen-specific: extra tool docs, behavior, phase 7, style
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_queen_tools_docs = """
|
||||
# -- Queen identity (all phases) --
|
||||
|
||||
## Operating Modes
|
||||
_queen_identity = """\
|
||||
You are the Queen — the user's primary interface. You are a coding agent \
|
||||
with the same capabilities as the Hive Coder worker, PLUS the ability to \
|
||||
manage the worker's lifecycle.
|
||||
"""
|
||||
|
||||
You operate in one of three modes. Your available tools change based on the \
|
||||
mode. The system notifies you when a mode change occurs.
|
||||
# -- Phase-specific tool docs --
|
||||
|
||||
_queen_tools_building = """
|
||||
# Tools (BUILDING phase)
|
||||
|
||||
### BUILDING mode (default)
|
||||
You have full coding tools for building and modifying agents:
|
||||
- File I/O: read_file, write_file, edit_file, list_directory, search_files, \
|
||||
run_command, undo_changes
|
||||
- Meta-agent: list_agent_tools, validate_agent_tools, \
|
||||
list_agents, list_agent_sessions, \
|
||||
list_agent_checkpoints, get_agent_checkpoint, run_agent_tests
|
||||
- load_built_agent(agent_path) — Load the agent and switch to STAGING mode
|
||||
- load_built_agent(agent_path) — Load the agent and switch to STAGING phase
|
||||
- list_credentials(credential_id?) — List authorized credentials
|
||||
|
||||
When you finish building an agent, call load_built_agent(path) to stage it.
|
||||
"""
|
||||
|
||||
_queen_tools_staging = """
|
||||
# Tools (STAGING phase)
|
||||
|
||||
### STAGING mode (agent loaded, not yet running)
|
||||
The agent is loaded and ready to run. You can inspect it and launch it:
|
||||
- Read-only: read_file, list_directory, search_files, run_command
|
||||
- list_credentials(credential_id?) — Verify credentials are configured
|
||||
- get_worker_status() — Check the loaded worker
|
||||
- run_agent_with_input(task) — Start the worker and switch to RUNNING mode
|
||||
- stop_worker_and_edit() — Go back to BUILDING mode
|
||||
- get_worker_status(focus?) — Brief status. Drill in with focus: memory, tools, issues, progress
|
||||
- run_agent_with_input(task) — Start the worker and switch to RUNNING phase
|
||||
- stop_worker_and_edit() — Go back to BUILDING phase
|
||||
|
||||
In STAGING mode you do NOT have write tools. If you need to modify the agent, \
|
||||
call stop_worker_and_edit() to go back to BUILDING mode.
|
||||
You do NOT have write tools. If you need to modify the agent, \
|
||||
call stop_worker_and_edit() to go back to BUILDING phase.
|
||||
"""
|
||||
|
||||
_queen_tools_running = """
|
||||
# Tools (RUNNING phase)
|
||||
|
||||
### RUNNING mode (worker is executing)
|
||||
The worker is running. You have monitoring and lifecycle tools:
|
||||
- Read-only: read_file, list_directory, search_files, run_command
|
||||
- get_worker_status() — Check worker status (idle, running, waiting)
|
||||
- get_worker_status(focus?) — Brief status. Drill in: activity, memory, tools, issues, progress
|
||||
- inject_worker_message(content) — Send a message to the running worker
|
||||
- get_worker_health_summary() — Read the latest health data
|
||||
- notify_operator(ticket_id, analysis, urgency) — Alert the user (use sparingly)
|
||||
- stop_worker() — Stop the worker and return to STAGING mode, then ask the user what to do next
|
||||
- stop_worker_and_edit() — Stop the worker and switch back to BUILDING mode
|
||||
- stop_worker() — Stop the worker and return to STAGING phase, then ask the user what to do next
|
||||
- stop_worker_and_edit() — Stop the worker and switch back to BUILDING phase
|
||||
|
||||
In RUNNING mode you do NOT have write tools or agent construction tools. \
|
||||
You do NOT have write tools or agent construction tools. \
|
||||
If you need to modify the agent, call stop_worker_and_edit() to switch back \
|
||||
to BUILDING mode. To stop the worker and ask the user what to do next, call \
|
||||
stop_worker() to return to STAGING mode.
|
||||
|
||||
### Mode transitions
|
||||
- load_built_agent(path) → switches to STAGING mode
|
||||
- run_agent_with_input(task) → starts worker, switches to RUNNING mode
|
||||
- stop_worker() → stops worker, switches to STAGING mode (ask user: re-run or edit?)
|
||||
- stop_worker_and_edit() → stops worker (if running), switches to BUILDING mode
|
||||
to BUILDING phase. To stop the worker and ask the user what to do next, call \
|
||||
stop_worker() to return to STAGING phase.
|
||||
"""
|
||||
|
||||
_queen_behavior = """
|
||||
# -- Behavior shared across all phases --
|
||||
|
||||
_queen_behavior_always = """
|
||||
# Behavior
|
||||
|
||||
## CRITICAL RULE — ask_user tool
|
||||
@@ -692,7 +649,27 @@ If no worker is loaded, say so.
|
||||
## Direct coding
|
||||
You can do any coding task directly — reading files, writing code, running \
|
||||
commands, building agents, debugging. For quick tasks, do them yourself.
|
||||
"""
|
||||
|
||||
# -- BUILDING phase behavior --
|
||||
|
||||
_queen_behavior_building = """
|
||||
## Worker delegation
|
||||
The worker is a specialized agent (see Worker Profile at the end of this \
|
||||
prompt). It can ONLY do what its goal and tools allow.
|
||||
|
||||
**Decision rule — read the Worker Profile first:**
|
||||
- The user's request directly matches the worker's goal → use \
|
||||
run_agent_with_input(task) (if in staging) or load then run (if in building)
|
||||
- Anything else → do it yourself. Do NOT reframe user requests into \
|
||||
subtasks to justify delegation.
|
||||
- Building, modifying, or configuring agents is ALWAYS your job. Never \
|
||||
delegate agent construction to the worker, even as a "research" subtask.
|
||||
"""
|
||||
|
||||
# -- STAGING phase behavior --
|
||||
|
||||
_queen_behavior_staging = """
|
||||
## Worker delegation
|
||||
The worker is a specialized agent (see Worker Profile at the end of this \
|
||||
prompt). It can ONLY do what its goal and tools allow.
|
||||
@@ -716,7 +693,7 @@ NEVER call run_agent_with_input until the user has provided their input.
|
||||
|
||||
If NO worker is loaded, say so and offer to build one.
|
||||
|
||||
## When in staging mode (agent loaded, not running):
|
||||
## When in staging phase (agent loaded, not running):
|
||||
- Tell the user the agent is loaded and ready.
|
||||
- For tasks matching the worker's goal: ALWAYS ask the user for their \
|
||||
specific input BEFORE calling run_agent_with_input(task). NEVER make up \
|
||||
@@ -742,38 +719,67 @@ explain the problem clearly and help fix it. For credential errors, \
|
||||
guide the user to set up the missing credentials. For structural \
|
||||
issues, offer to fix the agent graph directly.
|
||||
|
||||
## When worker is running — GO SILENT
|
||||
## Showing or describing the loaded worker
|
||||
|
||||
Once you call start_worker(), your job is DONE. Do NOT call ask_user, \
|
||||
do NOT call get_worker_status(), do NOT emit any text. Just stop. \
|
||||
The worker owns the conversation now — it has its own client-facing \
|
||||
nodes that talk to the user directly.
|
||||
When the user asks to "show the graph", "describe the agent", or \
|
||||
"re-generate the graph", read the Worker Profile and present the \
|
||||
worker's current architecture as an ASCII diagram. Use the processing \
|
||||
stages, tools, and edges from the loaded worker. Do NOT enter the \
|
||||
agent building workflow — you are describing what already exists, not \
|
||||
building something new.
|
||||
|
||||
**After start_worker, your ENTIRE response should be ONE short \
|
||||
confirmation sentence with NO tool calls.** Example: \
|
||||
"Started the vulnerability assessment." — that's it. No ask_user, \
|
||||
no get_worker_status, no follow-up questions.
|
||||
## Modifying the loaded worker
|
||||
|
||||
You only wake up again when:
|
||||
- The user explicitly addresses you (not answering a worker question)
|
||||
- A worker question is forwarded to you for relay
|
||||
When the user asks to change, modify, or update the loaded worker \
|
||||
(e.g., "change the report node", "add a node", "delete node X"):
|
||||
|
||||
1. Call stop_worker_and_edit() — this stops the worker and gives you \
|
||||
coding tools (switches to BUILDING phase).
|
||||
2. Use the **Path** from the Worker Profile to locate the agent files.
|
||||
3. Read the relevant files (nodes/__init__.py, agent.py, etc.).
|
||||
4. Make the requested changes using edit_file / write_file.
|
||||
5. Run validation (default_agent.validate(), AgentRunner.load(), \
|
||||
validate_agent_tools()).
|
||||
6. **Reload the modified worker**: call load_built_agent("{path}") \
|
||||
so the changes take effect immediately (switches to STAGING phase). \
|
||||
Then call run_agent_with_input(task) to restart execution.
|
||||
|
||||
Do NOT skip step 6 — without reloading, the user will still be \
|
||||
interacting with the old version.
|
||||
"""
|
||||
|
||||
# -- RUNNING phase behavior --
|
||||
|
||||
_queen_behavior_running = """
|
||||
## When worker is running — queen is the only user interface
|
||||
|
||||
After run_agent_with_input(task), the worker should run autonomously and \
|
||||
talk to YOU (queen) via escalate_to_coder when blocked. The worker should \
|
||||
NOT ask the user directly.
|
||||
|
||||
You wake up when:
|
||||
- The user explicitly addresses you
|
||||
- A worker escalation arrives (`[WORKER_ESCALATION_REQUEST]`)
|
||||
- An escalation ticket arrives from the judge
|
||||
- The worker finishes
|
||||
|
||||
If the user explicitly asks about progress, call get_worker_status() \
|
||||
ONCE and report. Do NOT poll or check proactively.
|
||||
If the user asks for progress, call get_worker_status() ONCE and report. \
|
||||
If the summary mentions issues, follow up with get_worker_status(focus="issues").
|
||||
|
||||
For escalation tickets: low/transient → acknowledge silently. \
|
||||
## Handling worker escalations
|
||||
|
||||
When a worker escalation arrives:
|
||||
1. Read reason/context from the escalation message.
|
||||
2. Call get_worker_status(focus="issues") or get_worker_status(focus="activity") for details.
|
||||
3. Decide the next action:
|
||||
- Quick unblock guidance → inject_worker_message(...)
|
||||
- Requires worker code/graph changes → stop_worker_and_edit()
|
||||
- Requires user decision/business input → ask_user(...), then relay via inject_worker_message(...)
|
||||
4. Keep the user loop on queen. Do not instruct the worker to ask the user directly.
|
||||
|
||||
For judge escalation tickets: low/transient → acknowledge silently. \
|
||||
High/critical → notify the user with a brief analysis.
|
||||
|
||||
## When the worker asks the user a question:
|
||||
- The user's answer is routed to you with context: \
|
||||
[Worker asked: "...", Options: ...] User answered: "...".
|
||||
- If the user is answering the worker's question normally, relay it \
|
||||
using inject_worker_message(answer_text). Then go silent again.
|
||||
- If the user is rejecting the approach, asking to stop, or giving \
|
||||
you an instruction, handle it yourself — do NOT relay.
|
||||
|
||||
## Showing or describing the loaded worker
|
||||
|
||||
When the user asks to "show the graph", "describe the agent", or \
|
||||
@@ -789,26 +795,52 @@ When the user asks to change, modify, or update the loaded worker \
|
||||
(e.g., "change the report node", "add a node", "delete node X"):
|
||||
|
||||
1. Call stop_worker_and_edit() — this stops the worker and gives you \
|
||||
coding tools (switches to BUILDING mode).
|
||||
coding tools (switches to BUILDING phase).
|
||||
2. Use the **Path** from the Worker Profile to locate the agent files.
|
||||
3. Read the relevant files (nodes/__init__.py, agent.py, etc.).
|
||||
4. Make the requested changes using edit_file / write_file.
|
||||
5. Run validation (default_agent.validate(), AgentRunner.load(), \
|
||||
validate_agent_tools()).
|
||||
6. **Reload the modified worker**: call load_built_agent("{path}") \
|
||||
so the changes take effect immediately (switches to STAGING mode). \
|
||||
so the changes take effect immediately (switches to STAGING phase). \
|
||||
Then call run_agent_with_input(task) to restart execution.
|
||||
|
||||
Do NOT skip step 6 — without reloading, the user will still be \
|
||||
interacting with the old version.
|
||||
"""
|
||||
|
||||
# -- Backward-compatible composed versions (used by queen_node.system_prompt default) --
|
||||
|
||||
_queen_tools_docs = (
|
||||
"\n\n## Queen Operating Phases\n\n"
|
||||
"You operate in one of three phases. Your available tools change based on the "
|
||||
"phase. The system notifies you when a phase change occurs.\n\n"
|
||||
"### BUILDING phase (default)\n"
|
||||
+ _queen_tools_building.strip()
|
||||
+ "\n\n### STAGING phase (agent loaded, not yet running)\n"
|
||||
+ _queen_tools_staging.strip()
|
||||
+ "\n\n### RUNNING phase (worker is executing)\n"
|
||||
+ _queen_tools_running.strip()
|
||||
+ "\n\n### Phase transitions\n"
|
||||
"- load_built_agent(path) → switches to STAGING phase\n"
|
||||
"- run_agent_with_input(task) → starts worker, switches to RUNNING phase\n"
|
||||
"- stop_worker() → stops worker, switches to STAGING phase (ask user: re-run or edit?)\n"
|
||||
"- stop_worker_and_edit() → stops worker (if running), switches to BUILDING phase\n"
|
||||
)
|
||||
|
||||
_queen_behavior = (
|
||||
_queen_behavior_always
|
||||
+ _queen_behavior_building
|
||||
+ _queen_behavior_staging
|
||||
+ _queen_behavior_running
|
||||
)
|
||||
|
||||
_queen_phase_7 = """
|
||||
## 7. Load into Session
|
||||
|
||||
After building and verifying, load the agent into the current session:
|
||||
load_built_agent("exports/{name}")
|
||||
This switches to STAGING mode — the user sees the agent's graph and \
|
||||
This switches to STAGING phase — the user sees the agent's graph and \
|
||||
the tab name updates. Then call run_agent_with_input(task) to start it. \
|
||||
Do NOT tell the user to run `python -m {name} run` — load and run it here.
|
||||
"""
|
||||
@@ -942,14 +974,13 @@ queen_node = NodeSpec(
|
||||
),
|
||||
tools=sorted(set(_QUEEN_BUILDING_TOOLS + _QUEEN_STAGING_TOOLS + _QUEEN_RUNNING_TOOLS)),
|
||||
system_prompt=(
|
||||
"You are the Queen — the user's primary interface. You are a coding agent "
|
||||
"with the same capabilities as the Hive Coder worker, PLUS the ability to "
|
||||
"manage the worker's lifecycle.\n"
|
||||
_queen_identity
|
||||
+ _queen_style
|
||||
+ _agent_builder_knowledge
|
||||
+ _gcu_building_section # GCU as first-class citizen (not appendix)
|
||||
+ _queen_tools_docs
|
||||
+ _queen_behavior
|
||||
+ _queen_phase_7
|
||||
+ _queen_style
|
||||
+ _appendices
|
||||
),
|
||||
)
|
||||
@@ -965,4 +996,18 @@ __all__ = [
|
||||
"_QUEEN_BUILDING_TOOLS",
|
||||
"_QUEEN_STAGING_TOOLS",
|
||||
"_QUEEN_RUNNING_TOOLS",
|
||||
# Phase-specific prompt segments (used by session_manager for dynamic prompts)
|
||||
"_queen_identity",
|
||||
"_queen_tools_building",
|
||||
"_queen_tools_staging",
|
||||
"_queen_tools_running",
|
||||
"_queen_behavior_always",
|
||||
"_queen_behavior_building",
|
||||
"_queen_behavior_staging",
|
||||
"_queen_behavior_running",
|
||||
"_queen_phase_7",
|
||||
"_queen_style",
|
||||
"_agent_builder_knowledge",
|
||||
"_appendices",
|
||||
"_gcu_building_section",
|
||||
]
|
||||
|
||||
@@ -1,113 +1,32 @@
|
||||
# Common Mistakes When Building Hive Agents
|
||||
|
||||
## Critical Errors
|
||||
|
||||
1. **Using tools that don't exist** — Always verify tools are available in the hive-tools MCP server before assigning them to nodes. Never guess tool names.
|
||||
|
||||
2. **Wrong entry_points format** — MUST be `{"start": "first-node-id"}`. NOT a set, NOT `{node_id: [keys]}`.
|
||||
|
||||
3. **Wrong mcp_servers.json format** — Flat dict (no `"mcpServers"` wrapper). `cwd` must be `"../../tools"`. `command` must be `"uv"` with args `["run", "python", ...]`.
|
||||
|
||||
4. **Missing STEP 1/STEP 2 in client-facing prompts** — Without explicit phases, the LLM calls set_output before the user responds. Always use the pattern.
|
||||
|
||||
5. **Forgetting nullable_output_keys** — When a node receives inputs from multiple edges and some inputs only arrive on certain edges (e.g., feedback), mark those as nullable. Without this, the executor blocks waiting for a value that will never arrive.
|
||||
|
||||
6. **Creating dead-end nodes in forever-alive graphs** — Every node must have at least one outgoing edge. A node with no outgoing edges ends the execution, breaking the loop.
|
||||
|
||||
7. **Setting max_node_visits to a non-zero value in forever-alive agents** — The framework default is `max_node_visits=0` (unbounded). Setting it to any positive value (e.g., 1) means the node stops executing after that many visits, silently breaking the forever-alive loop. Only set `max_node_visits > 0` in one-shot agents with feedback loops that need bounded retries.
|
||||
|
||||
7. **Missing module-level exports in `__init__.py`** — The runner loads agents via `importlib.import_module(package_name)`, which imports `__init__.py`. It then reads `goal`, `nodes`, `edges`, `entry_node`, `entry_points`, `pause_nodes`, `terminal_nodes`, `conversation_mode`, `identity_prompt`, `loop_config` via `getattr()`. If ANY of these are missing from `__init__.py`, they default to `None` or `{}` — causing "must define goal, nodes, edges" errors or "node X is unreachable" validation failures. **ALL module-level variables from agent.py must be re-exported in `__init__.py`.**
|
||||
1. **Using tools that don't exist** — Always verify tools via `list_agent_tools()` before designing. Common hallucinations: `csv_read`, `csv_write`, `file_upload`, `database_query`, `bulk_fetch_emails`.
|
||||
2. **Wrong mcp_servers.json format** — Flat dict (no `"mcpServers"` wrapper). `cwd` must be `"../../tools"`. `command` must be `"uv"` with args `["run", "python", ...]`.
|
||||
3. **Missing module-level exports in `__init__.py`** — The runner reads `goal`, `nodes`, `edges`, `entry_node`, `entry_points`, `terminal_nodes`, `conversation_mode`, `identity_prompt`, `loop_config` via `getattr()`. ALL module-level variables from agent.py must be re-exported in `__init__.py`.
|
||||
|
||||
## Value Errors
|
||||
|
||||
8. **Invalid `conversation_mode` value** — Only two valid values: `"continuous"` (recommended for interactive agents) or omit entirely (for isolated per-node conversations). Values like `"client_facing"`, `"interactive"`, `"adaptive"` do NOT exist and will cause runtime errors.
|
||||
|
||||
9. **Invalid `loop_config` keys** — Only three valid keys: `max_iterations` (int), `max_tool_calls_per_turn` (int), `max_history_tokens` (int). Keys like `"strategy"`, `"mode"`, `"timeout"` are NOT valid and are silently ignored or cause errors.
|
||||
|
||||
10. **Fabricating tools that don't exist** — Never guess tool names. Always verify via `list_agent_tools()` before designing and `validate_agent_tools()` after building. Common hallucinations: `csv_read`, `csv_write`, `csv_append`, `file_upload`, `database_query`, `bulk_fetch_emails`. If a required tool doesn't exist, redesign the agent to use tools that DO exist (e.g., `save_data`/`load_data` for data persistence).
|
||||
4. **Fabricating tools** — Always verify via `list_agent_tools()` before designing and `validate_agent_tools()` after building.
|
||||
|
||||
## Design Errors
|
||||
|
||||
11. **Too many thin nodes** — Hard limit: **2-4 nodes** for most agents. Each node boundary serializes outputs to shared memory and loses all in-context information (tool results, intermediate reasoning, conversation history). A node with 0 tools that just does LLM reasoning is NOT a real node — merge it into its predecessor or successor.
|
||||
|
||||
**Merge when:**
|
||||
- Node has NO tools — pure LLM reasoning belongs in the node that produces or consumes its data
|
||||
- Node sets only 1 trivial output (e.g., `set_output("done", "true")`) — collapse into predecessor
|
||||
- Multiple consecutive autonomous nodes with same/similar tools — combine into one
|
||||
- A "report" or "summary" node that just presents analysis — merge into the client-facing node
|
||||
- A "schedule" or "confirm" node that doesn't actually schedule anything — remove entirely
|
||||
|
||||
**Keep separate when:**
|
||||
- Client-facing vs autonomous — different interaction models require separate nodes
|
||||
- Fundamentally different tool sets (e.g., web search vs file I/O)
|
||||
- Fan-out parallelism — parallel branches MUST be separate nodes
|
||||
|
||||
**Bad example** (7 nodes — WAY too many):
|
||||
```
|
||||
profile_setup → daily_intake → update_tracker → analyze_progress → generate_plan → schedule_reminders → report
|
||||
```
|
||||
`analyze_progress` has no tools. `schedule_reminders` just sets one boolean. `report` just presents analysis. `update_tracker` and `generate_plan` are sequential autonomous work.
|
||||
|
||||
**Good example** (2 nodes):
|
||||
```
|
||||
process (autonomous: track + analyze + plan) → review (client-facing) → process (loop back)
|
||||
```
|
||||
The queen handles intake (gathering requirements from the user) and passes the task via `run_agent_with_input(task)`. One autonomous node handles ALL backend work (CSV update, analysis, plan generation) with tools and context preserved. One client-facing node handles review/approval when needed.
|
||||
|
||||
12. **Adding framework gating for LLM behavior** — Don't add output rollback, premature rejection, or interaction protocol injection. Fix with better prompts or custom judges.
|
||||
|
||||
13. **Not using continuous conversation mode** — Interactive agents should use `conversation_mode="continuous"`. Without it, each node starts with blank context.
|
||||
|
||||
14. **Adding terminal nodes by default** — ALL agents should use `terminal_nodes=[]` (forever-alive) unless the user explicitly requests a one-shot/batch agent. Forever-alive is the standard pattern. Every node must have at least one outgoing edge. Dead-end nodes break the loop.
|
||||
|
||||
15. **Calling set_output in same turn as tool calls** — Instruct the LLM to call set_output in a SEPARATE turn from real tool calls.
|
||||
5. **Adding framework gating for LLM behavior** — Don't add output rollback or premature rejection. Fix with better prompts or custom judges.
|
||||
6. **Calling set_output in same turn as tool calls** — Call set_output in a SEPARATE turn.
|
||||
|
||||
## File Template Errors
|
||||
|
||||
16. **Wrong import paths** — Use `from framework.graph import ...`, NOT `from core.framework.graph import ...`. The PYTHONPATH includes `core/`.
|
||||
|
||||
17. **Missing storage path** — Agent class must set `self._storage_path = Path.home() / ".hive" / "agents" / "agent_name"`.
|
||||
|
||||
18. **Missing mcp_servers.json** — Without this, the agent has no tools at runtime.
|
||||
|
||||
19. **Bare `python` command in mcp_servers.json** — Use `"command": "uv"` with args `["run", "python", ...]`.
|
||||
7. **Wrong import paths** — Use `from framework.graph import ...`, NOT `from core.framework.graph import ...`.
|
||||
8. **Missing storage path** — Agent class must set `self._storage_path = Path.home() / ".hive" / "agents" / "agent_name"`.
|
||||
9. **Missing mcp_servers.json** — Without this, the agent has no tools at runtime.
|
||||
10. **Bare `python` command** — Use `"command": "uv"` with args `["run", "python", ...]`.
|
||||
|
||||
## Testing Errors
|
||||
11. **Using `runner.run()` on forever-alive agents** — `runner.run()` hangs forever because forever-alive agents have no terminal node. Write structural tests instead: validate graph structure, verify node specs, test `AgentRunner.load()` succeeds (no API key needed).
|
||||
12. **Stale tests after restructuring** — When changing nodes/edges, update tests to match. Tests referencing old node names will fail.
|
||||
13. **Running integration tests without API keys** — Use `pytest.skip()` when credentials are missing.
|
||||
14. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
|
||||
|
||||
20. **Using `runner.run()` on forever-alive agents** — `runner.run()` calls `trigger_and_wait()` which blocks until the graph reaches a terminal node. Forever-alive agents have `terminal_nodes=[]`, so **`runner.run()` hangs forever**. This is the #1 cause of stuck test suites.
|
||||
## GCU Errors
|
||||
15. **Manually wiring browser tools on event_loop nodes** — Use `node_type="gcu"` which auto-includes browser tools. Do NOT manually list browser tool names.
|
||||
16. **Using GCU nodes as regular graph nodes** — GCU nodes are subagents only. They must ONLY appear in `sub_agents=["gcu-node-id"]` and be invoked via `delegate_to_sub_agent()`. Never connect via edges or use as entry/terminal nodes.
|
||||
|
||||
**For forever-alive agents, write structural tests instead:**
|
||||
- Validate graph structure (nodes, edges, entry points)
|
||||
- Verify node specs (tools, prompts, client-facing flag)
|
||||
- Check goal/constraints/success criteria definitions
|
||||
- Test that `AgentRunner.load()` succeeds (structural, no API key needed)
|
||||
|
||||
**What NOT to do:**
|
||||
```python
|
||||
# WRONG — hangs forever on forever-alive agents
|
||||
result = await runner.run({"topic": "quantum computing"})
|
||||
```
|
||||
|
||||
**Correct pattern for structure tests:**
|
||||
```python
|
||||
def test_research_has_web_tools(self):
|
||||
assert "web_search" in research_node.tools
|
||||
|
||||
def test_research_routes_back_to_interact(self):
|
||||
edges_to_interact = [e for e in edges if e.source == "research" and e.target == "interact"]
|
||||
assert edges_to_interact
|
||||
```
|
||||
|
||||
21. **Stale tests after agent restructuring** — When you change an agent's node count or names (e.g., 4 nodes → 2 nodes), the tests MUST be updated too. Tests referencing old node names (e.g., `"review"`, `"report"`) will fail or hang. Always check that test assertions match the current `nodes/__init__.py`.
|
||||
|
||||
22. **Running full integration tests without API keys** — Structural tests (validate, import) work without keys. Full integration tests need `ANTHROPIC_API_KEY`. Use `pytest.skip()` in the runner fixture when `_setup()` fails due to missing credentials.
|
||||
|
||||
23. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
|
||||
|
||||
24. **Not using auto_responder for client-facing nodes** — Tests with client-facing nodes hang without an auto-responder that injects input. But note: even WITH auto_responder, forever-alive agents still hang because the graph never terminates. Auto-responder only helps for agents with terminal nodes.
|
||||
|
||||
25. **Manually wiring browser tools on event_loop nodes** — If the agent needs browser automation, use `node_type="gcu"` which auto-includes all browser tools and prepends best-practices guidance. Do NOT manually list browser tool names on event_loop nodes — they may not exist in the MCP server or may be incomplete. See the GCU Guide appendix.
|
||||
|
||||
26. **Using GCU nodes as regular graph nodes** — GCU nodes (`node_type="gcu"`) are exclusively subagents. They must ONLY appear in a parent node's `sub_agents=["gcu-node-id"]` list and be invoked via `delegate_to_sub_agent()`. They must NEVER be connected via edges, used as entry nodes, or used as terminal nodes. If a GCU node appears as an edge source or target, the graph will fail pre-load validation.
|
||||
|
||||
27. **Adding a client-facing intake node to worker agents** — The queen owns intake. She defines the entry node's `input_keys` at build time and fills them via `run_agent_with_input(task)` at run time. Worker agents should start with an autonomous processing node, NOT a client-facing intake node that asks the user for requirements. Client-facing nodes in workers are for mid-execution review/approval only.
|
||||
## Worker Agent Errors
|
||||
17. **Adding client-facing intake node to workers** — The queen owns intake. Workers should start with an autonomous processing node. Client-facing nodes in workers are for mid-execution review/approval only.
|
||||
|
||||
@@ -84,35 +84,36 @@ Work in phases:
|
||||
tools=["web_search", "web_scrape", "save_data", "load_data", "list_data_files"],
|
||||
)
|
||||
|
||||
# Node 3: Review (client-facing)
|
||||
review_node = NodeSpec(
|
||||
id="review",
|
||||
name="Review",
|
||||
description="Present results for user approval",
|
||||
# Node 2: Handoff (autonomous)
|
||||
handoff_node = NodeSpec(
|
||||
id="handoff",
|
||||
name="Handoff",
|
||||
description="Prepare worker results for queen review",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
client_facing=False,
|
||||
max_node_visits=0,
|
||||
input_keys=["results", "user_request"],
|
||||
output_keys=["next_action", "feedback"],
|
||||
nullable_output_keys=["feedback"],
|
||||
success_criteria="User has reviewed and decided next steps.",
|
||||
output_keys=["next_action", "feedback", "worker_summary"],
|
||||
nullable_output_keys=["feedback", "worker_summary"],
|
||||
success_criteria="Results are packaged for queen decision-making.",
|
||||
system_prompt="""\
|
||||
Present the results to the user.
|
||||
Do NOT talk to the user directly. The queen is the only user interface.
|
||||
|
||||
**STEP 1 — Present (text only, NO tool calls):**
|
||||
1. Summary of work done
|
||||
2. Key results
|
||||
3. Ask: satisfied, or want changes?
|
||||
If blocked by tool failures, missing credentials, or unclear constraints, call:
|
||||
- escalate_to_coder(reason, context)
|
||||
Then set:
|
||||
- set_output("next_action", "escalated")
|
||||
- set_output("feedback", "what help is needed")
|
||||
|
||||
**STEP 2 — After user responds, call set_output:**
|
||||
- set_output("next_action", "done") — if satisfied
|
||||
- set_output("next_action", "revise") — if changes needed
|
||||
- set_output("feedback", "what to change") — only if revising
|
||||
Otherwise summarize findings for queen and set:
|
||||
- set_output("worker_summary", "short summary for queen")
|
||||
- set_output("next_action", "done") or set_output("next_action", "revise")
|
||||
- set_output("feedback", "what to revise") only when revising
|
||||
""",
|
||||
tools=[],
|
||||
)
|
||||
|
||||
__all__ = ["process_node", "review_node"]
|
||||
__all__ = ["process_node", "handoff_node"]
|
||||
```
|
||||
|
||||
## agent.py
|
||||
@@ -132,7 +133,7 @@ from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
from .config import default_config, metadata
|
||||
from .nodes import process_node, review_node
|
||||
from .nodes import process_node, handoff_node
|
||||
|
||||
# Goal definition
|
||||
goal = Goal(
|
||||
@@ -149,18 +150,22 @@ goal = Goal(
|
||||
)
|
||||
|
||||
# Node list
|
||||
nodes = [process_node, review_node]
|
||||
nodes = [process_node, handoff_node]
|
||||
|
||||
# Edge definitions
|
||||
edges = [
|
||||
EdgeSpec(id="process-to-review", source="process", target="review",
|
||||
EdgeSpec(id="process-to-handoff", source="process", target="handoff",
|
||||
condition=EdgeCondition.ON_SUCCESS, priority=1),
|
||||
# Feedback loop — revise results
|
||||
EdgeSpec(id="review-to-process", source="review", target="process",
|
||||
EdgeSpec(id="handoff-to-process", source="handoff", target="process",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="str(next_action).lower() == 'revise'", priority=2),
|
||||
# Loop back for next task (queen sends new input)
|
||||
EdgeSpec(id="review-done", source="review", target="process",
|
||||
# Escalation loop — queen injects guidance and worker retries
|
||||
EdgeSpec(id="handoff-escalated", source="handoff", target="process",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="str(next_action).lower() == 'escalated'", priority=3),
|
||||
# Loop back for next task after queen decision
|
||||
EdgeSpec(id="handoff-done", source="handoff", target="process",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="str(next_action).lower() == 'done'", priority=1),
|
||||
]
|
||||
@@ -267,16 +272,60 @@ class MyAgent:
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Validate graph wiring and entry-point contract."""
|
||||
errors, warnings = [], []
|
||||
node_ids = {n.id for n in self.nodes}
|
||||
for e in self.edges:
|
||||
if e.source not in node_ids: errors.append(f"Edge {e.id}: source '{e.source}' not found")
|
||||
if e.target not in node_ids: errors.append(f"Edge {e.id}: target '{e.target}' not found")
|
||||
if self.entry_node not in node_ids: errors.append(f"Entry node '{self.entry_node}' not found")
|
||||
if e.source not in node_ids:
|
||||
errors.append(f"Edge {e.id}: source '{e.source}' not found")
|
||||
if e.target not in node_ids:
|
||||
errors.append(f"Edge {e.id}: target '{e.target}' not found")
|
||||
if self.entry_node not in node_ids:
|
||||
errors.append(f"Entry node '{self.entry_node}' not found")
|
||||
for t in self.terminal_nodes:
|
||||
if t not in node_ids: errors.append(f"Terminal node '{t}' not found")
|
||||
for ep_id, nid in self.entry_points.items():
|
||||
if nid not in node_ids: errors.append(f"Entry point '{ep_id}' references unknown node '{nid}'")
|
||||
if t not in node_ids:
|
||||
errors.append(f"Terminal node '{t}' not found")
|
||||
|
||||
if not isinstance(self.entry_points, dict):
|
||||
errors.append(
|
||||
"Invalid entry_points: expected dict[str, str] like "
|
||||
"{'start': '<entry-node-id>'}. "
|
||||
f"Got {type(self.entry_points).__name__}. "
|
||||
"Fix agent.py: set entry_points = {'start': '<entry-node-id>'}."
|
||||
)
|
||||
else:
|
||||
if "start" not in self.entry_points:
|
||||
errors.append(
|
||||
"entry_points must include 'start' mapped to entry_node. "
|
||||
"Example: {'start': '<entry-node-id>'}."
|
||||
)
|
||||
else:
|
||||
start_node = self.entry_points.get("start")
|
||||
if start_node != self.entry_node:
|
||||
errors.append(
|
||||
f"entry_points['start'] points to '{start_node}' "
|
||||
f"but entry_node is '{self.entry_node}'. Keep these aligned."
|
||||
)
|
||||
|
||||
for ep_id, nid in self.entry_points.items():
|
||||
if not isinstance(ep_id, str):
|
||||
errors.append(
|
||||
f"Invalid entry_points key {ep_id!r} "
|
||||
f"({type(ep_id).__name__}). Entry point names must be strings."
|
||||
)
|
||||
continue
|
||||
if not isinstance(nid, str):
|
||||
errors.append(
|
||||
f"Invalid entry_points['{ep_id}']={nid!r} "
|
||||
f"({type(nid).__name__}). Node ids must be strings."
|
||||
)
|
||||
continue
|
||||
if nid not in node_ids:
|
||||
errors.append(
|
||||
f"Entry point '{ep_id}' references unknown node '{nid}'. "
|
||||
f"Known nodes: {sorted(node_ids)}"
|
||||
)
|
||||
|
||||
return {"valid": len(errors) == 0, "errors": errors, "warnings": warnings}
|
||||
|
||||
|
||||
|
||||
@@ -258,173 +258,53 @@ Judge is the SOLE acceptance mechanism — no ad-hoc framework gating.
|
||||
|
||||
## Async Entry Points (Webhooks, Timers, Events)
|
||||
|
||||
For agents that need to react to external events (incoming emails, scheduled
|
||||
tasks, API calls), use `AsyncEntryPointSpec` and optionally `AgentRuntimeConfig`.
|
||||
|
||||
### Imports
|
||||
```python
|
||||
from framework.graph.edge import GraphSpec, AsyncEntryPointSpec
|
||||
from framework.runtime.agent_runtime import AgentRuntime, AgentRuntimeConfig, create_agent_runtime
|
||||
```
|
||||
Note: `AsyncEntryPointSpec` is in `framework.graph.edge` (the graph/declarative layer).
|
||||
`AgentRuntimeConfig` is in `framework.runtime.agent_runtime` (the runtime layer).
|
||||
|
||||
### AsyncEntryPointSpec Fields
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| id | str | required | Unique identifier |
|
||||
| name | str | required | Human-readable name |
|
||||
| entry_node | str | required | Node ID to start execution from |
|
||||
| trigger_type | str | `"manual"` | `webhook`, `api`, `timer`, `event`, `manual` |
|
||||
| trigger_config | dict | `{}` | Trigger-specific config (see below) |
|
||||
| isolation_level | str | `"shared"` | `isolated`, `shared`, `synchronized` |
|
||||
| priority | int | `0` | Execution priority (higher = more priority) |
|
||||
| max_concurrent | int | `10` | Max concurrent executions |
|
||||
|
||||
### Trigger Types
|
||||
|
||||
**timer** — Fires on a schedule. Two modes: cron expressions or fixed interval.
|
||||
|
||||
Cron (preferred for precise scheduling):
|
||||
```python
|
||||
AsyncEntryPointSpec(
|
||||
id="daily-digest",
|
||||
name="Daily Digest",
|
||||
entry_node="check-node",
|
||||
trigger_type="timer",
|
||||
trigger_config={"cron": "0 9 * * *"}, # daily at 9am
|
||||
isolation_level="shared",
|
||||
max_concurrent=1,
|
||||
)
|
||||
```
|
||||
- `cron` (str) — standard cron expression (5 fields: min hour dom month dow)
|
||||
- Examples: `"0 9 * * *"` (daily 9am), `"0 9 * * MON-FRI"` (weekdays 9am), `"*/30 * * * *"` (every 30 min)
|
||||
|
||||
Fixed interval (simpler, for polling-style tasks):
|
||||
```python
|
||||
AsyncEntryPointSpec(
|
||||
id="scheduled-check",
|
||||
name="Scheduled Check",
|
||||
entry_node="check-node",
|
||||
trigger_type="timer",
|
||||
trigger_config={"interval_minutes": 20, "run_immediately": False},
|
||||
isolation_level="shared",
|
||||
max_concurrent=1,
|
||||
)
|
||||
```
|
||||
- `interval_minutes` (float) — how often to fire
|
||||
- `run_immediately` (bool, default False) — fire once on startup
|
||||
|
||||
**event** — Subscribes to EventBus (e.g., webhook events):
|
||||
```python
|
||||
AsyncEntryPointSpec(
|
||||
id="email-event",
|
||||
name="Email Event Handler",
|
||||
entry_node="process-emails",
|
||||
trigger_type="event",
|
||||
trigger_config={"event_types": ["webhook_received"]},
|
||||
isolation_level="shared",
|
||||
max_concurrent=10,
|
||||
)
|
||||
```
|
||||
- `event_types` (list[str]) — EventType values to subscribe to
|
||||
- `filter_stream` (str, optional) — only receive from this stream
|
||||
- `filter_node` (str, optional) — only receive from this node
|
||||
|
||||
**webhook** — HTTP endpoint (requires AgentRuntimeConfig):
|
||||
The webhook server publishes `WEBHOOK_RECEIVED` events on the EventBus.
|
||||
An `event` trigger type with `event_types: ["webhook_received"]` subscribes
|
||||
to those events. The flow is:
|
||||
```
|
||||
HTTP POST /webhooks/gmail → WebhookServer → EventBus (WEBHOOK_RECEIVED)
|
||||
→ event entry point → triggers graph execution from entry_node
|
||||
```
|
||||
|
||||
**manual** — Triggered programmatically via `runtime.trigger()`.
|
||||
|
||||
### Isolation Levels
|
||||
|
||||
| Level | Meaning |
|
||||
|-------|---------|
|
||||
| `isolated` | Private state per execution |
|
||||
| `shared` | Eventual consistency — async executions can read primary session memory |
|
||||
| `synchronized` | Shared with write locks (use when ordering matters) |
|
||||
|
||||
For most async patterns, use `shared` — the async execution reads the primary
|
||||
session's memory (e.g., user-configured rules) and runs its own workflow.
|
||||
|
||||
### AgentRuntimeConfig (for webhook servers)
|
||||
For agents that react to external events, use `AsyncEntryPointSpec`:
|
||||
|
||||
```python
|
||||
from framework.graph.edge import AsyncEntryPointSpec
|
||||
from framework.runtime.agent_runtime import AgentRuntimeConfig
|
||||
|
||||
# Timer trigger (cron or interval)
|
||||
async_entry_points = [
|
||||
AsyncEntryPointSpec(
|
||||
id="daily-check",
|
||||
name="Daily Check",
|
||||
entry_node="process",
|
||||
trigger_type="timer",
|
||||
trigger_config={"cron": "0 9 * * *"}, # daily at 9am
|
||||
isolation_level="shared",
|
||||
)
|
||||
]
|
||||
|
||||
# Webhook server (optional)
|
||||
runtime_config = AgentRuntimeConfig(
|
||||
webhook_host="127.0.0.1",
|
||||
webhook_port=8080,
|
||||
webhook_routes=[
|
||||
{
|
||||
"source_id": "gmail",
|
||||
"path": "/webhooks/gmail",
|
||||
"methods": ["POST"],
|
||||
"secret": None, # Optional HMAC-SHA256 secret
|
||||
},
|
||||
],
|
||||
)
|
||||
```
|
||||
`runtime_config` is a module-level variable read by `AgentRunner.load()`.
|
||||
The runner passes it to `create_agent_runtime()`. On `runtime.start()`,
|
||||
if webhook_routes is non-empty, an embedded HTTP server starts.
|
||||
|
||||
### Session Sharing
|
||||
|
||||
Timer and event triggers automatically call `_get_primary_session_state()`
|
||||
before execution. This finds the active user-facing session and provides
|
||||
its memory to the async execution, filtered to only the async entry node's
|
||||
`input_keys`. This means the async flow can read user-configured values
|
||||
(like rules, preferences) without needing separate configuration.
|
||||
|
||||
### Module-Level Variables
|
||||
|
||||
Agents with async entry points must export two additional variables:
|
||||
```python
|
||||
# In agent.py:
|
||||
async_entry_points = [AsyncEntryPointSpec(...), ...]
|
||||
runtime_config = AgentRuntimeConfig(...) # Only if using webhooks
|
||||
```
|
||||
|
||||
Both must be re-exported from `__init__.py`:
|
||||
```python
|
||||
from .agent import (
|
||||
..., async_entry_points, runtime_config,
|
||||
webhook_routes=[{"source_id": "gmail", "path": "/webhooks/gmail", "methods": ["POST"]}],
|
||||
)
|
||||
```
|
||||
|
||||
### Reference Agent
|
||||
### Key Fields
|
||||
- `trigger_type`: `"timer"`, `"event"`, `"webhook"`, `"manual"`
|
||||
- `trigger_config`: `{"cron": "0 9 * * *"}` or `{"interval_minutes": 20}`
|
||||
- `isolation_level`: `"shared"` (recommended), `"isolated"`, `"synchronized"`
|
||||
- `event_types`: For event triggers, e.g., `["webhook_received"]`
|
||||
|
||||
See `exports/gmail_inbox_guardian/agent.py` for a complete example with:
|
||||
- Primary client-facing node (user configures rules)
|
||||
- Timer-based scheduled inbox checks (every 20 min)
|
||||
- Webhook-triggered email event handling
|
||||
- Shared isolation for memory access across streams
|
||||
### Exports Required
|
||||
Both `async_entry_points` and `runtime_config` must be exported from `__init__.py`.
|
||||
|
||||
## Framework Capabilities
|
||||
|
||||
**Works well:** Multi-turn conversations, HITL review, tool orchestration, structured outputs, parallel execution, context management, error recovery, session persistence.
|
||||
|
||||
**Limitations:** LLM latency (2-10s/turn), context window limits (~128K), cost per run, rate limits, node boundaries lose context.
|
||||
|
||||
**Not designed for:** Sub-second responses, millions of items, real-time streaming, guaranteed determinism, offline/air-gapped.
|
||||
See `exports/gmail_inbox_guardian/agent.py` for complete example.
|
||||
|
||||
## Tool Discovery
|
||||
|
||||
Do NOT rely on a static tool list — it will be outdated. Always use
|
||||
`list_agent_tools()` to discover available tools, grouped by category.
|
||||
Do NOT rely on a static tool list — it will be outdated. Always call
|
||||
`list_agent_tools()` with NO arguments first to see ALL available tools.
|
||||
Only use `group=` or `output_schema=` as follow-up calls after seeing the
|
||||
full list.
|
||||
|
||||
```
|
||||
list_agent_tools() # names + descriptions, all groups
|
||||
list_agent_tools(output_schema="full") # include input_schema
|
||||
list_agent_tools(group="gmail") # only gmail_* tools
|
||||
list_agent_tools() # ALWAYS call this first
|
||||
list_agent_tools(group="gmail", output_schema="full") # then drill into a category
|
||||
list_agent_tools("exports/my_agent/mcp_servers.json") # specific agent's tools
|
||||
```
|
||||
|
||||
|
||||
@@ -165,6 +165,7 @@ class LoopConfig:
|
||||
max_tool_calls_per_turn: int = 30
|
||||
judge_every_n_turns: int = 1
|
||||
stall_detection_threshold: int = 3
|
||||
stall_similarity_threshold: float = 0.7
|
||||
max_history_tokens: int = 32_000
|
||||
store_prefix: str = ""
|
||||
|
||||
@@ -488,13 +489,16 @@ class EventLoopNode(NodeProtocol):
|
||||
# 2b. Restore spill counter from existing files (resume safety)
|
||||
self._restore_spill_counter()
|
||||
|
||||
# 3. Build tool list: node tools + synthetic set_output + ask_user + delegate tools
|
||||
# 3. Build tool list: node tools + synthetic framework tools + delegate tools
|
||||
tools = list(ctx.available_tools)
|
||||
set_output_tool = self._build_set_output_tool(ctx.node_spec.output_keys)
|
||||
if set_output_tool:
|
||||
tools.append(set_output_tool)
|
||||
if ctx.node_spec.client_facing and not ctx.event_triggered:
|
||||
tools.append(self._build_ask_user_tool())
|
||||
# Workers/subagents can escalate blockers to the queen.
|
||||
if stream_id not in ("queen", "judge"):
|
||||
tools.append(self._build_escalate_to_coder_tool())
|
||||
|
||||
# Add delegate_to_sub_agent tool if:
|
||||
# - Node has sub_agents defined
|
||||
@@ -578,6 +582,7 @@ class EventLoopNode(NodeProtocol):
|
||||
_synthetic_names = {
|
||||
"set_output",
|
||||
"ask_user",
|
||||
"escalate_to_coder",
|
||||
"delegate_to_sub_agent",
|
||||
"report_to_parent",
|
||||
}
|
||||
@@ -586,6 +591,15 @@ class EventLoopNode(NodeProtocol):
|
||||
tools.extend(ctx.dynamic_tools_provider())
|
||||
tools.extend(synthetic)
|
||||
|
||||
# 6b3. Dynamic prompt refresh (phase switching)
|
||||
if ctx.dynamic_prompt_provider is not None:
|
||||
from framework.graph.prompt_composer import _with_datetime
|
||||
|
||||
_new_prompt = _with_datetime(ctx.dynamic_prompt_provider())
|
||||
if _new_prompt != conversation.system_prompt:
|
||||
conversation.update_system_prompt(_new_prompt)
|
||||
logger.info("[%s] Dynamic prompt updated (phase switch)", node_id)
|
||||
|
||||
# 6c. Publish iteration event
|
||||
await self._publish_iteration(stream_id, node_id, iteration, execution_id)
|
||||
|
||||
@@ -613,6 +627,9 @@ class EventLoopNode(NodeProtocol):
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
request_system_prompt,
|
||||
request_messages,
|
||||
) = await self._run_single_turn(
|
||||
ctx, conversation, tools, iteration, accumulator
|
||||
)
|
||||
@@ -647,6 +664,8 @@ class EventLoopNode(NodeProtocol):
|
||||
stream_id=stream_id,
|
||||
execution_id=execution_id,
|
||||
iteration=iteration,
|
||||
system_prompt=request_system_prompt,
|
||||
messages=request_messages,
|
||||
assistant_text=assistant_text,
|
||||
tool_calls=logged_tool_calls,
|
||||
tool_results=real_tool_results,
|
||||
@@ -808,6 +827,7 @@ class EventLoopNode(NodeProtocol):
|
||||
and not real_tool_results
|
||||
and not outputs_set
|
||||
and not user_input_requested
|
||||
and not queen_input_requested
|
||||
)
|
||||
if truly_empty and accumulator is not None:
|
||||
missing = self._get_missing_output_keys(
|
||||
@@ -959,8 +979,8 @@ class EventLoopNode(NodeProtocol):
|
||||
return NodeResult(
|
||||
success=False,
|
||||
error=(
|
||||
f"Node stalled: {self._config.stall_detection_threshold} "
|
||||
"consecutive identical responses"
|
||||
f"Node stalled: {self._config.stall_detection_threshold} similar "
|
||||
f"responses ({self._config.stall_similarity_threshold*100:.0f}+ threshold)"
|
||||
),
|
||||
output=accumulator.to_dict(),
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
@@ -977,7 +997,7 @@ class EventLoopNode(NodeProtocol):
|
||||
mcp_tool_calls = [
|
||||
tc
|
||||
for tc in logged_tool_calls
|
||||
if tc.get("tool_name") not in ("set_output", "ask_user")
|
||||
if tc.get("tool_name") not in ("set_output", "ask_user", "escalate_to_coder")
|
||||
]
|
||||
if mcp_tool_calls:
|
||||
fps = self._fingerprint_tool_calls(mcp_tool_calls)
|
||||
@@ -1002,7 +1022,25 @@ class EventLoopNode(NodeProtocol):
|
||||
"same tool calls with identical arguments. "
|
||||
"Try a different approach or different arguments."
|
||||
)
|
||||
if ctx.node_spec.client_facing and not ctx.event_triggered:
|
||||
if (
|
||||
ctx.node_spec.client_facing
|
||||
and not ctx.event_triggered
|
||||
and stream_id not in ("queen", "judge")
|
||||
and self._event_bus is not None
|
||||
):
|
||||
await self._event_bus.emit_escalation_requested(
|
||||
stream_id=stream_id,
|
||||
node_id=node_id,
|
||||
reason="Tool doom loop detected",
|
||||
context=doom_desc,
|
||||
execution_id=execution_id,
|
||||
)
|
||||
await conversation.add_user_message(
|
||||
"[SYSTEM] Escalated tool doom loop to queen for intervention."
|
||||
)
|
||||
recent_tool_fingerprints.clear()
|
||||
recent_responses.clear()
|
||||
elif ctx.node_spec.client_facing and not ctx.event_triggered:
|
||||
await conversation.add_user_message(warning_msg)
|
||||
await self._await_user_input(ctx, prompt=doom_desc)
|
||||
recent_tool_fingerprints.clear()
|
||||
@@ -1223,25 +1261,105 @@ class EventLoopNode(NodeProtocol):
|
||||
if not _outputs_complete:
|
||||
_cf_text_only_streak = 0
|
||||
_continue_count += 1
|
||||
if ctx.runtime_logger:
|
||||
iter_latency_ms = int((time.time() - iter_start) * 1000)
|
||||
ctx.runtime_logger.log_step(
|
||||
node_id=node_id,
|
||||
node_type="event_loop",
|
||||
step_index=iteration,
|
||||
verdict="CONTINUE",
|
||||
verdict_feedback=("Blocked for ask_user input (skip judge)"),
|
||||
tool_calls=logged_tool_calls,
|
||||
llm_text=assistant_text,
|
||||
input_tokens=turn_tokens.get("input", 0),
|
||||
output_tokens=turn_tokens.get("output", 0),
|
||||
latency_ms=iter_latency_ms,
|
||||
)
|
||||
self._log_skip_judge(
|
||||
ctx, node_id, iteration,
|
||||
"Blocked for ask_user input (skip judge)",
|
||||
logged_tool_calls, assistant_text, turn_tokens, iter_start,
|
||||
)
|
||||
continue
|
||||
# All outputs set -- fall through to judge
|
||||
|
||||
# Auto-block beyond grace -- fall through to judge (6i)
|
||||
|
||||
# 6h''. Worker wait for queen guidance
|
||||
# If a worker escalates with wait_for_response=true, pause here and
|
||||
# skip judge evaluation until queen injects guidance.
|
||||
if queen_input_requested:
|
||||
if self._shutdown:
|
||||
await self._publish_loop_completed(
|
||||
stream_id, node_id, iteration + 1, execution_id
|
||||
)
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
_continue_count += 1
|
||||
self._log_skip_judge(
|
||||
ctx, node_id, iteration,
|
||||
"Shutdown signaled (waiting for queen input)",
|
||||
logged_tool_calls, assistant_text, turn_tokens, iter_start,
|
||||
)
|
||||
if ctx.runtime_logger:
|
||||
ctx.runtime_logger.log_node_complete(
|
||||
node_id=node_id,
|
||||
node_name=ctx.node_spec.name,
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
total_steps=iteration + 1,
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
input_tokens=total_input_tokens,
|
||||
output_tokens=total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
exit_status="success",
|
||||
accept_count=_accept_count,
|
||||
retry_count=_retry_count,
|
||||
escalate_count=_escalate_count,
|
||||
continue_count=_continue_count,
|
||||
)
|
||||
return NodeResult(
|
||||
success=True,
|
||||
output=accumulator.to_dict(),
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
conversation=conversation if _is_continuous else None,
|
||||
)
|
||||
|
||||
logger.info("[%s] iter=%d: waiting for queen input...", node_id, iteration)
|
||||
got_input = await self._await_user_input(ctx, prompt="", emit_client_request=False)
|
||||
logger.info("[%s] iter=%d: queen wait unblocked, got_input=%s", node_id, iteration, got_input)
|
||||
if not got_input:
|
||||
await self._publish_loop_completed(
|
||||
stream_id, node_id, iteration + 1, execution_id
|
||||
)
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
_continue_count += 1
|
||||
self._log_skip_judge(
|
||||
ctx, node_id, iteration,
|
||||
"No queen input received (shutdown during wait)",
|
||||
logged_tool_calls, assistant_text, turn_tokens, iter_start,
|
||||
)
|
||||
if ctx.runtime_logger:
|
||||
ctx.runtime_logger.log_node_complete(
|
||||
node_id=node_id,
|
||||
node_name=ctx.node_spec.name,
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
total_steps=iteration + 1,
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
input_tokens=total_input_tokens,
|
||||
output_tokens=total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
exit_status="success",
|
||||
accept_count=_accept_count,
|
||||
retry_count=_retry_count,
|
||||
escalate_count=_escalate_count,
|
||||
continue_count=_continue_count,
|
||||
)
|
||||
return NodeResult(
|
||||
success=True,
|
||||
output=accumulator.to_dict(),
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
conversation=conversation if _is_continuous else None,
|
||||
)
|
||||
|
||||
recent_responses.clear()
|
||||
_cf_text_only_streak = 0
|
||||
_continue_count += 1
|
||||
self._log_skip_judge(
|
||||
ctx, node_id, iteration,
|
||||
"Blocked for queen input (skip judge)",
|
||||
logged_tool_calls, assistant_text, turn_tokens, iter_start,
|
||||
)
|
||||
continue
|
||||
|
||||
# 6i. Judge evaluation
|
||||
should_judge = (
|
||||
ctx.is_subagent_mode # Always evaluate subagents
|
||||
@@ -1253,20 +1371,11 @@ class EventLoopNode(NodeProtocol):
|
||||
if not should_judge:
|
||||
# Gap C: unjudged iteration — log as CONTINUE
|
||||
_continue_count += 1
|
||||
if ctx.runtime_logger:
|
||||
iter_latency_ms = int((time.time() - iter_start) * 1000)
|
||||
ctx.runtime_logger.log_step(
|
||||
node_id=node_id,
|
||||
node_type="event_loop",
|
||||
step_index=iteration,
|
||||
verdict="CONTINUE",
|
||||
verdict_feedback="Unjudged (judge_every_n_turns skip)",
|
||||
tool_calls=logged_tool_calls,
|
||||
llm_text=assistant_text,
|
||||
input_tokens=turn_tokens.get("input", 0),
|
||||
output_tokens=turn_tokens.get("output", 0),
|
||||
latency_ms=iter_latency_ms,
|
||||
)
|
||||
self._log_skip_judge(
|
||||
ctx, node_id, iteration,
|
||||
"Unjudged (judge_every_n_turns skip)",
|
||||
logged_tool_calls, assistant_text, turn_tokens, iter_start,
|
||||
)
|
||||
continue
|
||||
|
||||
# Judge evaluation (should_judge is always True here)
|
||||
@@ -1522,6 +1631,7 @@ class EventLoopNode(NodeProtocol):
|
||||
prompt: str = "",
|
||||
*,
|
||||
options: list[str] | None = None,
|
||||
emit_client_request: bool = True,
|
||||
) -> bool:
|
||||
"""Block until user input arrives or shutdown is signaled.
|
||||
|
||||
@@ -1535,6 +1645,9 @@ class EventLoopNode(NodeProtocol):
|
||||
options: Optional predefined choices for the user (from ask_user).
|
||||
Passed through to the CLIENT_INPUT_REQUESTED event so the
|
||||
frontend can render a QuestionWidget with buttons.
|
||||
emit_client_request: When False, wait silently without publishing
|
||||
CLIENT_INPUT_REQUESTED. Used for worker waits where input is
|
||||
expected from the queen via inject_worker_message().
|
||||
|
||||
Returns True if input arrived, False if shutdown was signaled.
|
||||
"""
|
||||
@@ -1549,7 +1662,7 @@ class EventLoopNode(NodeProtocol):
|
||||
# without injecting, so the wait still blocks until the user types.
|
||||
self._input_ready.clear()
|
||||
|
||||
if self._event_bus:
|
||||
if emit_client_request and self._event_bus:
|
||||
await self._event_bus.emit_client_input_requested(
|
||||
stream_id=ctx.stream_id or ctx.node_id,
|
||||
node_id=ctx.node_id,
|
||||
@@ -1576,18 +1689,35 @@ class EventLoopNode(NodeProtocol):
|
||||
tools: list[Tool],
|
||||
iteration: int,
|
||||
accumulator: OutputAccumulator,
|
||||
) -> tuple[str, list[dict], list[str], dict[str, int], list[dict], bool, str, list[str] | None]:
|
||||
) -> tuple[
|
||||
str,
|
||||
list[dict],
|
||||
list[str],
|
||||
dict[str, int],
|
||||
list[dict],
|
||||
bool,
|
||||
str,
|
||||
list[str] | None,
|
||||
bool,
|
||||
str,
|
||||
list[dict[str, Any]],
|
||||
]:
|
||||
"""Run a single LLM turn with streaming and tool execution.
|
||||
|
||||
Returns (assistant_text, real_tool_results, outputs_set, token_counts, logged_tool_calls,
|
||||
user_input_requested, ask_user_prompt, ask_user_options).
|
||||
user_input_requested, ask_user_prompt, ask_user_options, queen_input_requested,
|
||||
system_prompt, messages).
|
||||
|
||||
``real_tool_results`` contains only results from actual tools (web_search,
|
||||
etc.), NOT from the synthetic ``set_output`` or ``ask_user`` tools.
|
||||
etc.), NOT from synthetic framework tools such as ``set_output``,
|
||||
``ask_user``, or ``escalate_to_coder``.
|
||||
``outputs_set`` lists the output keys written via ``set_output`` during
|
||||
this turn. ``user_input_requested`` is True if the LLM called
|
||||
``ask_user`` during this turn. This separation lets the caller treat
|
||||
synthetic tools as framework concerns rather than tool-execution concerns.
|
||||
``queen_input_requested`` is True when the worker called
|
||||
``escalate_to_coder(wait_for_response=true)`` and should wait for
|
||||
queen guidance before judge evaluation.
|
||||
|
||||
``logged_tool_calls`` accumulates ALL tool calls across inner iterations
|
||||
(real tools, set_output, and discarded calls) for L3 logging. Unlike
|
||||
@@ -1600,11 +1730,14 @@ class EventLoopNode(NodeProtocol):
|
||||
token_counts: dict[str, int] = {"input": 0, "output": 0}
|
||||
tool_call_count = 0
|
||||
final_text = ""
|
||||
final_system_prompt = conversation.system_prompt
|
||||
final_messages: list[dict[str, Any]] = []
|
||||
# Track output keys set via set_output across all inner iterations
|
||||
outputs_set_this_turn: list[str] = []
|
||||
user_input_requested = False
|
||||
ask_user_prompt = ""
|
||||
ask_user_options: list[str] | None = None
|
||||
queen_input_requested = False
|
||||
# Accumulate ALL tool calls across inner iterations for L3 logging.
|
||||
# Unlike real_tool_results (reset each inner iteration), this persists.
|
||||
logged_tool_calls: list[dict] = []
|
||||
@@ -1635,6 +1768,8 @@ class EventLoopNode(NodeProtocol):
|
||||
)
|
||||
await conversation.add_user_message("[Continue working on your current task.]")
|
||||
messages = conversation.to_llm_messages()
|
||||
final_system_prompt = conversation.system_prompt
|
||||
final_messages = messages
|
||||
|
||||
accumulated_text = ""
|
||||
tool_calls: list[ToolCallEvent] = []
|
||||
@@ -1753,6 +1888,9 @@ class EventLoopNode(NodeProtocol):
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
)
|
||||
|
||||
# Execute tool calls — framework tools (set_output, ask_user)
|
||||
@@ -1896,6 +2034,56 @@ class EventLoopNode(NodeProtocol):
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
|
||||
elif tc.tool_name == "escalate_to_coder":
|
||||
# --- Framework-level escalate_to_coder handling ---
|
||||
reason = str(tc.tool_input.get("reason", "")).strip()
|
||||
context = str(tc.tool_input.get("context", "")).strip()
|
||||
wait_for_response = bool(tc.tool_input.get("wait_for_response", True))
|
||||
|
||||
if stream_id in ("queen", "judge"):
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: escalate_to_coder is only available to worker "
|
||||
"nodes/sub-agents, not queen/judge streams."
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
continue
|
||||
|
||||
if self._event_bus is None:
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"ERROR: EventBus unavailable. Could not emit escalation request."
|
||||
),
|
||||
is_error=True,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
continue
|
||||
|
||||
await self._event_bus.emit_escalation_requested(
|
||||
stream_id=stream_id,
|
||||
node_id=node_id,
|
||||
reason=reason,
|
||||
context=context,
|
||||
execution_id=execution_id,
|
||||
)
|
||||
if wait_for_response:
|
||||
queen_input_requested = True
|
||||
|
||||
result = ToolResult(
|
||||
tool_use_id=tc.tool_use_id,
|
||||
content=(
|
||||
"Escalation requested to hive_coder (queen); waiting for guidance."
|
||||
if wait_for_response
|
||||
else "Escalation requested to hive_coder (queen)."
|
||||
),
|
||||
is_error=False,
|
||||
)
|
||||
results_by_id[tc.tool_use_id] = result
|
||||
|
||||
elif tc.tool_name == "delegate_to_sub_agent":
|
||||
# --- Framework-level subagent delegation ---
|
||||
# Queue for parallel execution in Phase 2
|
||||
@@ -2091,6 +2279,7 @@ class EventLoopNode(NodeProtocol):
|
||||
if tc.tool_name not in (
|
||||
"set_output",
|
||||
"ask_user",
|
||||
"escalate_to_coder",
|
||||
"delegate_to_sub_agent",
|
||||
"report_to_parent",
|
||||
):
|
||||
@@ -2181,6 +2370,9 @@ class EventLoopNode(NodeProtocol):
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
)
|
||||
|
||||
# --- Mid-turn pruning: prevent context blowup within a single turn ---
|
||||
@@ -2197,9 +2389,9 @@ class EventLoopNode(NodeProtocol):
|
||||
conversation.usage_ratio() * 100,
|
||||
)
|
||||
|
||||
# If ask_user was called, return immediately so the outer loop
|
||||
# can block for user input instead of re-invoking the LLM.
|
||||
if user_input_requested:
|
||||
# If the turn requested external input (ask_user or queen handoff),
|
||||
# return immediately so the outer loop can block before judge eval.
|
||||
if user_input_requested or queen_input_requested:
|
||||
return (
|
||||
final_text,
|
||||
real_tool_results,
|
||||
@@ -2209,12 +2401,15 @@ class EventLoopNode(NodeProtocol):
|
||||
user_input_requested,
|
||||
ask_user_prompt,
|
||||
ask_user_options,
|
||||
queen_input_requested,
|
||||
final_system_prompt,
|
||||
final_messages,
|
||||
)
|
||||
|
||||
# Tool calls processed -- loop back to stream with updated conversation
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Synthetic tools: set_output, ask_user
|
||||
# Synthetic tools: set_output, ask_user, escalate_to_coder
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
def _build_ask_user_tool(self) -> Tool:
|
||||
@@ -2299,6 +2494,42 @@ class EventLoopNode(NodeProtocol):
|
||||
},
|
||||
)
|
||||
|
||||
def _build_escalate_to_coder_tool(self) -> Tool:
|
||||
"""Build the synthetic escalate_to_coder tool for worker -> queen handoff."""
|
||||
return Tool(
|
||||
name="escalate_to_coder",
|
||||
description=(
|
||||
"Escalate to the Hive Coder queen when blocked by errors, missing "
|
||||
"credentials, or ambiguous constraints that require supervisor "
|
||||
"guidance. Include a concise reason and optional context. Set "
|
||||
"wait_for_response=true to pause until the queen injects guidance."
|
||||
),
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"reason": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Short reason for escalation (e.g. 'Tool repeatedly failing')."
|
||||
),
|
||||
},
|
||||
"context": {
|
||||
"type": "string",
|
||||
"description": "Optional diagnostic details for the queen.",
|
||||
},
|
||||
"wait_for_response": {
|
||||
"type": "boolean",
|
||||
"description": (
|
||||
"When true (default), block this node until queen guidance "
|
||||
"arrives via injected input."
|
||||
),
|
||||
"default": True,
|
||||
},
|
||||
},
|
||||
"required": ["reason"],
|
||||
},
|
||||
)
|
||||
|
||||
def _build_delegate_tool(
|
||||
self, sub_agents: list[str], node_registry: dict[str, Any]
|
||||
) -> Tool | None:
|
||||
@@ -2620,13 +2851,46 @@ class EventLoopNode(NodeProtocol):
|
||||
skip = set(nullable_keys) if nullable_keys else set()
|
||||
return [k for k in output_keys if k not in skip and accumulator.get(k) is None]
|
||||
|
||||
@staticmethod
|
||||
def _ngram_similarity(s1: str, s2: str, n: int = 2) -> float:
|
||||
"""Jaccard similarity of n-gram sets.
|
||||
|
||||
Returns 0.0-1.0, where 1.0 is exact match.
|
||||
Fast: O(len(s) + len(s2)) using set operations.
|
||||
"""
|
||||
def _ngrams(s: str) -> set[str]:
|
||||
return {s[i:i+n] for i in range(len(s) - n + 1) if s.strip()}
|
||||
|
||||
if not s1 or not s2:
|
||||
return 0.0
|
||||
|
||||
ngrams1, ngrams2 = _ngrams(s1.lower()), _ngrams(s2.lower())
|
||||
if not ngrams1 or not ngrams2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(ngrams1 & ngrams2)
|
||||
union = len(ngrams1 | ngrams2)
|
||||
return intersection / union if union else 0.0
|
||||
|
||||
def _is_stalled(self, recent_responses: list[str]) -> bool:
|
||||
"""Detect stall: N consecutive identical non-empty responses."""
|
||||
"""Detect stall using n-gram similarity.
|
||||
|
||||
Detects when N consecutive responses have similarity >= threshold.
|
||||
This catches phrases like "I'm still stuck" vs "I'm stuck".
|
||||
"""
|
||||
if len(recent_responses) < self._config.stall_detection_threshold:
|
||||
return False
|
||||
if not recent_responses[0]:
|
||||
return False
|
||||
return all(r == recent_responses[0] for r in recent_responses)
|
||||
|
||||
threshold = self._config.stall_similarity_threshold
|
||||
# Check similarity against all recent responses (excluding self)
|
||||
for i, resp in enumerate(recent_responses):
|
||||
# Compare against all previous responses
|
||||
for prev in recent_responses[:i]:
|
||||
if self._ngram_similarity(resp, prev) >= threshold:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_transient_error(exc: BaseException) -> bool:
|
||||
@@ -2705,7 +2969,10 @@ class EventLoopNode(NodeProtocol):
|
||||
self,
|
||||
recent_tool_fingerprints: list[list[tuple[str, str]]],
|
||||
) -> tuple[bool, str]:
|
||||
"""Detect doom loop: N consecutive turns with identical tool calls.
|
||||
"""Detect doom loop using n-gram similarity on tool inputs.
|
||||
|
||||
Detects when N consecutive turns have similar tool calls.
|
||||
Similarity applies to the canonicalized tool input strings.
|
||||
|
||||
Returns (is_doom_loop, description).
|
||||
"""
|
||||
@@ -2714,15 +2981,24 @@ class EventLoopNode(NodeProtocol):
|
||||
threshold = self._config.tool_doom_loop_threshold
|
||||
if len(recent_tool_fingerprints) < threshold:
|
||||
return False, ""
|
||||
# All entries must be non-empty and identical
|
||||
first = recent_tool_fingerprints[0]
|
||||
if not first:
|
||||
return False, ""
|
||||
if all(fp == first for fp in recent_tool_fingerprints):
|
||||
tool_names = [name for name, _ in first]
|
||||
|
||||
# Check similarity against all recent fingerprints
|
||||
similarity_threshold = self._config.stall_similarity_threshold
|
||||
similar_count = sum(
|
||||
1
|
||||
for fp in recent_tool_fingerprints
|
||||
# Compare canonicalized tool input strings using n-gram similarity
|
||||
if self._ngram_similarity(fp[1], first[1]) >= similarity_threshold
|
||||
)
|
||||
|
||||
if similar_count >= threshold:
|
||||
tool_names = [name for name, _ in recent_tool_fingerprints]
|
||||
desc = (
|
||||
f"Doom loop detected: {threshold} consecutive identical "
|
||||
f"tool calls ({', '.join(tool_names)})"
|
||||
f"Doom loop detected: {similar_count}/{len(recent_tool_fingerprints)} "
|
||||
f"consecutive similar tool calls ({', '.join(tool_names)})"
|
||||
)
|
||||
return True, desc
|
||||
return False, ""
|
||||
@@ -3611,6 +3887,32 @@ class EventLoopNode(NodeProtocol):
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
def _log_skip_judge(
|
||||
self,
|
||||
ctx: NodeContext,
|
||||
node_id: str,
|
||||
iteration: int,
|
||||
feedback: str,
|
||||
tool_calls: list[dict],
|
||||
llm_text: str,
|
||||
turn_tokens: dict[str, int],
|
||||
iter_start: float,
|
||||
) -> None:
|
||||
"""Log a CONTINUE step that skips judge evaluation (e.g., waiting for input)."""
|
||||
if ctx.runtime_logger:
|
||||
ctx.runtime_logger.log_step(
|
||||
node_id=node_id,
|
||||
node_type="event_loop",
|
||||
step_index=iteration,
|
||||
verdict="CONTINUE",
|
||||
verdict_feedback=feedback,
|
||||
tool_calls=tool_calls,
|
||||
llm_text=llm_text,
|
||||
input_tokens=turn_tokens.get("input", 0),
|
||||
output_tokens=turn_tokens.get("output", 0),
|
||||
latency_ms=int((time.time() - iter_start) * 1000),
|
||||
)
|
||||
|
||||
async def _publish_loop_completed(
|
||||
self, stream_id: str, node_id: str, iterations: int, execution_id: str = ""
|
||||
) -> None:
|
||||
@@ -3627,7 +3929,7 @@ class EventLoopNode(NodeProtocol):
|
||||
await self._event_bus.emit_node_stalled(
|
||||
stream_id=stream_id,
|
||||
node_id=node_id,
|
||||
reason="Consecutive identical responses detected",
|
||||
reason="Consecutive similar responses detected",
|
||||
execution_id=execution_id,
|
||||
)
|
||||
|
||||
|
||||
@@ -139,6 +139,7 @@ class GraphExecutor:
|
||||
accounts_data: list[dict] | None = None,
|
||||
tool_provider_map: dict[str, str] | None = None,
|
||||
dynamic_tools_provider: Callable | None = None,
|
||||
dynamic_prompt_provider: Callable | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize the executor.
|
||||
@@ -163,6 +164,8 @@ class GraphExecutor:
|
||||
tool_provider_map: Tool name to provider name mapping for account routing
|
||||
dynamic_tools_provider: Optional callback returning current
|
||||
tool list (for mode switching)
|
||||
dynamic_prompt_provider: Optional callback returning current
|
||||
system prompt (for phase switching)
|
||||
"""
|
||||
self.runtime = runtime
|
||||
self.llm = llm
|
||||
@@ -182,6 +185,7 @@ class GraphExecutor:
|
||||
self.accounts_data = accounts_data
|
||||
self.tool_provider_map = tool_provider_map
|
||||
self.dynamic_tools_provider = dynamic_tools_provider
|
||||
self.dynamic_prompt_provider = dynamic_prompt_provider
|
||||
|
||||
# Initialize output cleaner — uses its own dedicated fast model (CEREBRAS_API_KEY),
|
||||
# never the main agent LLM. Passing the main LLM here would cause expensive
|
||||
@@ -1798,6 +1802,7 @@ class GraphExecutor:
|
||||
all_tools=list(self.tools), # Full catalog for subagent tool resolution
|
||||
shared_node_registry=self.node_registry, # For subagent escalation routing
|
||||
dynamic_tools_provider=self.dynamic_tools_provider,
|
||||
dynamic_prompt_provider=self.dynamic_prompt_provider,
|
||||
)
|
||||
|
||||
VALID_NODE_TYPES = {
|
||||
|
||||
@@ -549,6 +549,12 @@ class NodeContext:
|
||||
# the queen to switch between building-mode and running-mode tools.
|
||||
dynamic_tools_provider: Any = None # Callable[[], list[Tool]] | None
|
||||
|
||||
# Dynamic prompt provider — when set, EventLoopNode checks each
|
||||
# iteration and updates the system prompt if it changed. Used by
|
||||
# the queen to switch between phase-specific prompts (building /
|
||||
# staging / running) without restarting the conversation.
|
||||
dynamic_prompt_provider: Any = None # Callable[[], str] | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeResult:
|
||||
|
||||
@@ -611,10 +611,9 @@ def add_node(
|
||||
] = "{}",
|
||||
client_facing: Annotated[
|
||||
bool,
|
||||
"If True, an ask_user() tool is injected so the LLM can explicitly request user input. "
|
||||
"The node blocks ONLY when ask_user() is called — text-only turns stream freely. "
|
||||
"Set True for nodes that interact with users (intake, review, approval). "
|
||||
"Nodes that do autonomous work (research, data processing, API calls) MUST be False.",
|
||||
"Workers should be autonomous: set False. client_facing=True routes user-facing "
|
||||
"conversation to the node; this architecture requires worker->queen handoff "
|
||||
"instead via escalate_to_coder when blocked.",
|
||||
] = False,
|
||||
nullable_output_keys: Annotated[
|
||||
str, "JSON array of output keys that may remain unset (for mutually exclusive outputs)"
|
||||
@@ -704,12 +703,11 @@ def add_node(
|
||||
f"after the browser instructions."
|
||||
)
|
||||
|
||||
# Warn about client_facing on nodes with tools (likely autonomous work)
|
||||
if node_type in ("event_loop", "gcu") and client_facing and tools_list:
|
||||
warnings.append(
|
||||
f"Node '{node_id}' is client_facing=True but has tools {tools_list}. "
|
||||
"Nodes with tools typically do autonomous work and should be "
|
||||
"client_facing=False. Only set True if this node needs user approval."
|
||||
# Worker nodes must remain autonomous (queen handles user interaction).
|
||||
if node_type in ("event_loop", "gcu") and client_facing:
|
||||
errors.append(
|
||||
f"Node '{node_id}' has client_facing=True. Worker nodes must use "
|
||||
"client_facing=False and escalate_to_coder for blockers/errors."
|
||||
)
|
||||
|
||||
# nullable_output_keys must be a subset of output_keys
|
||||
@@ -1405,15 +1403,14 @@ def validate_graph() -> str:
|
||||
f"must be a subset of output_keys {node.output_keys}"
|
||||
)
|
||||
|
||||
# Warn if all event_loop nodes are client_facing (common misconfiguration)
|
||||
# Worker nodes should be autonomous; queen owns user interaction.
|
||||
el_nodes = [n for n in session.nodes if n.node_type == "event_loop"]
|
||||
cf_el_nodes = [n for n in el_nodes if n.client_facing]
|
||||
if len(el_nodes) > 1 and len(cf_el_nodes) == len(el_nodes):
|
||||
warnings.append(
|
||||
f"ALL {len(el_nodes)} event_loop nodes are client_facing=True. "
|
||||
"This injects ask_user() on every node. Only nodes that need user "
|
||||
"interaction (intake, review, approval) should be client_facing. Set "
|
||||
"client_facing=False on autonomous processing nodes."
|
||||
if cf_el_nodes:
|
||||
errors.append(
|
||||
"event_loop nodes must not be client_facing in worker graphs. "
|
||||
f"Set client_facing=False for: {[n.id for n in cf_el_nodes]} and use "
|
||||
"escalate_to_coder for handoff to queen."
|
||||
)
|
||||
|
||||
# Collect summary info
|
||||
@@ -1878,6 +1875,892 @@ def export_graph() -> str:
|
||||
)
|
||||
|
||||
|
||||
def _snake_to_camel(name: str) -> str:
|
||||
"""Convert snake_case to CamelCase. e.g. 'twitter_outreach_agent' -> 'TwitterOutreachAgent'."""
|
||||
return "".join(word.capitalize() for word in name.split("_"))
|
||||
|
||||
|
||||
def _node_var_name(node_id: str) -> str:
|
||||
"""Convert node id to a Python variable name. e.g. 'check-inbox' -> 'check_inbox_node'."""
|
||||
return node_id.replace("-", "_") + "_node"
|
||||
|
||||
|
||||
def _generate_config_py(session: BuildSession) -> str:
|
||||
"""Generate config.py content."""
|
||||
goal_name = session.goal.name if session.goal else session.name
|
||||
goal_desc = session.goal.description if session.goal else ""
|
||||
return f'''\
|
||||
"""Runtime configuration."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_preferred_model() -> str:
|
||||
"""Load preferred model from ~/.hive/configuration.json."""
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
llm = config.get("llm", {{}})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{{llm[\'provider\']}}/{{llm[\'model\']}}"
|
||||
except Exception:
|
||||
pass
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeConfig:
|
||||
model: str = field(default_factory=_load_preferred_model)
|
||||
temperature: float = 0.7
|
||||
max_tokens: int = 40000
|
||||
api_key: str | None = None
|
||||
api_base: str | None = None
|
||||
|
||||
|
||||
default_config = RuntimeConfig()
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentMetadata:
|
||||
name: str = {json.dumps(goal_name)}
|
||||
version: str = "1.0.0"
|
||||
description: str = {json.dumps(goal_desc)}
|
||||
intro_message: str = {json.dumps(f"{goal_name} ready.")}
|
||||
|
||||
|
||||
metadata = AgentMetadata()
|
||||
'''
|
||||
|
||||
|
||||
# GCU default system prompt template
|
||||
_GCU_DEFAULT_PROMPT = """\
|
||||
You are a browser automation agent. Your job is to complete the assigned task using browser tools.
|
||||
|
||||
## Workflow
|
||||
1. browser_start (only if no browser is running yet)
|
||||
2. browser_open(url=TARGET_URL) — note the returned targetId
|
||||
3. browser_snapshot to read the page
|
||||
4. [task-specific steps]
|
||||
5. set_output("result", JSON)
|
||||
|
||||
## Best Practices
|
||||
- Prefer browser_snapshot over browser_get_text("body") — compact accessibility tree
|
||||
- Always browser_wait after navigation
|
||||
- Use large scroll amounts (~2000-5000) for lazy-loaded content
|
||||
- If auth wall detected, report immediately — do not attempt login
|
||||
- Keep tool calls per turn ≤10
|
||||
- Tab isolation: use browser_open(background=true) and pass target_id to every call
|
||||
|
||||
## Output format
|
||||
set_output("result", JSON) with your findings.
|
||||
"""
|
||||
|
||||
|
||||
def _generate_nodes_init_py(session: BuildSession) -> str:
|
||||
"""Generate nodes/__init__.py content with GCU auto-configuration."""
|
||||
lines = ['"""Node definitions."""\n', "from framework.graph import NodeSpec\n"]
|
||||
|
||||
var_names = []
|
||||
for node in session.nodes:
|
||||
var = _node_var_name(node.id)
|
||||
var_names.append(var)
|
||||
|
||||
# GCU auto-configuration: set sensible defaults for GCU nodes
|
||||
is_gcu = node.node_type == "gcu"
|
||||
client_facing = node.client_facing if node.client_facing else (False if is_gcu else node.client_facing)
|
||||
max_node_visits = node.max_node_visits if node.max_node_visits != 0 else (1 if is_gcu else node.max_node_visits)
|
||||
output_keys = node.output_keys if node.output_keys else (["result"] if is_gcu else node.output_keys)
|
||||
|
||||
# Build NodeSpec kwargs
|
||||
kwargs_parts = [
|
||||
f" id={json.dumps(node.id)},",
|
||||
f" name={json.dumps(node.name)},",
|
||||
f" description={json.dumps(node.description)},",
|
||||
f" node_type={json.dumps(node.node_type)},",
|
||||
f" client_facing={client_facing!r},",
|
||||
f" max_node_visits={max_node_visits},",
|
||||
f" input_keys={json.dumps(node.input_keys)},",
|
||||
f" output_keys={json.dumps(output_keys)},",
|
||||
]
|
||||
if node.nullable_output_keys:
|
||||
kwargs_parts.append(f" nullable_output_keys={json.dumps(node.nullable_output_keys)},")
|
||||
if node.success_criteria:
|
||||
kwargs_parts.append(f" success_criteria={json.dumps(node.success_criteria)},")
|
||||
if node.routes:
|
||||
kwargs_parts.append(f" routes={json.dumps(node.routes)},")
|
||||
if node.sub_agents:
|
||||
kwargs_parts.append(f" sub_agents={json.dumps(node.sub_agents)},")
|
||||
|
||||
# System prompt — use GCU default for GCU nodes if not provided
|
||||
sp = node.system_prompt or ""
|
||||
if is_gcu and not sp.strip():
|
||||
sp = _GCU_DEFAULT_PROMPT
|
||||
kwargs_parts.append(f" system_prompt={json.dumps(sp)},")
|
||||
|
||||
# Tools — GCU nodes auto-include browser tools at runtime
|
||||
kwargs_parts.append(f" tools={json.dumps(node.tools)},")
|
||||
|
||||
lines.append(f"\n{var} = NodeSpec(\n")
|
||||
lines.append("\n".join(kwargs_parts))
|
||||
lines.append("\n)\n")
|
||||
|
||||
lines.append(f"\n__all__ = {json.dumps(var_names)}\n")
|
||||
return "".join(lines)
|
||||
|
||||
|
||||
def _generate_agent_py(
|
||||
session: BuildSession,
|
||||
entry_node: str,
|
||||
entry_points: dict,
|
||||
terminal_nodes: list,
|
||||
pause_nodes: list,
|
||||
has_async: bool,
|
||||
) -> str:
|
||||
"""Generate agent.py content."""
|
||||
class_name = _snake_to_camel(session.name)
|
||||
agent_name = session.name
|
||||
goal = session.goal
|
||||
|
||||
# Build node variable imports
|
||||
node_vars = [_node_var_name(n.id) for n in session.nodes]
|
||||
node_imports = ", ".join(node_vars)
|
||||
|
||||
# Imports block
|
||||
imports = [
|
||||
'"""Agent graph construction."""\n',
|
||||
"from pathlib import Path\n",
|
||||
"from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint",
|
||||
]
|
||||
if has_async:
|
||||
imports.append("from framework.graph.edge import GraphSpec, AsyncEntryPointSpec")
|
||||
imports.append(
|
||||
"from framework.runtime.agent_runtime import (\n"
|
||||
" AgentRuntime, AgentRuntimeConfig, create_agent_runtime,\n"
|
||||
")"
|
||||
)
|
||||
else:
|
||||
imports.append("from framework.graph.edge import GraphSpec")
|
||||
imports.append(
|
||||
"from framework.runtime.agent_runtime import (\n"
|
||||
" AgentRuntime, create_agent_runtime,\n"
|
||||
")"
|
||||
)
|
||||
imports.append("from framework.graph.executor import ExecutionResult")
|
||||
imports.append("from framework.graph.checkpoint_config import CheckpointConfig")
|
||||
imports.append("from framework.llm import LiteLLMProvider")
|
||||
imports.append("from framework.runner.tool_registry import ToolRegistry")
|
||||
imports.append("from framework.runtime.execution_stream import EntryPointSpec")
|
||||
imports.append(f"\nfrom .config import default_config, metadata")
|
||||
imports.append(f"from .nodes import {node_imports}")
|
||||
|
||||
out = "\n".join(imports) + "\n\n"
|
||||
|
||||
# Goal definition
|
||||
out += "# Goal definition\n"
|
||||
out += "goal = Goal(\n"
|
||||
out += f" id={json.dumps(goal.id)},\n"
|
||||
out += f" name={json.dumps(goal.name)},\n"
|
||||
out += f" description={json.dumps(goal.description)},\n"
|
||||
|
||||
if goal.success_criteria:
|
||||
out += " success_criteria=[\n"
|
||||
for sc in goal.success_criteria:
|
||||
sc_dict = sc.model_dump() if hasattr(sc, "model_dump") else sc
|
||||
out += " SuccessCriterion(\n"
|
||||
out += f" id={json.dumps(sc_dict['id'])},\n"
|
||||
out += f" description={json.dumps(sc_dict['description'])},\n"
|
||||
out += f" metric={json.dumps(sc_dict.get('metric', ''))},\n"
|
||||
out += f" target={json.dumps(sc_dict.get('target', ''))},\n"
|
||||
out += f" weight={sc_dict.get('weight', 1.0)},\n"
|
||||
out += " ),\n"
|
||||
out += " ],\n"
|
||||
|
||||
if goal.constraints:
|
||||
out += " constraints=[\n"
|
||||
for c in goal.constraints:
|
||||
c_dict = c.model_dump() if hasattr(c, "model_dump") else c
|
||||
out += " Constraint(\n"
|
||||
out += f" id={json.dumps(c_dict['id'])},\n"
|
||||
out += f" description={json.dumps(c_dict['description'])},\n"
|
||||
out += f" constraint_type={json.dumps(c_dict.get('constraint_type', 'hard'))},\n"
|
||||
out += f" category={json.dumps(c_dict.get('category', 'quality'))},\n"
|
||||
out += " ),\n"
|
||||
out += " ],\n"
|
||||
|
||||
out += ")\n\n"
|
||||
|
||||
# Nodes list
|
||||
out += f"# Node list\nnodes = [{node_imports}]\n\n"
|
||||
|
||||
# Edges
|
||||
out += "# Edge definitions\nedges = [\n"
|
||||
for edge in session.edges:
|
||||
out += " EdgeSpec(\n"
|
||||
out += f" id={json.dumps(edge.id)},\n"
|
||||
out += f" source={json.dumps(edge.source)},\n"
|
||||
out += f" target={json.dumps(edge.target)},\n"
|
||||
out += f" condition=EdgeCondition.{edge.condition.name},\n"
|
||||
if edge.condition_expr:
|
||||
out += f" condition_expr={json.dumps(edge.condition_expr)},\n"
|
||||
out += f" priority={edge.priority},\n"
|
||||
out += " ),\n"
|
||||
out += "]\n\n"
|
||||
|
||||
# Graph config
|
||||
out += "# Graph configuration\n"
|
||||
out += f"entry_node = {json.dumps(entry_node)}\n"
|
||||
out += f"entry_points = {json.dumps(entry_points)}\n"
|
||||
out += f"pause_nodes = {json.dumps(pause_nodes)}\n"
|
||||
out += f"terminal_nodes = {json.dumps(terminal_nodes)}\n\n"
|
||||
|
||||
# Async entry points placeholder (if has_async, emit a TODO skeleton)
|
||||
if has_async:
|
||||
out += "# Async entry points — customize triggers as needed\n"
|
||||
out += "async_entry_points = []\n\n"
|
||||
out += "# Runtime config for webhooks (optional)\n"
|
||||
out += "runtime_config = AgentRuntimeConfig(\n"
|
||||
out += ' webhook_host="127.0.0.1",\n'
|
||||
out += " webhook_port=8080,\n"
|
||||
out += " webhook_routes=[],\n"
|
||||
out += ")\n\n"
|
||||
|
||||
# Module-level vars
|
||||
out += "# Module-level vars read by AgentRunner.load()\n"
|
||||
out += 'conversation_mode = "continuous"\n'
|
||||
|
||||
identity = f"You are {goal.name}. {goal.description}"
|
||||
if len(identity) > 200:
|
||||
identity = identity[:197] + "..."
|
||||
out += f"identity_prompt = {json.dumps(identity)}\n"
|
||||
|
||||
loop_cfg = session.loop_config or {
|
||||
"max_iterations": 100,
|
||||
"max_tool_calls_per_turn": 30,
|
||||
"max_history_tokens": 32000,
|
||||
}
|
||||
out += f"loop_config = {json.dumps(loop_cfg)}\n\n"
|
||||
|
||||
# Agent class
|
||||
graph_id = f"{agent_name}-graph"
|
||||
out += f"\nclass {class_name}:\n"
|
||||
out += " def __init__(self, config=None):\n"
|
||||
out += " self.config = config or default_config\n"
|
||||
out += " self.goal = goal\n"
|
||||
out += " self.nodes = nodes\n"
|
||||
out += " self.edges = edges\n"
|
||||
out += " self.entry_node = entry_node\n"
|
||||
out += " self.entry_points = entry_points\n"
|
||||
out += " self.pause_nodes = pause_nodes\n"
|
||||
out += " self.terminal_nodes = terminal_nodes\n"
|
||||
out += " self._graph = None\n"
|
||||
out += " self._agent_runtime = None\n"
|
||||
out += " self._tool_registry = None\n"
|
||||
out += " self._storage_path = None\n\n"
|
||||
|
||||
# _build_graph
|
||||
out += " def _build_graph(self):\n"
|
||||
out += " return GraphSpec(\n"
|
||||
out += f" id={json.dumps(graph_id)},\n"
|
||||
out += " goal_id=self.goal.id,\n"
|
||||
out += ' version="1.0.0",\n'
|
||||
out += " entry_node=self.entry_node,\n"
|
||||
out += " entry_points=self.entry_points,\n"
|
||||
out += " terminal_nodes=self.terminal_nodes,\n"
|
||||
out += " pause_nodes=self.pause_nodes,\n"
|
||||
out += " nodes=self.nodes,\n"
|
||||
out += " edges=self.edges,\n"
|
||||
if has_async:
|
||||
out += " async_entry_points=async_entry_points,\n"
|
||||
out += " default_model=self.config.model,\n"
|
||||
out += " max_tokens=self.config.max_tokens,\n"
|
||||
out += " loop_config=loop_config,\n"
|
||||
out += " conversation_mode=conversation_mode,\n"
|
||||
out += " identity_prompt=identity_prompt,\n"
|
||||
out += " )\n\n"
|
||||
|
||||
# _setup
|
||||
storage = f".hive/agents/{agent_name}"
|
||||
out += " def _setup(self):\n"
|
||||
out += f" self._storage_path = Path.home() / {json.dumps(storage)}\n"
|
||||
out += " self._storage_path.mkdir(parents=True, exist_ok=True)\n"
|
||||
out += " self._tool_registry = ToolRegistry()\n"
|
||||
out += ' mcp_config = Path(__file__).parent / "mcp_servers.json"\n'
|
||||
out += " if mcp_config.exists():\n"
|
||||
out += " self._tool_registry.load_mcp_config(mcp_config)\n"
|
||||
out += " llm = LiteLLMProvider(\n"
|
||||
out += " model=self.config.model,\n"
|
||||
out += " api_key=self.config.api_key,\n"
|
||||
out += " api_base=self.config.api_base,\n"
|
||||
out += " )\n"
|
||||
out += " tools = list(self._tool_registry.get_tools().values())\n"
|
||||
out += " tool_executor = self._tool_registry.get_executor()\n"
|
||||
out += " self._graph = self._build_graph()\n"
|
||||
out += " self._agent_runtime = create_agent_runtime(\n"
|
||||
out += " graph=self._graph,\n"
|
||||
out += " goal=self.goal,\n"
|
||||
out += " storage_path=self._storage_path,\n"
|
||||
out += " entry_points=[\n"
|
||||
out += " EntryPointSpec(\n"
|
||||
out += ' id="default",\n'
|
||||
out += ' name="Default",\n'
|
||||
out += " entry_node=self.entry_node,\n"
|
||||
out += ' trigger_type="manual",\n'
|
||||
out += ' isolation_level="shared",\n'
|
||||
out += " ),\n"
|
||||
out += " ],\n"
|
||||
if has_async:
|
||||
out += " runtime_config=runtime_config,\n"
|
||||
out += " llm=llm,\n"
|
||||
out += " tools=tools,\n"
|
||||
out += " tool_executor=tool_executor,\n"
|
||||
out += " checkpoint_config=CheckpointConfig(\n"
|
||||
out += " enabled=True,\n"
|
||||
out += " checkpoint_on_node_complete=True,\n"
|
||||
out += " checkpoint_max_age_days=7,\n"
|
||||
out += " async_checkpoint=True,\n"
|
||||
out += " ),\n"
|
||||
out += " )\n\n"
|
||||
|
||||
# start / stop / trigger_and_wait / run
|
||||
out += " async def start(self):\n"
|
||||
out += " if self._agent_runtime is None:\n"
|
||||
out += " self._setup()\n"
|
||||
out += " if not self._agent_runtime.is_running:\n"
|
||||
out += " await self._agent_runtime.start()\n\n"
|
||||
|
||||
out += " async def stop(self):\n"
|
||||
out += " if self._agent_runtime and self._agent_runtime.is_running:\n"
|
||||
out += " await self._agent_runtime.stop()\n"
|
||||
out += " self._agent_runtime = None\n\n"
|
||||
|
||||
out += " async def trigger_and_wait(\n"
|
||||
out += " self,\n"
|
||||
out += ' entry_point="default",\n'
|
||||
out += " input_data=None,\n"
|
||||
out += " timeout=None,\n"
|
||||
out += " session_state=None,\n"
|
||||
out += " ):\n"
|
||||
out += " if self._agent_runtime is None:\n"
|
||||
out += ' raise RuntimeError("Agent not started. Call start() first.")\n'
|
||||
out += " return await self._agent_runtime.trigger_and_wait(\n"
|
||||
out += " entry_point_id=entry_point,\n"
|
||||
out += " input_data=input_data or {},\n"
|
||||
out += " session_state=session_state,\n"
|
||||
out += " )\n\n"
|
||||
|
||||
out += " async def run(self, context, session_state=None):\n"
|
||||
out += " await self.start()\n"
|
||||
out += " try:\n"
|
||||
out += " result = await self.trigger_and_wait(\n"
|
||||
out += ' "default", context, session_state=session_state\n'
|
||||
out += " )\n"
|
||||
out += ' return result or ExecutionResult(success=False, error="Execution timeout")\n'
|
||||
out += " finally:\n"
|
||||
out += " await self.stop()\n\n"
|
||||
|
||||
# info
|
||||
out += " def info(self):\n"
|
||||
out += " return {\n"
|
||||
out += ' "name": metadata.name,\n'
|
||||
out += ' "version": metadata.version,\n'
|
||||
out += ' "description": metadata.description,\n'
|
||||
out += ' "goal": {\n'
|
||||
out += ' "name": self.goal.name,\n'
|
||||
out += ' "description": self.goal.description,\n'
|
||||
out += " },\n"
|
||||
out += ' "nodes": [n.id for n in self.nodes],\n'
|
||||
out += ' "edges": [e.id for e in self.edges],\n'
|
||||
out += ' "entry_node": self.entry_node,\n'
|
||||
out += ' "entry_points": self.entry_points,\n'
|
||||
out += ' "terminal_nodes": self.terminal_nodes,\n'
|
||||
out += ' "client_facing_nodes": [n.id for n in self.nodes if n.client_facing],\n'
|
||||
out += " }\n\n"
|
||||
|
||||
# validate
|
||||
out += " def validate(self):\n"
|
||||
out += ' """Validate graph wiring and entry-point contract."""\n'
|
||||
out += " errors, warnings = [], []\n"
|
||||
out += " node_ids = {n.id for n in self.nodes}\n"
|
||||
out += " for e in self.edges:\n"
|
||||
out += " if e.source not in node_ids:\n"
|
||||
out += " errors.append(f\"Edge {e.id}: source '{e.source}' not found\")\n"
|
||||
out += " if e.target not in node_ids:\n"
|
||||
out += " errors.append(f\"Edge {e.id}: target '{e.target}' not found\")\n"
|
||||
out += " if self.entry_node not in node_ids:\n"
|
||||
out += " errors.append(f\"Entry node '{self.entry_node}' not found\")\n"
|
||||
out += " for t in self.terminal_nodes:\n"
|
||||
out += " if t not in node_ids:\n"
|
||||
out += " errors.append(f\"Terminal node '{t}' not found\")\n"
|
||||
out += " if not isinstance(self.entry_points, dict):\n"
|
||||
out += " errors.append(\n"
|
||||
out += " \"Invalid entry_points: expected dict[str, str] like \"\n"
|
||||
out += " \"{'start': '<entry-node-id>'}. \"\n"
|
||||
out += " f\"Got {type(self.entry_points).__name__}. \"\n"
|
||||
out += " \"Fix agent.py: set entry_points = {'start': '<entry-node-id>'}.\"\n"
|
||||
out += " )\n"
|
||||
out += " else:\n"
|
||||
out += " if 'start' not in self.entry_points:\n"
|
||||
out += " errors.append(\n"
|
||||
out += " \"entry_points must include 'start' mapped to entry_node. \"\n"
|
||||
out += " \"Example: {'start': '<entry-node-id>'}.\"\n"
|
||||
out += " )\n"
|
||||
out += " else:\n"
|
||||
out += " start_node = self.entry_points.get('start')\n"
|
||||
out += " if start_node != self.entry_node:\n"
|
||||
out += " errors.append(\n"
|
||||
out += " f\"entry_points['start'] points to '{start_node}' \"\n"
|
||||
out += " f\"but entry_node is '{self.entry_node}'. \"\n"
|
||||
out += " \"Keep these aligned.\"\n"
|
||||
out += " )\n"
|
||||
out += " for ep_id, nid in self.entry_points.items():\n"
|
||||
out += " if not isinstance(ep_id, str):\n"
|
||||
out += " errors.append(\n"
|
||||
out += " f\"Invalid entry_points key {ep_id!r} \"\n"
|
||||
out += " f\"({type(ep_id).__name__}). Entry point names must be strings.\"\n"
|
||||
out += " )\n"
|
||||
out += " continue\n"
|
||||
out += " if not isinstance(nid, str):\n"
|
||||
out += " errors.append(\n"
|
||||
out += " f\"Invalid entry_points['{ep_id}']={nid!r} \"\n"
|
||||
out += " f\"({type(nid).__name__}). Node ids must be strings.\"\n"
|
||||
out += " )\n"
|
||||
out += " continue\n"
|
||||
out += " if nid not in node_ids:\n"
|
||||
out += " errors.append(\n"
|
||||
out += " f\"Entry point '{ep_id}' references unknown node '{nid}'. \"\n"
|
||||
out += " f\"Known nodes: {sorted(node_ids)}\"\n"
|
||||
out += " )\n"
|
||||
out += ' return {"valid": len(errors) == 0, "errors": errors, "warnings": warnings}\n\n\n'
|
||||
|
||||
out += f"default_agent = {class_name}()\n"
|
||||
return out
|
||||
|
||||
|
||||
def _generate_init_py(session: BuildSession, has_async: bool) -> str:
|
||||
"""Generate __init__.py content."""
|
||||
class_name = _snake_to_camel(session.name)
|
||||
goal_name = session.goal.name if session.goal else session.name
|
||||
|
||||
agent_imports = [
|
||||
class_name,
|
||||
"default_agent",
|
||||
"goal",
|
||||
"nodes",
|
||||
"edges",
|
||||
"entry_node",
|
||||
"entry_points",
|
||||
"pause_nodes",
|
||||
"terminal_nodes",
|
||||
"conversation_mode",
|
||||
"identity_prompt",
|
||||
"loop_config",
|
||||
]
|
||||
if has_async:
|
||||
agent_imports.extend(["async_entry_points", "runtime_config"])
|
||||
|
||||
agent_import_str = ",\n ".join(agent_imports)
|
||||
|
||||
config_imports = ["default_config", "metadata"]
|
||||
config_import_str = ", ".join(config_imports)
|
||||
|
||||
all_names = agent_imports + config_imports
|
||||
all_str = ",\n ".join(f'"{n}"' for n in all_names)
|
||||
|
||||
return f'''\
|
||||
"""{goal_name}."""
|
||||
|
||||
from .agent import (
|
||||
{agent_import_str},
|
||||
)
|
||||
from .config import {config_import_str}
|
||||
|
||||
__all__ = [
|
||||
{all_str},
|
||||
]
|
||||
'''
|
||||
|
||||
|
||||
def _generate_main_py(session: BuildSession, has_async: bool) -> str:
|
||||
"""Generate __main__.py content."""
|
||||
class_name = _snake_to_camel(session.name)
|
||||
agent_name = session.name
|
||||
goal_name = session.goal.name if session.goal else session.name
|
||||
storage_path = f".hive/agents/{agent_name}"
|
||||
|
||||
out = f'''\
|
||||
"""CLI entry point for {goal_name}."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import click
|
||||
|
||||
from .agent import default_agent, {class_name}
|
||||
|
||||
|
||||
def setup_logging(verbose=False, debug=False):
|
||||
if debug:
|
||||
level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
|
||||
elif verbose:
|
||||
level, fmt = logging.INFO, "%(message)s"
|
||||
else:
|
||||
level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
|
||||
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(version="1.0.0")
|
||||
def cli():
|
||||
"""{goal_name}."""
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
def run(verbose):
|
||||
"""Execute the agent."""
|
||||
setup_logging(verbose=verbose)
|
||||
result = asyncio.run(default_agent.run({{}}))
|
||||
click.echo(
|
||||
json.dumps(
|
||||
{{"success": result.success, "output": result.output}}, indent=2, default=str
|
||||
)
|
||||
)
|
||||
sys.exit(0 if result.success else 1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def tui():
|
||||
"""Launch TUI dashboard."""
|
||||
from pathlib import Path
|
||||
|
||||
from framework.runtime.agent_runtime import create_agent_runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
from framework.tui.app import AdenTUI
|
||||
|
||||
async def run_tui():
|
||||
agent = {class_name}()
|
||||
storage = Path.home() / {json.dumps(storage_path)}
|
||||
storage.mkdir(parents=True, exist_ok=True)
|
||||
agent._setup()
|
||||
runtime = agent._agent_runtime
|
||||
app = AdenTUI(runtime)
|
||||
await app.run_async()
|
||||
await runtime.stop()
|
||||
|
||||
asyncio.run(run_tui())
|
||||
|
||||
|
||||
@cli.command()
|
||||
def info():
|
||||
"""Show agent info."""
|
||||
data = default_agent.info()
|
||||
click.echo(f"Agent: {{data[\'name\']}}\\nVersion: {{data[\'version\']}}")
|
||||
click.echo(f"Description: {{data[\'description\']}}")
|
||||
click.echo(f"Nodes: {{', '.join(data[\'nodes\'])}}")
|
||||
click.echo(
|
||||
f"Client-facing: {{', '.join(data[\'client_facing_nodes\'])}}"
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def validate():
|
||||
"""Validate agent structure."""
|
||||
v = default_agent.validate()
|
||||
if v["valid"]:
|
||||
click.echo("Agent is valid")
|
||||
else:
|
||||
click.echo("Errors:")
|
||||
for e in v["errors"]:
|
||||
click.echo(f" {{e}}")
|
||||
sys.exit(0 if v["valid"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
'''
|
||||
return out
|
||||
|
||||
|
||||
def _generate_conftest_py() -> str:
|
||||
"""Generate tests/conftest.py content — pure boilerplate."""
|
||||
return '''\
|
||||
"""Test fixtures."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
_repo_root = Path(__file__).resolve().parents[3]
|
||||
for _p in ["exports", "core"]:
|
||||
_path = str(_repo_root / _p)
|
||||
if _path not in sys.path:
|
||||
sys.path.insert(0, _path)
|
||||
|
||||
AGENT_PATH = str(Path(__file__).resolve().parents[1])
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def agent_module():
|
||||
"""Import the agent package for structural validation."""
|
||||
import importlib
|
||||
|
||||
return importlib.import_module(Path(AGENT_PATH).name)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def runner_loaded():
|
||||
"""Load the agent through AgentRunner (structural only, no LLM needed)."""
|
||||
from framework.runner.runner import AgentRunner
|
||||
|
||||
return AgentRunner.load(AGENT_PATH)
|
||||
'''
|
||||
|
||||
|
||||
def _generate_mcp_servers_json(session: BuildSession) -> str | None:
|
||||
"""Generate mcp_servers.json in flat dict format. Returns None if no servers."""
|
||||
if not session.mcp_servers:
|
||||
return None
|
||||
flat: dict[str, dict] = {}
|
||||
for server in session.mcp_servers:
|
||||
name = server.get("name", "unnamed")
|
||||
entry: dict = {}
|
||||
for key in ("transport", "command", "args", "cwd", "env", "url", "headers", "description"):
|
||||
if key in server and server[key]:
|
||||
entry[key] = server[key]
|
||||
# Default cwd for stdio servers
|
||||
if entry.get("transport") == "stdio" and "cwd" not in entry:
|
||||
entry["cwd"] = "../../tools"
|
||||
flat[name] = entry
|
||||
return json.dumps(flat, indent=2)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def initialize_agent_package(
|
||||
agent_name: Annotated[str, "Name for the agent package. Must be snake_case (e.g., 'my_research_agent')."],
|
||||
) -> str:
|
||||
"""
|
||||
Generate the full Python agent package from the current build session.
|
||||
|
||||
Creates all files needed for a runnable agent in exports/{agent_name}/:
|
||||
config.py, nodes/__init__.py, agent.py, __init__.py, __main__.py,
|
||||
mcp_servers.json, tests/conftest.py, agent.json, README.md.
|
||||
|
||||
Call this INSTEAD of manually writing package files. Requires a valid
|
||||
graph (goal, nodes, edges). Uses the same validation as export_graph.
|
||||
|
||||
Args:
|
||||
agent_name: Name for the agent. Must be valid snake_case for Python package.
|
||||
Examples: 'my_agent', 'research_bot', 'data_processor'
|
||||
"""
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
session = get_session()
|
||||
|
||||
# Validate agent name (must be valid snake_case for Python package)
|
||||
if not re.match(r'^[a-z][a-z0-9_]*$', agent_name):
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"errors": [
|
||||
f"Invalid agent_name '{agent_name}'. Must be snake_case: lowercase letters, numbers, underscores. "
|
||||
f"Must start with a letter. Examples: 'my_agent', 'research_bot', 'data_processor'"
|
||||
],
|
||||
})
|
||||
|
||||
# Update session name
|
||||
session.name = agent_name
|
||||
_save_session(session)
|
||||
|
||||
# Validate first
|
||||
validation = json.loads(validate_graph())
|
||||
if not validation["valid"]:
|
||||
return json.dumps({"success": False, "errors": validation["errors"]})
|
||||
|
||||
entry_node = validation["entry_node"]
|
||||
terminal_nodes = validation["terminal_nodes"]
|
||||
pause_nodes = validation.get("pause_nodes", [])
|
||||
resume_entry_points = validation.get("resume_entry_points", [])
|
||||
|
||||
# Build entry_points dict (same logic as export_graph)
|
||||
entry_points: dict[str, str] = {}
|
||||
if entry_node:
|
||||
entry_points["start"] = entry_node
|
||||
|
||||
if pause_nodes and resume_entry_points:
|
||||
pause_to_resume: dict[str, str] = {}
|
||||
for pause_node_id in pause_nodes:
|
||||
pause_node = next((n for n in session.nodes if n.id == pause_node_id), None)
|
||||
if not pause_node:
|
||||
continue
|
||||
for resume_node_id in resume_entry_points:
|
||||
resume_node = next((n for n in session.nodes if n.id == resume_node_id), None)
|
||||
if not resume_node:
|
||||
continue
|
||||
shared_keys = set(pause_node.output_keys) & set(resume_node.input_keys)
|
||||
if shared_keys:
|
||||
pause_to_resume[pause_node_id] = resume_node_id
|
||||
break
|
||||
unmatched_pause = [p for p in pause_nodes if p not in pause_to_resume]
|
||||
unmatched_resume = [r for r in resume_entry_points if r not in pause_to_resume.values()]
|
||||
for pause_id, resume_id in zip(unmatched_pause, unmatched_resume, strict=False):
|
||||
pause_to_resume[pause_id] = resume_id
|
||||
for pause_id, resume_id in pause_to_resume.items():
|
||||
entry_points[f"{pause_id}_resume"] = resume_id
|
||||
|
||||
# Detect whether this agent needs async entry points
|
||||
has_async = False # Placeholder; the coder can customize after generation
|
||||
|
||||
# Create directory structure
|
||||
exports_dir = Path("exports") / session.name
|
||||
nodes_dir = exports_dir / "nodes"
|
||||
tests_dir = exports_dir / "tests"
|
||||
nodes_dir.mkdir(parents=True, exist_ok=True)
|
||||
tests_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
files_written: dict[str, dict] = {}
|
||||
|
||||
def _write(rel_path: str, content: str) -> None:
|
||||
full = exports_dir / rel_path
|
||||
full.parent.mkdir(parents=True, exist_ok=True)
|
||||
with atomic_write(full) as f:
|
||||
f.write(content)
|
||||
files_written[rel_path] = {
|
||||
"path": str(full),
|
||||
"size_bytes": full.stat().st_size,
|
||||
}
|
||||
|
||||
# 1. config.py
|
||||
_write("config.py", _generate_config_py(session))
|
||||
|
||||
# 2. nodes/__init__.py
|
||||
_write("nodes/__init__.py", _generate_nodes_init_py(session))
|
||||
|
||||
# 3. agent.py
|
||||
_write(
|
||||
"agent.py",
|
||||
_generate_agent_py(session, entry_node, entry_points, terminal_nodes, pause_nodes, has_async),
|
||||
)
|
||||
|
||||
# 4. __init__.py
|
||||
_write("__init__.py", _generate_init_py(session, has_async))
|
||||
|
||||
# 5. __main__.py
|
||||
_write("__main__.py", _generate_main_py(session, has_async))
|
||||
|
||||
# 6. mcp_servers.json
|
||||
mcp_content = _generate_mcp_servers_json(session)
|
||||
if mcp_content is not None:
|
||||
_write("mcp_servers.json", mcp_content)
|
||||
|
||||
# 7. tests/conftest.py
|
||||
_write("tests/conftest.py", _generate_conftest_py())
|
||||
|
||||
# 8. agent.json + README.md — reuse export_graph logic
|
||||
export_result = json.loads(export_graph())
|
||||
if export_result.get("success"):
|
||||
for key in ("agent_json", "readme", "mcp_servers"):
|
||||
if key in export_result.get("files_written", {}):
|
||||
info = export_result["files_written"][key]
|
||||
# Map to relative path
|
||||
rel = str(Path(info["path"]).relative_to(exports_dir)) if exports_dir.as_posix() in info["path"] else info["path"]
|
||||
files_written[rel] = info
|
||||
|
||||
# 9. Generate validation commands
|
||||
agent_name = session.name
|
||||
validation_commands = [
|
||||
f'run_command("uv run python -c \'from {agent_name} import default_agent; print(default_agent.validate())\'")',
|
||||
f'run_command("uv run python -c \'from framework.runner.runner import AgentRunner; r = AgentRunner.load(\\"exports/{agent_name}\\"); print(\\"AgentRunner.load: OK\\")\'")',
|
||||
f'validate_agent_tools("exports/{agent_name}")',
|
||||
f'run_agent_tests("{agent_name}")',
|
||||
]
|
||||
|
||||
# 10. Generate node design warnings
|
||||
design_warnings = []
|
||||
for node in session.nodes:
|
||||
# Warn about nodes with no tools
|
||||
if not node.tools and node.node_type == "event_loop":
|
||||
design_warnings.append({
|
||||
"node_id": node.id,
|
||||
"type": "no_tools",
|
||||
"message": f"Node '{node.id}' has no tools. Consider merging into another node or adding tools.",
|
||||
"severity": "warning",
|
||||
})
|
||||
# Warn about client-facing nodes that aren't entry nodes
|
||||
if node.client_facing and node.id != entry_node:
|
||||
design_warnings.append({
|
||||
"node_id": node.id,
|
||||
"type": "client_facing_not_entry",
|
||||
"message": f"Node '{node.id}' is client_facing but not the entry node. Worker agents should not have client-facing nodes (queen handles user interaction).",
|
||||
"severity": "warning",
|
||||
})
|
||||
# GCU nodes should not be client_facing
|
||||
if node.node_type == "gcu" and node.client_facing:
|
||||
design_warnings.append({
|
||||
"node_id": node.id,
|
||||
"type": "gcu_client_facing",
|
||||
"message": f"GCU node '{node.id}' is client_facing. GCU nodes should be autonomous subagents (client_facing=False).",
|
||||
"severity": "warning",
|
||||
})
|
||||
# GCU nodes should have max_node_visits=1
|
||||
if node.node_type == "gcu" and node.max_node_visits != 1:
|
||||
design_warnings.append({
|
||||
"node_id": node.id,
|
||||
"type": "gcu_max_visits",
|
||||
"message": f"GCU node '{node.id}' should have max_node_visits=1 (single execution per delegation).",
|
||||
"severity": "info",
|
||||
})
|
||||
|
||||
# Warn about node count (prefer 2-5 nodes)
|
||||
node_count = len(session.nodes)
|
||||
if node_count < 2:
|
||||
design_warnings.append({
|
||||
"node_id": None,
|
||||
"type": "too_few_nodes",
|
||||
"message": f"Agent has only {node_count} node. Consider adding nodes for better separation of concerns.",
|
||||
"severity": "warning",
|
||||
})
|
||||
elif node_count > 5:
|
||||
design_warnings.append({
|
||||
"node_id": None,
|
||||
"type": "too_many_nodes",
|
||||
"message": f"Agent has {node_count} nodes. Consider consolidating to 2-5 nodes for simpler architecture.",
|
||||
"severity": "warning",
|
||||
})
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"agent_name": session.name,
|
||||
"class_name": _snake_to_camel(session.name),
|
||||
"files_written": files_written,
|
||||
"file_count": len(files_written),
|
||||
"node_count": len(session.nodes),
|
||||
"edge_count": len(session.edges),
|
||||
"has_async": has_async,
|
||||
"entry_node": entry_node,
|
||||
"entry_points": entry_points,
|
||||
"validation_commands": validation_commands,
|
||||
"design_warnings": design_warnings,
|
||||
"summary": (
|
||||
f"Agent package '{session.name}' initialized at exports/{session.name}/. "
|
||||
f"Generated {len(files_written)} files. "
|
||||
f"Review and customize system prompts in nodes/__init__.py, "
|
||||
f"then run validation commands."
|
||||
),
|
||||
},
|
||||
default=str,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def import_from_export(
|
||||
agent_json_path: Annotated[str, "Path to the agent.json file to import"],
|
||||
|
||||
@@ -561,7 +561,7 @@ The runtime logger automatically flags issues based on execution metrics:
|
||||
|
||||
### Attention Categories
|
||||
|
||||
Used by `/hive-debugger` skill for issue categorization:
|
||||
Used for runtime issue categorization:
|
||||
|
||||
1. **Missing Outputs**: Node didn't set required output keys
|
||||
2. **Tool Errors**: Tool calls failed (API errors, timeouts)
|
||||
@@ -690,7 +690,7 @@ rm -rf session_2025*
|
||||
|
||||
**Documentation:**
|
||||
- `EXECUTION_STORAGE_REDESIGN.md` - Unified session storage design
|
||||
- `/.claude/skills/hive-debugger/SKILL.md` - Interactive debugging skill
|
||||
- `docs/developer-guide.md` - Debugging and troubleshooting workflows
|
||||
|
||||
**Related:**
|
||||
- `core/framework/schemas/session_state.py` - Session state schema
|
||||
|
||||
@@ -137,8 +137,8 @@ class EventType(StrEnum):
|
||||
WORKER_LOADED = "worker_loaded"
|
||||
CREDENTIALS_REQUIRED = "credentials_required"
|
||||
|
||||
# Queen mode changes (building ↔ running)
|
||||
QUEEN_MODE_CHANGED = "queen_mode_changed"
|
||||
# Queen phase changes (building <-> staging <-> running)
|
||||
QUEEN_PHASE_CHANGED = "queen_phase_changed"
|
||||
|
||||
# Subagent reports (one-way progress updates from sub-agents)
|
||||
SUBAGENT_REPORT = "subagent_report"
|
||||
|
||||
@@ -1,45 +1,30 @@
|
||||
"""HIVE_LLM_DEBUG — write every LLM turn to a JSONL file for replay/debugging.
|
||||
"""Write every LLM turn to ~/.hive/llm_logs/<ts>.jsonl for replay/debugging.
|
||||
|
||||
Set the env var to enable:
|
||||
HIVE_LLM_DEBUG=1 → writes to ~/.hive/llm_logs/<ts>.jsonl
|
||||
HIVE_LLM_DEBUG=/some/path → writes to that directory
|
||||
|
||||
Each line is a JSON object with the full LLM turn: assistant text, tool calls,
|
||||
tool results, and token counts. The file is opened lazily on first call and
|
||||
flushed after every write. Errors are silently swallowed — this must never
|
||||
break the agent.
|
||||
Each line is a JSON object with the full LLM turn: the request payload
|
||||
(system prompt + messages), assistant text, tool calls, tool results, and
|
||||
token counts. The file is opened lazily on first call and flushed after every
|
||||
write. Errors are silently swallowed — this must never break the agent.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import IO, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_LLM_DEBUG_RAW = os.environ.get("HIVE_LLM_DEBUG", "").strip()
|
||||
_LLM_DEBUG_ENABLED = _LLM_DEBUG_RAW.lower() in ("1", "true") or (
|
||||
bool(_LLM_DEBUG_RAW) and _LLM_DEBUG_RAW.lower() not in ("0", "false", "")
|
||||
)
|
||||
_LLM_DEBUG_DIR = Path.home() / ".hive" / "llm_logs"
|
||||
|
||||
_log_file: IO[str] | None = None
|
||||
_log_ready = False # lazy init guard
|
||||
|
||||
|
||||
def _open_log() -> IO[str] | None:
|
||||
"""Open a JSONL log file. Returns None if disabled."""
|
||||
if not _LLM_DEBUG_ENABLED:
|
||||
return None
|
||||
raw = _LLM_DEBUG_RAW
|
||||
if raw.lower() in ("1", "true"):
|
||||
log_dir = Path.home() / ".hive" / "llm_logs"
|
||||
else:
|
||||
log_dir = Path(raw)
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
"""Open the JSONL log file for this process."""
|
||||
_LLM_DEBUG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
path = log_dir / f"{ts}.jsonl"
|
||||
path = _LLM_DEBUG_DIR / f"{ts}.jsonl"
|
||||
logger.info("LLM debug log → %s", path)
|
||||
return open(path, "a", encoding="utf-8") # noqa: SIM115
|
||||
|
||||
@@ -50,6 +35,8 @@ def log_llm_turn(
|
||||
stream_id: str,
|
||||
execution_id: str,
|
||||
iteration: int,
|
||||
system_prompt: str,
|
||||
messages: list[dict[str, Any]],
|
||||
assistant_text: str,
|
||||
tool_calls: list[dict[str, Any]],
|
||||
tool_results: list[dict[str, Any]],
|
||||
@@ -57,10 +44,8 @@ def log_llm_turn(
|
||||
) -> None:
|
||||
"""Write one JSONL line capturing a complete LLM turn.
|
||||
|
||||
No-op when HIVE_LLM_DEBUG is not set. Never raises.
|
||||
Never raises.
|
||||
"""
|
||||
if not _LLM_DEBUG_ENABLED:
|
||||
return
|
||||
try:
|
||||
global _log_file, _log_ready # noqa: PLW0603
|
||||
if not _log_ready:
|
||||
@@ -74,6 +59,8 @@ def log_llm_turn(
|
||||
"stream_id": stream_id,
|
||||
"execution_id": execution_id,
|
||||
"iteration": iteration,
|
||||
"system_prompt": system_prompt,
|
||||
"messages": messages,
|
||||
"assistant_text": assistant_text,
|
||||
"tool_calls": tool_calls,
|
||||
"tool_results": tool_results,
|
||||
|
||||
@@ -38,7 +38,7 @@ DEFAULT_EVENT_TYPES = [
|
||||
EventType.WORKER_LOADED,
|
||||
EventType.CREDENTIALS_REQUIRED,
|
||||
EventType.SUBAGENT_REPORT,
|
||||
EventType.QUEEN_MODE_CHANGED,
|
||||
EventType.QUEEN_PHASE_CHANGED,
|
||||
]
|
||||
|
||||
# Keepalive interval in seconds
|
||||
@@ -92,7 +92,7 @@ async def handle_events(request: web.Request) -> web.StreamResponse:
|
||||
"node_loop_started",
|
||||
"credentials_required",
|
||||
"worker_loaded",
|
||||
"queen_mode_changed",
|
||||
"queen_phase_changed",
|
||||
}
|
||||
|
||||
client_disconnected = asyncio.Event()
|
||||
|
||||
@@ -64,15 +64,15 @@ async def handle_trigger(request: web.Request) -> web.Response:
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
# Cancel queen's in-progress LLM turn so it picks up the mode change cleanly
|
||||
# Cancel queen's in-progress LLM turn so it picks up the phase change cleanly
|
||||
if session.queen_executor:
|
||||
node = session.queen_executor.node_registry.get("queen")
|
||||
if node and hasattr(node, "cancel_current_turn"):
|
||||
node.cancel_current_turn()
|
||||
|
||||
# Switch queen to running mode (mirrors run_agent_with_input tool behavior)
|
||||
if session.mode_state is not None:
|
||||
await session.mode_state.switch_to_running(source="frontend")
|
||||
# Switch queen to running phase (mirrors run_agent_with_input tool behavior)
|
||||
if session.phase_state is not None:
|
||||
await session.phase_state.switch_to_running(source="frontend")
|
||||
|
||||
return web.json_response({"execution_id": execution_id})
|
||||
|
||||
@@ -382,8 +382,8 @@ async def handle_stop(request: web.Request) -> web.Response:
|
||||
node.cancel_current_turn()
|
||||
|
||||
# Switch to staging (agent still loaded, ready to re-run)
|
||||
if session.mode_state is not None:
|
||||
await session.mode_state.switch_to_staging(source="frontend")
|
||||
if session.phase_state is not None:
|
||||
await session.phase_state.switch_to_staging(source="frontend")
|
||||
|
||||
return web.json_response(
|
||||
{
|
||||
|
||||
@@ -48,7 +48,7 @@ def _get_manager(request: web.Request) -> SessionManager:
|
||||
def _session_to_live_dict(session) -> dict:
|
||||
"""Serialize a live Session to the session-primary JSON shape."""
|
||||
info = session.worker_info
|
||||
mode_state = getattr(session, "mode_state", None)
|
||||
phase_state = getattr(session, "phase_state", None)
|
||||
return {
|
||||
"session_id": session.id,
|
||||
"worker_id": session.worker_id,
|
||||
@@ -61,7 +61,7 @@ def _session_to_live_dict(session) -> dict:
|
||||
"loaded_at": session.loaded_at,
|
||||
"uptime_seconds": round(time.time() - session.loaded_at, 1),
|
||||
"intro_message": getattr(session.runner, "intro_message", "") or "",
|
||||
"queen_mode": mode_state.mode if mode_state else "building",
|
||||
"queen_phase": phase_state.phase if phase_state else "building",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -40,11 +40,12 @@ class Session:
|
||||
runner: Any | None = None # AgentRunner
|
||||
worker_runtime: Any | None = None # AgentRuntime
|
||||
worker_info: Any | None = None # AgentInfo
|
||||
# Queen mode state (building/staging/running)
|
||||
mode_state: Any = None # QueenModeState
|
||||
# Queen phase state (building/staging/running)
|
||||
phase_state: Any = None # QueenPhaseState
|
||||
# Judge (active when worker is loaded)
|
||||
judge_task: asyncio.Task | None = None
|
||||
escalation_sub: str | None = None
|
||||
worker_handoff_sub: str | None = None
|
||||
|
||||
|
||||
class SessionManager:
|
||||
@@ -374,6 +375,12 @@ class SessionManager:
|
||||
|
||||
# Stop judge
|
||||
self._stop_judge(session)
|
||||
if session.worker_handoff_sub is not None:
|
||||
try:
|
||||
session.event_bus.unsubscribe(session.worker_handoff_sub)
|
||||
except Exception:
|
||||
pass
|
||||
session.worker_handoff_sub = None
|
||||
|
||||
# Stop queen
|
||||
if session.queen_task is not None:
|
||||
@@ -395,6 +402,47 @@ class SessionManager:
|
||||
# Queen startup
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _handle_worker_handoff(self, session: Session, executor: Any, event: Any) -> None:
|
||||
"""Route worker escalation events into the queen conversation."""
|
||||
if event.stream_id in ("queen", "judge"):
|
||||
return
|
||||
|
||||
reason = str(event.data.get("reason", "")).strip()
|
||||
context = str(event.data.get("context", "")).strip()
|
||||
node_label = event.node_id or "unknown_node"
|
||||
stream_label = event.stream_id or "unknown_stream"
|
||||
|
||||
handoff = (
|
||||
"[WORKER_ESCALATION_REQUEST]\n"
|
||||
f"stream_id: {stream_label}\n"
|
||||
f"node_id: {node_label}\n"
|
||||
f"reason: {reason or 'unspecified'}\n"
|
||||
)
|
||||
if context:
|
||||
handoff += f"context:\n{context}\n"
|
||||
|
||||
node = executor.node_registry.get("queen")
|
||||
if node is not None and hasattr(node, "inject_event"):
|
||||
await node.inject_event(handoff, is_client_input=False)
|
||||
else:
|
||||
logger.warning("Worker handoff received but queen node not ready")
|
||||
|
||||
def _subscribe_worker_handoffs(self, session: Session, executor: Any) -> None:
|
||||
"""Subscribe queen to worker/subagent escalation handoff events."""
|
||||
from framework.runtime.event_bus import EventType as _ET
|
||||
|
||||
if session.worker_handoff_sub is not None:
|
||||
session.event_bus.unsubscribe(session.worker_handoff_sub)
|
||||
session.worker_handoff_sub = None
|
||||
|
||||
async def _on_worker_handoff(event):
|
||||
await self._handle_worker_handoff(session, executor, event)
|
||||
|
||||
session.worker_handoff_sub = session.event_bus.subscribe(
|
||||
event_types=[_ET.ESCALATION_REQUESTED],
|
||||
handler=_on_worker_handoff,
|
||||
)
|
||||
|
||||
async def _start_queen(
|
||||
self,
|
||||
session: Session,
|
||||
@@ -427,16 +475,16 @@ class SessionManager:
|
||||
except Exception:
|
||||
logger.warning("Queen: MCP config failed to load", exc_info=True)
|
||||
|
||||
# Mode state for building/running mode switching
|
||||
# Phase state for building/running phase switching
|
||||
from framework.tools.queen_lifecycle_tools import (
|
||||
QueenModeState,
|
||||
QueenPhaseState,
|
||||
register_queen_lifecycle_tools,
|
||||
)
|
||||
|
||||
# Start in staging when the caller provided an agent, building otherwise.
|
||||
initial_mode = "staging" if worker_identity else "building"
|
||||
mode_state = QueenModeState(mode=initial_mode, event_bus=session.event_bus)
|
||||
session.mode_state = mode_state
|
||||
initial_phase = "staging" if worker_identity else "building"
|
||||
phase_state = QueenPhaseState(phase=initial_phase, event_bus=session.event_bus)
|
||||
session.phase_state = phase_state
|
||||
|
||||
# Always register lifecycle tools — they check session.worker_runtime
|
||||
# at call time, so they work even if no worker is loaded yet.
|
||||
@@ -446,7 +494,7 @@ class SessionManager:
|
||||
session_id=session.id,
|
||||
session_manager=self,
|
||||
manager_session_id=session.id,
|
||||
mode_state=mode_state,
|
||||
phase_state=phase_state,
|
||||
)
|
||||
|
||||
# Monitoring tools need concrete worker paths — only register when present
|
||||
@@ -464,11 +512,23 @@ class SessionManager:
|
||||
queen_tools = list(queen_registry.get_tools().values())
|
||||
queen_tool_executor = queen_registry.get_executor()
|
||||
|
||||
# Partition tools into mode-specific sets
|
||||
# Partition tools into phase-specific sets and import prompt segments
|
||||
from framework.agents.hive_coder.nodes import (
|
||||
_QUEEN_BUILDING_TOOLS,
|
||||
_QUEEN_RUNNING_TOOLS,
|
||||
_QUEEN_STAGING_TOOLS,
|
||||
_agent_builder_knowledge,
|
||||
_appendices,
|
||||
_queen_behavior_always,
|
||||
_queen_behavior_building,
|
||||
_queen_behavior_running,
|
||||
_queen_behavior_staging,
|
||||
_queen_identity,
|
||||
_queen_phase_7,
|
||||
_queen_style,
|
||||
_queen_tools_building,
|
||||
_queen_tools_running,
|
||||
_queen_tools_staging,
|
||||
)
|
||||
|
||||
building_names = set(_QUEEN_BUILDING_TOOLS)
|
||||
@@ -486,13 +546,12 @@ class SessionManager:
|
||||
)
|
||||
logger.info("Queen: registered tools: %s", sorted(registered_names))
|
||||
|
||||
mode_state.building_tools = [t for t in queen_tools if t.name in building_names]
|
||||
mode_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
|
||||
mode_state.running_tools = [t for t in queen_tools if t.name in running_names]
|
||||
phase_state.building_tools = [t for t in queen_tools if t.name in building_names]
|
||||
phase_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
|
||||
phase_state.running_tools = [t for t in queen_tools if t.name in running_names]
|
||||
|
||||
# Build queen graph with adjusted prompt + tools
|
||||
_orig_node = _queen_graph.nodes[0]
|
||||
base_prompt = _orig_node.system_prompt or ""
|
||||
|
||||
if worker_identity is None:
|
||||
worker_identity = (
|
||||
@@ -501,12 +560,44 @@ class SessionManager:
|
||||
"Handle all tasks directly using your coding tools."
|
||||
)
|
||||
|
||||
# Compose phase-specific prompts
|
||||
phase_state.prompt_building = (
|
||||
_queen_identity
|
||||
+ _agent_builder_knowledge
|
||||
+ _queen_tools_building
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_building
|
||||
+ _queen_phase_7
|
||||
+ _queen_style
|
||||
+ _appendices
|
||||
+ worker_identity
|
||||
)
|
||||
phase_state.prompt_staging = (
|
||||
_queen_identity
|
||||
+ _queen_tools_staging
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_staging
|
||||
+ _queen_style
|
||||
+ worker_identity
|
||||
)
|
||||
phase_state.prompt_running = (
|
||||
_queen_identity
|
||||
+ _queen_tools_running
|
||||
+ _queen_behavior_always
|
||||
+ _queen_behavior_running
|
||||
+ _queen_style
|
||||
+ worker_identity
|
||||
)
|
||||
|
||||
# Use the initial phase prompt as the node's system_prompt
|
||||
initial_prompt_text = phase_state.get_current_prompt()
|
||||
|
||||
registered_tool_names = set(queen_registry.get_tools().keys())
|
||||
declared_tools = _orig_node.tools or []
|
||||
available_tools = [t for t in declared_tools if t in registered_tool_names]
|
||||
|
||||
node_updates: dict = {
|
||||
"system_prompt": base_prompt + worker_identity,
|
||||
"system_prompt": initial_prompt_text,
|
||||
}
|
||||
if set(available_tools) != set(declared_tools):
|
||||
missing = sorted(set(declared_tools) - registered_tool_names)
|
||||
@@ -531,17 +622,18 @@ class SessionManager:
|
||||
storage_path=queen_dir,
|
||||
loop_config=queen_graph.loop_config,
|
||||
execution_id=session.id,
|
||||
dynamic_tools_provider=mode_state.get_current_tools,
|
||||
dynamic_tools_provider=phase_state.get_current_tools,
|
||||
dynamic_prompt_provider=phase_state.get_current_prompt,
|
||||
)
|
||||
session.queen_executor = executor
|
||||
|
||||
# Wire inject_notification so mode switches notify the queen LLM
|
||||
async def _inject_mode_notification(content: str) -> None:
|
||||
# Wire inject_notification so phase switches notify the queen LLM
|
||||
async def _inject_phase_notification(content: str) -> None:
|
||||
node = executor.node_registry.get("queen")
|
||||
if node is not None and hasattr(node, "inject_event"):
|
||||
await node.inject_event(content)
|
||||
|
||||
mode_state.inject_notification = _inject_mode_notification
|
||||
phase_state.inject_notification = _inject_phase_notification
|
||||
|
||||
# Auto-switch to staging when worker execution finishes naturally
|
||||
from framework.runtime.event_bus import EventType as _ET
|
||||
@@ -549,19 +641,20 @@ class SessionManager:
|
||||
async def _on_worker_done(event):
|
||||
if event.stream_id == "queen":
|
||||
return
|
||||
if mode_state.mode == "running":
|
||||
await mode_state.switch_to_staging(source="auto")
|
||||
if phase_state.phase == "running":
|
||||
await phase_state.switch_to_staging(source="auto")
|
||||
|
||||
session.event_bus.subscribe(
|
||||
event_types=[_ET.EXECUTION_COMPLETED, _ET.EXECUTION_FAILED],
|
||||
handler=_on_worker_done,
|
||||
)
|
||||
self._subscribe_worker_handoffs(session, executor)
|
||||
|
||||
logger.info(
|
||||
"Queen starting in %s mode with %d tools: %s",
|
||||
mode_state.mode,
|
||||
len(mode_state.get_current_tools()),
|
||||
[t.name for t in mode_state.get_current_tools()],
|
||||
"Queen starting in %s phase with %d tools: %s",
|
||||
phase_state.phase,
|
||||
len(phase_state.get_current_tools()),
|
||||
[t.name for t in phase_state.get_current_tools()],
|
||||
)
|
||||
result = await executor.execute(
|
||||
graph=queen_graph,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
+16
-16
@@ -476,7 +476,7 @@ class AdenTUI(App):
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.core import Runtime
|
||||
from framework.tools.queen_lifecycle_tools import (
|
||||
QueenModeState,
|
||||
QueenPhaseState,
|
||||
register_queen_lifecycle_tools,
|
||||
)
|
||||
from framework.tools.worker_monitoring_tools import register_worker_monitoring_tools
|
||||
@@ -539,8 +539,8 @@ class AdenTUI(App):
|
||||
except Exception:
|
||||
log.warning("Queen: MCP config failed to load", exc_info=True)
|
||||
|
||||
# Worker is already loaded in TUI path → start in staging mode.
|
||||
mode_state = QueenModeState(mode="staging", event_bus=event_bus)
|
||||
# Worker is already loaded in TUI path -> start in staging phase.
|
||||
phase_state = QueenPhaseState(phase="staging", event_bus=event_bus)
|
||||
|
||||
register_queen_lifecycle_tools(
|
||||
queen_registry,
|
||||
@@ -548,7 +548,7 @@ class AdenTUI(App):
|
||||
event_bus=event_bus,
|
||||
storage_path=storage_path,
|
||||
session_id=session_id,
|
||||
mode_state=mode_state,
|
||||
phase_state=phase_state,
|
||||
)
|
||||
register_worker_monitoring_tools(
|
||||
queen_registry,
|
||||
@@ -560,7 +560,7 @@ class AdenTUI(App):
|
||||
queen_tools = list(queen_registry.get_tools().values())
|
||||
queen_tool_executor = queen_registry.get_executor()
|
||||
|
||||
# Partition tools into mode-specific sets
|
||||
# Partition tools into phase-specific sets
|
||||
from framework.agents.hive_coder.nodes import (
|
||||
_QUEEN_BUILDING_TOOLS,
|
||||
_QUEEN_RUNNING_TOOLS,
|
||||
@@ -570,9 +570,9 @@ class AdenTUI(App):
|
||||
building_names = set(_QUEEN_BUILDING_TOOLS)
|
||||
staging_names = set(_QUEEN_STAGING_TOOLS)
|
||||
running_names = set(_QUEEN_RUNNING_TOOLS)
|
||||
mode_state.building_tools = [t for t in queen_tools if t.name in building_names]
|
||||
mode_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
|
||||
mode_state.running_tools = [t for t in queen_tools if t.name in running_names]
|
||||
phase_state.building_tools = [t for t in queen_tools if t.name in building_names]
|
||||
phase_state.staging_tools = [t for t in queen_tools if t.name in staging_names]
|
||||
phase_state.running_tools = [t for t in queen_tools if t.name in running_names]
|
||||
|
||||
# Build worker profile for queen's system prompt.
|
||||
from framework.tools.queen_lifecycle_tools import build_worker_profile
|
||||
@@ -614,23 +614,23 @@ class AdenTUI(App):
|
||||
stream_id="queen",
|
||||
storage_path=queen_dir,
|
||||
loop_config=queen_graph.loop_config,
|
||||
dynamic_tools_provider=mode_state.get_current_tools,
|
||||
dynamic_tools_provider=phase_state.get_current_tools,
|
||||
)
|
||||
self._queen_executor = executor
|
||||
|
||||
# Wire inject_notification so mode switches notify the queen LLM
|
||||
async def _inject_mode_notification(content: str) -> None:
|
||||
# Wire inject_notification so phase switches notify the queen LLM
|
||||
async def _inject_phase_notification(content: str) -> None:
|
||||
node = executor.node_registry.get("queen")
|
||||
if node is not None and hasattr(node, "inject_event"):
|
||||
await node.inject_event(content)
|
||||
|
||||
mode_state.inject_notification = _inject_mode_notification
|
||||
phase_state.inject_notification = _inject_phase_notification
|
||||
|
||||
log.info(
|
||||
"Queen starting in %s mode with %d tools: %s",
|
||||
mode_state.mode,
|
||||
len(mode_state.get_current_tools()),
|
||||
[t.name for t in mode_state.get_current_tools()],
|
||||
"Queen starting in %s phase with %d tools: %s",
|
||||
phase_state.phase,
|
||||
len(phase_state.get_current_tools()),
|
||||
[t.name for t in phase_state.get_current_tools()],
|
||||
)
|
||||
# The queen's event_loop node runs forever (continuous mode).
|
||||
# It blocks on _await_user_input() after each LLM turn,
|
||||
|
||||
@@ -12,8 +12,8 @@ export interface LiveSession {
|
||||
loaded_at: number;
|
||||
uptime_seconds: number;
|
||||
intro_message?: string;
|
||||
/** Queen operating mode — "building", "staging", or "running" */
|
||||
queen_mode?: "building" | "staging" | "running";
|
||||
/** Queen operating phase — "building", "staging", or "running" */
|
||||
queen_phase?: "building" | "staging" | "running";
|
||||
/** Present in 409 conflict responses when worker is still loading */
|
||||
loading?: boolean;
|
||||
}
|
||||
@@ -273,7 +273,7 @@ export type EventTypeName =
|
||||
| "escalation_requested"
|
||||
| "worker_loaded"
|
||||
| "credentials_required"
|
||||
| "queen_mode_changed"
|
||||
| "queen_phase_changed"
|
||||
| "subagent_report";
|
||||
|
||||
export interface AgentEvent {
|
||||
|
||||
@@ -31,7 +31,7 @@ interface AgentGraphProps {
|
||||
version?: string;
|
||||
runState?: RunState;
|
||||
building?: boolean;
|
||||
queenMode?: "building" | "staging" | "running";
|
||||
queenPhase?: "building" | "staging" | "running";
|
||||
}
|
||||
|
||||
// --- Extracted RunButton so hover state survives parent re-renders ---
|
||||
@@ -146,7 +146,7 @@ function truncateLabel(label: string, availablePx: number, fontSize: number): st
|
||||
return label.slice(0, Math.max(maxChars - 1, 1)) + "\u2026";
|
||||
}
|
||||
|
||||
export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, onPause, version, runState: externalRunState, building, queenMode }: AgentGraphProps) {
|
||||
export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, onPause, version, runState: externalRunState, building, queenPhase }: AgentGraphProps) {
|
||||
const [localRunState, setLocalRunState] = useState<RunState>("idle");
|
||||
const runState = externalRunState ?? localRunState;
|
||||
const runBtnRef = useRef<HTMLButtonElement>(null);
|
||||
@@ -278,7 +278,7 @@ export default function AgentGraph({ nodes, title: _title, onNodeClick, onRun, o
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<RunButton runState={runState} disabled={nodes.length === 0 || queenMode === "building"} onRun={handleRun} onPause={onPause ?? (() => {})} btnRef={runBtnRef} />
|
||||
<RunButton runState={runState} disabled={nodes.length === 0 || queenPhase === "building"} onRun={handleRun} onPause={onPause ?? (() => {})} btnRef={runBtnRef} />
|
||||
</div>
|
||||
<div className="flex-1 flex items-center justify-center px-5">
|
||||
{building ? (
|
||||
|
||||
@@ -38,8 +38,8 @@ interface ChatPanelProps {
|
||||
onQuestionSubmit?: (answer: string, isOther: boolean) => void;
|
||||
/** Called when user dismisses the pending question without answering */
|
||||
onQuestionDismiss?: () => void;
|
||||
/** Queen operating mode — shown as a tag on queen messages */
|
||||
queenMode?: "building" | "staging" | "running";
|
||||
/** Queen operating phase — shown as a tag on queen messages */
|
||||
queenPhase?: "building" | "staging" | "running";
|
||||
}
|
||||
|
||||
const queenColor = "hsl(45,95%,58%)";
|
||||
@@ -144,7 +144,7 @@ function ToolActivityRow({ content }: { content: string }) {
|
||||
);
|
||||
}
|
||||
|
||||
const MessageBubble = memo(function MessageBubble({ msg, queenMode }: { msg: ChatMessage; queenMode?: "building" | "staging" | "running" }) {
|
||||
const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: ChatMessage; queenPhase?: "building" | "staging" | "running" }) {
|
||||
const isUser = msg.type === "user";
|
||||
const isQueen = msg.role === "queen";
|
||||
const color = getColor(msg.agent, msg.role);
|
||||
@@ -200,11 +200,11 @@ const MessageBubble = memo(function MessageBubble({ msg, queenMode }: { msg: Cha
|
||||
}`}
|
||||
>
|
||||
{isQueen
|
||||
? queenMode === "running"
|
||||
? "running mode"
|
||||
: queenMode === "staging"
|
||||
? "staging mode"
|
||||
: "building mode"
|
||||
? queenPhase === "running"
|
||||
? "running phase"
|
||||
: queenPhase === "staging"
|
||||
? "staging phase"
|
||||
: "building phase"
|
||||
: "Worker"}
|
||||
</span>
|
||||
</div>
|
||||
@@ -218,9 +218,9 @@ const MessageBubble = memo(function MessageBubble({ msg, queenMode }: { msg: Cha
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.queenMode === next.queenMode);
|
||||
}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.queenPhase === next.queenPhase);
|
||||
|
||||
export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, onQuestionSubmit, onQuestionDismiss, queenMode }: ChatPanelProps) {
|
||||
export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, onQuestionSubmit, onQuestionDismiss, queenPhase }: ChatPanelProps) {
|
||||
const [input, setInput] = useState("");
|
||||
const [readMap, setReadMap] = useState<Record<string, number>>({});
|
||||
const bottomRef = useRef<HTMLDivElement>(null);
|
||||
@@ -280,7 +280,7 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
|
||||
<div ref={scrollRef} onScroll={handleScroll} className="flex-1 overflow-auto px-5 py-4 space-y-3">
|
||||
{threadMessages.map((msg) => (
|
||||
<div key={msg.id}>
|
||||
<MessageBubble msg={msg} queenMode={queenMode} />
|
||||
<MessageBubble msg={msg} queenPhase={queenPhase} />
|
||||
</div>
|
||||
))}
|
||||
|
||||
|
||||
@@ -241,8 +241,8 @@ interface AgentBackendState {
|
||||
/** The message ID of the current worker input request (for inline reply box) */
|
||||
workerInputMessageId: string | null;
|
||||
queenBuilding: boolean;
|
||||
/** Queen operating mode — "building" (coding), "staging" (loaded), or "running" (executing) */
|
||||
queenMode: "building" | "staging" | "running";
|
||||
/** Queen operating phase — "building" (coding), "staging" (loaded), or "running" (executing) */
|
||||
queenPhase: "building" | "staging" | "running";
|
||||
workerRunState: "idle" | "deploying" | "running";
|
||||
currentExecutionId: string | null;
|
||||
nodeLogs: Record<string, string[]>;
|
||||
@@ -277,7 +277,7 @@ function defaultAgentState(): AgentBackendState {
|
||||
awaitingInput: false,
|
||||
workerInputMessageId: null,
|
||||
queenBuilding: false,
|
||||
queenMode: "building",
|
||||
queenPhase: "building",
|
||||
workerRunState: "idle",
|
||||
currentExecutionId: null,
|
||||
nodeLogs: {},
|
||||
@@ -317,6 +317,11 @@ export default function Workspace() {
|
||||
|
||||
if (persisted) {
|
||||
for (const tab of persisted.tabs) {
|
||||
// Skip new-agent tabs when starting fresh from home with a prompt
|
||||
// to avoid creating duplicate sessions
|
||||
if (initialPrompt && hasExplicitAgent && (tab.agentType === "new-agent" || tab.agentType.startsWith("new-agent-"))) {
|
||||
continue;
|
||||
}
|
||||
if (!initial[tab.agentType]) initial[tab.agentType] = [];
|
||||
const session = createSession(tab.agentType, tab.label);
|
||||
session.id = tab.id;
|
||||
@@ -369,7 +374,15 @@ export default function Workspace() {
|
||||
const [activeSessionByAgent, setActiveSessionByAgent] = useState<Record<string, string>>(() => {
|
||||
const persisted = loadPersistedTabs();
|
||||
if (persisted) {
|
||||
const restored = { ...persisted.activeSessionByAgent };
|
||||
let restored = { ...persisted.activeSessionByAgent };
|
||||
// Remove stale new-agent-* entries when starting fresh from home
|
||||
if (initialPrompt && hasExplicitAgent) {
|
||||
restored = Object.fromEntries(
|
||||
Object.entries(restored).filter(([key]) =>
|
||||
key !== "new-agent" && !key.startsWith("new-agent-")
|
||||
)
|
||||
);
|
||||
}
|
||||
const urlSessions = sessionsByAgent[initialAgent];
|
||||
if (urlSessions?.length) {
|
||||
// When a prompt was submitted from home, activate the newly created
|
||||
@@ -518,6 +531,10 @@ export default function Workspace() {
|
||||
// --- Agent loading: loadAgentForType ---
|
||||
const loadingRef = useRef(new Set<string>());
|
||||
const loadAgentForType = useCallback(async (agentType: string) => {
|
||||
// Ref-based guard: prevents double-load from React StrictMode (must be first check)
|
||||
if (loadingRef.current.has(agentType)) return;
|
||||
loadingRef.current.add(agentType);
|
||||
|
||||
if (agentType === "new-agent" || agentType.startsWith("new-agent-")) {
|
||||
// Create a queen-only session (no worker) for agent building
|
||||
updateAgentState(agentType, { loading: true, error: null, ready: false, sessionId: null });
|
||||
@@ -592,10 +609,6 @@ export default function Workspace() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Ref-based guard: prevents double-load from React StrictMode
|
||||
if (loadingRef.current.has(agentType)) return;
|
||||
loadingRef.current.add(agentType);
|
||||
|
||||
updateAgentState(agentType, { loading: true, error: null, ready: false, sessionId: null });
|
||||
|
||||
try {
|
||||
@@ -677,12 +690,12 @@ export default function Workspace() {
|
||||
// failed, the throw inside the catch exits the outer try block.
|
||||
const session = liveSession!;
|
||||
const displayName = formatAgentDisplayName(session.worker_name || agentType);
|
||||
const initialMode = session.queen_mode || (session.has_worker ? "staging" : "building");
|
||||
const initialPhase = session.queen_phase || (session.has_worker ? "staging" : "building");
|
||||
updateAgentState(agentType, {
|
||||
sessionId: session.session_id,
|
||||
displayName,
|
||||
queenMode: initialMode,
|
||||
queenBuilding: initialMode === "building",
|
||||
queenPhase: initialPhase,
|
||||
queenBuilding: initialPhase === "building",
|
||||
});
|
||||
|
||||
// Update the session label
|
||||
@@ -1296,7 +1309,7 @@ export default function Workspace() {
|
||||
case "tool_call_started": {
|
||||
console.log('[TOOL_PILL] tool_call_started received:', { isQueen, nodeId: event.node_id, streamId: event.stream_id, agentType, executionId: event.execution_id, toolName: event.data?.tool_name });
|
||||
|
||||
// queenBuilding is now driven by queen_mode_changed events
|
||||
// queenBuilding is now driven by queen_phase_changed events
|
||||
|
||||
if (event.node_id) {
|
||||
if (!isQueen) {
|
||||
@@ -1531,15 +1544,15 @@ export default function Workspace() {
|
||||
break;
|
||||
}
|
||||
|
||||
case "queen_mode_changed": {
|
||||
const rawMode = event.data?.mode as string;
|
||||
const newMode: "building" | "staging" | "running" =
|
||||
rawMode === "running" ? "running" : rawMode === "staging" ? "staging" : "building";
|
||||
case "queen_phase_changed": {
|
||||
const rawPhase = event.data?.phase as string;
|
||||
const newPhase: "building" | "staging" | "running" =
|
||||
rawPhase === "running" ? "running" : rawPhase === "staging" ? "staging" : "building";
|
||||
updateAgentState(agentType, {
|
||||
queenMode: newMode,
|
||||
queenBuilding: newMode === "building",
|
||||
// Sync workerRunState so the RunButton reflects the mode
|
||||
workerRunState: newMode === "running" ? "running" : "idle",
|
||||
queenPhase: newPhase,
|
||||
queenBuilding: newPhase === "building",
|
||||
// Sync workerRunState so the RunButton reflects the phase
|
||||
workerRunState: newPhase === "running" ? "running" : "idle",
|
||||
});
|
||||
break;
|
||||
}
|
||||
@@ -1975,7 +1988,7 @@ export default function Workspace() {
|
||||
onPause={handlePause}
|
||||
runState={activeAgentState?.workerRunState ?? "idle"}
|
||||
building={activeAgentState?.queenBuilding ?? false}
|
||||
queenMode={activeAgentState?.queenMode ?? "building"}
|
||||
queenPhase={activeAgentState?.queenPhase ?? "building"}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
@@ -2045,7 +2058,7 @@ export default function Workspace() {
|
||||
(activeAgentState?.loading ?? true) ||
|
||||
!(activeAgentState?.queenReady)
|
||||
}
|
||||
queenMode={activeAgentState?.queenMode ?? "building"}
|
||||
queenPhase={activeAgentState?.queenPhase ?? "building"}
|
||||
pendingQuestion={activeAgentState?.awaitingInput ? activeAgentState.pendingQuestion : null}
|
||||
pendingOptions={activeAgentState?.awaitingInput ? activeAgentState.pendingOptions : null}
|
||||
onQuestionSubmit={
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Worker-autonomy guardrails for agent_builder_server."""
|
||||
|
||||
import json
|
||||
|
||||
from framework.graph import Goal, NodeSpec
|
||||
from framework.mcp import agent_builder_server as builder
|
||||
|
||||
|
||||
def _make_session(name: str = "autonomy_test"):
|
||||
session = builder.BuildSession(name=name)
|
||||
session.goal = Goal(
|
||||
id="g1",
|
||||
name="Autonomy Goal",
|
||||
description="Workers stay autonomous.",
|
||||
success_criteria=[],
|
||||
constraints=[],
|
||||
)
|
||||
return session
|
||||
|
||||
|
||||
def test_add_node_rejects_client_facing_event_loop(monkeypatch):
|
||||
session = _make_session()
|
||||
monkeypatch.setattr(builder, "_session", session)
|
||||
monkeypatch.setattr(builder, "_save_session", lambda _: None)
|
||||
|
||||
raw = builder.add_node(
|
||||
node_id="worker",
|
||||
name="Worker",
|
||||
description="Autonomous worker node",
|
||||
node_type="event_loop",
|
||||
input_keys='["task"]',
|
||||
output_keys='["result"]',
|
||||
system_prompt="Do work.",
|
||||
client_facing=True,
|
||||
)
|
||||
data = json.loads(raw)
|
||||
|
||||
assert data["valid"] is False
|
||||
assert any("client_facing=True" in err for err in data["errors"])
|
||||
|
||||
|
||||
def test_validate_graph_rejects_client_facing_event_loop(monkeypatch):
|
||||
session = _make_session()
|
||||
session.nodes = [
|
||||
NodeSpec(
|
||||
id="worker",
|
||||
name="Worker",
|
||||
description="Autonomous worker node",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
input_keys=[],
|
||||
output_keys=[],
|
||||
system_prompt="Do work.",
|
||||
)
|
||||
]
|
||||
monkeypatch.setattr(builder, "_session", session)
|
||||
|
||||
data = json.loads(builder.validate_graph())
|
||||
|
||||
assert data["valid"] is False
|
||||
assert any("must not be client_facing" in err for err in data["errors"])
|
||||
@@ -31,6 +31,7 @@ from framework.llm.stream_events import (
|
||||
)
|
||||
from framework.runtime.core import Runtime
|
||||
from framework.runtime.event_bus import EventBus, EventType
|
||||
from framework.server.session_manager import Session, SessionManager
|
||||
from framework.storage.conversation_store import FileConversationStore
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -137,7 +138,16 @@ def memory():
|
||||
return SharedMemory()
|
||||
|
||||
|
||||
def build_ctx(runtime, node_spec, memory, llm, tools=None, input_data=None, goal_context=""):
|
||||
def build_ctx(
|
||||
runtime,
|
||||
node_spec,
|
||||
memory,
|
||||
llm,
|
||||
tools=None,
|
||||
input_data=None,
|
||||
goal_context="",
|
||||
stream_id=None,
|
||||
):
|
||||
"""Build a NodeContext for testing."""
|
||||
return NodeContext(
|
||||
runtime=runtime,
|
||||
@@ -148,6 +158,7 @@ def build_ctx(runtime, node_spec, memory, llm, tools=None, input_data=None, goal
|
||||
llm=llm,
|
||||
available_tools=tools or [],
|
||||
goal_context=goal_context,
|
||||
stream_id=stream_id,
|
||||
)
|
||||
|
||||
|
||||
@@ -708,6 +719,180 @@ class TestClientFacingBlocking:
|
||||
tool_names = [t.name for t in (call["tools"] or [])]
|
||||
assert "ask_user" not in tool_names
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalate_to_coder_available_for_worker_stream(self, runtime, memory):
|
||||
"""Workers should receive escalate_to_coder synthetic tool."""
|
||||
spec = NodeSpec(
|
||||
id="internal",
|
||||
name="Internal",
|
||||
description="internal node",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
)
|
||||
llm = MockStreamingLLM(scenarios=[text_scenario("thinking...")])
|
||||
node = EventLoopNode(config=LoopConfig(max_iterations=2))
|
||||
ctx = build_ctx(runtime, spec, memory, llm, stream_id="worker")
|
||||
|
||||
await node.execute(ctx)
|
||||
|
||||
assert llm._call_index >= 1
|
||||
tool_names = [t.name for t in (llm.stream_calls[0]["tools"] or [])]
|
||||
assert "escalate_to_coder" in tool_names
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalate_to_coder_not_available_for_queen_stream(self, runtime, memory):
|
||||
"""Queen stream should not receive escalate_to_coder tool."""
|
||||
spec = NodeSpec(
|
||||
id="queen",
|
||||
name="Queen",
|
||||
description="queen node",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
)
|
||||
llm = MockStreamingLLM(scenarios=[text_scenario("monitoring...")])
|
||||
node = EventLoopNode(config=LoopConfig(max_iterations=2))
|
||||
ctx = build_ctx(runtime, spec, memory, llm, stream_id="queen")
|
||||
|
||||
await node.execute(ctx)
|
||||
|
||||
assert llm._call_index >= 1
|
||||
tool_names = [t.name for t in (llm.stream_calls[0]["tools"] or [])]
|
||||
assert "escalate_to_coder" not in tool_names
|
||||
|
||||
|
||||
class TestEscalateToCoder:
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalate_to_coder_emits_event(self, runtime, node_spec, memory):
|
||||
"""escalate_to_coder() should publish ESCALATION_REQUESTED."""
|
||||
node_spec.output_keys = []
|
||||
llm = MockStreamingLLM(
|
||||
scenarios=[
|
||||
tool_call_scenario(
|
||||
"escalate_to_coder",
|
||||
{
|
||||
"reason": "tool failure",
|
||||
"context": "HTTP 401 from upstream",
|
||||
"wait_for_response": False,
|
||||
},
|
||||
tool_use_id="escalate_1",
|
||||
),
|
||||
text_scenario("Escalated to queen."),
|
||||
]
|
||||
)
|
||||
bus = EventBus()
|
||||
received = []
|
||||
|
||||
async def capture(event):
|
||||
received.append(event)
|
||||
|
||||
bus.subscribe(event_types=[EventType.ESCALATION_REQUESTED], handler=capture)
|
||||
|
||||
ctx = build_ctx(runtime, node_spec, memory, llm, stream_id="worker")
|
||||
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
|
||||
result = await node.execute(ctx)
|
||||
|
||||
assert result.success is True
|
||||
assert len(received) == 1
|
||||
assert received[0].type == EventType.ESCALATION_REQUESTED
|
||||
assert received[0].data["reason"] == "tool failure"
|
||||
assert "HTTP 401" in received[0].data["context"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalate_to_coder_handoff_reaches_queen(self, runtime, node_spec, memory):
|
||||
"""Worker escalation should be routed to queen via SessionManager handoff sub."""
|
||||
node_spec.output_keys = []
|
||||
llm = MockStreamingLLM(
|
||||
scenarios=[
|
||||
tool_call_scenario(
|
||||
"escalate_to_coder",
|
||||
{
|
||||
"reason": "blocked",
|
||||
"context": "dependency missing",
|
||||
"wait_for_response": False,
|
||||
},
|
||||
tool_use_id="escalate_1",
|
||||
),
|
||||
text_scenario("Escalation sent."),
|
||||
]
|
||||
)
|
||||
bus = EventBus()
|
||||
|
||||
manager = SessionManager()
|
||||
session = Session(id="handoff_test", event_bus=bus, llm=object(), loaded_at=0.0)
|
||||
queen_node = MagicMock()
|
||||
queen_node.inject_event = AsyncMock()
|
||||
queen_executor = MagicMock()
|
||||
queen_executor.node_registry = {"queen": queen_node}
|
||||
manager._subscribe_worker_handoffs(session, queen_executor)
|
||||
|
||||
ctx = build_ctx(runtime, node_spec, memory, llm, stream_id="worker")
|
||||
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
|
||||
result = await node.execute(ctx)
|
||||
|
||||
assert result.success is True
|
||||
queen_node.inject_event.assert_awaited_once()
|
||||
injected = queen_node.inject_event.await_args.args[0]
|
||||
kwargs = queen_node.inject_event.await_args.kwargs
|
||||
assert "[WORKER_ESCALATION_REQUEST]" in injected
|
||||
assert "stream_id: worker" in injected
|
||||
assert "node_id: test_loop" in injected
|
||||
assert "reason: blocked" in injected
|
||||
assert "dependency missing" in injected
|
||||
assert kwargs["is_client_input"] is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_escalate_waits_for_queen_input_and_skips_judge(
|
||||
self, runtime, node_spec, memory
|
||||
):
|
||||
"""wait_for_response=true should block for queen input before judge evaluation."""
|
||||
node_spec.output_keys = ["result"]
|
||||
llm = MockStreamingLLM(
|
||||
scenarios=[
|
||||
tool_call_scenario(
|
||||
"escalate_to_coder",
|
||||
{
|
||||
"reason": "need direction",
|
||||
"context": "conflicting constraints",
|
||||
"wait_for_response": True,
|
||||
},
|
||||
tool_use_id="escalate_1",
|
||||
),
|
||||
tool_call_scenario(
|
||||
"set_output",
|
||||
{"key": "result", "value": "resolved after queen guidance"},
|
||||
tool_use_id="set_1",
|
||||
),
|
||||
text_scenario("Completed."),
|
||||
]
|
||||
)
|
||||
bus = EventBus()
|
||||
client_input_events = []
|
||||
|
||||
async def capture_input(event):
|
||||
client_input_events.append(event)
|
||||
|
||||
bus.subscribe(event_types=[EventType.CLIENT_INPUT_REQUESTED], handler=capture_input)
|
||||
|
||||
judge = AsyncMock(spec=JudgeProtocol)
|
||||
judge.evaluate = AsyncMock(return_value=JudgeVerdict(action="ACCEPT"))
|
||||
|
||||
ctx = build_ctx(runtime, node_spec, memory, llm, stream_id="worker")
|
||||
node = EventLoopNode(judge=judge, event_bus=bus, config=LoopConfig(max_iterations=5))
|
||||
|
||||
async def queen_reply():
|
||||
await asyncio.sleep(0.05)
|
||||
assert judge.evaluate.await_count == 0
|
||||
await node.inject_event("Use fallback mode and continue.")
|
||||
|
||||
task = asyncio.create_task(queen_reply())
|
||||
result = await node.execute(ctx)
|
||||
await task
|
||||
|
||||
assert result.success is True
|
||||
assert result.output["result"] == "resolved after queen guidance"
|
||||
assert judge.evaluate.await_count >= 1
|
||||
assert len(client_input_events) == 0
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Client-facing: _cf_expecting_work state machine
|
||||
@@ -1765,6 +1950,71 @@ class TestToolDoomLoopIntegration:
|
||||
assert len(doom_events) == 1
|
||||
assert "search" in doom_events[0].data["description"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_client_facing_worker_doom_loop_escalates_to_queen(
|
||||
self,
|
||||
runtime,
|
||||
memory,
|
||||
):
|
||||
"""Client-facing worker doom loops should escalate instead of blocking for user input."""
|
||||
spec = NodeSpec(
|
||||
id="worker",
|
||||
name="Worker",
|
||||
description="worker node",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
client_facing=True,
|
||||
)
|
||||
judge = AsyncMock(spec=JudgeProtocol)
|
||||
eval_count = 0
|
||||
|
||||
async def judge_eval(*args, **kwargs):
|
||||
nonlocal eval_count
|
||||
eval_count += 1
|
||||
if eval_count >= 4:
|
||||
return JudgeVerdict(action="ACCEPT")
|
||||
return JudgeVerdict(action="RETRY")
|
||||
|
||||
judge.evaluate = judge_eval
|
||||
|
||||
llm = ToolRepeatLLM("search", {"q": "hello"}, tool_turns=3)
|
||||
bus = EventBus()
|
||||
escalation_events: list = []
|
||||
bus.subscribe(
|
||||
event_types=[EventType.ESCALATION_REQUESTED],
|
||||
handler=lambda e: escalation_events.append(e),
|
||||
)
|
||||
|
||||
def tool_exec(tool_use: ToolUse) -> ToolResult:
|
||||
return ToolResult(
|
||||
tool_use_id=tool_use.id,
|
||||
content="result",
|
||||
is_error=False,
|
||||
)
|
||||
|
||||
ctx = build_ctx(
|
||||
runtime,
|
||||
spec,
|
||||
memory,
|
||||
llm,
|
||||
tools=[Tool(name="search", description="s", parameters={})],
|
||||
stream_id="worker",
|
||||
)
|
||||
node = EventLoopNode(
|
||||
judge=judge,
|
||||
tool_executor=tool_exec,
|
||||
event_bus=bus,
|
||||
config=LoopConfig(
|
||||
max_iterations=10,
|
||||
tool_doom_loop_threshold=3,
|
||||
),
|
||||
)
|
||||
result = await node.execute(ctx)
|
||||
|
||||
assert result.success is True
|
||||
assert len(escalation_events) >= 1
|
||||
assert escalation_events[0].data["reason"] == "Tool doom loop detected"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_doom_loop_disabled(
|
||||
self,
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.server.session_manager import Session, SessionManager
|
||||
|
||||
|
||||
def _make_session(event_bus: EventBus, session_id: str = "session_handoff") -> Session:
|
||||
return Session(id=session_id, event_bus=event_bus, llm=object(), loaded_at=0.0)
|
||||
|
||||
|
||||
def _make_executor(queen_node) -> SimpleNamespace:
|
||||
node_registry = {}
|
||||
if queen_node is not None:
|
||||
node_registry["queen"] = queen_node
|
||||
return SimpleNamespace(node_registry=node_registry)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_handoff_injects_formatted_request_into_queen() -> None:
|
||||
bus = EventBus()
|
||||
manager = SessionManager()
|
||||
session = _make_session(bus)
|
||||
|
||||
queen_node = SimpleNamespace(inject_event=AsyncMock())
|
||||
manager._subscribe_worker_handoffs(session, _make_executor(queen_node))
|
||||
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="worker_a",
|
||||
node_id="research_node",
|
||||
reason="Credential wall",
|
||||
context="HTTP 401 while calling external API",
|
||||
execution_id="exec_123",
|
||||
)
|
||||
|
||||
queen_node.inject_event.assert_awaited_once()
|
||||
injected = queen_node.inject_event.await_args.args[0]
|
||||
kwargs = queen_node.inject_event.await_args.kwargs
|
||||
|
||||
assert "[WORKER_ESCALATION_REQUEST]" in injected
|
||||
assert "stream_id: worker_a" in injected
|
||||
assert "node_id: research_node" in injected
|
||||
assert "reason: Credential wall" in injected
|
||||
assert "context:\nHTTP 401 while calling external API" in injected
|
||||
assert kwargs["is_client_input"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_handoff_ignores_queen_and_judge_streams() -> None:
|
||||
bus = EventBus()
|
||||
manager = SessionManager()
|
||||
session = _make_session(bus)
|
||||
|
||||
queen_node = SimpleNamespace(inject_event=AsyncMock())
|
||||
manager._subscribe_worker_handoffs(session, _make_executor(queen_node))
|
||||
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="queen",
|
||||
node_id="queen",
|
||||
reason="should be ignored",
|
||||
)
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="judge",
|
||||
node_id="judge",
|
||||
reason="should be ignored",
|
||||
)
|
||||
|
||||
assert queen_node.inject_event.await_count == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_worker_handoff_resubscribe_replaces_previous_subscription() -> None:
|
||||
bus = EventBus()
|
||||
manager = SessionManager()
|
||||
session = _make_session(bus)
|
||||
|
||||
old_queen_node = SimpleNamespace(inject_event=AsyncMock())
|
||||
manager._subscribe_worker_handoffs(session, _make_executor(old_queen_node))
|
||||
first_sub = session.worker_handoff_sub
|
||||
assert first_sub is not None
|
||||
|
||||
new_queen_node = SimpleNamespace(inject_event=AsyncMock())
|
||||
manager._subscribe_worker_handoffs(session, _make_executor(new_queen_node))
|
||||
second_sub = session.worker_handoff_sub
|
||||
|
||||
assert second_sub is not None
|
||||
assert second_sub != first_sub
|
||||
assert first_sub not in bus._subscriptions
|
||||
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="worker_b",
|
||||
node_id="planner",
|
||||
reason="stuck",
|
||||
)
|
||||
|
||||
assert old_queen_node.inject_event.await_count == 0
|
||||
new_queen_node.inject_event.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stop_session_unsubscribes_worker_handoff() -> None:
|
||||
bus = EventBus()
|
||||
manager = SessionManager()
|
||||
session = _make_session(bus, session_id="session_stop")
|
||||
|
||||
queen_node = SimpleNamespace(inject_event=AsyncMock())
|
||||
manager._subscribe_worker_handoffs(session, _make_executor(queen_node))
|
||||
manager._sessions[session.id] = session
|
||||
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="worker_main",
|
||||
node_id="node_1",
|
||||
reason="before stop",
|
||||
)
|
||||
assert queen_node.inject_event.await_count == 1
|
||||
|
||||
stopped = await manager.stop_session(session.id)
|
||||
assert stopped is True
|
||||
assert session.worker_handoff_sub is None
|
||||
|
||||
await bus.emit_escalation_requested(
|
||||
stream_id="worker_main",
|
||||
node_id="node_1",
|
||||
reason="after stop",
|
||||
)
|
||||
assert queen_node.inject_event.await_count == 1
|
||||
@@ -25,7 +25,7 @@ Done. For details, prerequisites, and troubleshooting, read on.
|
||||
|
||||
- **agent-builder** – Create and manage agents (goals, nodes, edges).
|
||||
- **tools** – File operations, web search, and other agent tools.
|
||||
- **Skills** – Guided docs for building and testing agents (in `.agent/skills/` or `.claude/skills/`).
|
||||
- **Documentation** – Guided docs for building and testing agents.
|
||||
|
||||
---
|
||||
|
||||
@@ -80,18 +80,9 @@ That writes `~/.claude/mcp.json` as well.
|
||||
|
||||
**Prefer to do it manually?** See [Manual MCP config](#manual-mcp-config-template) below. You’ll create `~/.gemini/mcp.json` (or `~/.claude/mcp.json`) with absolute paths to your repo’s `core` and `tools` folders.
|
||||
|
||||
### Step 3: Use skills
|
||||
### Step 3: Use MCP tools + docs
|
||||
|
||||
Skills are guides (workflow, building, testing) in `.agent/skills/` (they point to `.claude/skills/`). If Antigravity doesn’t show a “skills” UI, open those folders in the project and use the files as reference while you use the MCP tools.
|
||||
|
||||
| Skill | What it's for |
|
||||
|-------|----------------|
|
||||
| **hive** | End-to-end workflow for building and testing agents |
|
||||
| **hive-concepts** | Core ideas for goal-driven agents |
|
||||
| **hive-create** | Step-by-step agent construction |
|
||||
| **hive-patterns** | Patterns and best practices |
|
||||
| **hive-test** | Goal-based evaluation and testing |
|
||||
| **hive-credentials** | Set up and manage agent credentials |
|
||||
Use the `agent-builder` and `tools` MCP servers in Antigravity, and use docs in `docs/` for workflow guidance.
|
||||
|
||||
---
|
||||
|
||||
@@ -100,7 +91,6 @@ Skills are guides (workflow, building, testing) in `.agent/skills/` (they point
|
||||
```
|
||||
.agent/
|
||||
├── mcp_config.json # Template for MCP servers (agent-builder, tools)
|
||||
└── skills/ # Symlinks to .claude/skills/
|
||||
```
|
||||
|
||||
The **setup script** writes your **user** config (`~/.gemini/antigravity/mcp_config.json`) using paths from **this repo**. The file in `.agent/` is the template; Antigravity itself uses the file in your home directory.
|
||||
@@ -123,24 +113,24 @@ The **setup script** writes your **user** config (`~/.gemini/antigravity/mcp_con
|
||||
- Open the **repo root** as the project in the IDE (the folder that has `core/` and `tools/`).
|
||||
- If you edited `~/.gemini/antigravity/mcp_config.json` by hand, make sure `--directory` paths are **absolute** (e.g. `/Users/you/hive/core` and `/Users/you/hive/tools`).
|
||||
|
||||
**Skills don’t show up in the UI**
|
||||
**MCP tools don’t show up in the UI**
|
||||
|
||||
- Antigravity may not have a dedicated “skills” panel. Use the files in `.claude/skills/` or `.agent/skills/` as docs; the MCP tools (agent-builder, tools) still work.
|
||||
- Antigravity may need a restart. Use the files in `docs/` as documentation; the MCP tools (`agent-builder`, `tools`) are the required integration point.
|
||||
|
||||
---
|
||||
|
||||
## Verification prompt (optional)
|
||||
|
||||
Paste this into Antigravity to check that MCP and skills are set up. It doesn’t use your machine’s paths; anyone can use it.
|
||||
Paste this into Antigravity to check that MCP is set up. It doesn’t use your machine’s paths; anyone can use it.
|
||||
|
||||
```
|
||||
Check the Hive + Antigravity integration:
|
||||
|
||||
1. MCP: List available MCP servers/tools. Confirm that "agent-builder" and "tools" (or equivalent) are connected. If not, tell the user to run ./scripts/setup-antigravity-mcp.sh from the hive repo root, then restart Antigravity (see docs/antigravity-setup.md).
|
||||
|
||||
2. Skills: Confirm that the project has .agent/skills/ (or .claude/skills/) with: hive, hive-concepts, hive-create, hive-patterns, hive-test, hive-credentials.
|
||||
2. Docs: Confirm that the project has `docs/` with setup/developer guides for the workflow.
|
||||
|
||||
3. Result: Reply with PASS (MCP + skills OK), PARTIAL (only skills or only MCP), or FAIL (neither), and one line on what to fix if not PASS.
|
||||
3. Result: Reply with PASS (MCP OK), PARTIAL (some MCP tools missing), or FAIL (MCP unavailable), and one line on what to fix if not PASS.
|
||||
```
|
||||
|
||||
If you get **PARTIAL** (e.g. MCP not connected), run `./scripts/setup-antigravity-mcp.sh` from the repo root and restart Antigravity.
|
||||
@@ -178,13 +168,10 @@ Make sure `uv` is installed and available in your PATH. Note: Use `--directory`
|
||||
|
||||
From the **repo root**:
|
||||
|
||||
**Check that config and skills exist**
|
||||
**Check that config exists**
|
||||
|
||||
```bash
|
||||
test -f .agent/mcp_config.json && echo "OK: mcp_config.json" || echo "MISSING"
|
||||
for s in hive hive-concepts hive-create hive-patterns hive-test hive-credentials; do
|
||||
test -L .agent/skills/$s && test -d .agent/skills/$s && echo "OK: $s" || echo "BROKEN: $s"
|
||||
done
|
||||
```
|
||||
|
||||
**Check that the config is valid JSON**
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
| File | Action |
|
||||
|---|---|
|
||||
| `.claude/skills/hive-create/SKILL.md` | Remove `"function"` from node type table (~L495, L856) |
|
||||
| `docs/developer-guide.md` | Remove `"function"` from node type table (~L495, L856) |
|
||||
| `docs/developer-guide.md` | Remove `"function"` node type reference (~L613) |
|
||||
| `core/MCP_SERVER_GUIDE.md` | Audit for `"function"` references |
|
||||
| `docs/why-conditional-edge-priority.md` | Remove or repurpose (entire doc framed around function nodes) |
|
||||
|
||||
@@ -104,11 +104,11 @@ The `.cursorrules` file at the repo root tells Cursor's AI the project's style r
|
||||
|
||||
### Antigravity IDE
|
||||
|
||||
Antigravity IDE (Google's AI-powered IDE) is supported via `.antigravity/mcp_config.json` and `.antigravity/skills/` (symlinks to `.claude/skills/`). See [antigravity-setup.md](antigravity-setup.md) for setup and troubleshooting.
|
||||
Antigravity IDE (Google's AI-powered IDE) is supported via `.antigravity/mcp_config.json`. See [antigravity-setup.md](antigravity-setup.md) for setup and troubleshooting.
|
||||
|
||||
### Codex CLI
|
||||
|
||||
Codex CLI (OpenAI, v0.101.0+) is supported via `.codex/config.toml` (MCP server config) and `.agents/skills/` (symlinks to `.claude/skills/`). These files are tracked in git. Run `codex` in the repo root and type `use hive` to start. See the [Codex CLI section in the README](../README.md#codex-cli) for details.
|
||||
Codex CLI (OpenAI, v0.101.0+) is supported via `.codex/config.toml` (MCP server config). This file is tracked in git. Run `codex` in the repo root to use the configured MCP tools. See the [Codex CLI section in the README](../README.md#codex-cli) for details.
|
||||
|
||||
---
|
||||
|
||||
|
||||
+20
-38
@@ -101,22 +101,16 @@ Get API keys:
|
||||
./quickstart.sh
|
||||
```
|
||||
|
||||
This installs agent-related Claude Code skills:
|
||||
|
||||
- `/hive` - Complete workflow for building agents
|
||||
- `/hive-create` - Step-by-step agent building
|
||||
- `/hive-concepts` - Fundamental agent concepts
|
||||
- `/hive-patterns` - Best practices and design patterns
|
||||
- `/hive-test` - Test and validate agents
|
||||
This sets up agent-builder and tools MCP workflows.
|
||||
|
||||
### Cursor IDE Support
|
||||
|
||||
Skills are also available in Cursor. To enable:
|
||||
MCP tools are also available in Cursor. To enable:
|
||||
|
||||
1. Open Command Palette (`Cmd+Shift+P` / `Ctrl+Shift+P`)
|
||||
2. Run `MCP: Enable` to enable MCP servers
|
||||
3. Restart Cursor to load the MCP servers from `.cursor/mcp.json`
|
||||
4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`)
|
||||
4. Open Agent chat and verify MCP tools are available
|
||||
|
||||
### Codex CLI Support
|
||||
|
||||
@@ -124,15 +118,14 @@ Hive supports [OpenAI Codex CLI](https://github.com/openai/codex) (v0.101.0+).
|
||||
|
||||
Configuration files are tracked in git:
|
||||
- `.codex/config.toml` — MCP server config (`agent-builder`)
|
||||
- `.agents/skills/` — Symlinks to Hive skills
|
||||
|
||||
To use Codex with Hive:
|
||||
1. Run `codex` in the repo root
|
||||
2. Type `use hive` to start the agent workflow
|
||||
2. Start the configured MCP-assisted workflow
|
||||
|
||||
Example:
|
||||
```
|
||||
codex> use hive
|
||||
Start Codex in the repo root and use the configured MCP tools
|
||||
```
|
||||
|
||||
|
||||
@@ -153,7 +146,7 @@ uv run python -c "import framework; print('✓ framework OK')"
|
||||
uv run python -c "import aden_tools; print('✓ aden_tools OK')"
|
||||
uv run python -c "import litellm; print('✓ litellm OK')"
|
||||
|
||||
# Run an agent (after building one via /hive-create)
|
||||
# Run an agent (after building one with agent-builder)
|
||||
PYTHONPATH=exports uv run python -m your_agent_name validate
|
||||
```
|
||||
|
||||
@@ -176,17 +169,8 @@ hive/ # Repository root
|
||||
│ ├── PULL_REQUEST_TEMPLATE.md # PR description template
|
||||
│ └── CODEOWNERS # Auto-assign reviewers
|
||||
│
|
||||
├── .claude/ # Claude Code Skills
|
||||
│ └── skills/ # Skills for building
|
||||
│ ├── hive/ # Complete workflow
|
||||
│ ├── hive-create/ # Step-by-step build guide
|
||||
│ ├── hive-concepts/ # Fundamental concepts
|
||||
│ ├── hive-patterns/ # Best practices
|
||||
│ └── hive-test/ # Test and validate agents
|
||||
├── .codex/ # Codex CLI project config
|
||||
│ └── config.toml # Codex MCP server definitions
|
||||
├── .agents/ # Shared skill mountpoint
|
||||
│ └── skills/ # Symlinks to Hive skills
|
||||
│
|
||||
├── core/ # CORE FRAMEWORK PACKAGE
|
||||
│ ├── framework/ # Main package code
|
||||
@@ -222,7 +206,7 @@ hive/ # Repository root
|
||||
│ └── README.md # Tools documentation
|
||||
│
|
||||
├── exports/ # AGENT PACKAGES (user-created, gitignored)
|
||||
│ └── your_agent_name/ # Created via /hive-create
|
||||
│ └── your_agent_name/ # Created via agent-builder workflow
|
||||
│
|
||||
├── examples/ # Example agents
|
||||
│ └── templates/ # Pre-built template agents
|
||||
@@ -251,19 +235,16 @@ hive/ # Repository root
|
||||
|
||||
## Building Agents
|
||||
|
||||
### Using Claude Code Skills
|
||||
### Using Agent Builder Workflow
|
||||
|
||||
The fastest way to build agents is using the Claude Code skills:
|
||||
The fastest way to build agents is with the configured MCP workflow:
|
||||
|
||||
```bash
|
||||
# Install skills (one-time)
|
||||
# Install dependencies (one-time)
|
||||
./quickstart.sh
|
||||
|
||||
# Build a new agent
|
||||
claude> /hive
|
||||
|
||||
# Test the agent
|
||||
claude> /hive-test
|
||||
Use the agent-builder MCP tools from your IDE agent chat
|
||||
```
|
||||
|
||||
### Agent Development Workflow
|
||||
@@ -271,19 +252,19 @@ claude> /hive-test
|
||||
1. **Define Your Goal**
|
||||
|
||||
```
|
||||
claude> /hive
|
||||
Use the agent-builder workflow
|
||||
Enter goal: "Build an agent that processes customer support tickets"
|
||||
```
|
||||
|
||||
2. **Design the Workflow**
|
||||
|
||||
- The skill guides you through defining nodes
|
||||
- The workflow guides you through defining nodes
|
||||
- Each node is a unit of work (LLM call with event_loop)
|
||||
- Edges define how execution flows
|
||||
|
||||
3. **Generate the Agent**
|
||||
|
||||
- The skill generates a complete Python package in `exports/`
|
||||
- The workflow generates a complete Python package in `exports/`
|
||||
- Includes: `agent.json`, `tools.py`, `README.md`
|
||||
|
||||
4. **Validate the Agent**
|
||||
@@ -293,8 +274,9 @@ claude> /hive-test
|
||||
```
|
||||
|
||||
5. **Test the Agent**
|
||||
```
|
||||
claude> /hive-test
|
||||
Run tests with:
|
||||
```bash
|
||||
PYTHONPATH=exports uv run python -m your_agent_name test
|
||||
```
|
||||
|
||||
### Manual Agent Development
|
||||
@@ -351,11 +333,11 @@ hive run exports/my_agent --tui
|
||||
|
||||
## Testing Agents
|
||||
|
||||
### Using the Testing Agent Skill
|
||||
### Using Built-in Test Commands
|
||||
|
||||
```bash
|
||||
# Run tests for an agent
|
||||
claude> /hive-test
|
||||
PYTHONPATH=exports uv run python -m agent_name test
|
||||
```
|
||||
|
||||
This generates and runs:
|
||||
@@ -573,7 +555,7 @@ uv add <package>
|
||||
|
||||
```bash
|
||||
# Option 1: Use Claude Code skill (recommended)
|
||||
claude> /hive
|
||||
Use the agent-builder workflow
|
||||
|
||||
# Option 2: Create manually
|
||||
# Note: exports/ is initially empty (gitignored). Create your agent directory:
|
||||
|
||||
+17
-24
@@ -112,7 +112,7 @@ uv run python -c "import litellm; print('✓ litellm OK')"
|
||||
|
||||
### API Keys
|
||||
|
||||
We recommend using quickstart.sh for LLM API credential setup and /hive-credentials for the tools credentials
|
||||
We recommend using `quickstart.sh` for LLM API credential setup and the credentials UI/tooling for tool credentials.
|
||||
|
||||
## Running Agents
|
||||
|
||||
@@ -165,33 +165,27 @@ Build and run an agent using Claude Code CLI with the agent building skills:
|
||||
./quickstart.sh
|
||||
```
|
||||
|
||||
This verifies agent-related Claude Code skills are available:
|
||||
|
||||
- `/hive` - Complete workflow for building agents
|
||||
- `/hive-create` - Step-by-step build guide
|
||||
- `/hive-concepts` - Fundamental concepts
|
||||
- `/hive-patterns` - Best practices
|
||||
- `/hive-test` - Test and validate agents
|
||||
This sets up agent-builder and tools MCP workflows.
|
||||
|
||||
### Cursor IDE Support
|
||||
|
||||
Skills are also available in Cursor. To enable:
|
||||
MCP tools are also available in Cursor. To enable:
|
||||
|
||||
1. Open Command Palette (`Cmd+Shift+P` / `Ctrl+Shift+P`)
|
||||
2. Run `MCP: Enable` to enable MCP servers
|
||||
3. Restart Cursor to load the MCP servers from `.cursor/mcp.json`
|
||||
4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`)
|
||||
4. Open Agent chat and verify MCP tools are available
|
||||
|
||||
### 2. Build an Agent
|
||||
|
||||
**Claude Code:**
|
||||
```
|
||||
claude> /hive
|
||||
Use the agent-builder workflow
|
||||
```
|
||||
|
||||
**Codex CLI:**
|
||||
```
|
||||
codex> use hive
|
||||
Start Codex in the repo root and use the configured MCP tools
|
||||
```
|
||||
|
||||
Follow the prompts to:
|
||||
@@ -206,7 +200,7 @@ This step creates the initial agent structure required for further development.
|
||||
### 3. Define Agent Logic
|
||||
|
||||
```
|
||||
claude> /hive-concepts
|
||||
claude> architecture guidance
|
||||
```
|
||||
|
||||
Follow the prompts to:
|
||||
@@ -221,7 +215,7 @@ This step establishes the core concepts and rules needed before building an agen
|
||||
### 4. Apply Agent Patterns
|
||||
|
||||
```
|
||||
claude> /hive-patterns
|
||||
claude> pattern guidance
|
||||
```
|
||||
|
||||
Follow the prompts to:
|
||||
@@ -236,7 +230,7 @@ This step helps optimize agent design before final testing.
|
||||
### 5. Test Your Agent
|
||||
|
||||
```
|
||||
claude> /hive-test
|
||||
claude> test workflow
|
||||
```
|
||||
|
||||
Follow the prompts to:
|
||||
@@ -367,7 +361,7 @@ hive/
|
||||
│ └── pyproject.toml
|
||||
│
|
||||
├── exports/ # Agent packages (user-created, gitignored)
|
||||
│ └── your_agent_name/ # Created via /hive-create
|
||||
│ └── your_agent_name/ # Created via agent-builder workflow
|
||||
│
|
||||
└── examples/
|
||||
└── templates/ # Pre-built template agents
|
||||
@@ -459,7 +453,7 @@ This design allows agents in `exports/` to be:
|
||||
### 2. Build Agent (Claude Code)
|
||||
|
||||
```
|
||||
claude> /hive
|
||||
Use the agent-builder workflow
|
||||
Enter goal: "Build an agent that processes customer support tickets"
|
||||
```
|
||||
|
||||
@@ -472,7 +466,7 @@ PYTHONPATH=exports uv run python -m your_agent_name validate
|
||||
### 4. Test Agent
|
||||
|
||||
```
|
||||
claude> /hive-test
|
||||
claude> test workflow
|
||||
```
|
||||
|
||||
### 5. Run Agent
|
||||
@@ -545,18 +539,17 @@ Run the quickstart script in the root directory:
|
||||
[OpenAI Codex CLI](https://github.com/openai/codex) (v0.101.0+) is supported with project-level config:
|
||||
|
||||
- `.codex/config.toml` — MCP server configuration (`agent-builder`)
|
||||
- `.agents/skills/` — Symlinks to Hive skills
|
||||
|
||||
These files are tracked in git and available on clone. To use Codex with Hive:
|
||||
|
||||
1. Run `codex` in the repo root
|
||||
2. Type `use hive` to start the agent workflow
|
||||
2. Start the configured MCP-assisted workflow
|
||||
|
||||
Quick verification:
|
||||
|
||||
```bash
|
||||
test -f .codex/config.toml && echo "OK: Codex config" || echo "MISSING: .codex/config.toml"
|
||||
test -d .agents/skills/hive && echo "OK: Skills" || echo "MISSING: .agents/skills/"
|
||||
echo "OK: .codex/config.toml and MCP tools configured"
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
@@ -564,8 +557,8 @@ test -d .agents/skills/hive && echo "OK: Skills" || echo "MISSING: .agents/skill
|
||||
- **Framework Documentation:** [core/README.md](../core/README.md)
|
||||
- **Tools Documentation:** [tools/README.md](../tools/README.md)
|
||||
- **Example Agents:** [examples/](../examples/)
|
||||
- **Agent Building Guide:** [.claude/skills/hive-create/SKILL.md](../.claude/skills/hive-create/SKILL.md)
|
||||
- **Testing Guide:** [.claude/skills/hive-test/SKILL.md](../.claude/skills/hive-test/SKILL.md)
|
||||
- **Agent Building Guide:** [docs/developer-guide.md](./developer-guide.md)
|
||||
- **Testing Guide:** [core/README.md](../core/README.md)
|
||||
|
||||
## Contributing
|
||||
|
||||
@@ -574,7 +567,7 @@ When contributing agent packages:
|
||||
1. Place agents in `exports/agent_name/`
|
||||
2. Follow the standard agent structure (see existing agents)
|
||||
3. Include README.md with usage instructions
|
||||
4. Add tests if using `/hive-test`
|
||||
4. Add tests if using `test workflow`
|
||||
5. Document required environment variables
|
||||
|
||||
## Support
|
||||
|
||||
+5
-16
@@ -47,7 +47,7 @@ This is the recommended way to create your first agent.
|
||||
# Setup already done via quickstart.sh above
|
||||
|
||||
# Start Claude Code and build an agent
|
||||
claude> /hive
|
||||
Use the agent-builder workflow
|
||||
```
|
||||
|
||||
Follow the interactive prompts to:
|
||||
@@ -115,19 +115,11 @@ hive/
|
||||
│ └── file_system_toolkits/
|
||||
│
|
||||
├── exports/ # Agent Packages (user-generated, not in repo)
|
||||
│ └── your_agent/ # Your agents created via /hive
|
||||
│ └── your_agent/ # Your agents created via agent-builder workflow
|
||||
│
|
||||
├── examples/
|
||||
│ └── templates/ # Pre-built template agents
|
||||
│
|
||||
├── .claude/ # Claude Code Skills
|
||||
│ └── skills/
|
||||
│ ├── hive/
|
||||
│ ├── hive-create/
|
||||
│ ├── hive-concepts/
|
||||
│ ├── hive-patterns/
|
||||
│ └── hive-test/
|
||||
│
|
||||
└── docs/ # Documentation
|
||||
```
|
||||
|
||||
@@ -168,10 +160,7 @@ Get your API keys:
|
||||
## Testing Your Agent
|
||||
|
||||
```bash
|
||||
# Using Claude Code
|
||||
claude> /hive-test
|
||||
|
||||
# Or manually
|
||||
# Run tests
|
||||
PYTHONPATH=exports uv run python -m my_agent test
|
||||
|
||||
# Run with specific test type
|
||||
@@ -184,7 +173,7 @@ PYTHONPATH=exports uv run python -m my_agent test --type success
|
||||
1. **Dashboard**: Run `hive open` to launch the web dashboard, or `hive tui` for the terminal UI
|
||||
2. **Detailed Setup**: See [environment-setup.md](./environment-setup.md)
|
||||
3. **Developer Guide**: See [developer-guide.md](./developer-guide.md)
|
||||
4. **Build Agents**: Use `/hive` skill in Claude Code
|
||||
4. **Build Agents**: Use agent-builder workflow in Claude Code
|
||||
5. **Custom Tools**: Learn to integrate MCP servers
|
||||
6. **Join Community**: [Discord](https://discord.com/invite/MXE49hrKDk)
|
||||
|
||||
@@ -227,4 +216,4 @@ pip uninstall -y framework tools
|
||||
- **Documentation**: Check the `/docs` folder
|
||||
- **Issues**: [github.com/adenhq/hive/issues](https://github.com/adenhq/hive/issues)
|
||||
- **Discord**: [discord.com/invite/MXE49hrKDk](https://discord.com/invite/MXE49hrKDk)
|
||||
- **Build Agents**: Use `/hive` skill to create agents
|
||||
- **Build Agents**: Use agent-builder workflow to create agents
|
||||
|
||||
@@ -22,7 +22,7 @@ template_name/
|
||||
|
||||
### Option 1: Build from template (recommended)
|
||||
|
||||
Use the `/hive-create` skill and select "From a template" to interactively pick a template, customize the goal/nodes/graph, and export a new agent.
|
||||
Use the agent-builder workflow and select "From a template" to interactively pick a template, customize the goal/nodes/graph, and export a new agent.
|
||||
|
||||
### Option 2: Manual copy
|
||||
|
||||
|
||||
@@ -139,14 +139,6 @@ terminal_nodes = ["customize"]
|
||||
class JobHunterAgent:
|
||||
"""
|
||||
Job Hunter Agent — 4-node pipeline for job search and application materials.
|
||||
|
||||
Flow: intake -> job-search -> job-review -> customize
|
||||
|
||||
Uses AgentRuntime for proper session management:
|
||||
- Session-scoped storage (sessions/{session_id}/)
|
||||
- Checkpointing for resume capability
|
||||
- Runtime logging
|
||||
- Data folder for save_data/load_data
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
@@ -168,7 +160,7 @@ class JobHunterAgent:
|
||||
return GraphSpec(
|
||||
id="job-hunter-graph",
|
||||
goal_id=self.goal.id,
|
||||
version="1.0.0",
|
||||
version="1.1.0",
|
||||
entry_node=self.entry_node,
|
||||
entry_points=self.entry_points,
|
||||
terminal_nodes=self.terminal_nodes,
|
||||
@@ -186,9 +178,7 @@ class JobHunterAgent:
|
||||
identity_prompt=(
|
||||
"You are a job hunting assistant. You analyze resumes to identify "
|
||||
"the strongest role fits, search for matching job opportunities, "
|
||||
"and help create personalized application materials. You only "
|
||||
"suggest roles the user is realistically qualified for, and you "
|
||||
"never fabricate experience — only enhance presentation truthfully."
|
||||
"and help create personalized application materials."
|
||||
),
|
||||
)
|
||||
|
||||
@@ -288,56 +278,18 @@ class JobHunterAgent:
|
||||
finally:
|
||||
await self.stop()
|
||||
|
||||
def info(self):
|
||||
"""Get agent information."""
|
||||
return {
|
||||
"name": metadata.name,
|
||||
"version": metadata.version,
|
||||
"description": metadata.description,
|
||||
"goal": {
|
||||
"name": self.goal.name,
|
||||
"description": self.goal.description,
|
||||
},
|
||||
"nodes": [n.id for n in self.nodes],
|
||||
"edges": [e.id for e in self.edges],
|
||||
"entry_node": self.entry_node,
|
||||
"entry_points": self.entry_points,
|
||||
"pause_nodes": self.pause_nodes,
|
||||
"terminal_nodes": self.terminal_nodes,
|
||||
"client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Validate agent structure."""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
node_ids = {node.id for node in self.nodes}
|
||||
for edge in self.edges:
|
||||
if edge.source not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
|
||||
if edge.target not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
|
||||
|
||||
if self.entry_node not in node_ids:
|
||||
errors.append(f"Entry node '{self.entry_node}' not found")
|
||||
|
||||
for terminal in self.terminal_nodes:
|
||||
if terminal not in node_ids:
|
||||
errors.append(f"Terminal node '{terminal}' not found")
|
||||
|
||||
for ep_id, node_id in self.entry_points.items():
|
||||
if node_id not in node_ids:
|
||||
errors.append(
|
||||
f"Entry point '{ep_id}' references unknown node '{node_id}'"
|
||||
)
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
return {"valid": len(errors) == 0, "errors": errors}
|
||||
|
||||
# Create default instance
|
||||
default_agent = JobHunterAgent()
|
||||
|
||||
@@ -2,62 +2,40 @@
|
||||
|
||||
from framework.graph import NodeSpec
|
||||
|
||||
# Node 1: Intake (client-facing)
|
||||
# Node 1: Intake (simple)
|
||||
# Collect resume and identify strongest role types.
|
||||
intake_node = NodeSpec(
|
||||
id="intake",
|
||||
name="Intake",
|
||||
description="Collect resume from user, analyze skills and experience, identify 3-5 strongest role types",
|
||||
description="Analyze resume and identify 3-5 strongest role types",
|
||||
node_type="event_loop",
|
||||
client_facing=True,
|
||||
client_facing=False,
|
||||
max_node_visits=1,
|
||||
input_keys=[],
|
||||
input_keys=["resume_text"],
|
||||
output_keys=["resume_text", "role_analysis"],
|
||||
success_criteria=(
|
||||
"The user's resume has been analyzed and 3-5 target roles identified "
|
||||
"based on their actual experience, with user confirmation."
|
||||
"based on their actual experience."
|
||||
),
|
||||
system_prompt="""\
|
||||
You are a career analyst helping a job seeker find their best opportunities.
|
||||
You are a career analyst. Your task is to analyze the user's resume and identify the best role fits.
|
||||
|
||||
**STEP 1 — Collect the resume:**
|
||||
**PROCESS:**
|
||||
1. Identify key skills (technical and soft skills).
|
||||
2. Summarize years and types of experience.
|
||||
3. Identify 3-5 specific role types where they're most competitive based on their ACTUAL experience.
|
||||
|
||||
Check your input context for a `pdf_file_path` key.
|
||||
|
||||
- **If `pdf_file_path` is present:** A PDF resume has been attached. Use the `pdf_read` tool \
|
||||
to extract its text: `pdf_read(file_path=<the path>)`. Then greet the user and proceed \
|
||||
directly to STEP 2 with the extracted text.
|
||||
- **If no `pdf_file_path`:** Ask the user to paste their resume. Be friendly and concise:
|
||||
"Please paste your resume below (or attach a PDF with /attach). I'll analyze your \
|
||||
experience and identify the roles where you have the strongest chance of success."
|
||||
|
||||
**STEP 2 — After you have the resume text:**
|
||||
|
||||
Analyze the resume thoroughly:
|
||||
1. Identify key skills (technical and soft skills)
|
||||
2. Summarize years and types of experience
|
||||
3. Identify 3-5 specific role types where they're most competitive based on their ACTUAL experience
|
||||
|
||||
Present your analysis to the user and ask if they agree with the role types identified.
|
||||
|
||||
**STEP 3 — After user confirms, call set_output IMMEDIATELY:**
|
||||
|
||||
IMPORTANT: When the user says any of these, treat it as CONFIRMATION and call set_output immediately:
|
||||
- "yes", "sure", "looks good", "that works", "go ahead", "find jobs", "start searching", etc.
|
||||
|
||||
DO NOT ask follow-up questions after the user confirms. DO NOT ask which roles to focus on.
|
||||
The job search will use ALL the roles you identified.
|
||||
|
||||
Use set_output to store:
|
||||
- set_output("resume_text", "<the full resume text>")
|
||||
**OUTPUT:**
|
||||
You MUST call set_output to store:
|
||||
- set_output("resume_text", "<the full resume text from input>")
|
||||
- set_output("role_analysis", "<JSON with: skills, experience_summary, target_roles (3-5 specific role titles)>")
|
||||
|
||||
NEVER ask the user to pick between roles. Your job is to identify the right roles, not make them choose.
|
||||
Do NOT wait for user confirmation. Simply perform the analysis and set the outputs.
|
||||
""",
|
||||
tools=["pdf_read"],
|
||||
tools=[],
|
||||
)
|
||||
|
||||
# Node 2: Job Search
|
||||
# Node 2: Job Search (simple)
|
||||
# Search for 10 jobs matching the identified roles.
|
||||
job_search_node = NodeSpec(
|
||||
id="job-search",
|
||||
@@ -78,33 +56,17 @@ You are a job search specialist. Your task is to find 10 relevant job openings.
|
||||
**INPUT:** You have access to role_analysis containing target roles and skills.
|
||||
|
||||
**PROCESS:**
|
||||
Use web_scrape to directly scrape job listings from these job boards. Build search URLs with the role title:
|
||||
Use web_scrape to directly scrape job listings from job boards. Build search URLs with the role title:
|
||||
- LinkedIn Jobs: https://www.linkedin.com/jobs/search/?keywords={role_title}
|
||||
- Indeed: https://www.indeed.com/jobs?q={role_title}
|
||||
|
||||
**Recommended Job Sites (scrape these directly):**
|
||||
1. **LinkedIn Jobs:** https://www.linkedin.com/jobs/search/?keywords={role_title}
|
||||
2. **Indeed:** https://www.indeed.com/jobs?q={role_title}
|
||||
3. **Glassdoor:** https://www.glassdoor.com/Job/jobs.htm?sc.keyword={role_title}
|
||||
4. **Wellfound (Startups):** https://wellfound.com/jobs?q={role_title}
|
||||
5. **RemoteOK:** https://remoteok.com/remote-{role_title}-jobs
|
||||
|
||||
**Strategy:**
|
||||
- For each target role in role_analysis, scrape 1-2 job board search result pages
|
||||
- Extract job listings from the scraped HTML
|
||||
- If a job looks promising, scrape its detail page for more info
|
||||
- Gather 10 quality job listings total across the target roles
|
||||
Gather 10 quality job listings total across the target roles.
|
||||
|
||||
**For each job, extract:**
|
||||
- Job title
|
||||
- Company name
|
||||
- Location (or "Remote" if applicable)
|
||||
- Brief job description/requirements summary
|
||||
- URL to the job posting
|
||||
- Any info about the hiring manager or company contact if visible
|
||||
- Job title, Company name, Location, Brief description, URL.
|
||||
|
||||
**OUTPUT:** Once you have 10 jobs, call:
|
||||
set_output("job_listings", "<JSON array of 10 job objects with title, company, location, description, url, contact_info>")
|
||||
|
||||
Focus on finding REAL, current job postings with actual URLs the user can visit.
|
||||
set_output("job_listings", "<JSON array of 10 job objects>")
|
||||
""",
|
||||
tools=["web_scrape"],
|
||||
)
|
||||
@@ -127,31 +89,13 @@ job_review_node = NodeSpec(
|
||||
system_prompt="""\
|
||||
You are helping a job seeker choose which positions to apply to.
|
||||
|
||||
**STEP 1 — Present the jobs (text only, NO tool calls):**
|
||||
**STEP 1 — Present the jobs:**
|
||||
Display all 10 jobs in a clear, numbered format.
|
||||
Ask: "Which jobs would you like me to create application materials for? List the numbers or say 'all'."
|
||||
|
||||
Display all 10 jobs in a clear, numbered format:
|
||||
|
||||
```
|
||||
**Job Opportunities Found:**
|
||||
|
||||
1. **[Job Title]** at [Company]
|
||||
Location: [Location]
|
||||
[Brief description - 2-3 lines]
|
||||
URL: [link]
|
||||
|
||||
2. **[Job Title]** at [Company]
|
||||
...
|
||||
```
|
||||
|
||||
After listing all jobs, ask:
|
||||
"Which jobs would you like me to create application materials for? Please list the numbers (e.g., '1, 3, 5') or say 'all' for all of them."
|
||||
|
||||
**STEP 2 — After the user responds:**
|
||||
|
||||
Confirm their selection and call set_output:
|
||||
- set_output("selected_jobs", "<JSON array of the selected job objects>")
|
||||
|
||||
Only include the jobs the user explicitly selected.
|
||||
**STEP 2 — After user responds:**
|
||||
Confirm their selection and call:
|
||||
set_output("selected_jobs", "<JSON array of the selected job objects>")
|
||||
""",
|
||||
tools=[],
|
||||
)
|
||||
@@ -174,84 +118,14 @@ customize_node = NodeSpec(
|
||||
system_prompt="""\
|
||||
You are a career coach creating personalized application materials.
|
||||
|
||||
**INPUT:** You have the user's resume and their selected jobs.
|
||||
**PROCESS:**
|
||||
1. Create application_materials.html using save_data and append_data.
|
||||
2. Generate resume customization list and professional cold email for each selected job.
|
||||
3. Serve the file to the user.
|
||||
4. Create Gmail drafts using gmail_create_draft.
|
||||
|
||||
**OUTPUT FORMAT: Single HTML Report — Built Incrementally**
|
||||
Build ONE polished HTML report, but write it in CHUNKS using append_data to avoid token limits.
|
||||
|
||||
**CRITICAL: You MUST build the file in multiple append_data calls. NEVER try to write the \
|
||||
entire HTML in a single save_data call — it will exceed the output token limit and fail.**
|
||||
|
||||
**PROCESS (follow exactly):**
|
||||
|
||||
**Step 1 — Write HTML header + table of contents:**
|
||||
Call save_data to create the file with the HTML head, styles, and TOC:
|
||||
```
|
||||
save_data(filename="application_materials.html", data="<!DOCTYPE html>\\n<html>\\n<head>...")
|
||||
```
|
||||
Include: DOCTYPE, head with styles, opening body tag, h1, and the table of contents \
|
||||
linking to each selected job. End with the TOC closing div.
|
||||
|
||||
CSS to use:
|
||||
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; max-width: 900px; margin: 0 auto; padding: 40px; line-height: 1.6; }
|
||||
h1 { color: #1a1a1a; border-bottom: 2px solid #0066cc; padding-bottom: 10px; }
|
||||
h2 { color: #0066cc; margin-top: 40px; padding-top: 20px; border-top: 1px solid #e0e0e0; }
|
||||
h3 { color: #333; margin-top: 20px; }
|
||||
.job-section { margin-bottom: 60px; }
|
||||
.email-card { background: #f8f9fa; border-left: 4px solid #0066cc; padding: 20px; margin: 20px 0; white-space: pre-wrap; }
|
||||
.customization-list { background: #fff; border: 1px solid #e0e0e0; padding: 20px; border-radius: 8px; }
|
||||
ul { line-height: 1.8; }
|
||||
.toc { background: #f0f4f8; padding: 20px; border-radius: 8px; margin-bottom: 40px; }
|
||||
.toc a { color: #0066cc; text-decoration: none; }
|
||||
.toc a:hover { text-decoration: underline; }
|
||||
.job-url { color: #666; font-size: 0.9em; }
|
||||
|
||||
**Step 2 — Append each job section ONE AT A TIME:**
|
||||
For EACH selected job, call append_data with that job's section:
|
||||
```
|
||||
append_data(filename="application_materials.html", data="<div class='job-section' id='job-N'>...")
|
||||
```
|
||||
Each section should contain:
|
||||
- Job title + company as h2
|
||||
- Job URL link
|
||||
- Resume Customization List (Priority Changes, Keywords, Experiences to Emphasize, Suggested Rewrites)
|
||||
- Cold Outreach Email in an email-card div (subject line + body, under 150 words)
|
||||
|
||||
**Step 3 — Append HTML footer:**
|
||||
```
|
||||
append_data(filename="application_materials.html", data="</body>\\n</html>")
|
||||
```
|
||||
|
||||
**Step 4 — Serve the file:**
|
||||
Call serve_file_to_user(filename="application_materials.html")
|
||||
Print the file_path from the result so the user can access it later.
|
||||
|
||||
**Step 5 — Create Gmail Drafts (in batches of 5):**
|
||||
IMPORTANT: Do NOT create all drafts in one turn. Create at most 5 gmail_create_draft calls \
|
||||
per turn to stay within tool call limits. If there are more than 5 jobs, create the first 5 \
|
||||
drafts, then create the remaining drafts in the next turn.
|
||||
|
||||
For each selected job, call gmail_create_draft with:
|
||||
- to: hiring manager email if available, otherwise "hiring@company-domain.com"
|
||||
- subject: the cold email subject line
|
||||
- body: the cold email body as plain text
|
||||
- draft: true (create as draft, not send immediately)
|
||||
|
||||
If gmail_create_draft errors (e.g. credentials not configured), skip ALL remaining drafts and tell the user:
|
||||
"Gmail drafts could not be created (Gmail not connected). You can copy the emails from the HTML report instead."
|
||||
|
||||
**Step 6 — Confirm Gmail Drafts Created:**
|
||||
After all drafts are created, tell the user: "✓ Created {N} draft emails in your Gmail inbox. You can review and send them when ready."
|
||||
|
||||
**Step 7 — Finish:**
|
||||
Call set_output("application_materials", "Created application_materials.html with materials for {N} jobs and {N} Gmail drafts")
|
||||
|
||||
**IMPORTANT:**
|
||||
- Only suggest truthful resume changes — enhance presentation, never fabricate
|
||||
- Cold emails must be professional, personalized, and under 150 words
|
||||
- ALWAYS print the full file path so users can easily access the file later
|
||||
- If a save_data or append_data call fails with a truncation error, you are writing too much \
|
||||
in one call. Break it into smaller chunks.
|
||||
**FINISH:**
|
||||
Call set_output("application_materials", "Completed")
|
||||
""",
|
||||
tools=["save_data", "append_data", "serve_file_to_user", "gmail_create_draft"],
|
||||
)
|
||||
|
||||
@@ -1347,41 +1347,7 @@ else
|
||||
echo -e "${YELLOW}--${NC}"
|
||||
fi
|
||||
|
||||
echo -n " ⬡ skills... "
|
||||
if [ -d "$SCRIPT_DIR/.claude/skills" ]; then
|
||||
SKILL_COUNT=$(ls -1d "$SCRIPT_DIR/.claude/skills"/*/ 2>/dev/null | wc -l)
|
||||
echo -e "${GREEN}${SKILL_COUNT} found${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}--${NC}"
|
||||
fi
|
||||
|
||||
echo -n " ⬡ codex CLI... "
|
||||
if command -v codex > /dev/null 2>&1; then
|
||||
CODEX_VERSION=$(codex --version 2>/dev/null | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo "0.0.0")
|
||||
# Compare version >= 0.101.0
|
||||
CODEX_MAJOR=$(echo "$CODEX_VERSION" | cut -d. -f1)
|
||||
CODEX_MINOR=$(echo "$CODEX_VERSION" | cut -d. -f2)
|
||||
if [ "$CODEX_MAJOR" -gt 0 ] 2>/dev/null || { [ "$CODEX_MAJOR" -eq 0 ] && [ "$CODEX_MINOR" -ge 101 ]; } 2>/dev/null; then
|
||||
echo -e "${GREEN}${CODEX_VERSION}${NC}"
|
||||
CODEX_AVAILABLE=true
|
||||
else
|
||||
echo -e "${YELLOW}${CODEX_VERSION} (upgrade to 0.101.0+)${NC}"
|
||||
CODEX_AVAILABLE=false
|
||||
fi
|
||||
else
|
||||
echo -e "${YELLOW}--${NC}"
|
||||
CODEX_AVAILABLE=false
|
||||
fi
|
||||
|
||||
echo -n " ⬡ local settings... "
|
||||
if [ -f "$SCRIPT_DIR/.claude/settings.local.json" ]; then
|
||||
echo -e "${GREEN}ok${NC}"
|
||||
elif [ -f "$SCRIPT_DIR/.claude/settings.local.json.example" ]; then
|
||||
cp "$SCRIPT_DIR/.claude/settings.local.json.example" "$SCRIPT_DIR/.claude/settings.local.json"
|
||||
echo -e "${GREEN}copied from example${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}--${NC}"
|
||||
fi
|
||||
|
||||
echo -n " ⬡ credential store... "
|
||||
if [ -n "$HIVE_CREDENTIAL_KEY" ] && [ -d "$HOME/.hive/credentials/credentials" ]; then
|
||||
|
||||
@@ -0,0 +1,828 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Open a browser-based viewer for Hive LLM debug JSONL sessions.
|
||||
|
||||
Usage:
|
||||
uv run --no-project scripts/llm_debug_log_visualizer.py
|
||||
uv run --no-project scripts/llm_debug_log_visualizer.py --no-open
|
||||
uv run --no-project scripts/llm_debug_log_visualizer.py --session <execution_id>
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import tempfile
|
||||
import webbrowser
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionSummary:
|
||||
execution_id: str
|
||||
log_file: str
|
||||
start_timestamp: str
|
||||
end_timestamp: str
|
||||
turn_count: int
|
||||
streams: list[str]
|
||||
nodes: list[str]
|
||||
models: list[str]
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--logs-dir",
|
||||
type=Path,
|
||||
default=Path.home() / ".hive" / "llm_logs",
|
||||
help="Directory containing Hive LLM debug JSONL files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--session",
|
||||
help="Execution ID to select initially in the webpage.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
help="Optional HTML output path. Defaults to a temporary file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit-files",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Maximum number of newest log files to scan.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-open",
|
||||
action="store_true",
|
||||
help="Generate the HTML but do not open a browser.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def _safe_read_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
records: list[dict[str, Any]] = []
|
||||
try:
|
||||
with path.open(encoding="utf-8") as handle:
|
||||
for line_number, raw_line in enumerate(handle, start=1):
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
payload = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
payload = {
|
||||
"timestamp": "",
|
||||
"execution_id": "",
|
||||
"assistant_text": "",
|
||||
"_parse_error": f"{path.name}:{line_number}",
|
||||
"_raw_line": line,
|
||||
}
|
||||
payload["_log_file"] = str(path)
|
||||
records.append(payload)
|
||||
except OSError as exc:
|
||||
print(f"warning: failed to read {path}: {exc}")
|
||||
return records
|
||||
|
||||
|
||||
def _discover_records(logs_dir: Path, limit_files: int) -> list[dict[str, Any]]:
|
||||
if not logs_dir.exists():
|
||||
raise FileNotFoundError(f"log directory not found: {logs_dir}")
|
||||
|
||||
files = sorted(
|
||||
[path for path in logs_dir.iterdir() if path.is_file() and path.suffix == ".jsonl"],
|
||||
key=lambda path: path.stat().st_mtime,
|
||||
reverse=True,
|
||||
)[:limit_files]
|
||||
|
||||
records: list[dict[str, Any]] = []
|
||||
for path in files:
|
||||
records.extend(_safe_read_jsonl(path))
|
||||
return records
|
||||
|
||||
|
||||
def _format_timestamp(raw: str) -> str:
|
||||
if not raw:
|
||||
return "-"
|
||||
try:
|
||||
return datetime.fromisoformat(raw).strftime("%Y-%m-%d %H:%M:%S")
|
||||
except ValueError:
|
||||
return raw
|
||||
|
||||
|
||||
def _group_sessions(records: list[dict[str, Any]]) -> tuple[list[SessionSummary], dict[str, list[dict[str, Any]]]]:
|
||||
by_session: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
||||
for record in records:
|
||||
execution_id = str(record.get("execution_id") or "").strip()
|
||||
if execution_id:
|
||||
by_session[execution_id].append(record)
|
||||
|
||||
summaries: list[SessionSummary] = []
|
||||
for execution_id, session_records in by_session.items():
|
||||
session_records.sort(key=lambda record: (str(record.get("timestamp", "")), record.get("iteration", 0)))
|
||||
first = session_records[0]
|
||||
last = session_records[-1]
|
||||
summaries.append(
|
||||
SessionSummary(
|
||||
execution_id=execution_id,
|
||||
log_file=str(first.get("_log_file", "")),
|
||||
start_timestamp=str(first.get("timestamp", "")),
|
||||
end_timestamp=str(last.get("timestamp", "")),
|
||||
turn_count=len(session_records),
|
||||
streams=sorted({str(r.get("stream_id", "")) for r in session_records if r.get("stream_id")}),
|
||||
nodes=sorted({str(r.get("node_id", "")) for r in session_records if r.get("node_id")}),
|
||||
models=sorted(
|
||||
{
|
||||
str(r.get("token_counts", {}).get("model", ""))
|
||||
for r in session_records
|
||||
if isinstance(r.get("token_counts"), dict) and r.get("token_counts", {}).get("model")
|
||||
}
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
summaries.sort(key=lambda summary: summary.start_timestamp, reverse=True)
|
||||
return summaries, by_session
|
||||
|
||||
|
||||
def _render_html(
|
||||
summaries: list[SessionSummary],
|
||||
sessions: dict[str, list[dict[str, Any]]],
|
||||
initial_session_id: str,
|
||||
) -> str:
|
||||
summaries_data = [
|
||||
{
|
||||
"execution_id": summary.execution_id,
|
||||
"log_file": summary.log_file,
|
||||
"start_timestamp": summary.start_timestamp,
|
||||
"end_timestamp": summary.end_timestamp,
|
||||
"start_display": _format_timestamp(summary.start_timestamp),
|
||||
"end_display": _format_timestamp(summary.end_timestamp),
|
||||
"turn_count": summary.turn_count,
|
||||
"streams": summary.streams,
|
||||
"nodes": summary.nodes,
|
||||
"models": summary.models,
|
||||
}
|
||||
for summary in summaries
|
||||
]
|
||||
|
||||
sessions_data = {
|
||||
execution_id: sorted(
|
||||
records,
|
||||
key=lambda record: (str(record.get("timestamp", "")), record.get("iteration", 0)),
|
||||
)
|
||||
for execution_id, records in sessions.items()
|
||||
}
|
||||
initial = initial_session_id or (summaries[0].execution_id if summaries else "")
|
||||
return f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Hive LLM Debug Viewer</title>
|
||||
<style>
|
||||
:root {{
|
||||
--bg: #efe6d8;
|
||||
--panel: rgba(255, 251, 245, 0.92);
|
||||
--panel-strong: #fffdfa;
|
||||
--ink: #1f1d19;
|
||||
--muted: #6d6457;
|
||||
--line: #ddceb6;
|
||||
--accent: #b64a2b;
|
||||
--accent-deep: #7a2813;
|
||||
--sidebar: #2b211d;
|
||||
--sidebar-soft: #3e302a;
|
||||
--user: #0f766e;
|
||||
--assistant: #7c3aed;
|
||||
--tool: #9a3412;
|
||||
--shadow: 0 18px 44px rgba(60, 39, 14, 0.12);
|
||||
}}
|
||||
* {{ box-sizing: border-box; }}
|
||||
body {{
|
||||
margin: 0;
|
||||
color: var(--ink);
|
||||
font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||
background:
|
||||
radial-gradient(circle at top left, rgba(182, 74, 43, 0.14), transparent 28rem),
|
||||
linear-gradient(180deg, #f8f3ea 0%, var(--bg) 100%);
|
||||
}}
|
||||
.app {{
|
||||
min-height: 100vh;
|
||||
display: grid;
|
||||
grid-template-columns: 340px minmax(0, 1fr);
|
||||
}}
|
||||
.sidebar {{
|
||||
background:
|
||||
linear-gradient(180deg, rgba(62, 48, 42, 0.96), rgba(29, 21, 18, 0.98));
|
||||
color: white;
|
||||
padding: 24px 18px;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
height: 100vh;
|
||||
overflow: auto;
|
||||
}}
|
||||
.brand {{
|
||||
margin-bottom: 20px;
|
||||
}}
|
||||
.brand h1 {{
|
||||
margin: 0 0 6px;
|
||||
font-size: 28px;
|
||||
line-height: 1;
|
||||
}}
|
||||
.brand p {{
|
||||
margin: 0;
|
||||
color: rgba(255, 255, 255, 0.72);
|
||||
line-height: 1.45;
|
||||
}}
|
||||
.sidebar input, .sidebar select {{
|
||||
width: 100%;
|
||||
border: 1px solid rgba(255, 255, 255, 0.14);
|
||||
border-radius: 16px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
color: white;
|
||||
padding: 12px 14px;
|
||||
margin: 10px 0;
|
||||
}}
|
||||
.sidebar input {{
|
||||
width: 100%;
|
||||
border: 1px solid rgba(255, 255, 255, 0.14);
|
||||
border-radius: 16px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
color: white;
|
||||
padding: 12px 14px;
|
||||
margin: 10px 0;
|
||||
}}
|
||||
.sidebar input::placeholder {{
|
||||
color: rgba(255, 255, 255, 0.5);
|
||||
}}
|
||||
.setup-note {{
|
||||
margin-top: 14px;
|
||||
padding: 14px;
|
||||
border-radius: 16px;
|
||||
background: rgba(255, 255, 255, 0.07);
|
||||
border: 1px solid rgba(255, 255, 255, 0.12);
|
||||
}}
|
||||
.setup-note h3 {{
|
||||
margin: 0 0 8px;
|
||||
font-size: 14px;
|
||||
}}
|
||||
.setup-note p {{
|
||||
margin: 0 0 10px;
|
||||
color: rgba(255, 255, 255, 0.76);
|
||||
line-height: 1.45;
|
||||
font-size: 13px;
|
||||
}}
|
||||
.setup-note pre {{
|
||||
margin: 0;
|
||||
background: rgba(0, 0, 0, 0.24);
|
||||
border: 1px solid rgba(255, 255, 255, 0.1);
|
||||
color: white;
|
||||
}}
|
||||
.session-list {{
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
margin-top: 16px;
|
||||
}}
|
||||
.session-card {{
|
||||
border: 1px solid rgba(255, 255, 255, 0.1);
|
||||
background: rgba(255, 255, 255, 0.06);
|
||||
color: white;
|
||||
border-radius: 18px;
|
||||
padding: 14px;
|
||||
cursor: pointer;
|
||||
text-align: left;
|
||||
width: 100%;
|
||||
}}
|
||||
.session-card.active {{
|
||||
background: linear-gradient(145deg, rgba(182, 74, 43, 0.96), rgba(122, 40, 19, 0.96));
|
||||
border-color: rgba(255, 255, 255, 0.24);
|
||||
}}
|
||||
.session-card .sid {{
|
||||
font-family: ui-monospace, "SFMono-Regular", Menlo, monospace;
|
||||
font-size: 12px;
|
||||
word-break: break-all;
|
||||
opacity: 0.95;
|
||||
}}
|
||||
.session-card .meta {{
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 6px;
|
||||
font-size: 12px;
|
||||
color: rgba(255, 255, 255, 0.76);
|
||||
}}
|
||||
.session-card .meta span {{
|
||||
border-radius: 999px;
|
||||
background: rgba(255, 255, 255, 0.09);
|
||||
padding: 4px 8px;
|
||||
}}
|
||||
.main {{
|
||||
padding: 26px;
|
||||
min-width: 0;
|
||||
}}
|
||||
.hero {{
|
||||
background: linear-gradient(145deg, rgba(182, 74, 43, 0.96), rgba(122, 40, 19, 0.96));
|
||||
color: white;
|
||||
border-radius: 28px;
|
||||
padding: 28px;
|
||||
box-shadow: var(--shadow);
|
||||
}}
|
||||
.hero h2 {{
|
||||
margin: 0 0 8px;
|
||||
font-size: clamp(30px, 5vw, 46px);
|
||||
line-height: 1.02;
|
||||
}}
|
||||
.hero code {{
|
||||
display: inline-block;
|
||||
margin-top: 4px;
|
||||
padding: 4px 10px;
|
||||
border-radius: 999px;
|
||||
background: rgba(255, 255, 255, 0.14);
|
||||
font-size: 13px;
|
||||
word-break: break-all;
|
||||
}}
|
||||
.meta-grid {{
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(170px, 1fr));
|
||||
gap: 12px;
|
||||
margin-top: 18px;
|
||||
}}
|
||||
.meta-card {{
|
||||
border-radius: 16px;
|
||||
padding: 14px;
|
||||
background: rgba(255, 255, 255, 0.11);
|
||||
border: 1px solid rgba(255, 255, 255, 0.14);
|
||||
}}
|
||||
.meta-card .label {{
|
||||
display: block;
|
||||
font-size: 11px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.08em;
|
||||
color: rgba(255, 255, 255, 0.68);
|
||||
margin-bottom: 6px;
|
||||
}}
|
||||
.toolbar {{
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
margin: 22px 0 18px;
|
||||
}}
|
||||
.toolbar input {{
|
||||
flex: 1 1 320px;
|
||||
min-width: 220px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 999px;
|
||||
padding: 12px 16px;
|
||||
background: rgba(255, 255, 255, 0.9);
|
||||
box-shadow: var(--shadow);
|
||||
}}
|
||||
.toolbar button {{
|
||||
border: 0;
|
||||
border-radius: 999px;
|
||||
padding: 12px 16px;
|
||||
background: var(--accent);
|
||||
color: white;
|
||||
cursor: pointer;
|
||||
}}
|
||||
.turn {{
|
||||
background: var(--panel);
|
||||
border: 1px solid rgba(121, 93, 44, 0.14);
|
||||
border-radius: 24px;
|
||||
padding: 20px;
|
||||
margin: 18px 0;
|
||||
box-shadow: var(--shadow);
|
||||
backdrop-filter: blur(10px);
|
||||
}}
|
||||
.turn.hidden {{
|
||||
display: none;
|
||||
}}
|
||||
.turn-head {{
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
gap: 10px;
|
||||
flex-wrap: wrap;
|
||||
margin-bottom: 14px;
|
||||
}}
|
||||
.turn-title {{
|
||||
font-size: 24px;
|
||||
font-weight: 700;
|
||||
}}
|
||||
.turn-meta {{
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
color: var(--muted);
|
||||
font-size: 13px;
|
||||
}}
|
||||
.turn-meta span {{
|
||||
background: #efe4d1;
|
||||
border-radius: 999px;
|
||||
padding: 6px 10px;
|
||||
}}
|
||||
details.block {{
|
||||
margin-top: 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 16px;
|
||||
background: var(--panel-strong);
|
||||
padding: 14px 16px;
|
||||
}}
|
||||
summary {{
|
||||
cursor: pointer;
|
||||
font-weight: 700;
|
||||
}}
|
||||
.message {{
|
||||
margin-top: 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 16px;
|
||||
padding: 14px;
|
||||
background: #fffdfa;
|
||||
}}
|
||||
.message-header {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
flex-wrap: wrap;
|
||||
margin-bottom: 10px;
|
||||
font-size: 13px;
|
||||
color: var(--muted);
|
||||
}}
|
||||
.badge {{
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
padding: 4px 10px;
|
||||
border-radius: 999px;
|
||||
color: white;
|
||||
font-size: 12px;
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
}}
|
||||
.badge-user {{ background: var(--user); }}
|
||||
.badge-assistant {{ background: var(--assistant); }}
|
||||
.badge-tool {{ background: var(--tool); }}
|
||||
.badge-system {{ background: #334155; }}
|
||||
pre {{
|
||||
margin: 0;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
overflow-x: auto;
|
||||
border-radius: 14px;
|
||||
padding: 14px;
|
||||
background: #faf5ec;
|
||||
border: 1px solid #eee2cf;
|
||||
font-family: ui-monospace, "SFMono-Regular", Menlo, monospace;
|
||||
font-size: 13px;
|
||||
line-height: 1.55;
|
||||
}}
|
||||
.tool-block {{
|
||||
margin-top: 12px;
|
||||
}}
|
||||
.tool-name {{
|
||||
font-weight: 700;
|
||||
}}
|
||||
.status {{
|
||||
margin-left: auto;
|
||||
padding: 4px 10px;
|
||||
border-radius: 999px;
|
||||
font-size: 11px;
|
||||
text-transform: uppercase;
|
||||
font-weight: 700;
|
||||
}}
|
||||
.status.ok {{
|
||||
background: #dcfce7;
|
||||
color: #166534;
|
||||
}}
|
||||
.status.error {{
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
}}
|
||||
.empty {{
|
||||
padding: 32px;
|
||||
color: var(--muted);
|
||||
text-align: center;
|
||||
border: 1px dashed var(--line);
|
||||
border-radius: 18px;
|
||||
background: rgba(255, 255, 255, 0.45);
|
||||
}}
|
||||
@media (max-width: 980px) {{
|
||||
.app {{
|
||||
grid-template-columns: 1fr;
|
||||
}}
|
||||
.sidebar {{
|
||||
position: static;
|
||||
height: auto;
|
||||
}}
|
||||
.main {{
|
||||
padding-top: 14px;
|
||||
}}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="app">
|
||||
<aside class="sidebar">
|
||||
<div class="brand">
|
||||
<h1>Hive Debug</h1>
|
||||
<p>Pick a session in the browser and inspect prompts, inputs, outputs, and tool activity turn by turn.</p>
|
||||
</div>
|
||||
<input id="sessionSearch" type="search" placeholder="Filter sessions">
|
||||
<div class="setup-note">
|
||||
<h3>Logging status</h3>
|
||||
<p>LLM turn logging is always on. If this list is empty, run Hive once and refresh after the session produces turns.</p>
|
||||
<pre>~/.hive/llm_logs</pre>
|
||||
</div>
|
||||
<div class="session-list" id="sessionList"></div>
|
||||
</aside>
|
||||
<main class="main">
|
||||
<section class="hero">
|
||||
<h2 id="heroTitle">LLM Debug Session</h2>
|
||||
<code id="heroId"></code>
|
||||
<div class="meta-grid" id="metaGrid"></div>
|
||||
</section>
|
||||
<div class="toolbar">
|
||||
<input id="turnFilter" type="search" placeholder="Filter selected session by text, tool name, role, model, or prompt content">
|
||||
<button type="button" id="expandAll">Expand all</button>
|
||||
<button type="button" id="collapseAll">Collapse all</button>
|
||||
</div>
|
||||
<div id="turns"></div>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<script id="session-summaries" type="application/json">{json.dumps(summaries_data, ensure_ascii=False)}</script>
|
||||
<script id="session-records" type="application/json">{json.dumps(sessions_data, ensure_ascii=False)}</script>
|
||||
<script>
|
||||
const summaries = JSON.parse(document.getElementById("session-summaries").textContent);
|
||||
const recordsBySession = JSON.parse(document.getElementById("session-records").textContent);
|
||||
const initialSessionId = {json.dumps(initial, ensure_ascii=False)};
|
||||
|
||||
const sessionSearch = document.getElementById("sessionSearch");
|
||||
const sessionList = document.getElementById("sessionList");
|
||||
const heroTitle = document.getElementById("heroTitle");
|
||||
const heroId = document.getElementById("heroId");
|
||||
const metaGrid = document.getElementById("metaGrid");
|
||||
const turnsEl = document.getElementById("turns");
|
||||
const turnFilter = document.getElementById("turnFilter");
|
||||
|
||||
let activeSessionId = initialSessionId || (summaries[0] ? summaries[0].execution_id : "");
|
||||
|
||||
function text(value) {{
|
||||
return value == null ? "" : String(value);
|
||||
}}
|
||||
|
||||
function escapeHtml(value) {{
|
||||
return text(value)
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">")
|
||||
.replaceAll('"', """);
|
||||
}}
|
||||
|
||||
function prettyJson(value) {{
|
||||
return escapeHtml(JSON.stringify(value, null, 2));
|
||||
}}
|
||||
|
||||
function sessionMatches(summary, query) {{
|
||||
if (!query) return true;
|
||||
const haystack = [
|
||||
summary.execution_id,
|
||||
summary.start_display,
|
||||
summary.end_display,
|
||||
summary.log_file,
|
||||
...(summary.streams || []),
|
||||
...(summary.nodes || []),
|
||||
...(summary.models || []),
|
||||
].join("\\n").toLowerCase();
|
||||
return haystack.includes(query);
|
||||
}}
|
||||
|
||||
function renderSessionChooser() {{
|
||||
const query = sessionSearch.value.trim().toLowerCase();
|
||||
const filtered = summaries.filter((summary) => sessionMatches(summary, query));
|
||||
|
||||
sessionList.innerHTML = filtered
|
||||
.map((summary) => {{
|
||||
const active = summary.execution_id === activeSessionId ? " active" : "";
|
||||
const chips = [
|
||||
summary.start_display,
|
||||
`${{summary.turn_count}} turns`,
|
||||
...(summary.models || []).slice(0, 2),
|
||||
];
|
||||
return `
|
||||
<button type="button" class="session-card${{active}}" data-session-id="${{escapeHtml(summary.execution_id)}}">
|
||||
<div class="sid">${{escapeHtml(summary.execution_id)}}</div>
|
||||
<div class="meta">${{chips.map((chip) => `<span>${{escapeHtml(chip)}}</span>`).join("")}}</div>
|
||||
</button>
|
||||
`;
|
||||
}})
|
||||
.join("") || '<div class="empty">No matching sessions.</div>';
|
||||
}}
|
||||
|
||||
function renderMetaCard(label, value) {{
|
||||
return `<div class="meta-card"><span class="label">${{escapeHtml(label)}}</span>${{escapeHtml(value || "-")}}</div>`;
|
||||
}}
|
||||
|
||||
function renderMessage(message, index) {{
|
||||
const role = text(message.role || "unknown");
|
||||
const content = text(message.content || "");
|
||||
const toolCalls = message.tool_calls;
|
||||
return `
|
||||
<div class="message">
|
||||
<div class="message-header">
|
||||
<span class="badge badge-${{escapeHtml(role)}}">${{escapeHtml(role)}}</span>
|
||||
<span>message ${{index}}</span>
|
||||
</div>
|
||||
${{
|
||||
content
|
||||
? `<pre>${{escapeHtml(content)}}</pre>`
|
||||
: '<div class="empty">(empty message)</div>'
|
||||
}}
|
||||
${{
|
||||
toolCalls
|
||||
? `<details class="block"><summary>tool_calls</summary><pre>${{prettyJson(toolCalls)}}</pre></details>`
|
||||
: ""
|
||||
}}
|
||||
</div>
|
||||
`;
|
||||
}}
|
||||
|
||||
function renderToolCall(toolCall, index) {{
|
||||
const name = text(toolCall.tool_name || (toolCall.function || {{}}).name || "unknown");
|
||||
const error = !!toolCall.is_error;
|
||||
return `
|
||||
<div class="tool-block">
|
||||
<div class="message-header">
|
||||
<span class="badge badge-tool">tool ${{index}}</span>
|
||||
<span class="tool-name">${{escapeHtml(name)}}</span>
|
||||
<span class="status ${{error ? "error" : "ok"}}">${{error ? "error" : "ok"}}</span>
|
||||
</div>
|
||||
<pre>${{prettyJson(toolCall)}}</pre>
|
||||
</div>
|
||||
`;
|
||||
}}
|
||||
|
||||
function renderTurn(record) {{
|
||||
const tokenCounts = record.token_counts || {{}};
|
||||
const messages = Array.isArray(record.messages) ? record.messages : [];
|
||||
const toolCalls = Array.isArray(record.tool_calls) ? record.tool_calls : [];
|
||||
const toolResults = Array.isArray(record.tool_results) ? record.tool_results : [];
|
||||
const systemPrompt = text(record.system_prompt || "");
|
||||
const assistantText = text(record.assistant_text || "");
|
||||
const parseError = text(record._parse_error || "");
|
||||
|
||||
return `
|
||||
<section class="turn">
|
||||
<div class="turn-head">
|
||||
<div class="turn-title">Iteration ${{escapeHtml(record.iteration ?? "?")}}</div>
|
||||
<div class="turn-meta">
|
||||
<span>${{escapeHtml(record.timestamp || "-")}}</span>
|
||||
<span>node=${{escapeHtml(record.node_id || "-")}}</span>
|
||||
<span>stream=${{escapeHtml(record.stream_id || "-")}}</span>
|
||||
<span>model=${{escapeHtml(tokenCounts.model || "-")}}</span>
|
||||
<span>stop=${{escapeHtml(tokenCounts.stop_reason || "-")}}</span>
|
||||
<span>in=${{escapeHtml(tokenCounts.input ?? "-")}}</span>
|
||||
<span>out=${{escapeHtml(tokenCounts.output ?? "-")}}</span>
|
||||
</div>
|
||||
</div>
|
||||
${{
|
||||
systemPrompt
|
||||
? `<details class="block" open><summary>System prompt</summary><pre>${{escapeHtml(systemPrompt)}}</pre></details>`
|
||||
: ""
|
||||
}}
|
||||
${{
|
||||
messages.length
|
||||
? `<details class="block" open><summary>Input messages (${{messages.length}})</summary>${{messages.map((message, index) => renderMessage(message, index + 1)).join("")}}</details>`
|
||||
: ""
|
||||
}}
|
||||
<details class="block" open>
|
||||
<summary>Assistant output</summary>
|
||||
<pre>${{escapeHtml(assistantText)}}</pre>
|
||||
</details>
|
||||
${{
|
||||
toolCalls.length
|
||||
? `<details class="block" open><summary>Tool calls (${{toolCalls.length}})</summary>${{toolCalls.map((toolCall, index) => renderToolCall(toolCall, index + 1)).join("")}}</details>`
|
||||
: ""
|
||||
}}
|
||||
${{
|
||||
toolResults.length
|
||||
? `<details class="block"><summary>Tool results (${{toolResults.length}})</summary><pre>${{prettyJson(toolResults)}}</pre></details>`
|
||||
: ""
|
||||
}}
|
||||
${{
|
||||
parseError
|
||||
? `<details class="block"><summary>Parse error</summary><pre>${{prettyJson(record)}}</pre></details>`
|
||||
: ""
|
||||
}}
|
||||
</section>
|
||||
`;
|
||||
}}
|
||||
|
||||
function renderSession(sessionId) {{
|
||||
activeSessionId = sessionId;
|
||||
const summary = summaries.find((entry) => entry.execution_id === sessionId);
|
||||
const records = recordsBySession[sessionId] || [];
|
||||
|
||||
renderSessionChooser();
|
||||
|
||||
if (!summary) {{
|
||||
heroTitle.textContent = "No session selected";
|
||||
heroId.textContent = "";
|
||||
metaGrid.innerHTML = "";
|
||||
turnsEl.innerHTML = '<div class="empty">No session data available.</div>';
|
||||
return;
|
||||
}}
|
||||
|
||||
heroTitle.textContent = "LLM Debug Session";
|
||||
heroId.textContent = summary.execution_id;
|
||||
metaGrid.innerHTML = [
|
||||
renderMetaCard("Started", summary.start_display),
|
||||
renderMetaCard("Ended", summary.end_display),
|
||||
renderMetaCard("Turns", String(summary.turn_count)),
|
||||
renderMetaCard("Streams", (summary.streams || []).join(", ")),
|
||||
renderMetaCard("Nodes", (summary.nodes || []).join(", ")),
|
||||
renderMetaCard("Models", (summary.models || []).join(", ")),
|
||||
renderMetaCard("Source file", summary.log_file),
|
||||
].join("");
|
||||
|
||||
turnsEl.innerHTML = records.length
|
||||
? records.map((record) => renderTurn(record)).join("")
|
||||
: '<div class="empty">This session has no turn records.</div>';
|
||||
|
||||
applyTurnFilter();
|
||||
history.replaceState(null, "", `#${{encodeURIComponent(sessionId)}}`);
|
||||
}}
|
||||
|
||||
function applyTurnFilter() {{
|
||||
const query = turnFilter.value.trim().toLowerCase();
|
||||
for (const turn of document.querySelectorAll(".turn")) {{
|
||||
const visible = !query || turn.textContent.toLowerCase().includes(query);
|
||||
turn.classList.toggle("hidden", !visible);
|
||||
}}
|
||||
}}
|
||||
|
||||
sessionSearch.addEventListener("input", renderSessionChooser);
|
||||
sessionList.addEventListener("click", (event) => {{
|
||||
const card = event.target.closest(".session-card");
|
||||
if (!card) return;
|
||||
renderSession(card.dataset.sessionId);
|
||||
}});
|
||||
turnFilter.addEventListener("input", applyTurnFilter);
|
||||
document.getElementById("expandAll").addEventListener("click", () => {{
|
||||
for (const details of document.querySelectorAll("details")) details.open = true;
|
||||
}});
|
||||
document.getElementById("collapseAll").addEventListener("click", () => {{
|
||||
for (const details of document.querySelectorAll("details")) details.open = false;
|
||||
}});
|
||||
|
||||
const hashSession = decodeURIComponent(window.location.hash.replace(/^#/, ""));
|
||||
const bootSession = recordsBySession[hashSession] ? hashSession : activeSessionId;
|
||||
renderSessionChooser();
|
||||
renderSession(bootSession);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def _write_report(html_report: str, output: Path | None) -> Path:
|
||||
if output is not None:
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
output.write_text(html_report, encoding="utf-8")
|
||||
return output
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w",
|
||||
encoding="utf-8",
|
||||
prefix="hive_llm_debug_",
|
||||
suffix=".html",
|
||||
delete=False,
|
||||
dir="/tmp",
|
||||
) as handle:
|
||||
handle.write(html_report)
|
||||
return Path(handle.name)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = _parse_args()
|
||||
records = _discover_records(args.logs_dir.expanduser(), args.limit_files)
|
||||
summaries, sessions = _group_sessions(records)
|
||||
|
||||
initial_session_id = args.session or (summaries[0].execution_id if summaries else "")
|
||||
if initial_session_id and initial_session_id not in sessions:
|
||||
print(f"session not found: {initial_session_id}")
|
||||
return 1
|
||||
|
||||
html_report = _render_html(summaries, sessions, initial_session_id)
|
||||
output_path = _write_report(html_report, args.output)
|
||||
print(output_path)
|
||||
|
||||
if not args.no_open:
|
||||
webbrowser.open(output_path.resolve().as_uri())
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
+102
-21
@@ -336,7 +336,7 @@ def list_agent_tools(
|
||||
output_schema: str = "simple",
|
||||
group: str = "all",
|
||||
) -> str:
|
||||
"""Discover tools available for agent building, grouped by category.
|
||||
"""Discover tools available for agent building, grouped by provider.
|
||||
|
||||
Connects to each MCP server, lists tools, then disconnects. Use this
|
||||
BEFORE designing an agent to know exactly which tools exist. Only use
|
||||
@@ -348,11 +348,12 @@ def list_agent_tools(
|
||||
to see what tools that specific agent has access to.
|
||||
output_schema: "simple" (default) returns name and description per tool.
|
||||
"full" also includes server and input_schema.
|
||||
group: "all" (default) returns every category. A prefix like "gmail"
|
||||
returns only that group's tools.
|
||||
group: "all" (default) returns all providers. A provider like "google"
|
||||
returns only that provider's tools. Legacy prefix filters (e.g. "gmail")
|
||||
are still supported.
|
||||
|
||||
Returns:
|
||||
JSON with tools grouped by prefix (e.g. gmail_*, slack_*).
|
||||
JSON with tools grouped by provider.
|
||||
"""
|
||||
if output_schema not in ("simple", "full"):
|
||||
return json.dumps(
|
||||
@@ -425,28 +426,108 @@ def list_agent_tools(
|
||||
except Exception as e:
|
||||
errors.append({"server": server_name, "error": str(e)})
|
||||
|
||||
# Group by prefix (e.g., gmail_, slack_, stripe_)
|
||||
groups: dict[str, list[dict]] = {}
|
||||
for t in sorted(all_tools, key=lambda x: x["name"]):
|
||||
parts = t["name"].split("_", 1)
|
||||
prefix = parts[0] if len(parts) > 1 else "general"
|
||||
groups.setdefault(prefix, []).append(t)
|
||||
def _normalize_provider_name(raw: str | None, fallback: str) -> str:
|
||||
"""Normalize provider names to stable top-level buckets."""
|
||||
text = (raw or fallback or "unknown").strip().lower()
|
||||
text = re.sub(r"[^a-z0-9]+", "_", text).strip("_")
|
||||
if not text:
|
||||
return "unknown"
|
||||
head = text.split("_", 1)[0]
|
||||
# Collapse Google families (google_docs/google_cloud/google-custom-search -> google)
|
||||
if head == "google":
|
||||
return "google"
|
||||
return head
|
||||
|
||||
# Filter to a specific group
|
||||
def _build_provider_metadata() -> tuple[
|
||||
dict[str, dict[str, dict[str, dict]]], dict[str, set[str]]
|
||||
]:
|
||||
"""Build tool->provider->credential metadata index from CredentialSpecs."""
|
||||
try:
|
||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
||||
except ImportError:
|
||||
return {}, {}
|
||||
|
||||
tool_provider_auth: dict[str, dict[str, dict[str, dict]]] = {}
|
||||
tool_providers: dict[str, set[str]] = {}
|
||||
|
||||
for cred_name, spec in CREDENTIAL_SPECS.items():
|
||||
provider_hint = spec.aden_provider_name or spec.credential_group or spec.credential_id
|
||||
provider = _normalize_provider_name(provider_hint, fallback=cred_name)
|
||||
auth_entry = {
|
||||
"env_var": spec.env_var,
|
||||
"required": spec.required,
|
||||
"description": spec.description,
|
||||
"help_url": spec.help_url,
|
||||
"credential_id": spec.credential_id,
|
||||
"credential_key": spec.credential_key,
|
||||
}
|
||||
for tool_name in spec.tools:
|
||||
tool_providers.setdefault(tool_name, set()).add(provider)
|
||||
provider_map = tool_provider_auth.setdefault(tool_name, {})
|
||||
credential_map = provider_map.setdefault(provider, {})
|
||||
credential_map[cred_name] = auth_entry
|
||||
|
||||
return tool_provider_auth, tool_providers
|
||||
|
||||
tool_provider_auth, tool_providers = _build_provider_metadata()
|
||||
|
||||
def _group_by_provider(tools: list[dict]) -> dict[str, dict]:
|
||||
"""Group tools by provider, including auth metadata and providerless tools."""
|
||||
groups: dict[str, dict] = {}
|
||||
|
||||
for t in sorted(tools, key=lambda x: (x["name"], x["server"])):
|
||||
providers = sorted(tool_providers.get(t["name"], []))
|
||||
if not providers:
|
||||
providers = ["no_provider"]
|
||||
|
||||
tool_payload = {
|
||||
"name": t["name"],
|
||||
"description": t["description"],
|
||||
}
|
||||
if output_schema == "full":
|
||||
tool_payload["server"] = t["server"]
|
||||
tool_payload["input_schema"] = t["input_schema"]
|
||||
|
||||
for provider in providers:
|
||||
bucket = groups.setdefault(
|
||||
provider,
|
||||
{
|
||||
"authorization": {},
|
||||
"tools": [],
|
||||
},
|
||||
)
|
||||
bucket["tools"].append(tool_payload)
|
||||
|
||||
provider_auth = tool_provider_auth.get(t["name"], {}).get(provider, {})
|
||||
for cred_name, auth in provider_auth.items():
|
||||
bucket["authorization"][cred_name] = auth
|
||||
|
||||
for _provider, bucket in groups.items():
|
||||
bucket["tools"] = sorted(bucket["tools"], key=lambda x: x["name"])
|
||||
bucket["authorization"] = dict(sorted(bucket["authorization"].items()))
|
||||
|
||||
return dict(sorted(groups.items()))
|
||||
|
||||
provider_groups = _group_by_provider(all_tools)
|
||||
|
||||
# Filter to a specific provider (preferred) or legacy prefix (fallback)
|
||||
if group != "all":
|
||||
groups = {group: groups[group]} if group in groups else {}
|
||||
if group in provider_groups:
|
||||
provider_groups = {group: provider_groups[group]}
|
||||
else:
|
||||
prefixed_tools = []
|
||||
for t in all_tools:
|
||||
parts = t["name"].split("_", 1)
|
||||
prefix = parts[0] if len(parts) > 1 else "general"
|
||||
if prefix == group:
|
||||
prefixed_tools.append(t)
|
||||
provider_groups = _group_by_provider(prefixed_tools)
|
||||
|
||||
# Apply output schema
|
||||
if output_schema == "simple":
|
||||
groups = {
|
||||
prefix: [{"name": t["name"], "description": t["description"]} for t in tools]
|
||||
for prefix, tools in groups.items()
|
||||
}
|
||||
|
||||
all_names = sorted(t["name"] for tools in groups.values() for t in tools)
|
||||
all_names = sorted({t["name"] for p in provider_groups.values() for t in p["tools"]})
|
||||
result: dict = {
|
||||
"total": len(all_names),
|
||||
"tools_by_category": groups,
|
||||
"tools_by_provider": provider_groups,
|
||||
"tools_by_category": provider_groups, # backward-compat alias
|
||||
"all_tool_names": all_names,
|
||||
}
|
||||
if errors:
|
||||
|
||||
@@ -187,25 +187,15 @@ def _register_verified(
|
||||
register_email(mcp, credentials=credentials)
|
||||
register_gmail(mcp, credentials=credentials)
|
||||
register_hubspot(mcp, credentials=credentials)
|
||||
register_intercom(mcp, credentials=credentials)
|
||||
register_apollo(mcp, credentials=credentials)
|
||||
register_brevo(mcp, credentials=credentials)
|
||||
register_bigquery(mcp, credentials=credentials)
|
||||
register_calcom(mcp, credentials=credentials)
|
||||
register_calendar(mcp, credentials=credentials)
|
||||
register_discord(mcp, credentials=credentials)
|
||||
register_exa_search(mcp, credentials=credentials)
|
||||
register_news(mcp, credentials=credentials)
|
||||
register_razorpay(mcp, credentials=credentials)
|
||||
register_serpapi(mcp, credentials=credentials)
|
||||
register_slack(mcp, credentials=credentials)
|
||||
register_stripe(mcp, credentials=credentials)
|
||||
register_telegram(mcp, credentials=credentials)
|
||||
register_vision(mcp, credentials=credentials)
|
||||
register_google_docs(mcp, credentials=credentials)
|
||||
register_google_maps(mcp, credentials=credentials)
|
||||
register_account_info(mcp, credentials=credentials)
|
||||
register_postgres(mcp, credentials=credentials)
|
||||
|
||||
|
||||
def _register_unverified(
|
||||
@@ -225,6 +215,16 @@ def _register_unverified(
|
||||
register_attio(mcp, credentials=credentials)
|
||||
register_aws_s3(mcp, credentials=credentials)
|
||||
register_azure_sql(mcp, credentials=credentials)
|
||||
register_intercom(mcp, credentials=credentials)
|
||||
register_apollo(mcp, credentials=credentials)
|
||||
register_brevo(mcp, credentials=credentials)
|
||||
register_bigquery(mcp, credentials=credentials)
|
||||
register_calcom(mcp, credentials=credentials)
|
||||
register_razorpay(mcp, credentials=credentials)
|
||||
register_serpapi(mcp, credentials=credentials)
|
||||
register_vision(mcp, credentials=credentials)
|
||||
register_stripe(mcp, credentials=credentials)
|
||||
register_postgres(mcp, credentials=credentials)
|
||||
register_calendly(mcp, credentials=credentials)
|
||||
register_cloudinary(mcp, credentials=credentials)
|
||||
register_confluence(mcp, credentials=credentials)
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test tool to visualize the Queen's building phase system prompt."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root / "core"))
|
||||
|
||||
from framework.agents.hive_coder.nodes import (
|
||||
_queen_identity,
|
||||
_agent_builder_knowledge,
|
||||
_gcu_building_section,
|
||||
_queen_tools_docs,
|
||||
_queen_behavior,
|
||||
_queen_phase_7,
|
||||
_queen_style,
|
||||
_appendices,
|
||||
queen_node,
|
||||
_is_gcu_enabled,
|
||||
)
|
||||
|
||||
|
||||
def print_section_header(title: str, char: str = "=") -> None:
|
||||
"""Print a section header."""
|
||||
print(f"\n{char * 80}")
|
||||
print(f" {title}")
|
||||
print(f"{char * 80}\n")
|
||||
|
||||
|
||||
def print_subsection_header(title: str) -> None:
|
||||
"""Print a subsection header."""
|
||||
print(f"\n{'-' * 80}")
|
||||
print(f" {title}")
|
||||
print(f"{'-' * 80}\n")
|
||||
|
||||
|
||||
def count_lines(text: str) -> int:
|
||||
"""Count non-empty lines in text."""
|
||||
return len([line for line in text.split("\n") if line.strip()])
|
||||
|
||||
|
||||
def analyze_prompt():
|
||||
"""Analyze and display the Queen's building phase system prompt."""
|
||||
print_section_header("QUEEN BUILDING PHASE PROMPT ANALYZER")
|
||||
|
||||
# Show GCU status
|
||||
gcu_enabled = _is_gcu_enabled()
|
||||
print(f"GCU Enabled: {gcu_enabled}")
|
||||
print(f"GCU Section Length: {len(_gcu_building_section)} chars, {count_lines(_gcu_building_section)} lines")
|
||||
|
||||
# Build the full prompt
|
||||
full_prompt = (
|
||||
_queen_identity
|
||||
+ _queen_style
|
||||
+ _agent_builder_knowledge
|
||||
+ _gcu_building_section
|
||||
+ _queen_tools_docs
|
||||
+ _queen_behavior
|
||||
+ _queen_phase_7
|
||||
+ _appendices
|
||||
)
|
||||
|
||||
print(f"\nFull Prompt Length: {len(full_prompt)} chars, {count_lines(full_prompt)} lines")
|
||||
|
||||
# Show section breakdown
|
||||
print_subsection_header("SECTION BREAKDOWN")
|
||||
sections = [
|
||||
("_queen_identity", _queen_identity),
|
||||
("_queen_style", _queen_style),
|
||||
("_agent_builder_knowledge", _agent_builder_knowledge),
|
||||
("_gcu_building_section", _gcu_building_section),
|
||||
("_queen_tools_docs", _queen_tools_docs),
|
||||
("_queen_behavior", _queen_behavior),
|
||||
("_queen_phase_7", _queen_phase_7),
|
||||
("_appendices", _appendices),
|
||||
]
|
||||
|
||||
print(f"{'Section':<30} {'Chars':>10} {'Lines':>10} {'%':>8}")
|
||||
print("-" * 60)
|
||||
|
||||
total_chars = sum(len(s[1]) for s in sections)
|
||||
for name, content in sections:
|
||||
chars = len(content)
|
||||
lines = count_lines(content)
|
||||
pct = (chars / total_chars * 100) if total_chars > 0 else 0
|
||||
print(f"{name:<30} {chars:>10} {lines:>10} {pct:>7.1f}%")
|
||||
|
||||
print("-" * 60)
|
||||
print(f"{'TOTAL':<30} {total_chars:>10} {count_lines(full_prompt):>10} {'100.0':>7}%")
|
||||
|
||||
# Show prompt structure
|
||||
print_subsection_header("PROMPT STRUCTURE (First 200 chars of each section)")
|
||||
|
||||
for name, content in sections:
|
||||
if content.strip():
|
||||
print(f"\n### {name} ###")
|
||||
preview = content[:200].strip()
|
||||
if len(content) > 200:
|
||||
preview += "..."
|
||||
print(preview)
|
||||
|
||||
return full_prompt
|
||||
|
||||
|
||||
def print_full_prompt():
|
||||
"""Print the full Queen system prompt."""
|
||||
print_section_header("FULL QUEEN SYSTEM PROMPT")
|
||||
|
||||
full_prompt = queen_node.system_prompt
|
||||
print(full_prompt)
|
||||
|
||||
print_section_header("END OF PROMPT")
|
||||
print(f"Total length: {len(full_prompt)} characters")
|
||||
|
||||
|
||||
def print_gcu_section():
|
||||
"""Print just the GCU section to verify it's first-class."""
|
||||
print_section_header("GCU BUILDING SECTION (First-Class)")
|
||||
|
||||
if _gcu_building_section:
|
||||
print(_gcu_building_section)
|
||||
else:
|
||||
print("(GCU is disabled or section is empty)")
|
||||
|
||||
print_section_header("END OF GCU SECTION")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Analyze Queen's building phase prompt")
|
||||
parser.add_argument(
|
||||
"--full", "-f",
|
||||
action="store_true",
|
||||
help="Print the full system prompt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gcu", "-g",
|
||||
action="store_true",
|
||||
help="Print just the GCU section"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--structure", "-s",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Show prompt structure analysis (default)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.full:
|
||||
print_full_prompt()
|
||||
elif args.gcu:
|
||||
print_gcu_section()
|
||||
else:
|
||||
analyze_prompt()
|
||||
@@ -0,0 +1,166 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_coder_tools_server():
|
||||
module_path = Path(__file__).resolve().parents[1] / "coder_tools_server.py"
|
||||
spec = importlib.util.spec_from_file_location("coder_tools_server_under_test", module_path)
|
||||
assert spec is not None and spec.loader is not None
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def _install_fake_framework(monkeypatch, tools_by_server: dict[str, list[dict]]) -> None:
|
||||
framework_mod = types.ModuleType("framework")
|
||||
runner_mod = types.ModuleType("framework.runner")
|
||||
mcp_client_mod = types.ModuleType("framework.runner.mcp_client")
|
||||
tool_registry_mod = types.ModuleType("framework.runner.tool_registry")
|
||||
|
||||
class FakeMCPServerConfig:
|
||||
def __init__(self, **kwargs):
|
||||
self.name = kwargs.get("name", "")
|
||||
|
||||
class FakeTool:
|
||||
def __init__(self, name: str, description: str = "", input_schema: dict | None = None):
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.input_schema = input_schema or {}
|
||||
|
||||
class FakeMCPClient:
|
||||
def __init__(self, config):
|
||||
self._server_name = config.name
|
||||
|
||||
def connect(self):
|
||||
return None
|
||||
|
||||
def list_tools(self):
|
||||
items = tools_by_server.get(self._server_name, [])
|
||||
return [
|
||||
FakeTool(
|
||||
name=item["name"],
|
||||
description=item.get("description", ""),
|
||||
input_schema=item.get("input_schema", {}),
|
||||
)
|
||||
for item in items
|
||||
]
|
||||
|
||||
def disconnect(self):
|
||||
return None
|
||||
|
||||
class FakeToolRegistry:
|
||||
@staticmethod
|
||||
def resolve_mcp_stdio_config(config: dict, _config_dir: Path) -> dict:
|
||||
return config
|
||||
|
||||
mcp_client_mod.MCPClient = FakeMCPClient
|
||||
mcp_client_mod.MCPServerConfig = FakeMCPServerConfig
|
||||
tool_registry_mod.ToolRegistry = FakeToolRegistry
|
||||
|
||||
framework_mod.runner = runner_mod
|
||||
runner_mod.mcp_client = mcp_client_mod
|
||||
runner_mod.tool_registry = tool_registry_mod
|
||||
|
||||
monkeypatch.setitem(sys.modules, "framework", framework_mod)
|
||||
monkeypatch.setitem(sys.modules, "framework.runner", runner_mod)
|
||||
monkeypatch.setitem(sys.modules, "framework.runner.mcp_client", mcp_client_mod)
|
||||
monkeypatch.setitem(sys.modules, "framework.runner.tool_registry", tool_registry_mod)
|
||||
|
||||
|
||||
def _call_list_agent_tools(mod, **kwargs) -> str:
|
||||
tool = mod.mcp._tool_manager._tools["list_agent_tools"]
|
||||
return tool.fn(**kwargs)
|
||||
|
||||
|
||||
def test_list_agent_tools_groups_by_provider_and_keeps_uncredentialed(monkeypatch, tmp_path):
|
||||
_install_fake_framework(
|
||||
monkeypatch,
|
||||
tools_by_server={
|
||||
"fake-server": [
|
||||
{"name": "gmail_list_messages", "description": "Read Gmail"},
|
||||
{"name": "calendar_list_events", "description": "Read calendar"},
|
||||
{"name": "send_email", "description": "Send email"},
|
||||
{"name": "web_scrape", "description": "Scrape a page"},
|
||||
]
|
||||
},
|
||||
)
|
||||
mod = _load_coder_tools_server()
|
||||
mod.PROJECT_ROOT = str(tmp_path)
|
||||
|
||||
config_path = tmp_path / "mcp_servers.json"
|
||||
config_path.write_text(
|
||||
json.dumps({"fake-server": {"transport": "stdio", "command": "noop", "args": []}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
raw = _call_list_agent_tools(
|
||||
mod,
|
||||
server_config_path="mcp_servers.json",
|
||||
output_schema="simple",
|
||||
group="all",
|
||||
)
|
||||
data = json.loads(raw)
|
||||
|
||||
providers = data["tools_by_provider"]
|
||||
assert "google" in providers
|
||||
assert "resend" in providers
|
||||
assert "no_provider" in providers
|
||||
|
||||
google_tools = {t["name"] for t in providers["google"]["tools"]}
|
||||
assert "gmail_list_messages" in google_tools
|
||||
assert "calendar_list_events" in google_tools
|
||||
assert "send_email" in google_tools
|
||||
assert providers["google"]["authorization"]
|
||||
|
||||
resend_tools = {t["name"] for t in providers["resend"]["tools"]}
|
||||
assert resend_tools == {"send_email"}
|
||||
assert providers["resend"]["authorization"]
|
||||
|
||||
no_provider_tools = {t["name"] for t in providers["no_provider"]["tools"]}
|
||||
assert "web_scrape" in no_provider_tools
|
||||
assert providers["no_provider"]["authorization"] == {}
|
||||
|
||||
|
||||
def test_list_agent_tools_provider_filter_and_legacy_prefix_filter(monkeypatch, tmp_path):
|
||||
_install_fake_framework(
|
||||
monkeypatch,
|
||||
tools_by_server={
|
||||
"fake-server": [
|
||||
{"name": "gmail_list_messages", "description": "Read Gmail"},
|
||||
{"name": "web_scrape", "description": "Scrape a page"},
|
||||
]
|
||||
},
|
||||
)
|
||||
mod = _load_coder_tools_server()
|
||||
mod.PROJECT_ROOT = str(tmp_path)
|
||||
|
||||
config_path = tmp_path / "mcp_servers.json"
|
||||
config_path.write_text(
|
||||
json.dumps({"fake-server": {"transport": "stdio", "command": "noop", "args": []}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
provider_raw = _call_list_agent_tools(
|
||||
mod,
|
||||
server_config_path="mcp_servers.json",
|
||||
output_schema="simple",
|
||||
group="google",
|
||||
)
|
||||
provider_data = json.loads(provider_raw)
|
||||
assert list(provider_data["tools_by_provider"].keys()) == ["google"]
|
||||
assert provider_data["all_tool_names"] == ["gmail_list_messages"]
|
||||
|
||||
legacy_raw = _call_list_agent_tools(
|
||||
mod,
|
||||
server_config_path="mcp_servers.json",
|
||||
output_schema="simple",
|
||||
group="gmail",
|
||||
)
|
||||
legacy_data = json.loads(legacy_raw)
|
||||
assert list(legacy_data["tools_by_provider"].keys()) == ["google"]
|
||||
assert legacy_data["all_tool_names"] == ["gmail_list_messages"]
|
||||
Reference in New Issue
Block a user