refactor: remove all old unused skills
This commit is contained in:
@@ -1,9 +0,0 @@
|
|||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"agent-builder": {
|
|
||||||
"command": "uv",
|
|
||||||
"args": ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"],
|
|
||||||
"disabled": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-concepts
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-create
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-credentials
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-patterns
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-test
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive-concepts
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive-concepts skill
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive-create
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive-create skill
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive-credentials
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive-credentials skill
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive-patterns
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive-patterns skill
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive-test
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive-test skill
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
description: hive
|
|
||||||
---
|
|
||||||
|
|
||||||
use hive skill
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-concepts
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-create
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-credentials
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-patterns
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-test
|
|
||||||
@@ -1,399 +0,0 @@
|
|||||||
---
|
|
||||||
name: hive-concepts
|
|
||||||
description: Core concepts for goal-driven agents - architecture, node types (event_loop, function), tool discovery, and workflow overview. Use when starting agent development or need to understand agent fundamentals.
|
|
||||||
license: Apache-2.0
|
|
||||||
metadata:
|
|
||||||
author: hive
|
|
||||||
version: "2.0"
|
|
||||||
type: foundational
|
|
||||||
part_of: hive
|
|
||||||
---
|
|
||||||
|
|
||||||
# Building Agents - Core Concepts
|
|
||||||
|
|
||||||
Foundational knowledge for building goal-driven agents as Python packages.
|
|
||||||
|
|
||||||
## Architecture: Python Services (Not JSON Configs)
|
|
||||||
|
|
||||||
Agents are built as Python packages:
|
|
||||||
|
|
||||||
```
|
|
||||||
exports/my_agent/
|
|
||||||
├── __init__.py # Package exports
|
|
||||||
├── __main__.py # CLI (run, info, validate, shell)
|
|
||||||
├── agent.py # Graph construction (goal, edges, agent class)
|
|
||||||
├── nodes/__init__.py # Node definitions (NodeSpec)
|
|
||||||
├── config.py # Runtime config
|
|
||||||
└── README.md # Documentation
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Principle: Agent is visible and editable during build**
|
|
||||||
|
|
||||||
- Files created immediately as components are approved
|
|
||||||
- User can watch files grow in their editor
|
|
||||||
- No session state - just direct file writes
|
|
||||||
- No "export" step - agent is ready when build completes
|
|
||||||
|
|
||||||
## Core Concepts
|
|
||||||
|
|
||||||
### Goal
|
|
||||||
|
|
||||||
Success criteria and constraints (written to agent.py)
|
|
||||||
|
|
||||||
```python
|
|
||||||
goal = Goal(
|
|
||||||
id="research-goal",
|
|
||||||
name="Technical Research Agent",
|
|
||||||
description="Research technical topics thoroughly",
|
|
||||||
success_criteria=[
|
|
||||||
SuccessCriterion(
|
|
||||||
id="completeness",
|
|
||||||
description="Cover all aspects of topic",
|
|
||||||
metric="coverage_score",
|
|
||||||
target=">=0.9",
|
|
||||||
weight=0.4,
|
|
||||||
),
|
|
||||||
# 3-5 success criteria total
|
|
||||||
],
|
|
||||||
constraints=[
|
|
||||||
Constraint(
|
|
||||||
id="accuracy",
|
|
||||||
description="All information must be verified",
|
|
||||||
constraint_type="hard",
|
|
||||||
category="quality",
|
|
||||||
),
|
|
||||||
# 1-5 constraints total
|
|
||||||
],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Node
|
|
||||||
|
|
||||||
Unit of work (written to nodes/__init__.py)
|
|
||||||
|
|
||||||
**Node Types:**
|
|
||||||
|
|
||||||
- `event_loop` — Multi-turn streaming loop with tool execution and judge-based evaluation. Works with or without tools.
|
|
||||||
- `function` — Deterministic Python operations. No LLM involved.
|
|
||||||
|
|
||||||
```python
|
|
||||||
search_node = NodeSpec(
|
|
||||||
id="search-web",
|
|
||||||
name="Search Web",
|
|
||||||
description="Search for information and extract results",
|
|
||||||
node_type="event_loop",
|
|
||||||
input_keys=["query"],
|
|
||||||
output_keys=["search_results"],
|
|
||||||
system_prompt="Search the web for: {query}. Use the web_search tool to find results, then call set_output to store them.",
|
|
||||||
tools=["web_search"],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**NodeSpec Fields for Event Loop Nodes:**
|
|
||||||
|
|
||||||
| Field | Default | Description |
|
|
||||||
|-------|---------|-------------|
|
|
||||||
| `client_facing` | `False` | If True, streams output to user and blocks for input between turns |
|
|
||||||
| `nullable_output_keys` | `[]` | Output keys that may remain unset (for mutually exclusive outputs) |
|
|
||||||
| `max_node_visits` | `1` | Max times this node executes per run. Set >1 for feedback loop targets |
|
|
||||||
|
|
||||||
### Edge
|
|
||||||
|
|
||||||
Connection between nodes (written to agent.py)
|
|
||||||
|
|
||||||
**Edge Conditions:**
|
|
||||||
|
|
||||||
- `on_success` — Proceed if node succeeds (most common)
|
|
||||||
- `on_failure` — Handle errors
|
|
||||||
- `always` — Always proceed
|
|
||||||
- `conditional` — Based on expression evaluating node output
|
|
||||||
|
|
||||||
**Edge Priority:**
|
|
||||||
|
|
||||||
Priority controls evaluation order when multiple edges leave the same node. Higher priority edges are evaluated first. Use negative priority for feedback edges (edges that loop back to earlier nodes).
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Forward edge (evaluated first)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-to-campaign",
|
|
||||||
source="review",
|
|
||||||
target="campaign-builder",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="output.get('approved_contacts') is not None",
|
|
||||||
priority=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Feedback edge (evaluated after forward edges)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-feedback",
|
|
||||||
source="review",
|
|
||||||
target="extractor",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="output.get('redo_extraction') is not None",
|
|
||||||
priority=-1,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Client-Facing Nodes
|
|
||||||
|
|
||||||
For multi-turn conversations with the user, set `client_facing=True` on a node. The node will:
|
|
||||||
- Stream its LLM output directly to the end user
|
|
||||||
- Block for user input between conversational turns
|
|
||||||
- Resume when new input is injected via `inject_event()`
|
|
||||||
|
|
||||||
```python
|
|
||||||
intake_node = NodeSpec(
|
|
||||||
id="intake",
|
|
||||||
name="Intake",
|
|
||||||
description="Gather requirements from the user",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
input_keys=[],
|
|
||||||
output_keys=["repo_url", "project_url"],
|
|
||||||
system_prompt="You are the intake agent. Ask the user for the repo URL and project URL.",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
> **Legacy Note:** The old `pause_nodes` / `entry_points` pattern still works but `client_facing=True` is preferred for new agents.
|
|
||||||
|
|
||||||
**STEP 1 / STEP 2 Prompt Pattern:** For client-facing nodes, structure the system prompt with two explicit phases:
|
|
||||||
|
|
||||||
```python
|
|
||||||
system_prompt="""\
|
|
||||||
**STEP 1 — Respond to the user (text only, NO tool calls):**
|
|
||||||
[Present information, ask questions, etc.]
|
|
||||||
|
|
||||||
**STEP 2 — After the user responds, call set_output:**
|
|
||||||
[Call set_output with the structured outputs]
|
|
||||||
"""
|
|
||||||
```
|
|
||||||
|
|
||||||
This prevents the LLM from calling `set_output` prematurely before the user has had a chance to respond.
|
|
||||||
|
|
||||||
### Node Design: Fewer, Richer Nodes
|
|
||||||
|
|
||||||
Prefer fewer nodes that do more work over many thin single-purpose nodes:
|
|
||||||
|
|
||||||
- **Bad**: 8 thin nodes (parse query → search → fetch → evaluate → synthesize → write → check → save)
|
|
||||||
- **Good**: 4 rich nodes (intake → research → review → report)
|
|
||||||
|
|
||||||
Why: Each node boundary requires serializing outputs and passing context. Fewer nodes means the LLM retains full context of its work within the node. A research node that searches, fetches, and analyzes keeps all the source material in its conversation history.
|
|
||||||
|
|
||||||
### nullable_output_keys for Cross-Edge Inputs
|
|
||||||
|
|
||||||
When a node receives inputs that only arrive on certain edges (e.g., `feedback` only comes from a review → research feedback loop, not from intake → research), mark those keys as `nullable_output_keys`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
research_node = NodeSpec(
|
|
||||||
id="research",
|
|
||||||
input_keys=["research_brief", "feedback"],
|
|
||||||
nullable_output_keys=["feedback"], # Not present on first visit
|
|
||||||
max_node_visits=3,
|
|
||||||
...
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Event Loop Architecture Concepts
|
|
||||||
|
|
||||||
### How EventLoopNode Works
|
|
||||||
|
|
||||||
An event loop node runs a multi-turn loop:
|
|
||||||
1. LLM receives system prompt + conversation history
|
|
||||||
2. LLM responds (text and/or tool calls)
|
|
||||||
3. Tool calls are executed, results added to conversation
|
|
||||||
4. Judge evaluates: ACCEPT (exit loop), RETRY (loop again), or ESCALATE
|
|
||||||
5. Repeat until judge ACCEPTs or max_iterations reached
|
|
||||||
|
|
||||||
### EventLoopNode Runtime
|
|
||||||
|
|
||||||
EventLoopNodes are **auto-created** by `GraphExecutor` at runtime. You do NOT need to manually register them. Both `GraphExecutor` (direct) and `AgentRuntime` / `create_agent_runtime()` handle event_loop nodes automatically.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Direct execution — executor auto-creates EventLoopNodes
|
|
||||||
from framework.graph.executor import GraphExecutor
|
|
||||||
from framework.runtime.core import Runtime
|
|
||||||
|
|
||||||
runtime = Runtime(storage_path)
|
|
||||||
executor = GraphExecutor(
|
|
||||||
runtime=runtime,
|
|
||||||
llm=llm,
|
|
||||||
tools=tools,
|
|
||||||
tool_executor=tool_executor,
|
|
||||||
storage_path=storage_path,
|
|
||||||
)
|
|
||||||
result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
|
|
||||||
|
|
||||||
# TUI execution — AgentRuntime also works
|
|
||||||
from framework.runtime.agent_runtime import create_agent_runtime
|
|
||||||
runtime = create_agent_runtime(
|
|
||||||
graph=graph, goal=goal, storage_path=storage_path,
|
|
||||||
entry_points=[...], llm=llm, tools=tools, tool_executor=tool_executor,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### set_output
|
|
||||||
|
|
||||||
Nodes produce structured outputs by calling `set_output(key, value)` — a synthetic tool injected by the framework. When the LLM calls `set_output`, the value is stored in the output accumulator and made available to downstream nodes via shared memory.
|
|
||||||
|
|
||||||
`set_output` is NOT a real tool — it is excluded from `real_tool_results`. For client-facing nodes, this means a turn where the LLM only calls `set_output` (no other tools) is treated as a conversational boundary and will block for user input.
|
|
||||||
|
|
||||||
### JudgeProtocol
|
|
||||||
|
|
||||||
**The judge is the SOLE mechanism for acceptance decisions.** Do not add ad-hoc framework gating, output rollback, or premature rejection logic. If the LLM calls `set_output` too early, fix it with better prompts or a custom judge — not framework-level guards.
|
|
||||||
|
|
||||||
The judge controls when a node's loop exits:
|
|
||||||
- **Implicit judge** (default, no judge configured): ACCEPTs when the LLM finishes with no tool calls and all required output keys are set
|
|
||||||
- **SchemaJudge**: Validates outputs against a Pydantic model
|
|
||||||
- **Custom judges**: Implement `evaluate(context) -> JudgeVerdict`
|
|
||||||
|
|
||||||
### LoopConfig
|
|
||||||
|
|
||||||
Controls loop behavior:
|
|
||||||
- `max_iterations` (default 50) — prevents infinite loops
|
|
||||||
- `max_tool_calls_per_turn` (default 10) — limits tool calls per LLM response
|
|
||||||
- `tool_call_overflow_margin` (default 0.5) — wiggle room before discarding extra tool calls (50% means hard cutoff at 150% of limit)
|
|
||||||
- `stall_detection_threshold` (default 3) — detects repeated identical responses
|
|
||||||
- `max_history_tokens` (default 32000) — triggers conversation compaction
|
|
||||||
|
|
||||||
### Data Tools (Spillover Management)
|
|
||||||
|
|
||||||
When tool results exceed the context window, the framework automatically saves them to a spillover directory and truncates with a hint. Nodes that produce or consume large data should include the data tools:
|
|
||||||
|
|
||||||
- `save_data(filename, data)` — Write data to a file in the data directory
|
|
||||||
- `load_data(filename, offset=0, limit=50)` — Read data with line-based pagination
|
|
||||||
- `list_data_files()` — List available data files
|
|
||||||
- `serve_file_to_user(filename, label="")` — Get a clickable file:// URI for the user
|
|
||||||
|
|
||||||
Note: `data_dir` is a framework-injected context parameter — the LLM never sees or passes it. `GraphExecutor.execute()` sets it per-execution via `contextvars`, so data tools and spillover always share the same session-scoped directory.
|
|
||||||
|
|
||||||
These are real MCP tools (not synthetic). Add them to nodes that handle large tool results:
|
|
||||||
|
|
||||||
```python
|
|
||||||
research_node = NodeSpec(
|
|
||||||
...
|
|
||||||
tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fan-Out / Fan-In
|
|
||||||
|
|
||||||
Multiple ON_SUCCESS edges from the same source create parallel execution. All branches run concurrently via `asyncio.gather()`. Parallel event_loop nodes must have disjoint `output_keys`.
|
|
||||||
|
|
||||||
### max_node_visits
|
|
||||||
|
|
||||||
Controls how many times a node can execute in one graph run. Default is 1. Set higher for nodes that are targets of feedback edges (review-reject loops). Set 0 for unlimited (guarded by max_steps).
|
|
||||||
|
|
||||||
## Tool Discovery & Validation
|
|
||||||
|
|
||||||
**CRITICAL:** Before adding a node with tools, you MUST verify the tools exist.
|
|
||||||
|
|
||||||
Tools are provided by MCP servers. Never assume a tool exists - always discover dynamically.
|
|
||||||
|
|
||||||
### Step 1: Register MCP Server (if not already done)
|
|
||||||
|
|
||||||
```python
|
|
||||||
mcp__agent-builder__add_mcp_server(
|
|
||||||
name="tools",
|
|
||||||
transport="stdio",
|
|
||||||
command="python",
|
|
||||||
args='["mcp_server.py", "--stdio"]',
|
|
||||||
cwd="../tools"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: Discover Available Tools
|
|
||||||
|
|
||||||
```python
|
|
||||||
# List all tools from all registered servers
|
|
||||||
mcp__agent-builder__list_mcp_tools()
|
|
||||||
|
|
||||||
# Or list tools from a specific server
|
|
||||||
mcp__agent-builder__list_mcp_tools(server_name="tools")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 3: Validate Before Adding Nodes
|
|
||||||
|
|
||||||
Before writing a node with `tools=[...]`:
|
|
||||||
|
|
||||||
1. Call `list_mcp_tools()` to get available tools
|
|
||||||
2. Check each tool in your node exists in the response
|
|
||||||
3. If a tool doesn't exist:
|
|
||||||
- **DO NOT proceed** with the node
|
|
||||||
- Inform the user: "The tool 'X' is not available. Available tools are: ..."
|
|
||||||
- Ask if they want to use an alternative or proceed without the tool
|
|
||||||
|
|
||||||
### Tool Validation Anti-Patterns
|
|
||||||
|
|
||||||
- **Never assume a tool exists** - always call `list_mcp_tools()` first
|
|
||||||
- **Never write a node with unverified tools** - validate before writing
|
|
||||||
- **Never silently drop tools** - if a tool doesn't exist, inform the user
|
|
||||||
- **Never guess tool names** - use exact names from discovery response
|
|
||||||
|
|
||||||
## Workflow Overview: Incremental File Construction
|
|
||||||
|
|
||||||
```
|
|
||||||
1. CREATE PACKAGE → mkdir + write skeletons
|
|
||||||
2. DEFINE GOAL → Write to agent.py + config.py
|
|
||||||
3. FOR EACH NODE:
|
|
||||||
- Propose design (event_loop for LLM work, function for deterministic)
|
|
||||||
- User approves
|
|
||||||
- Write to nodes/__init__.py IMMEDIATELY
|
|
||||||
- (Optional) Validate with test_node
|
|
||||||
4. CONNECT EDGES → Update agent.py
|
|
||||||
- Use priority for feedback edges (negative priority)
|
|
||||||
- (Optional) Validate with validate_graph
|
|
||||||
5. FINALIZE → Write agent class to agent.py
|
|
||||||
6. DONE - Agent ready at exports/my_agent/
|
|
||||||
```
|
|
||||||
|
|
||||||
**Files written immediately. MCP tools optional for validation/testing bookkeeping.**
|
|
||||||
|
|
||||||
## When to Use This Skill
|
|
||||||
|
|
||||||
Use hive-concepts when:
|
|
||||||
- Starting a new agent project and need to understand fundamentals
|
|
||||||
- Need to understand agent architecture before building
|
|
||||||
- Want to validate tool availability before proceeding
|
|
||||||
- Learning about node types, edges, and graph execution
|
|
||||||
|
|
||||||
**Next Steps:**
|
|
||||||
- Ready to build? → Use `hive-create` skill
|
|
||||||
- Need patterns and examples? → Use `hive-patterns` skill
|
|
||||||
|
|
||||||
## MCP Tools for Validation
|
|
||||||
|
|
||||||
After writing files, optionally use MCP tools for validation:
|
|
||||||
|
|
||||||
**test_node** - Validate node configuration with mock inputs
|
|
||||||
```python
|
|
||||||
mcp__agent-builder__test_node(
|
|
||||||
node_id="search-web",
|
|
||||||
test_input='{"query": "test query"}',
|
|
||||||
mock_llm_response='{"results": "mock output"}'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**validate_graph** - Check graph structure
|
|
||||||
```python
|
|
||||||
mcp__agent-builder__validate_graph()
|
|
||||||
# Returns: unreachable nodes, missing connections, event_loop validation, etc.
|
|
||||||
```
|
|
||||||
|
|
||||||
**configure_loop** - Set event loop parameters
|
|
||||||
```python
|
|
||||||
mcp__agent-builder__configure_loop(
|
|
||||||
max_iterations=50,
|
|
||||||
max_tool_calls_per_turn=10,
|
|
||||||
stall_detection_threshold=3,
|
|
||||||
max_history_tokens=32000
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Point:** Files are written FIRST. MCP tools are for validation only.
|
|
||||||
|
|
||||||
## Related Skills
|
|
||||||
|
|
||||||
- **hive-create** - Step-by-step building process
|
|
||||||
- **hive-patterns** - Best practices: judges, feedback edges, fan-out, context management
|
|
||||||
- **hive** - Complete workflow orchestrator
|
|
||||||
- **hive-test** - Test and validate completed agents
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,24 +0,0 @@
|
|||||||
"""
|
|
||||||
Deep Research Agent - Interactive, rigorous research with TUI conversation.
|
|
||||||
|
|
||||||
Research any topic through multi-source web search, quality evaluation,
|
|
||||||
and synthesis. Features client-facing TUI interaction at key checkpoints
|
|
||||||
for user guidance and iterative deepening.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .agent import DeepResearchAgent, default_agent, goal, nodes, edges
|
|
||||||
from .config import RuntimeConfig, AgentMetadata, default_config, metadata
|
|
||||||
|
|
||||||
__version__ = "1.0.0"
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"DeepResearchAgent",
|
|
||||||
"default_agent",
|
|
||||||
"goal",
|
|
||||||
"nodes",
|
|
||||||
"edges",
|
|
||||||
"RuntimeConfig",
|
|
||||||
"AgentMetadata",
|
|
||||||
"default_config",
|
|
||||||
"metadata",
|
|
||||||
]
|
|
||||||
@@ -1,241 +0,0 @@
|
|||||||
"""
|
|
||||||
CLI entry point for Deep Research Agent.
|
|
||||||
|
|
||||||
Uses AgentRuntime for multi-entrypoint support with HITL pause/resume.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
import click
|
|
||||||
|
|
||||||
from .agent import default_agent, DeepResearchAgent
|
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(verbose=False, debug=False):
|
|
||||||
"""Configure logging for execution visibility."""
|
|
||||||
if debug:
|
|
||||||
level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
|
|
||||||
elif verbose:
|
|
||||||
level, fmt = logging.INFO, "%(message)s"
|
|
||||||
else:
|
|
||||||
level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
|
|
||||||
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
|
|
||||||
logging.getLogger("framework").setLevel(level)
|
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
|
||||||
@click.version_option(version="1.0.0")
|
|
||||||
def cli():
|
|
||||||
"""Deep Research Agent - Interactive, rigorous research with TUI conversation."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option("--topic", "-t", type=str, required=True, help="Research topic")
|
|
||||||
@click.option("--mock", is_flag=True, help="Run in mock mode")
|
|
||||||
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
|
|
||||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
|
||||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
|
||||||
def run(topic, mock, quiet, verbose, debug):
|
|
||||||
"""Execute research on a topic."""
|
|
||||||
if not quiet:
|
|
||||||
setup_logging(verbose=verbose, debug=debug)
|
|
||||||
|
|
||||||
context = {"topic": topic}
|
|
||||||
|
|
||||||
result = asyncio.run(default_agent.run(context, mock_mode=mock))
|
|
||||||
|
|
||||||
output_data = {
|
|
||||||
"success": result.success,
|
|
||||||
"steps_executed": result.steps_executed,
|
|
||||||
"output": result.output,
|
|
||||||
}
|
|
||||||
if result.error:
|
|
||||||
output_data["error"] = result.error
|
|
||||||
|
|
||||||
click.echo(json.dumps(output_data, indent=2, default=str))
|
|
||||||
sys.exit(0 if result.success else 1)
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option("--mock", is_flag=True, help="Run in mock mode")
|
|
||||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
|
||||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
|
||||||
def tui(mock, verbose, debug):
|
|
||||||
"""Launch the TUI dashboard for interactive research."""
|
|
||||||
setup_logging(verbose=verbose, debug=debug)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from framework.tui.app import AdenTUI
|
|
||||||
except ImportError:
|
|
||||||
click.echo(
|
|
||||||
"TUI requires the 'textual' package. Install with: pip install textual"
|
|
||||||
)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from framework.llm import LiteLLMProvider
|
|
||||||
from framework.runner.tool_registry import ToolRegistry
|
|
||||||
from framework.runtime.agent_runtime import create_agent_runtime
|
|
||||||
from framework.runtime.event_bus import EventBus
|
|
||||||
from framework.runtime.execution_stream import EntryPointSpec
|
|
||||||
|
|
||||||
async def run_with_tui():
|
|
||||||
agent = DeepResearchAgent()
|
|
||||||
|
|
||||||
# Build graph and tools
|
|
||||||
agent._event_bus = EventBus()
|
|
||||||
agent._tool_registry = ToolRegistry()
|
|
||||||
|
|
||||||
storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
|
|
||||||
storage_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
|
||||||
if mcp_config_path.exists():
|
|
||||||
agent._tool_registry.load_mcp_config(mcp_config_path)
|
|
||||||
|
|
||||||
llm = None
|
|
||||||
if not mock:
|
|
||||||
llm = LiteLLMProvider(
|
|
||||||
model=agent.config.model,
|
|
||||||
api_key=agent.config.api_key,
|
|
||||||
api_base=agent.config.api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
tools = list(agent._tool_registry.get_tools().values())
|
|
||||||
tool_executor = agent._tool_registry.get_executor()
|
|
||||||
graph = agent._build_graph()
|
|
||||||
|
|
||||||
runtime = create_agent_runtime(
|
|
||||||
graph=graph,
|
|
||||||
goal=agent.goal,
|
|
||||||
storage_path=storage_path,
|
|
||||||
entry_points=[
|
|
||||||
EntryPointSpec(
|
|
||||||
id="start",
|
|
||||||
name="Start Research",
|
|
||||||
entry_node="intake",
|
|
||||||
trigger_type="manual",
|
|
||||||
isolation_level="isolated",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
llm=llm,
|
|
||||||
tools=tools,
|
|
||||||
tool_executor=tool_executor,
|
|
||||||
)
|
|
||||||
|
|
||||||
await runtime.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
app = AdenTUI(runtime)
|
|
||||||
await app.run_async()
|
|
||||||
finally:
|
|
||||||
await runtime.stop()
|
|
||||||
|
|
||||||
asyncio.run(run_with_tui())
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option("--json", "output_json", is_flag=True)
|
|
||||||
def info(output_json):
|
|
||||||
"""Show agent information."""
|
|
||||||
info_data = default_agent.info()
|
|
||||||
if output_json:
|
|
||||||
click.echo(json.dumps(info_data, indent=2))
|
|
||||||
else:
|
|
||||||
click.echo(f"Agent: {info_data['name']}")
|
|
||||||
click.echo(f"Version: {info_data['version']}")
|
|
||||||
click.echo(f"Description: {info_data['description']}")
|
|
||||||
click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
|
|
||||||
click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
|
|
||||||
click.echo(f"Entry: {info_data['entry_node']}")
|
|
||||||
click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
def validate():
|
|
||||||
"""Validate agent structure."""
|
|
||||||
validation = default_agent.validate()
|
|
||||||
if validation["valid"]:
|
|
||||||
click.echo("Agent is valid")
|
|
||||||
if validation["warnings"]:
|
|
||||||
for warning in validation["warnings"]:
|
|
||||||
click.echo(f" WARNING: {warning}")
|
|
||||||
else:
|
|
||||||
click.echo("Agent has errors:")
|
|
||||||
for error in validation["errors"]:
|
|
||||||
click.echo(f" ERROR: {error}")
|
|
||||||
sys.exit(0 if validation["valid"] else 1)
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
|
||||||
def shell(verbose):
|
|
||||||
"""Interactive research session (CLI, no TUI)."""
|
|
||||||
asyncio.run(_interactive_shell(verbose))
|
|
||||||
|
|
||||||
|
|
||||||
async def _interactive_shell(verbose=False):
|
|
||||||
"""Async interactive shell."""
|
|
||||||
setup_logging(verbose=verbose)
|
|
||||||
|
|
||||||
click.echo("=== Deep Research Agent ===")
|
|
||||||
click.echo("Enter a topic to research (or 'quit' to exit):\n")
|
|
||||||
|
|
||||||
agent = DeepResearchAgent()
|
|
||||||
await agent.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
topic = await asyncio.get_event_loop().run_in_executor(
|
|
||||||
None, input, "Topic> "
|
|
||||||
)
|
|
||||||
if topic.lower() in ["quit", "exit", "q"]:
|
|
||||||
click.echo("Goodbye!")
|
|
||||||
break
|
|
||||||
|
|
||||||
if not topic.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
click.echo("\nResearching...\n")
|
|
||||||
|
|
||||||
result = await agent.trigger_and_wait("start", {"topic": topic})
|
|
||||||
|
|
||||||
if result is None:
|
|
||||||
click.echo("\n[Execution timed out]\n")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
output = result.output
|
|
||||||
if "report_content" in output:
|
|
||||||
click.echo("\n--- Report ---\n")
|
|
||||||
click.echo(output["report_content"])
|
|
||||||
click.echo("\n")
|
|
||||||
if "references" in output:
|
|
||||||
click.echo("--- References ---\n")
|
|
||||||
for ref in output.get("references", []):
|
|
||||||
click.echo(
|
|
||||||
f" [{ref.get('number', '?')}] {ref.get('title', '')} - {ref.get('url', '')}"
|
|
||||||
)
|
|
||||||
click.echo("\n")
|
|
||||||
else:
|
|
||||||
click.echo(f"\nResearch failed: {result.error}\n")
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
click.echo("\nGoodbye!")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
click.echo(f"Error: {e}", err=True)
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
traceback.print_exc()
|
|
||||||
finally:
|
|
||||||
await agent.stop()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cli()
|
|
||||||
@@ -1,358 +0,0 @@
|
|||||||
"""Agent graph construction for Deep Research Agent."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
|
|
||||||
from framework.graph.edge import GraphSpec
|
|
||||||
from framework.graph.executor import ExecutionResult
|
|
||||||
from framework.graph.checkpoint_config import CheckpointConfig
|
|
||||||
from framework.llm import LiteLLMProvider
|
|
||||||
from framework.runner.tool_registry import ToolRegistry
|
|
||||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
|
||||||
from framework.runtime.execution_stream import EntryPointSpec
|
|
||||||
|
|
||||||
from .config import default_config, metadata
|
|
||||||
from .nodes import (
|
|
||||||
intake_node,
|
|
||||||
research_node,
|
|
||||||
review_node,
|
|
||||||
report_node,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Goal definition
|
|
||||||
goal = Goal(
|
|
||||||
id="rigorous-interactive-research",
|
|
||||||
name="Rigorous Interactive Research",
|
|
||||||
description=(
|
|
||||||
"Research any topic by searching diverse sources, analyzing findings, "
|
|
||||||
"and producing a cited report — with user checkpoints to guide direction."
|
|
||||||
),
|
|
||||||
success_criteria=[
|
|
||||||
SuccessCriterion(
|
|
||||||
id="source-diversity",
|
|
||||||
description="Use multiple diverse, authoritative sources",
|
|
||||||
metric="source_count",
|
|
||||||
target=">=5",
|
|
||||||
weight=0.25,
|
|
||||||
),
|
|
||||||
SuccessCriterion(
|
|
||||||
id="citation-coverage",
|
|
||||||
description="Every factual claim in the report cites its source",
|
|
||||||
metric="citation_coverage",
|
|
||||||
target="100%",
|
|
||||||
weight=0.25,
|
|
||||||
),
|
|
||||||
SuccessCriterion(
|
|
||||||
id="user-satisfaction",
|
|
||||||
description="User reviews findings before report generation",
|
|
||||||
metric="user_approval",
|
|
||||||
target="true",
|
|
||||||
weight=0.25,
|
|
||||||
),
|
|
||||||
SuccessCriterion(
|
|
||||||
id="report-completeness",
|
|
||||||
description="Final report answers the original research questions",
|
|
||||||
metric="question_coverage",
|
|
||||||
target="90%",
|
|
||||||
weight=0.25,
|
|
||||||
),
|
|
||||||
],
|
|
||||||
constraints=[
|
|
||||||
Constraint(
|
|
||||||
id="no-hallucination",
|
|
||||||
description="Only include information found in fetched sources",
|
|
||||||
constraint_type="quality",
|
|
||||||
category="accuracy",
|
|
||||||
),
|
|
||||||
Constraint(
|
|
||||||
id="source-attribution",
|
|
||||||
description="Every claim must cite its source with a numbered reference",
|
|
||||||
constraint_type="quality",
|
|
||||||
category="accuracy",
|
|
||||||
),
|
|
||||||
Constraint(
|
|
||||||
id="user-checkpoint",
|
|
||||||
description="Present findings to the user before writing the final report",
|
|
||||||
constraint_type="functional",
|
|
||||||
category="interaction",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Node list
|
|
||||||
nodes = [
|
|
||||||
intake_node,
|
|
||||||
research_node,
|
|
||||||
review_node,
|
|
||||||
report_node,
|
|
||||||
]
|
|
||||||
|
|
||||||
# Edge definitions
|
|
||||||
edges = [
|
|
||||||
# intake -> research
|
|
||||||
EdgeSpec(
|
|
||||||
id="intake-to-research",
|
|
||||||
source="intake",
|
|
||||||
target="research",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS,
|
|
||||||
priority=1,
|
|
||||||
),
|
|
||||||
# research -> review
|
|
||||||
EdgeSpec(
|
|
||||||
id="research-to-review",
|
|
||||||
source="research",
|
|
||||||
target="review",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS,
|
|
||||||
priority=1,
|
|
||||||
),
|
|
||||||
# review -> research (feedback loop)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-to-research-feedback",
|
|
||||||
source="review",
|
|
||||||
target="research",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="needs_more_research == True",
|
|
||||||
priority=1,
|
|
||||||
),
|
|
||||||
# review -> report (user satisfied)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-to-report",
|
|
||||||
source="review",
|
|
||||||
target="report",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="needs_more_research == False",
|
|
||||||
priority=2,
|
|
||||||
),
|
|
||||||
# report -> research (user wants deeper research on current topic)
|
|
||||||
EdgeSpec(
|
|
||||||
id="report-to-research",
|
|
||||||
source="report",
|
|
||||||
target="research",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="str(next_action).lower() == 'more_research'",
|
|
||||||
priority=2,
|
|
||||||
),
|
|
||||||
# report -> intake (user wants a new topic — default when not more_research)
|
|
||||||
EdgeSpec(
|
|
||||||
id="report-to-intake",
|
|
||||||
source="report",
|
|
||||||
target="intake",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="str(next_action).lower() != 'more_research'",
|
|
||||||
priority=1,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Graph configuration
|
|
||||||
entry_node = "intake"
|
|
||||||
entry_points = {"start": "intake"}
|
|
||||||
pause_nodes = []
|
|
||||||
terminal_nodes = []
|
|
||||||
|
|
||||||
|
|
||||||
class DeepResearchAgent:
|
|
||||||
"""
|
|
||||||
Deep Research Agent — 4-node pipeline with user checkpoints.
|
|
||||||
|
|
||||||
Flow: intake -> research -> review -> report
|
|
||||||
^ |
|
|
||||||
+-- feedback loop (if user wants more)
|
|
||||||
|
|
||||||
Uses AgentRuntime for proper session management:
|
|
||||||
- Session-scoped storage (sessions/{session_id}/)
|
|
||||||
- Checkpointing for resume capability
|
|
||||||
- Runtime logging
|
|
||||||
- Data folder for save_data/load_data
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config=None):
|
|
||||||
self.config = config or default_config
|
|
||||||
self.goal = goal
|
|
||||||
self.nodes = nodes
|
|
||||||
self.edges = edges
|
|
||||||
self.entry_node = entry_node
|
|
||||||
self.entry_points = entry_points
|
|
||||||
self.pause_nodes = pause_nodes
|
|
||||||
self.terminal_nodes = terminal_nodes
|
|
||||||
self._graph: GraphSpec | None = None
|
|
||||||
self._agent_runtime: AgentRuntime | None = None
|
|
||||||
self._tool_registry: ToolRegistry | None = None
|
|
||||||
self._storage_path: Path | None = None
|
|
||||||
|
|
||||||
def _build_graph(self) -> GraphSpec:
|
|
||||||
"""Build the GraphSpec."""
|
|
||||||
return GraphSpec(
|
|
||||||
id="deep-research-agent-graph",
|
|
||||||
goal_id=self.goal.id,
|
|
||||||
version="1.0.0",
|
|
||||||
entry_node=self.entry_node,
|
|
||||||
entry_points=self.entry_points,
|
|
||||||
terminal_nodes=self.terminal_nodes,
|
|
||||||
pause_nodes=self.pause_nodes,
|
|
||||||
nodes=self.nodes,
|
|
||||||
edges=self.edges,
|
|
||||||
default_model=self.config.model,
|
|
||||||
max_tokens=self.config.max_tokens,
|
|
||||||
loop_config={
|
|
||||||
"max_iterations": 100,
|
|
||||||
"max_tool_calls_per_turn": 30,
|
|
||||||
"max_history_tokens": 32000,
|
|
||||||
},
|
|
||||||
conversation_mode="continuous",
|
|
||||||
identity_prompt=(
|
|
||||||
"You are a rigorous research agent. You search for information "
|
|
||||||
"from diverse, authoritative sources, analyze findings critically, "
|
|
||||||
"and produce well-cited reports. You never fabricate information — "
|
|
||||||
"every claim must trace back to a source you actually retrieved."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _setup(self, mock_mode=False) -> None:
|
|
||||||
"""Set up the agent runtime with sessions, checkpoints, and logging."""
|
|
||||||
self._storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
|
|
||||||
self._storage_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
self._tool_registry = ToolRegistry()
|
|
||||||
|
|
||||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
|
||||||
if mcp_config_path.exists():
|
|
||||||
self._tool_registry.load_mcp_config(mcp_config_path)
|
|
||||||
|
|
||||||
llm = None
|
|
||||||
if not mock_mode:
|
|
||||||
llm = LiteLLMProvider(
|
|
||||||
model=self.config.model,
|
|
||||||
api_key=self.config.api_key,
|
|
||||||
api_base=self.config.api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
tool_executor = self._tool_registry.get_executor()
|
|
||||||
tools = list(self._tool_registry.get_tools().values())
|
|
||||||
|
|
||||||
self._graph = self._build_graph()
|
|
||||||
|
|
||||||
checkpoint_config = CheckpointConfig(
|
|
||||||
enabled=True,
|
|
||||||
checkpoint_on_node_start=False,
|
|
||||||
checkpoint_on_node_complete=True,
|
|
||||||
checkpoint_max_age_days=7,
|
|
||||||
async_checkpoint=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
entry_point_specs = [
|
|
||||||
EntryPointSpec(
|
|
||||||
id="default",
|
|
||||||
name="Default",
|
|
||||||
entry_node=self.entry_node,
|
|
||||||
trigger_type="manual",
|
|
||||||
isolation_level="shared",
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
self._agent_runtime = create_agent_runtime(
|
|
||||||
graph=self._graph,
|
|
||||||
goal=self.goal,
|
|
||||||
storage_path=self._storage_path,
|
|
||||||
entry_points=entry_point_specs,
|
|
||||||
llm=llm,
|
|
||||||
tools=tools,
|
|
||||||
tool_executor=tool_executor,
|
|
||||||
checkpoint_config=checkpoint_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def start(self, mock_mode=False) -> None:
|
|
||||||
"""Set up and start the agent runtime."""
|
|
||||||
if self._agent_runtime is None:
|
|
||||||
self._setup(mock_mode=mock_mode)
|
|
||||||
if not self._agent_runtime.is_running:
|
|
||||||
await self._agent_runtime.start()
|
|
||||||
|
|
||||||
async def stop(self) -> None:
|
|
||||||
"""Stop the agent runtime and clean up."""
|
|
||||||
if self._agent_runtime and self._agent_runtime.is_running:
|
|
||||||
await self._agent_runtime.stop()
|
|
||||||
self._agent_runtime = None
|
|
||||||
|
|
||||||
async def trigger_and_wait(
|
|
||||||
self,
|
|
||||||
entry_point: str = "default",
|
|
||||||
input_data: dict | None = None,
|
|
||||||
timeout: float | None = None,
|
|
||||||
session_state: dict | None = None,
|
|
||||||
) -> ExecutionResult | None:
|
|
||||||
"""Execute the graph and wait for completion."""
|
|
||||||
if self._agent_runtime is None:
|
|
||||||
raise RuntimeError("Agent not started. Call start() first.")
|
|
||||||
|
|
||||||
return await self._agent_runtime.trigger_and_wait(
|
|
||||||
entry_point_id=entry_point,
|
|
||||||
input_data=input_data or {},
|
|
||||||
session_state=session_state,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def run(
|
|
||||||
self, context: dict, mock_mode=False, session_state=None
|
|
||||||
) -> ExecutionResult:
|
|
||||||
"""Run the agent (convenience method for single execution)."""
|
|
||||||
await self.start(mock_mode=mock_mode)
|
|
||||||
try:
|
|
||||||
result = await self.trigger_and_wait(
|
|
||||||
"default", context, session_state=session_state
|
|
||||||
)
|
|
||||||
return result or ExecutionResult(success=False, error="Execution timeout")
|
|
||||||
finally:
|
|
||||||
await self.stop()
|
|
||||||
|
|
||||||
def info(self):
|
|
||||||
"""Get agent information."""
|
|
||||||
return {
|
|
||||||
"name": metadata.name,
|
|
||||||
"version": metadata.version,
|
|
||||||
"description": metadata.description,
|
|
||||||
"goal": {
|
|
||||||
"name": self.goal.name,
|
|
||||||
"description": self.goal.description,
|
|
||||||
},
|
|
||||||
"nodes": [n.id for n in self.nodes],
|
|
||||||
"edges": [e.id for e in self.edges],
|
|
||||||
"entry_node": self.entry_node,
|
|
||||||
"entry_points": self.entry_points,
|
|
||||||
"pause_nodes": self.pause_nodes,
|
|
||||||
"terminal_nodes": self.terminal_nodes,
|
|
||||||
"client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
|
|
||||||
}
|
|
||||||
|
|
||||||
def validate(self):
|
|
||||||
"""Validate agent structure."""
|
|
||||||
errors = []
|
|
||||||
warnings = []
|
|
||||||
|
|
||||||
node_ids = {node.id for node in self.nodes}
|
|
||||||
for edge in self.edges:
|
|
||||||
if edge.source not in node_ids:
|
|
||||||
errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
|
|
||||||
if edge.target not in node_ids:
|
|
||||||
errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
|
|
||||||
|
|
||||||
if self.entry_node not in node_ids:
|
|
||||||
errors.append(f"Entry node '{self.entry_node}' not found")
|
|
||||||
|
|
||||||
for terminal in self.terminal_nodes:
|
|
||||||
if terminal not in node_ids:
|
|
||||||
errors.append(f"Terminal node '{terminal}' not found")
|
|
||||||
|
|
||||||
for ep_id, node_id in self.entry_points.items():
|
|
||||||
if node_id not in node_ids:
|
|
||||||
errors.append(
|
|
||||||
f"Entry point '{ep_id}' references unknown node '{node_id}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"valid": len(errors) == 0,
|
|
||||||
"errors": errors,
|
|
||||||
"warnings": warnings,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Create default instance
|
|
||||||
default_agent = DeepResearchAgent()
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
"""Runtime configuration."""
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from framework.config import RuntimeConfig
|
|
||||||
|
|
||||||
default_config = RuntimeConfig()
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AgentMetadata:
|
|
||||||
name: str = "Deep Research Agent"
|
|
||||||
version: str = "1.0.0"
|
|
||||||
description: str = (
|
|
||||||
"Interactive research agent that rigorously investigates topics through "
|
|
||||||
"multi-source search, quality evaluation, and synthesis - with TUI conversation "
|
|
||||||
"at key checkpoints for user guidance and feedback."
|
|
||||||
)
|
|
||||||
intro_message: str = (
|
|
||||||
"Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
|
|
||||||
"thoroughly — searching multiple sources, evaluating quality, and synthesizing "
|
|
||||||
"a comprehensive report. What would you like me to research?"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
metadata = AgentMetadata()
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
{
|
|
||||||
"hive-tools": {
|
|
||||||
"transport": "stdio",
|
|
||||||
"command": "uv",
|
|
||||||
"args": ["run", "python", "mcp_server.py", "--stdio"],
|
|
||||||
"cwd": "../../tools",
|
|
||||||
"description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,213 +0,0 @@
|
|||||||
"""Node definitions for Deep Research Agent."""
|
|
||||||
|
|
||||||
from framework.graph import NodeSpec
|
|
||||||
|
|
||||||
# Node 1: Intake (client-facing)
|
|
||||||
# Brief conversation to clarify what the user wants researched.
|
|
||||||
intake_node = NodeSpec(
|
|
||||||
id="intake",
|
|
||||||
name="Research Intake",
|
|
||||||
description="Discuss the research topic with the user, clarify scope, and confirm direction",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
max_node_visits=0,
|
|
||||||
input_keys=["topic"],
|
|
||||||
output_keys=["research_brief"],
|
|
||||||
success_criteria=(
|
|
||||||
"The research brief is specific and actionable: it states the topic, "
|
|
||||||
"the key questions to answer, the desired scope, and depth."
|
|
||||||
),
|
|
||||||
system_prompt="""\
|
|
||||||
You are a research intake specialist. The user wants to research a topic.
|
|
||||||
Have a brief conversation to clarify what they need.
|
|
||||||
|
|
||||||
**STEP 1 — Read and respond (text only, NO tool calls):**
|
|
||||||
1. Read the topic provided
|
|
||||||
2. If it's vague, ask 1-2 clarifying questions (scope, angle, depth)
|
|
||||||
3. If it's already clear, confirm your understanding and ask the user to confirm
|
|
||||||
|
|
||||||
Keep it short. Don't over-ask.
|
|
||||||
|
|
||||||
**STEP 2 — After the user confirms, call set_output:**
|
|
||||||
- set_output("research_brief", "A clear paragraph describing exactly what to research, \
|
|
||||||
what questions to answer, what scope to cover, and how deep to go.")
|
|
||||||
""",
|
|
||||||
tools=[],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Node 2: Research
|
|
||||||
# The workhorse — searches the web, fetches content, analyzes sources.
|
|
||||||
# One node with both tools avoids the context-passing overhead of 5 separate nodes.
|
|
||||||
research_node = NodeSpec(
|
|
||||||
id="research",
|
|
||||||
name="Research",
|
|
||||||
description="Search the web, fetch source content, and compile findings",
|
|
||||||
node_type="event_loop",
|
|
||||||
max_node_visits=0,
|
|
||||||
input_keys=["research_brief", "feedback"],
|
|
||||||
output_keys=["findings", "sources", "gaps"],
|
|
||||||
nullable_output_keys=["feedback"],
|
|
||||||
success_criteria=(
|
|
||||||
"Findings reference at least 3 distinct sources with URLs. "
|
|
||||||
"Key claims are substantiated by fetched content, not generated."
|
|
||||||
),
|
|
||||||
system_prompt="""\
|
|
||||||
You are a research agent. Given a research brief, find and analyze sources.
|
|
||||||
|
|
||||||
If feedback is provided, this is a follow-up round — focus on the gaps identified.
|
|
||||||
|
|
||||||
Work in phases:
|
|
||||||
1. **Search**: Use web_search with 3-5 diverse queries covering different angles.
|
|
||||||
Prioritize authoritative sources (.edu, .gov, established publications).
|
|
||||||
2. **Fetch**: Use web_scrape on the most promising URLs (aim for 5-8 sources).
|
|
||||||
Skip URLs that fail. Extract the substantive content.
|
|
||||||
3. **Analyze**: Review what you've collected. Identify key findings, themes,
|
|
||||||
and any contradictions between sources.
|
|
||||||
|
|
||||||
Important:
|
|
||||||
- Work in batches of 3-4 tool calls at a time — never more than 10 per turn
|
|
||||||
- After each batch, assess whether you have enough material
|
|
||||||
- Prefer quality over quantity — 5 good sources beat 15 thin ones
|
|
||||||
- Track which URL each finding comes from (you'll need citations later)
|
|
||||||
- Call set_output for each key in a SEPARATE turn (not in the same turn as other tool calls)
|
|
||||||
|
|
||||||
Context management:
|
|
||||||
- Your tool results are automatically saved to files. After compaction, the file \
|
|
||||||
references remain in the conversation — use load_data() to recover any content you need.
|
|
||||||
- Use append_data('research_notes.md', ...) to maintain a running log of key findings \
|
|
||||||
as you go. This survives compaction and helps the report node produce a detailed report.
|
|
||||||
|
|
||||||
When done, use set_output (one key at a time, separate turns):
|
|
||||||
- set_output("findings", "Structured summary: key findings with source URLs for each claim. \
|
|
||||||
Include themes, contradictions, and confidence levels.")
|
|
||||||
- set_output("sources", [{"url": "...", "title": "...", "summary": "..."}])
|
|
||||||
- set_output("gaps", "What aspects of the research brief are NOT well-covered yet, if any.")
|
|
||||||
""",
|
|
||||||
tools=[
|
|
||||||
"web_search",
|
|
||||||
"web_scrape",
|
|
||||||
"load_data",
|
|
||||||
"save_data",
|
|
||||||
"append_data",
|
|
||||||
"list_data_files",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Node 3: Review (client-facing)
|
|
||||||
# Shows the user what was found and asks whether to dig deeper or proceed.
|
|
||||||
review_node = NodeSpec(
|
|
||||||
id="review",
|
|
||||||
name="Review Findings",
|
|
||||||
description="Present findings to user and decide whether to research more or write the report",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
max_node_visits=0,
|
|
||||||
input_keys=["findings", "sources", "gaps", "research_brief"],
|
|
||||||
output_keys=["needs_more_research", "feedback"],
|
|
||||||
success_criteria=(
|
|
||||||
"The user has been presented with findings and has explicitly indicated "
|
|
||||||
"whether they want more research or are ready for the report."
|
|
||||||
),
|
|
||||||
system_prompt="""\
|
|
||||||
Present the research findings to the user clearly and concisely.
|
|
||||||
|
|
||||||
**STEP 1 — Present (your first message, text only, NO tool calls):**
|
|
||||||
1. **Summary** (2-3 sentences of what was found)
|
|
||||||
2. **Key Findings** (bulleted, with confidence levels)
|
|
||||||
3. **Sources Used** (count and quality assessment)
|
|
||||||
4. **Gaps** (what's still unclear or under-covered)
|
|
||||||
|
|
||||||
End by asking: Are they satisfied, or do they want deeper research? \
|
|
||||||
Should we proceed to writing the final report?
|
|
||||||
|
|
||||||
**STEP 2 — After the user responds, call set_output:**
|
|
||||||
- set_output("needs_more_research", "true") — if they want more
|
|
||||||
- set_output("needs_more_research", "false") — if they're satisfied
|
|
||||||
- set_output("feedback", "What the user wants explored further, or empty string")
|
|
||||||
""",
|
|
||||||
tools=[],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Node 4: Report (client-facing)
|
|
||||||
# Writes an HTML report, serves the link to the user, and answers follow-ups.
|
|
||||||
report_node = NodeSpec(
|
|
||||||
id="report",
|
|
||||||
name="Write & Deliver Report",
|
|
||||||
description="Write a cited HTML report from the findings and present it to the user",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
max_node_visits=0,
|
|
||||||
input_keys=["findings", "sources", "research_brief"],
|
|
||||||
output_keys=["delivery_status", "next_action"],
|
|
||||||
success_criteria=(
|
|
||||||
"An HTML report has been saved, the file link has been presented to the user, "
|
|
||||||
"and the user has indicated what they want to do next."
|
|
||||||
),
|
|
||||||
system_prompt="""\
|
|
||||||
Write a research report as an HTML file and present it to the user.
|
|
||||||
|
|
||||||
IMPORTANT: save_data requires TWO separate arguments: filename and data.
|
|
||||||
Call it like: save_data(filename="report.html", data="<html>...</html>")
|
|
||||||
Do NOT use _raw, do NOT nest arguments inside a JSON string.
|
|
||||||
|
|
||||||
**STEP 1 — Write and save the HTML report (tool calls, NO text to user yet):**
|
|
||||||
|
|
||||||
Build a clean HTML document. Keep the HTML concise — aim for clarity over length.
|
|
||||||
Use minimal embedded CSS (a few lines of style, not a full framework).
|
|
||||||
|
|
||||||
Report structure:
|
|
||||||
- Title & date
|
|
||||||
- Executive Summary (2-3 paragraphs)
|
|
||||||
- Key Findings (organized by theme, with [n] citation links)
|
|
||||||
- Analysis (synthesis, implications)
|
|
||||||
- Conclusion (key takeaways)
|
|
||||||
- References (numbered list with clickable URLs)
|
|
||||||
|
|
||||||
Requirements:
|
|
||||||
- Every factual claim must cite its source with [n] notation
|
|
||||||
- Be objective — present multiple viewpoints where sources disagree
|
|
||||||
- Answer the original research questions from the brief
|
|
||||||
- If findings appear incomplete or summarized, call list_data_files() and load_data() \
|
|
||||||
to access the detailed source material from the research phase. The research node's \
|
|
||||||
tool results and research_notes.md contain the full data.
|
|
||||||
|
|
||||||
Save the HTML:
|
|
||||||
save_data(filename="report.html", data="<html>...</html>")
|
|
||||||
|
|
||||||
Then get the clickable link:
|
|
||||||
serve_file_to_user(filename="report.html", label="Research Report")
|
|
||||||
|
|
||||||
If save_data fails, simplify and shorten the HTML, then retry.
|
|
||||||
|
|
||||||
**STEP 2 — Present the link to the user (text only, NO tool calls):**
|
|
||||||
|
|
||||||
Tell the user the report is ready and include the file:// URI from
|
|
||||||
serve_file_to_user so they can click it to open. Give a brief summary
|
|
||||||
of what the report covers. Ask if they have questions or want to continue.
|
|
||||||
|
|
||||||
**STEP 3 — After the user responds:**
|
|
||||||
- Answer any follow-up questions from the research material
|
|
||||||
- When the user is ready to move on, ask what they'd like to do next:
|
|
||||||
- Research a new topic?
|
|
||||||
- Dig deeper into the current topic?
|
|
||||||
- Then call set_output:
|
|
||||||
- set_output("delivery_status", "completed")
|
|
||||||
- set_output("next_action", "new_topic") — if they want a new topic
|
|
||||||
- set_output("next_action", "more_research") — if they want deeper research
|
|
||||||
""",
|
|
||||||
tools=[
|
|
||||||
"save_data",
|
|
||||||
"append_data",
|
|
||||||
"edit_data",
|
|
||||||
"serve_file_to_user",
|
|
||||||
"load_data",
|
|
||||||
"list_data_files",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"intake_node",
|
|
||||||
"research_node",
|
|
||||||
"review_node",
|
|
||||||
"report_node",
|
|
||||||
]
|
|
||||||
@@ -1,640 +0,0 @@
|
|||||||
---
|
|
||||||
name: hive-credentials
|
|
||||||
description: Set up and install credentials for an agent. Detects missing credentials from agent config, collects them from the user, and stores them securely in the local encrypted store at ~/.hive/credentials.
|
|
||||||
license: Apache-2.0
|
|
||||||
metadata:
|
|
||||||
author: hive
|
|
||||||
version: "2.3"
|
|
||||||
type: utility
|
|
||||||
---
|
|
||||||
|
|
||||||
# Setup Credentials
|
|
||||||
|
|
||||||
Interactive credential setup for agents with multiple authentication options. Detects what's missing, offers auth method choices, validates with health checks, and stores credentials securely.
|
|
||||||
|
|
||||||
## When to Use
|
|
||||||
|
|
||||||
- Before running or testing an agent for the first time
|
|
||||||
- When `AgentRunner.run()` fails with "missing required credentials"
|
|
||||||
- When a user asks to configure credentials for an agent
|
|
||||||
- After building a new agent that uses tools requiring API keys
|
|
||||||
|
|
||||||
## Workflow
|
|
||||||
|
|
||||||
### Step 1: Identify the Agent
|
|
||||||
|
|
||||||
Determine which agent needs credentials. The user will either:
|
|
||||||
|
|
||||||
- Name the agent directly (e.g., "set up credentials for hubspot-agent")
|
|
||||||
- Have an agent directory open (check `exports/` for agent dirs)
|
|
||||||
- Be working on an agent in the current session
|
|
||||||
|
|
||||||
Locate the agent's directory under `exports/{agent_name}/`.
|
|
||||||
|
|
||||||
### Step 2: Detect Missing Credentials
|
|
||||||
|
|
||||||
Use the `check_missing_credentials` MCP tool to detect what the agent needs and what's already configured. This tool loads the agent, inspects its required tools and node types, maps them to credentials via `CREDENTIAL_SPECS`, and checks both the encrypted store and environment variables.
|
|
||||||
|
|
||||||
```
|
|
||||||
check_missing_credentials(agent_path="exports/{agent_name}")
|
|
||||||
```
|
|
||||||
|
|
||||||
The tool returns a JSON response:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"agent": "exports/{agent_name}",
|
|
||||||
"missing": [
|
|
||||||
{
|
|
||||||
"credential_name": "brave_search",
|
|
||||||
"env_var": "BRAVE_SEARCH_API_KEY",
|
|
||||||
"description": "Brave Search API key for web search",
|
|
||||||
"help_url": "https://brave.com/search/api/",
|
|
||||||
"tools": ["web_search"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"available": [
|
|
||||||
{
|
|
||||||
"credential_name": "anthropic",
|
|
||||||
"env_var": "ANTHROPIC_API_KEY",
|
|
||||||
"source": "encrypted_store"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"total_missing": 1,
|
|
||||||
"ready": false
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**If `ready` is true (nothing missing):** Report all credentials as configured and skip Steps 3-5. Example:
|
|
||||||
|
|
||||||
```
|
|
||||||
All required credentials are already configured:
|
|
||||||
✓ anthropic (ANTHROPIC_API_KEY)
|
|
||||||
✓ brave_search (BRAVE_SEARCH_API_KEY)
|
|
||||||
Your agent is ready to run!
|
|
||||||
```
|
|
||||||
|
|
||||||
**If credentials are missing:** Continue to Step 3 with the `missing` list.
|
|
||||||
|
|
||||||
### Step 3: Present Auth Options for Each Missing Credential
|
|
||||||
|
|
||||||
For each missing credential, check what authentication methods are available:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
|
||||||
|
|
||||||
spec = CREDENTIAL_SPECS.get("hubspot")
|
|
||||||
if spec:
|
|
||||||
# Determine available auth options
|
|
||||||
auth_options = []
|
|
||||||
if spec.aden_supported:
|
|
||||||
auth_options.append("aden")
|
|
||||||
if spec.direct_api_key_supported:
|
|
||||||
auth_options.append("direct")
|
|
||||||
auth_options.append("custom") # Always available
|
|
||||||
|
|
||||||
# Get setup info
|
|
||||||
setup_info = {
|
|
||||||
"env_var": spec.env_var,
|
|
||||||
"description": spec.description,
|
|
||||||
"help_url": spec.help_url,
|
|
||||||
"api_key_instructions": spec.api_key_instructions,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Present the available options using AskUserQuestion:
|
|
||||||
|
|
||||||
```
|
|
||||||
Choose how to configure HUBSPOT_ACCESS_TOKEN:
|
|
||||||
|
|
||||||
1) Aden Platform (OAuth) (Recommended)
|
|
||||||
Secure OAuth2 flow via hive.adenhq.com
|
|
||||||
- Quick setup with automatic token refresh
|
|
||||||
- No need to manage API keys manually
|
|
||||||
|
|
||||||
2) Direct API Key
|
|
||||||
Enter your own API key manually
|
|
||||||
- Requires creating a HubSpot Private App
|
|
||||||
- Full control over scopes and permissions
|
|
||||||
|
|
||||||
3) Local Credential Setup (Advanced)
|
|
||||||
Programmatic configuration for CI/CD
|
|
||||||
- For automated deployments
|
|
||||||
- Requires manual API calls
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 4: Execute Auth Flow Based on User Choice
|
|
||||||
|
|
||||||
#### Prerequisite: Ensure HIVE_CREDENTIAL_KEY Is Available
|
|
||||||
|
|
||||||
Before storing any credentials, verify `HIVE_CREDENTIAL_KEY` is set (needed to encrypt/decrypt the local store). Check both the current session and shell config:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check current session
|
|
||||||
printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "session: set" || echo "session: not set"
|
|
||||||
|
|
||||||
# Check shell config files
|
|
||||||
for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTIAL_KEY' "$f" && echo "$f"; done
|
|
||||||
```
|
|
||||||
|
|
||||||
- **In current session** — proceed to store credentials
|
|
||||||
- **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
|
|
||||||
- **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile
|
|
||||||
|
|
||||||
> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
|
|
||||||
> ```
|
|
||||||
> ⚠️ Environment variables were added to your shell config.
|
|
||||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
|
||||||
> ```
|
|
||||||
|
|
||||||
#### Option 1: Aden Platform (OAuth)
|
|
||||||
|
|
||||||
This is the recommended flow for supported integrations (HubSpot, etc.).
|
|
||||||
|
|
||||||
**How Aden OAuth Works:**
|
|
||||||
|
|
||||||
The ADEN_API_KEY represents a user who has already completed OAuth authorization on Aden's platform. When users sign up and connect integrations on Aden, those OAuth tokens are stored server-side. Having an ADEN_API_KEY means:
|
|
||||||
|
|
||||||
1. User has an Aden account
|
|
||||||
2. User has already authorized integrations (HubSpot, etc.) via OAuth on Aden
|
|
||||||
3. We just need to sync those credentials down to the local credential store
|
|
||||||
|
|
||||||
**4.1a. Check for ADEN_API_KEY**
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
aden_key = os.environ.get("ADEN_API_KEY")
|
|
||||||
```
|
|
||||||
|
|
||||||
If not set, guide user to get one from Aden (this is where they do OAuth):
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import open_browser, get_aden_setup_url
|
|
||||||
|
|
||||||
# Open browser to Aden - user will sign up and connect integrations there
|
|
||||||
url = get_aden_setup_url() # https://hive.adenhq.com
|
|
||||||
success, msg = open_browser(url)
|
|
||||||
|
|
||||||
print("Please sign in to Aden and connect your integrations (HubSpot, etc.).")
|
|
||||||
print("Once done, copy your API key and return here.")
|
|
||||||
```
|
|
||||||
|
|
||||||
Ask user to provide the ADEN_API_KEY they received.
|
|
||||||
|
|
||||||
**4.1b. Save ADEN_API_KEY to Shell Config**
|
|
||||||
|
|
||||||
With user approval, persist ADEN_API_KEY to their shell config:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import (
|
|
||||||
detect_shell,
|
|
||||||
add_env_var_to_shell_config,
|
|
||||||
get_shell_source_command,
|
|
||||||
)
|
|
||||||
|
|
||||||
shell_type = detect_shell() # 'bash', 'zsh', or 'unknown'
|
|
||||||
|
|
||||||
# Ask user for approval before modifying shell config
|
|
||||||
# If approved:
|
|
||||||
success, config_path = add_env_var_to_shell_config(
|
|
||||||
"ADEN_API_KEY",
|
|
||||||
user_provided_key,
|
|
||||||
comment="Aden Platform (OAuth) API key"
|
|
||||||
)
|
|
||||||
|
|
||||||
if success:
|
|
||||||
source_cmd = get_shell_source_command()
|
|
||||||
print(f"Saved to {config_path}")
|
|
||||||
print(f"Run: {source_cmd}")
|
|
||||||
```
|
|
||||||
|
|
||||||
> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
|
|
||||||
> ```
|
|
||||||
> ⚠️ Environment variables were added to your shell config.
|
|
||||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Also save to `~/.hive/configuration.json` for the framework:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
config_path = Path.home() / ".hive" / "configuration.json"
|
|
||||||
config = json.loads(config_path.read_text()) if config_path.exists() else {}
|
|
||||||
|
|
||||||
config["aden"] = {
|
|
||||||
"api_key_configured": True,
|
|
||||||
"api_url": "https://api.adenhq.com"
|
|
||||||
}
|
|
||||||
|
|
||||||
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
config_path.write_text(json.dumps(config, indent=2))
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.1c. Sync Credentials from Aden Server**
|
|
||||||
|
|
||||||
Since the user has already authorized integrations on Aden, use the one-liner factory method:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from core.framework.credentials import CredentialStore
|
|
||||||
|
|
||||||
# This single call handles everything:
|
|
||||||
# - Creates encrypted local storage at ~/.hive/credentials
|
|
||||||
# - Configures Aden client from ADEN_API_KEY env var
|
|
||||||
# - Syncs all credentials from Aden server automatically
|
|
||||||
store = CredentialStore.with_aden_sync(
|
|
||||||
base_url="https://api.adenhq.com",
|
|
||||||
auto_sync=True, # Syncs on creation
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check what was synced
|
|
||||||
synced = store.list_credentials()
|
|
||||||
print(f"Synced credentials: {synced}")
|
|
||||||
|
|
||||||
# If the required credential wasn't synced, the user hasn't authorized it on Aden yet
|
|
||||||
if "hubspot" not in synced:
|
|
||||||
print("HubSpot not found in your Aden account.")
|
|
||||||
print("Please visit https://hive.adenhq.com to connect HubSpot, then try again.")
|
|
||||||
```
|
|
||||||
|
|
||||||
For more control over the sync process:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from core.framework.credentials import CredentialStore
|
|
||||||
from core.framework.credentials.aden import (
|
|
||||||
AdenCredentialClient,
|
|
||||||
AdenClientConfig,
|
|
||||||
AdenSyncProvider,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create client (API key loaded from ADEN_API_KEY env var)
|
|
||||||
client = AdenCredentialClient(AdenClientConfig(
|
|
||||||
base_url="https://api.adenhq.com",
|
|
||||||
))
|
|
||||||
|
|
||||||
# Create provider and store
|
|
||||||
provider = AdenSyncProvider(client=client)
|
|
||||||
store = CredentialStore.with_encrypted_storage()
|
|
||||||
|
|
||||||
# Manual sync
|
|
||||||
synced_count = provider.sync_all(store)
|
|
||||||
print(f"Synced {synced_count} credentials from Aden")
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.1d. Run Health Check**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import check_credential_health
|
|
||||||
|
|
||||||
# Get the token from the store
|
|
||||||
cred = store.get_credential("hubspot")
|
|
||||||
token = cred.keys["access_token"].value.get_secret_value()
|
|
||||||
|
|
||||||
result = check_credential_health("hubspot", token)
|
|
||||||
if result.valid:
|
|
||||||
print("HubSpot credentials validated successfully!")
|
|
||||||
else:
|
|
||||||
print(f"Validation failed: {result.message}")
|
|
||||||
# Offer to retry the OAuth flow
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Option 2: Direct API Key
|
|
||||||
|
|
||||||
For users who prefer manual API key management.
|
|
||||||
|
|
||||||
**4.2a. Show Setup Instructions**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
|
||||||
|
|
||||||
spec = CREDENTIAL_SPECS.get("hubspot")
|
|
||||||
if spec and spec.api_key_instructions:
|
|
||||||
print(spec.api_key_instructions)
|
|
||||||
# Output:
|
|
||||||
# To get a HubSpot Private App token:
|
|
||||||
# 1. Go to HubSpot Settings > Integrations > Private Apps
|
|
||||||
# 2. Click "Create a private app"
|
|
||||||
# 3. Name your app (e.g., "Hive Agent")
|
|
||||||
# ...
|
|
||||||
|
|
||||||
if spec and spec.help_url:
|
|
||||||
print(f"More info: {spec.help_url}")
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.2b. Collect API Key from User**
|
|
||||||
|
|
||||||
Use AskUserQuestion to securely collect the API key:
|
|
||||||
|
|
||||||
```
|
|
||||||
Please provide your HubSpot access token:
|
|
||||||
(This will be stored securely in ~/.hive/credentials)
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.2c. Run Health Check Before Storing**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import check_credential_health
|
|
||||||
|
|
||||||
result = check_credential_health("hubspot", user_provided_token)
|
|
||||||
if not result.valid:
|
|
||||||
print(f"Warning: {result.message}")
|
|
||||||
# Ask user if they want to:
|
|
||||||
# 1. Try a different token
|
|
||||||
# 2. Continue anyway (not recommended)
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.2d. Store in Local Encrypted Store**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
|
|
||||||
from pydantic import SecretStr
|
|
||||||
|
|
||||||
store = CredentialStore.with_encrypted_storage()
|
|
||||||
|
|
||||||
cred = CredentialObject(
|
|
||||||
id="hubspot",
|
|
||||||
name="HubSpot Access Token",
|
|
||||||
keys={
|
|
||||||
"access_token": CredentialKey(
|
|
||||||
name="access_token",
|
|
||||||
value=SecretStr(user_provided_token),
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
store.save_credential(cred)
|
|
||||||
```
|
|
||||||
|
|
||||||
**4.2e. Export to Current Session**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export HUBSPOT_ACCESS_TOKEN="the-value"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Option 3: Local Credential Setup (Advanced)
|
|
||||||
|
|
||||||
For programmatic/CI/CD setups.
|
|
||||||
|
|
||||||
**4.3a. Show Documentation**
|
|
||||||
|
|
||||||
```
|
|
||||||
For advanced credential management, you can use the CredentialStore API directly:
|
|
||||||
|
|
||||||
from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
|
|
||||||
from pydantic import SecretStr
|
|
||||||
|
|
||||||
store = CredentialStore.with_encrypted_storage()
|
|
||||||
|
|
||||||
cred = CredentialObject(
|
|
||||||
id="hubspot",
|
|
||||||
name="HubSpot Access Token",
|
|
||||||
keys={"access_token": CredentialKey(name="access_token", value=SecretStr("..."))}
|
|
||||||
)
|
|
||||||
store.save_credential(cred)
|
|
||||||
|
|
||||||
For CI/CD environments:
|
|
||||||
- Set HIVE_CREDENTIAL_KEY for encryption
|
|
||||||
- Pre-populate ~/.hive/credentials programmatically
|
|
||||||
- Or use environment variables directly (HUBSPOT_ACCESS_TOKEN)
|
|
||||||
|
|
||||||
Documentation: See core/framework/credentials/README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 5: Record Configuration Method
|
|
||||||
|
|
||||||
Track which auth method was used for each credential in `~/.hive/configuration.json`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
config_path = Path.home() / ".hive" / "configuration.json"
|
|
||||||
config = json.loads(config_path.read_text()) if config_path.exists() else {}
|
|
||||||
|
|
||||||
if "credential_methods" not in config:
|
|
||||||
config["credential_methods"] = {}
|
|
||||||
|
|
||||||
config["credential_methods"]["hubspot"] = {
|
|
||||||
"method": "aden", # or "direct" or "custom"
|
|
||||||
"configured_at": datetime.now().isoformat(),
|
|
||||||
}
|
|
||||||
|
|
||||||
config_path.write_text(json.dumps(config, indent=2))
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 6: Verify All Credentials
|
|
||||||
|
|
||||||
Use the `verify_credentials` MCP tool to confirm everything is properly configured:
|
|
||||||
|
|
||||||
```
|
|
||||||
verify_credentials(agent_path="exports/{agent_name}")
|
|
||||||
```
|
|
||||||
|
|
||||||
The tool returns:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"agent": "exports/{agent_name}",
|
|
||||||
"ready": true,
|
|
||||||
"missing_credentials": [],
|
|
||||||
"warnings": [],
|
|
||||||
"errors": []
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
If `ready` is true, report success. If `missing_credentials` is non-empty, identify what failed and loop back to Step 3 for the remaining credentials.
|
|
||||||
|
|
||||||
## Health Check Reference
|
|
||||||
|
|
||||||
Health checks validate credentials by making lightweight API calls:
|
|
||||||
|
|
||||||
| Credential | Endpoint | What It Checks |
|
|
||||||
| --------------- | --------------------------------------- | --------------------------------- |
|
|
||||||
| `anthropic` | `POST /v1/messages` | API key validity |
|
|
||||||
| `brave_search` | `GET /res/v1/web/search?q=test&count=1` | API key validity |
|
|
||||||
| `google_search` | `GET /customsearch/v1?q=test&num=1` | API key + CSE ID validity |
|
|
||||||
| `github` | `GET /user` | Token validity, user identity |
|
|
||||||
| `hubspot` | `GET /crm/v3/objects/contacts?limit=1` | Bearer token validity, CRM scopes |
|
|
||||||
| `resend` | `GET /domains` | API key validity |
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import check_credential_health, HealthCheckResult
|
|
||||||
|
|
||||||
result: HealthCheckResult = check_credential_health("hubspot", token_value)
|
|
||||||
# result.valid: bool
|
|
||||||
# result.message: str
|
|
||||||
# result.details: dict (status_code, rate_limited, etc.)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Encryption Key (HIVE_CREDENTIAL_KEY)
|
|
||||||
|
|
||||||
The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.
|
|
||||||
|
|
||||||
- If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
|
|
||||||
- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
|
|
||||||
- Without this key, stored credentials cannot be decrypted
|
|
||||||
|
|
||||||
**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
|
|
||||||
- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
|
|
||||||
- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
|
|
||||||
|
|
||||||
All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**
|
|
||||||
|
|
||||||
If `HIVE_CREDENTIAL_KEY` is not set:
|
|
||||||
|
|
||||||
1. Let the store generate one
|
|
||||||
2. Tell the user to save it: `export HIVE_CREDENTIAL_KEY="{generated_key}"`
|
|
||||||
3. Recommend adding it to `~/.bashrc` or their shell profile
|
|
||||||
|
|
||||||
## Security Rules
|
|
||||||
|
|
||||||
- **NEVER** log, print, or echo credential values in tool output
|
|
||||||
- **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
|
|
||||||
- **NEVER** hardcode credentials in source code
|
|
||||||
- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
|
|
||||||
- **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
|
|
||||||
- **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
|
|
||||||
- **ALWAYS** run health checks before storing credentials (when possible)
|
|
||||||
- **ALWAYS** verify credentials were stored by re-running validation, not by reading them back
|
|
||||||
- When modifying `~/.bashrc` or `~/.zshrc`, confirm with the user first
|
|
||||||
|
|
||||||
## Credential Sources Reference
|
|
||||||
|
|
||||||
All credential specs are defined in `tools/src/aden_tools/credentials/`:
|
|
||||||
|
|
||||||
| File | Category | Credentials | Aden Supported |
|
|
||||||
| ----------------- | ------------- | --------------------------------------------- | -------------- |
|
|
||||||
| `llm.py` | LLM Providers | `anthropic` | No |
|
|
||||||
| `search.py` | Search Tools | `brave_search`, `google_search`, `google_cse` | No |
|
|
||||||
| `email.py` | Email | `resend` | No |
|
|
||||||
| `integrations.py` | Integrations | `github`, `hubspot`, `google_calendar_oauth` | No / Yes |
|
|
||||||
|
|
||||||
**Note:** Additional LLM providers (Cerebras, Groq, OpenAI) are handled by LiteLLM via environment
|
|
||||||
variables (`CEREBRAS_API_KEY`, `GROQ_API_KEY`, `OPENAI_API_KEY`) but are not yet in CREDENTIAL_SPECS.
|
|
||||||
Add them to `llm.py` as needed.
|
|
||||||
|
|
||||||
To check what's registered:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
|
||||||
for name, spec in CREDENTIAL_SPECS.items():
|
|
||||||
print(f"{name}: aden={spec.aden_supported}, direct={spec.direct_api_key_supported}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Migration: CredentialManager → CredentialStore
|
|
||||||
|
|
||||||
**CredentialManager is deprecated.** Use CredentialStore instead.
|
|
||||||
|
|
||||||
| Old (Deprecated) | New (Recommended) |
|
|
||||||
| ----------------------------------------- | -------------------------------------------------------------------- |
|
|
||||||
| `CredentialManager()` | `CredentialStore.with_encrypted_storage()` |
|
|
||||||
| `creds.get("hubspot")` | `store.get("hubspot")` or `store.get_key("hubspot", "access_token")` |
|
|
||||||
| `creds.validate_for_tools(tools)` | Use `store.is_available(cred_id)` per credential |
|
|
||||||
| `creds.get_auth_options("hubspot")` | Check `CREDENTIAL_SPECS["hubspot"].aden_supported` |
|
|
||||||
| `creds.get_setup_instructions("hubspot")` | Access `CREDENTIAL_SPECS["hubspot"]` directly |
|
|
||||||
|
|
||||||
**Why migrate?**
|
|
||||||
|
|
||||||
- **CredentialStore** supports encrypted storage, multi-key credentials, template resolution, and automatic token refresh
|
|
||||||
- **CredentialManager** only reads from environment variables and .env files (no encryption, no refresh)
|
|
||||||
- **CredentialStoreAdapter** exists for backward compatibility during migration
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Old way (deprecated)
|
|
||||||
from aden_tools.credentials import CredentialManager
|
|
||||||
creds = CredentialManager()
|
|
||||||
token = creds.get("hubspot")
|
|
||||||
|
|
||||||
# New way (recommended)
|
|
||||||
from core.framework.credentials import CredentialStore
|
|
||||||
store = CredentialStore.with_encrypted_storage()
|
|
||||||
token = store.get("hubspot")
|
|
||||||
|
|
||||||
# With Aden sync (recommended for OAuth integrations)
|
|
||||||
store = CredentialStore.with_aden_sync()
|
|
||||||
token = store.get_key("hubspot", "access_token")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example Session
|
|
||||||
|
|
||||||
```
|
|
||||||
User: /hive-credentials for my research-agent
|
|
||||||
|
|
||||||
Agent: Let me check what credentials your research-agent needs.
|
|
||||||
|
|
||||||
[Calls check_missing_credentials(agent_path="exports/research-agent")]
|
|
||||||
→ Returns:
|
|
||||||
available: anthropic (encrypted_store), brave_search (encrypted_store)
|
|
||||||
missing: google_search (GOOGLE_API_KEY), google_cse (GOOGLE_CSE_ID)
|
|
||||||
ready: false
|
|
||||||
|
|
||||||
Agent: 2 of 4 required credentials are already configured. Only Google Custom
|
|
||||||
Search needs setup (2 values).
|
|
||||||
|
|
||||||
--- Setting up Google Custom Search (google_search + google_cse) ---
|
|
||||||
|
|
||||||
This requires two values that work together.
|
|
||||||
|
|
||||||
[Checks HIVE_CREDENTIAL_KEY before storing]
|
|
||||||
$ printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "set" || echo "not set"
|
|
||||||
set
|
|
||||||
|
|
||||||
First, the Google API Key:
|
|
||||||
1. Go to https://console.cloud.google.com/apis/credentials
|
|
||||||
2. Create a new project (or select an existing one)
|
|
||||||
3. Enable the "Custom Search API" from the API Library
|
|
||||||
4. Go to Credentials > Create Credentials > API Key
|
|
||||||
5. Copy the generated API key
|
|
||||||
|
|
||||||
[AskUserQuestion: "Please provide your Google API key:"]
|
|
||||||
[User provides key]
|
|
||||||
|
|
||||||
Now, the Custom Search Engine ID:
|
|
||||||
1. Go to https://programmablesearchengine.google.com/controlpanel/all
|
|
||||||
2. Click "Add" to create a new search engine
|
|
||||||
3. Under "What to search", select "Search the entire web"
|
|
||||||
4. Give your search engine a name
|
|
||||||
5. Click "Create"
|
|
||||||
6. Copy the Search Engine ID (cx value)
|
|
||||||
|
|
||||||
[AskUserQuestion: "Please provide your Google CSE ID:"]
|
|
||||||
[User provides ID]
|
|
||||||
|
|
||||||
[Runs health check with both values - GET /customsearch/v1?q=test&num=1 → 200 OK]
|
|
||||||
[Stores both in local encrypted store, exports to env]
|
|
||||||
|
|
||||||
✓ Google Custom Search credentials valid
|
|
||||||
|
|
||||||
[Calls verify_credentials(agent_path="exports/research-agent")]
|
|
||||||
→ Returns: ready: true, missing_credentials: []
|
|
||||||
|
|
||||||
All credentials are now configured:
|
|
||||||
✓ anthropic (ANTHROPIC_API_KEY) — already in encrypted store
|
|
||||||
✓ brave_search (BRAVE_SEARCH_API_KEY) — already in encrypted store
|
|
||||||
✓ google_search (GOOGLE_API_KEY) — stored in encrypted store
|
|
||||||
✓ google_cse (GOOGLE_CSE_ID) — stored in encrypted store
|
|
||||||
|
|
||||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ ✅ CREDENTIALS CONFIGURED │
|
|
||||||
├─────────────────────────────────────────────────────────────────────────────┤
|
|
||||||
│ │
|
|
||||||
│ OPEN A NEW TERMINAL before running commands below. │
|
|
||||||
│ Environment variables were saved to your shell config but │
|
|
||||||
│ only take effect in new terminal sessions. │
|
|
||||||
│ │
|
|
||||||
│ NEXT STEPS: │
|
|
||||||
│ │
|
|
||||||
│ 1. RUN YOUR AGENT: │
|
|
||||||
│ │
|
|
||||||
│ hive tui │
|
|
||||||
│ │
|
|
||||||
│ 2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER: │
|
|
||||||
│ │
|
|
||||||
│ /hive-debugger │
|
|
||||||
│ │
|
|
||||||
│ The debugger analyzes runtime logs, identifies retry loops, tool │
|
|
||||||
│ failures, stalled execution, and provides actionable fix suggestions. │
|
|
||||||
│ │
|
|
||||||
└─────────────────────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,385 +0,0 @@
|
|||||||
---
|
|
||||||
name: hive-patterns
|
|
||||||
description: Best practices, patterns, and examples for building goal-driven agents. Includes client-facing interaction, feedback edges, judge patterns, fan-out/fan-in, context management, and anti-patterns.
|
|
||||||
license: Apache-2.0
|
|
||||||
metadata:
|
|
||||||
author: hive
|
|
||||||
version: "2.0"
|
|
||||||
type: reference
|
|
||||||
part_of: hive
|
|
||||||
---
|
|
||||||
|
|
||||||
# Building Agents - Patterns & Best Practices
|
|
||||||
|
|
||||||
Design patterns, examples, and best practices for building robust goal-driven agents.
|
|
||||||
|
|
||||||
**Prerequisites:** Complete agent structure using `hive-create`.
|
|
||||||
|
|
||||||
## Practical Example: Hybrid Workflow
|
|
||||||
|
|
||||||
How to build a node using both direct file writes and optional MCP validation:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# 1. WRITE TO FILE FIRST (Primary - makes it visible)
|
|
||||||
node_code = '''
|
|
||||||
search_node = NodeSpec(
|
|
||||||
id="search-web",
|
|
||||||
node_type="event_loop",
|
|
||||||
input_keys=["query"],
|
|
||||||
output_keys=["search_results"],
|
|
||||||
system_prompt="Search the web for: {query}. Use web_search, then call set_output to store results.",
|
|
||||||
tools=["web_search"],
|
|
||||||
)
|
|
||||||
'''
|
|
||||||
|
|
||||||
Edit(
|
|
||||||
file_path="exports/research_agent/nodes/__init__.py",
|
|
||||||
old_string="# Nodes will be added here",
|
|
||||||
new_string=node_code
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2. OPTIONALLY VALIDATE WITH MCP (Secondary - bookkeeping)
|
|
||||||
validation = mcp__agent-builder__test_node(
|
|
||||||
node_id="search-web",
|
|
||||||
test_input='{"query": "python tutorials"}',
|
|
||||||
mock_llm_response='{"search_results": [...mock results...]}'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**User experience:**
|
|
||||||
|
|
||||||
- Immediately sees node in their editor (from step 1)
|
|
||||||
- Gets validation feedback (from step 2)
|
|
||||||
- Can edit the file directly if needed
|
|
||||||
|
|
||||||
## Multi-Turn Interaction Patterns
|
|
||||||
|
|
||||||
For agents needing multi-turn conversations with users, use `client_facing=True` on event_loop nodes.
|
|
||||||
|
|
||||||
### Client-Facing Nodes
|
|
||||||
|
|
||||||
A client-facing node streams LLM output to the user and blocks for user input between conversational turns. This replaces the old pause/resume pattern.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Client-facing node with STEP 1/STEP 2 prompt pattern
|
|
||||||
intake_node = NodeSpec(
|
|
||||||
id="intake",
|
|
||||||
name="Intake",
|
|
||||||
description="Gather requirements from the user",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
input_keys=["topic"],
|
|
||||||
output_keys=["research_brief"],
|
|
||||||
system_prompt="""\
|
|
||||||
You are an intake specialist.
|
|
||||||
|
|
||||||
**STEP 1 — Read and respond (text only, NO tool calls):**
|
|
||||||
1. Read the topic provided
|
|
||||||
2. If it's vague, ask 1-2 clarifying questions
|
|
||||||
3. If it's clear, confirm your understanding
|
|
||||||
|
|
||||||
**STEP 2 — After the user confirms, call set_output:**
|
|
||||||
- set_output("research_brief", "Clear description of what to research")
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Internal node runs without user interaction
|
|
||||||
research_node = NodeSpec(
|
|
||||||
id="research",
|
|
||||||
name="Research",
|
|
||||||
description="Search and analyze sources",
|
|
||||||
node_type="event_loop",
|
|
||||||
input_keys=["research_brief"],
|
|
||||||
output_keys=["findings", "sources"],
|
|
||||||
system_prompt="Research the topic using web_search and web_scrape...",
|
|
||||||
tools=["web_search", "web_scrape", "load_data", "save_data"],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**How it works:**
|
|
||||||
|
|
||||||
- Client-facing nodes stream LLM text to the user and block for input after each response
|
|
||||||
- User input is injected via `node.inject_event(text)`
|
|
||||||
- When the LLM calls `set_output` to produce structured outputs, the judge evaluates and ACCEPTs
|
|
||||||
- Internal nodes (non-client-facing) run their entire loop without blocking
|
|
||||||
- `set_output` is a synthetic tool — a turn with only `set_output` calls (no real tools) triggers user input blocking
|
|
||||||
|
|
||||||
**STEP 1/STEP 2 pattern:** Always structure client-facing prompts with explicit phases. STEP 1 is text-only conversation. STEP 2 calls `set_output` after user confirmation. This prevents the LLM from calling `set_output` prematurely before the user responds.
|
|
||||||
|
|
||||||
### When to Use client_facing
|
|
||||||
|
|
||||||
| Scenario | client_facing | Why |
|
|
||||||
| ----------------------------------- | :-----------: | ---------------------- |
|
|
||||||
| Gathering user requirements | Yes | Need user input |
|
|
||||||
| Human review/approval checkpoint | Yes | Need human decision |
|
|
||||||
| Data processing (scanning, scoring) | No | Runs autonomously |
|
|
||||||
| Report generation | No | No user input needed |
|
|
||||||
| Final confirmation before action | Yes | Need explicit approval |
|
|
||||||
|
|
||||||
> **Legacy Note:** The `pause_nodes` / `entry_points` pattern still works for backward compatibility but `client_facing=True` is preferred for new agents.
|
|
||||||
|
|
||||||
## Edge-Based Routing and Feedback Loops
|
|
||||||
|
|
||||||
### Conditional Edge Routing
|
|
||||||
|
|
||||||
Multiple conditional edges from the same source replace the old `router` node type. Each edge checks a condition on the node's output.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Node with mutually exclusive outputs
|
|
||||||
review_node = NodeSpec(
|
|
||||||
id="review",
|
|
||||||
name="Review",
|
|
||||||
node_type="event_loop",
|
|
||||||
client_facing=True,
|
|
||||||
output_keys=["approved_contacts", "redo_extraction"],
|
|
||||||
nullable_output_keys=["approved_contacts", "redo_extraction"],
|
|
||||||
max_node_visits=3,
|
|
||||||
system_prompt="Present the contact list to the operator. If they approve, call set_output('approved_contacts', ...). If they want changes, call set_output('redo_extraction', 'true').",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Forward edge (positive priority, evaluated first)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-to-campaign",
|
|
||||||
source="review",
|
|
||||||
target="campaign-builder",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="output.get('approved_contacts') is not None",
|
|
||||||
priority=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Feedback edge (negative priority, evaluated after forward edges)
|
|
||||||
EdgeSpec(
|
|
||||||
id="review-feedback",
|
|
||||||
source="review",
|
|
||||||
target="extractor",
|
|
||||||
condition=EdgeCondition.CONDITIONAL,
|
|
||||||
condition_expr="output.get('redo_extraction') is not None",
|
|
||||||
priority=-1,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key concepts:**
|
|
||||||
|
|
||||||
- `nullable_output_keys`: Lists output keys that may remain unset. The node sets exactly one of the mutually exclusive keys per execution.
|
|
||||||
- `max_node_visits`: Must be >1 on the feedback target (extractor) so it can re-execute. Default is 1.
|
|
||||||
- `priority`: Positive = forward edge (evaluated first). Negative = feedback edge. The executor tries forward edges first; if none match, falls back to feedback edges.
|
|
||||||
|
|
||||||
### Routing Decision Table
|
|
||||||
|
|
||||||
| Pattern | Old Approach | New Approach |
|
|
||||||
| ---------------------- | ----------------------- | --------------------------------------------- |
|
|
||||||
| Conditional branching | `router` node | Conditional edges with `condition_expr` |
|
|
||||||
| Binary approve/reject | `pause_nodes` + resume | `client_facing=True` + `nullable_output_keys` |
|
|
||||||
| Loop-back on rejection | Manual entry_points | Feedback edge with `priority=-1` |
|
|
||||||
| Multi-way routing | Router with routes dict | Multiple conditional edges with priorities |
|
|
||||||
|
|
||||||
## Judge Patterns
|
|
||||||
|
|
||||||
**Core Principle: The judge is the SOLE mechanism for acceptance decisions.** Never add ad-hoc framework gating to compensate for LLM behavior. If the LLM calls `set_output` prematurely, fix the system prompt or use a custom judge. Anti-patterns to avoid:
|
|
||||||
|
|
||||||
- Output rollback logic
|
|
||||||
- `_user_has_responded` flags
|
|
||||||
- Premature set_output rejection
|
|
||||||
- Interaction protocol injection into system prompts
|
|
||||||
|
|
||||||
Judges control when an event_loop node's loop exits. Choose based on validation needs.
|
|
||||||
|
|
||||||
### Implicit Judge (Default)
|
|
||||||
|
|
||||||
When no judge is configured, the implicit judge ACCEPTs when:
|
|
||||||
|
|
||||||
- The LLM finishes its response with no tool calls
|
|
||||||
- All required output keys have been set via `set_output`
|
|
||||||
|
|
||||||
Best for simple nodes where "all outputs set" is sufficient validation.
|
|
||||||
|
|
||||||
### SchemaJudge
|
|
||||||
|
|
||||||
Validates outputs against a Pydantic model. Use when you need structural validation.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
class ScannerOutput(BaseModel):
|
|
||||||
github_users: list[dict] # Must be a list of user objects
|
|
||||||
|
|
||||||
class SchemaJudge:
|
|
||||||
def __init__(self, output_model: type[BaseModel]):
|
|
||||||
self._model = output_model
|
|
||||||
|
|
||||||
async def evaluate(self, context: dict) -> JudgeVerdict:
|
|
||||||
missing = context.get("missing_keys", [])
|
|
||||||
if missing:
|
|
||||||
return JudgeVerdict(
|
|
||||||
action="RETRY",
|
|
||||||
feedback=f"Missing output keys: {missing}. Use set_output to provide them.",
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
self._model.model_validate(context["output_accumulator"])
|
|
||||||
return JudgeVerdict(action="ACCEPT")
|
|
||||||
except ValidationError as e:
|
|
||||||
return JudgeVerdict(action="RETRY", feedback=str(e))
|
|
||||||
```
|
|
||||||
|
|
||||||
### When to Use Which Judge
|
|
||||||
|
|
||||||
| Judge | Use When | Example |
|
|
||||||
| --------------- | ------------------------------------- | ---------------------- |
|
|
||||||
| Implicit (None) | Output keys are sufficient validation | Simple data extraction |
|
|
||||||
| SchemaJudge | Need structural validation of outputs | API response parsing |
|
|
||||||
| Custom | Domain-specific validation logic | Score must be 0.0-1.0 |
|
|
||||||
|
|
||||||
## Fan-Out / Fan-In (Parallel Execution)
|
|
||||||
|
|
||||||
Multiple ON_SUCCESS edges from the same source trigger parallel execution. All branches run concurrently via `asyncio.gather()`.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Scanner fans out to Profiler and Scorer in parallel
|
|
||||||
EdgeSpec(id="scanner-to-profiler", source="scanner", target="profiler",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS)
|
|
||||||
EdgeSpec(id="scanner-to-scorer", source="scanner", target="scorer",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS)
|
|
||||||
|
|
||||||
# Both fan in to Extractor
|
|
||||||
EdgeSpec(id="profiler-to-extractor", source="profiler", target="extractor",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS)
|
|
||||||
EdgeSpec(id="scorer-to-extractor", source="scorer", target="extractor",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Requirements:**
|
|
||||||
|
|
||||||
- Parallel event_loop nodes must have **disjoint output_keys** (no key written by both)
|
|
||||||
- Only one parallel branch may contain a `client_facing` node
|
|
||||||
- Fan-in node receives outputs from all completed branches in shared memory
|
|
||||||
|
|
||||||
## Context Management Patterns
|
|
||||||
|
|
||||||
### Tiered Compaction
|
|
||||||
|
|
||||||
EventLoopNode automatically manages context window usage with tiered compaction:
|
|
||||||
|
|
||||||
1. **Pruning** — Old tool results replaced with compact placeholders (zero-cost, no LLM call)
|
|
||||||
2. **Normal compaction** — LLM summarizes older messages
|
|
||||||
3. **Aggressive compaction** — Keeps only recent messages + summary
|
|
||||||
4. **Emergency** — Hard reset with tool history preservation
|
|
||||||
|
|
||||||
### Spillover Pattern
|
|
||||||
|
|
||||||
The framework automatically truncates large tool results and saves full content to a spillover directory. The LLM receives a truncation message with instructions to use `load_data` to read the full result.
|
|
||||||
|
|
||||||
For explicit data management, use the data tools (real MCP tools, not synthetic):
|
|
||||||
|
|
||||||
```python
|
|
||||||
# save_data, load_data, list_data_files, serve_file_to_user are real MCP tools
|
|
||||||
# data_dir is auto-injected by the framework — the LLM never sees it
|
|
||||||
|
|
||||||
# Saving large results
|
|
||||||
save_data(filename="sources.json", data=large_json_string)
|
|
||||||
|
|
||||||
# Reading with pagination (line-based offset/limit)
|
|
||||||
load_data(filename="sources.json", offset=0, limit=50)
|
|
||||||
|
|
||||||
# Listing available files
|
|
||||||
list_data_files()
|
|
||||||
|
|
||||||
# Serving a file to the user as a clickable link
|
|
||||||
serve_file_to_user(filename="report.html", label="Research Report")
|
|
||||||
```
|
|
||||||
|
|
||||||
Add data tools to nodes that handle large tool results:
|
|
||||||
|
|
||||||
```python
|
|
||||||
research_node = NodeSpec(
|
|
||||||
...
|
|
||||||
tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
`data_dir` is a framework context parameter — auto-injected at call time. `GraphExecutor.execute()` sets it per-execution via `ToolRegistry.set_execution_context(data_dir=...)` (using `contextvars` for concurrency safety), ensuring it matches the session-scoped spillover directory.
|
|
||||||
|
|
||||||
## Anti-Patterns
|
|
||||||
|
|
||||||
### What NOT to Do
|
|
||||||
|
|
||||||
- **Don't rely on `export_graph`** — Write files immediately, not at end
|
|
||||||
- **Don't hide code in session** — Write to files as components are approved
|
|
||||||
- **Don't wait to write files** — Agent visible from first step
|
|
||||||
- **Don't batch everything** — Write incrementally, one component at a time
|
|
||||||
- **Don't create too many thin nodes** — Prefer fewer, richer nodes (see below)
|
|
||||||
- **Don't add framework gating for LLM behavior** — Fix prompts or use judges instead
|
|
||||||
|
|
||||||
### Fewer, Richer Nodes
|
|
||||||
|
|
||||||
A common mistake is splitting work into too many small single-purpose nodes. Each node boundary requires serializing outputs, losing in-context information, and adding edge complexity.
|
|
||||||
|
|
||||||
| Bad (8 thin nodes) | Good (4 rich nodes) |
|
|
||||||
| ------------------- | ----------------------------------- |
|
|
||||||
| parse-query | intake (client-facing) |
|
|
||||||
| search-sources | research (search + fetch + analyze) |
|
|
||||||
| fetch-content | review (client-facing) |
|
|
||||||
| evaluate-sources | report (write + deliver) |
|
|
||||||
| synthesize-findings | |
|
|
||||||
| write-report | |
|
|
||||||
| quality-check | |
|
|
||||||
| save-report | |
|
|
||||||
|
|
||||||
**Why fewer nodes are better:**
|
|
||||||
|
|
||||||
- The LLM retains full context of its work within a single node
|
|
||||||
- A research node that searches, fetches, and analyzes keeps all source material in its conversation history
|
|
||||||
- Fewer edges means simpler graph and fewer failure points
|
|
||||||
- Data tools (`save_data`/`load_data`) handle context window limits within a single node
|
|
||||||
|
|
||||||
### MCP Tools - Correct Usage
|
|
||||||
|
|
||||||
**MCP tools OK for:**
|
|
||||||
|
|
||||||
- `test_node` — Validate node configuration with mock inputs
|
|
||||||
- `validate_graph` — Check graph structure
|
|
||||||
- `configure_loop` — Set event loop parameters
|
|
||||||
- `create_session` — Track session state for bookkeeping
|
|
||||||
|
|
||||||
**Just don't:** Use MCP as the primary construction method or rely on export_graph
|
|
||||||
|
|
||||||
## Error Handling Patterns
|
|
||||||
|
|
||||||
### Graceful Failure with Fallback
|
|
||||||
|
|
||||||
```python
|
|
||||||
edges = [
|
|
||||||
# Success path
|
|
||||||
EdgeSpec(id="api-success", source="api-call", target="process-results",
|
|
||||||
condition=EdgeCondition.ON_SUCCESS),
|
|
||||||
# Fallback on failure
|
|
||||||
EdgeSpec(id="api-to-fallback", source="api-call", target="fallback-cache",
|
|
||||||
condition=EdgeCondition.ON_FAILURE, priority=1),
|
|
||||||
# Report if fallback also fails
|
|
||||||
EdgeSpec(id="fallback-to-error", source="fallback-cache", target="report-error",
|
|
||||||
condition=EdgeCondition.ON_FAILURE, priority=1),
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
## Handoff to Testing
|
|
||||||
|
|
||||||
When agent is complete, transition to testing phase:
|
|
||||||
|
|
||||||
### Pre-Testing Checklist
|
|
||||||
|
|
||||||
- [ ] Agent structure validates: `uv run python -m agent_name validate`
|
|
||||||
- [ ] All nodes defined in nodes/**init**.py
|
|
||||||
- [ ] All edges connect valid nodes with correct priorities
|
|
||||||
- [ ] Feedback edge targets have `max_node_visits > 1`
|
|
||||||
- [ ] Client-facing nodes have meaningful system prompts
|
|
||||||
- [ ] Agent can be imported: `from exports.agent_name import default_agent`
|
|
||||||
|
|
||||||
## Related Skills
|
|
||||||
|
|
||||||
- **hive-concepts** — Fundamental concepts (node types, edges, event loop architecture)
|
|
||||||
- **hive-create** — Step-by-step building process
|
|
||||||
- **hive-test** — Test and validate agents
|
|
||||||
- **hive** — Complete workflow orchestrator
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Remember: Agent is actively constructed, visible the whole time. No hidden state. No surprise exports. Just transparent, incremental file building.**
|
|
||||||
@@ -1,940 +0,0 @@
|
|||||||
---
|
|
||||||
name: hive-test
|
|
||||||
description: Iterative agent testing with session recovery. Execute, analyze, fix, resume from checkpoints. Use when testing an agent, debugging test failures, or verifying fixes without re-running from scratch.
|
|
||||||
---
|
|
||||||
|
|
||||||
# Agent Testing
|
|
||||||
|
|
||||||
Test agents iteratively: execute, analyze failures, fix, resume from checkpoint, repeat.
|
|
||||||
|
|
||||||
## When to Use
|
|
||||||
|
|
||||||
- Testing a newly built agent against its goal
|
|
||||||
- Debugging a failing agent iteratively
|
|
||||||
- Verifying fixes without re-running expensive early nodes
|
|
||||||
- Running final regression tests before deployment
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
1. Agent package at `exports/{agent_name}/` (built with `/hive-create`)
|
|
||||||
2. Credentials configured (`/hive-credentials`)
|
|
||||||
3. `ANTHROPIC_API_KEY` set (or appropriate LLM provider key)
|
|
||||||
|
|
||||||
**Path distinction** (critical — don't confuse these):
|
|
||||||
- `exports/{agent_name}/` — agent source code (edit here)
|
|
||||||
- `~/.hive/agents/{agent_name}/` — runtime data: sessions, checkpoints, logs (read here)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## The Iterative Test Loop
|
|
||||||
|
|
||||||
This is the core workflow. Don't re-run the entire agent when a late node fails — analyze, fix, and resume from the last clean checkpoint.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────────────────────┐
|
|
||||||
│ PHASE 1: Generate Test Scenarios │
|
|
||||||
│ Goal → synthetic test inputs + tests │
|
|
||||||
└──────────────┬───────────────────────┘
|
|
||||||
↓
|
|
||||||
┌──────────────────────────────────────┐
|
|
||||||
│ PHASE 2: Execute │◄────────────────┐
|
|
||||||
│ Run agent (CLI or pytest) │ │
|
|
||||||
└──────────────┬───────────────────────┘ │
|
|
||||||
↓ │
|
|
||||||
Pass? ──yes──► PHASE 6: Final Verification │
|
|
||||||
│ │
|
|
||||||
no │
|
|
||||||
↓ │
|
|
||||||
┌──────────────────────────────────────┐ │
|
|
||||||
│ PHASE 3: Analyze │ │
|
|
||||||
│ Session + runtime logs + checkpoints │ │
|
|
||||||
└──────────────┬───────────────────────┘ │
|
|
||||||
↓ │
|
|
||||||
┌──────────────────────────────────────┐ │
|
|
||||||
│ PHASE 4: Fix │ │
|
|
||||||
│ Prompt / code / graph / goal │ │
|
|
||||||
└──────────────┬───────────────────────┘ │
|
|
||||||
↓ │
|
|
||||||
┌──────────────────────────────────────┐ │
|
|
||||||
│ PHASE 5: Recover & Resume │─────────────────┘
|
|
||||||
│ Checkpoint resume OR fresh re-run │
|
|
||||||
└──────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 1: Generate Test Scenarios
|
|
||||||
|
|
||||||
Create synthetic tests from the agent's goal, constraints, and success criteria.
|
|
||||||
|
|
||||||
#### Step 1a: Read the goal
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Read goal from agent.py
|
|
||||||
Read(file_path="exports/{agent_name}/agent.py")
|
|
||||||
# Extract the Goal definition and convert to JSON string
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 1b: Get test guidelines
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Get constraint test guidelines
|
|
||||||
generate_constraint_tests(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
goal_json='{"id": "...", "constraints": [...]}',
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get success criteria test guidelines
|
|
||||||
generate_success_tests(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
goal_json='{"id": "...", "success_criteria": [...]}',
|
|
||||||
node_names="intake,research,review,report",
|
|
||||||
tool_names="web_search,web_scrape",
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
These return `file_header`, `test_template`, `constraints_formatted`/`success_criteria_formatted`, and `test_guidelines`. They do NOT generate test code — you write the tests.
|
|
||||||
|
|
||||||
#### Step 1c: Write tests
|
|
||||||
|
|
||||||
```python
|
|
||||||
Write(
|
|
||||||
file_path=result["output_file"],
|
|
||||||
content=result["file_header"] + "\n\n" + your_test_code
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Test writing rules
|
|
||||||
|
|
||||||
- Every test MUST be `async` with `@pytest.mark.asyncio`
|
|
||||||
- Every test MUST accept `runner, auto_responder, mock_mode` fixtures
|
|
||||||
- Use `await auto_responder.start()` before running, `await auto_responder.stop()` in `finally`
|
|
||||||
- Use `await runner.run(input_dict)` — this goes through AgentRunner → AgentRuntime → ExecutionStream
|
|
||||||
- Access output via `result.output.get("key")` — NEVER `result.output["key"]`
|
|
||||||
- `result.success=True` means no exception, NOT goal achieved — always check output
|
|
||||||
- Write 8-15 tests total, not 30+
|
|
||||||
- Each real test costs ~3 seconds + LLM tokens
|
|
||||||
- NEVER use `default_agent.run()` — it bypasses the runtime (no sessions, no logs, client-facing nodes hang)
|
|
||||||
|
|
||||||
#### Step 1d: Check existing tests
|
|
||||||
|
|
||||||
Before generating, check if tests already exist:
|
|
||||||
|
|
||||||
```python
|
|
||||||
list_tests(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 2: Execute
|
|
||||||
|
|
||||||
Two execution paths, use the right one for your situation.
|
|
||||||
|
|
||||||
#### Iterative debugging (for complex agents)
|
|
||||||
|
|
||||||
Run the agent via CLI. This creates sessions with checkpoints at `~/.hive/agents/{agent_name}/sessions/`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Sessions and checkpoints are saved automatically.
|
|
||||||
|
|
||||||
**Client-facing nodes**: Agents with `client_facing=True` nodes (interactive conversation) work in headless mode when run from a real terminal — the agent streams output to stdout and reads user input from stdin via a `>>> ` prompt. In non-interactive shells (like Claude Code's Bash tool), client-facing nodes will hang because there is no stdin. For testing interactive agents from Claude Code, use `run_tests` with mock mode or have the user run the agent manually in their terminal.
|
|
||||||
|
|
||||||
#### Automated regression (for CI or final verification)
|
|
||||||
|
|
||||||
Use the `run_tests` MCP tool to run all pytest tests:
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns structured results:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"overall_passed": false,
|
|
||||||
"summary": {"total": 12, "passed": 10, "failed": 2, "pass_rate": "83.3%"},
|
|
||||||
"test_results": [{"test_name": "test_success_source_diversity", "status": "failed"}],
|
|
||||||
"failures": [{"test_name": "test_success_source_diversity", "details": "..."}]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Options:**
|
|
||||||
```python
|
|
||||||
# Run only constraint tests
|
|
||||||
run_tests(goal_id, agent_path, test_types='["constraint"]')
|
|
||||||
|
|
||||||
# Stop on first failure
|
|
||||||
run_tests(goal_id, agent_path, fail_fast=True)
|
|
||||||
|
|
||||||
# Parallel execution
|
|
||||||
run_tests(goal_id, agent_path, parallel=4)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Note:** `run_tests` uses `AgentRunner` with `tmp_path` storage, so sessions are isolated per test run. For checkpoint-based recovery with persistent sessions, use CLI execution. Use `run_tests` for quick regression checks and final verification.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 3: Analyze Failures
|
|
||||||
|
|
||||||
When a test fails, drill down systematically. Don't guess — use the tools.
|
|
||||||
|
|
||||||
#### Step 3a: Get error category
|
|
||||||
|
|
||||||
```python
|
|
||||||
debug_test(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
test_name="test_success_source_diversity",
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns error category (`IMPLEMENTATION_ERROR`, `ASSERTION_FAILURE`, `TIMEOUT`, `IMPORT_ERROR`, `API_ERROR`) plus full traceback and suggestions.
|
|
||||||
|
|
||||||
#### Step 3b: Find the failed session
|
|
||||||
|
|
||||||
```python
|
|
||||||
list_agent_sessions(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
status="failed",
|
|
||||||
limit=5
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns session list with IDs, timestamps, current_node (where it failed), execution_quality.
|
|
||||||
|
|
||||||
#### Step 3c: Inspect session state
|
|
||||||
|
|
||||||
```python
|
|
||||||
get_agent_session_state(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
session_id="session_20260209_143022_abc12345"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns execution path, which node was current, step count, timestamps — but excludes memory values (to avoid context bloat). Shows `memory_keys` and `memory_size` instead.
|
|
||||||
|
|
||||||
#### Step 3d: Examine runtime logs (L2/L3)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# L2: Per-node success/failure, retry counts
|
|
||||||
query_runtime_log_details(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
run_id="session_20260209_143022_abc12345",
|
|
||||||
needs_attention_only=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# L3: Exact LLM responses, tool call inputs/outputs
|
|
||||||
query_runtime_log_raw(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
run_id="session_20260209_143022_abc12345",
|
|
||||||
node_id="research"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 3e: Inspect memory data
|
|
||||||
|
|
||||||
```python
|
|
||||||
# See what data a node actually produced
|
|
||||||
get_agent_session_memory(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
session_id="session_20260209_143022_abc12345",
|
|
||||||
key="research_results"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 3f: Find recovery points
|
|
||||||
|
|
||||||
```python
|
|
||||||
list_agent_checkpoints(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
session_id="session_20260209_143022_abc12345",
|
|
||||||
is_clean="true"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns checkpoint summaries with IDs, types (`node_start`, `node_complete`), which node, and `is_clean` flag. Clean checkpoints are safe resume points.
|
|
||||||
|
|
||||||
#### Step 3g: Compare checkpoints (optional)
|
|
||||||
|
|
||||||
To understand what changed between two points in execution:
|
|
||||||
|
|
||||||
```python
|
|
||||||
compare_agent_checkpoints(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
session_id="session_20260209_143022_abc12345",
|
|
||||||
checkpoint_id_before="cp_node_complete_research_143030",
|
|
||||||
checkpoint_id_after="cp_node_complete_review_143115"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns memory diff (added/removed/changed keys) and execution path diff.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 4: Fix Based on Root Cause
|
|
||||||
|
|
||||||
Use the analysis from Phase 3 to determine what to fix and where.
|
|
||||||
|
|
||||||
| Root Cause | What to Fix | Where to Edit |
|
|
||||||
|------------|------------|---------------|
|
|
||||||
| **Prompt issue** — LLM produces wrong output format, misses instructions | Node `system_prompt` | `exports/{agent}/nodes/__init__.py` |
|
|
||||||
| **Code bug** — TypeError, KeyError, logic error in Python | Agent code | `exports/{agent}/agent.py`, `nodes/__init__.py` |
|
|
||||||
| **Graph issue** — wrong routing, missing edge, bad condition_expr | Edges, node config | `exports/{agent}/agent.py` |
|
|
||||||
| **Tool issue** — MCP tool fails, wrong config, missing credential | Tool config | `exports/{agent}/mcp_servers.json`, `/hive-credentials` |
|
|
||||||
| **Goal issue** — success criteria too strict/vague, wrong constraints | Goal definition | `exports/{agent}/agent.py` (goal section) |
|
|
||||||
| **Test issue** — test expectations don't match actual agent behavior | Test code | `exports/{agent}/tests/test_*.py` |
|
|
||||||
|
|
||||||
#### Fix strategies by error category
|
|
||||||
|
|
||||||
**IMPLEMENTATION_ERROR** (TypeError, AttributeError, KeyError):
|
|
||||||
```python
|
|
||||||
# Read the failing code
|
|
||||||
Read(file_path="exports/{agent_name}/nodes/__init__.py")
|
|
||||||
|
|
||||||
# Fix the bug
|
|
||||||
Edit(
|
|
||||||
file_path="exports/{agent_name}/nodes/__init__.py",
|
|
||||||
old_string="results.get('videos')",
|
|
||||||
new_string="(results or {}).get('videos', [])"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**ASSERTION_FAILURE** (test assertions fail but agent ran successfully):
|
|
||||||
- Check if the agent's output is actually wrong → fix the prompt
|
|
||||||
- Check if the test's expectations are unrealistic → fix the test
|
|
||||||
- Use `get_agent_session_memory` to see what the agent actually produced
|
|
||||||
|
|
||||||
**TIMEOUT / STALL** (agent runs too long):
|
|
||||||
- Check `node_visit_counts` for feedback loops hitting max_node_visits
|
|
||||||
- Check L3 logs for tool calls that hang
|
|
||||||
- Reduce `max_iterations` in loop_config or fix the prompt to converge faster
|
|
||||||
|
|
||||||
**API_ERROR** (connection, rate limit, auth):
|
|
||||||
- Verify credentials with `/hive-credentials`
|
|
||||||
- Check MCP server configuration
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 5: Recover & Resume
|
|
||||||
|
|
||||||
After fixing the agent, decide whether to resume or re-run.
|
|
||||||
|
|
||||||
#### When to resume from checkpoint
|
|
||||||
|
|
||||||
Resume when ALL of these are true:
|
|
||||||
- The fix is to a node that comes AFTER existing clean checkpoints
|
|
||||||
- Clean checkpoints exist (from a CLI execution with checkpointing)
|
|
||||||
- The early nodes are expensive (web scraping, API calls, long LLM chains)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Resume from the last clean checkpoint before the failing node
|
|
||||||
uv run hive run exports/{agent_name} \
|
|
||||||
--resume-session session_20260209_143022_abc12345 \
|
|
||||||
--checkpoint cp_node_complete_research_143030
|
|
||||||
```
|
|
||||||
|
|
||||||
This skips all nodes before the checkpoint and only re-runs the fixed node onward.
|
|
||||||
|
|
||||||
#### When to re-run from scratch
|
|
||||||
|
|
||||||
Re-run when ANY of these are true:
|
|
||||||
- The fix is to the entry node or an early node
|
|
||||||
- No checkpoints exist (e.g., agent was run via `run_tests`)
|
|
||||||
- The agent is fast (2-3 nodes, completes in seconds)
|
|
||||||
- You changed the graph structure (added/removed nodes/edges)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Inspecting a checkpoint before resuming
|
|
||||||
|
|
||||||
```python
|
|
||||||
get_agent_checkpoint(
|
|
||||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
|
||||||
session_id="session_20260209_143022_abc12345",
|
|
||||||
checkpoint_id="cp_node_complete_research_143030"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns the full checkpoint: shared_memory snapshot, execution_path, current_node, next_node, is_clean.
|
|
||||||
|
|
||||||
#### Loop back to Phase 2
|
|
||||||
|
|
||||||
After resuming or re-running, check if the fix worked. If not, go back to Phase 3.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 6: Final Verification
|
|
||||||
|
|
||||||
Once the iterative fix loop converges (the agent produces correct output), run the full automated test suite:
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="your-goal-id",
|
|
||||||
agent_path="exports/{agent_name}"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
All tests should pass. If not, repeat the loop for remaining failures.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Credential Requirements
|
|
||||||
|
|
||||||
**CRITICAL: Testing requires ALL credentials the agent depends on.** This includes both the LLM API key AND any tool-specific credentials (HubSpot, Brave Search, etc.).
|
|
||||||
|
|
||||||
### Prerequisites
|
|
||||||
|
|
||||||
Before running agent tests, you MUST collect ALL required credentials from the user.
|
|
||||||
|
|
||||||
**Step 1: LLM API Key (always required)**
|
|
||||||
```bash
|
|
||||||
export ANTHROPIC_API_KEY="your-key-here"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Tool-specific credentials (depends on agent's tools)**
|
|
||||||
|
|
||||||
Inspect the agent's `mcp_servers.json` and tool configuration to determine which tools the agent uses, then check for all required credentials:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from aden_tools.credentials import CredentialManager, CREDENTIAL_SPECS
|
|
||||||
|
|
||||||
creds = CredentialManager()
|
|
||||||
|
|
||||||
# Determine which tools the agent uses (from agent.json or mcp_servers.json)
|
|
||||||
agent_tools = [...] # e.g., ["hubspot_search_contacts", "web_search", ...]
|
|
||||||
|
|
||||||
# Find all missing credentials for those tools
|
|
||||||
missing = creds.get_missing_for_tools(agent_tools)
|
|
||||||
```
|
|
||||||
|
|
||||||
Common tool credentials:
|
|
||||||
| Tool | Env Var | Help URL |
|
|
||||||
|------|---------|----------|
|
|
||||||
| HubSpot CRM | `HUBSPOT_ACCESS_TOKEN` | https://developers.hubspot.com/docs/api/private-apps |
|
|
||||||
| Brave Search | `BRAVE_SEARCH_API_KEY` | https://brave.com/search/api/ |
|
|
||||||
| Google Search | `GOOGLE_SEARCH_API_KEY` + `GOOGLE_SEARCH_CX` | https://developers.google.com/custom-search |
|
|
||||||
|
|
||||||
**Why ALL credentials are required:**
|
|
||||||
- Tests need to execute the agent's LLM nodes to validate behavior
|
|
||||||
- Tools with missing credentials will return error dicts instead of real data
|
|
||||||
- Mock mode bypasses everything, providing no confidence in real-world performance
|
|
||||||
|
|
||||||
### Mock Mode Limitations
|
|
||||||
|
|
||||||
Mock mode (`--mock` flag or `MOCK_MODE=1`) is **ONLY for structure validation**:
|
|
||||||
|
|
||||||
- Validates graph structure (nodes, edges, connections)
|
|
||||||
- Validates that `AgentRunner.load()` succeeds and the agent is importable
|
|
||||||
- Does NOT execute event_loop agents — MockLLMProvider never calls `set_output`, so event_loop nodes loop forever
|
|
||||||
- Does NOT test LLM reasoning, content quality, or constraint validation
|
|
||||||
- Does NOT test real API integrations or tool use
|
|
||||||
|
|
||||||
**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials.
|
|
||||||
|
|
||||||
### Enforcing Credentials in Tests
|
|
||||||
|
|
||||||
When writing tests, **ALWAYS include credential checks**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
import pytest
|
|
||||||
from aden_tools.credentials import CredentialManager
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipif(
|
|
||||||
not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"),
|
|
||||||
reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", autouse=True)
|
|
||||||
def check_credentials():
|
|
||||||
"""Ensure ALL required credentials are set for real testing."""
|
|
||||||
creds = CredentialManager()
|
|
||||||
mock_mode = os.environ.get("MOCK_MODE")
|
|
||||||
|
|
||||||
if not creds.is_available("anthropic"):
|
|
||||||
if mock_mode:
|
|
||||||
print("\nRunning in MOCK MODE - structure validation only")
|
|
||||||
else:
|
|
||||||
pytest.fail(
|
|
||||||
"\nANTHROPIC_API_KEY not set!\n"
|
|
||||||
"Set API key: export ANTHROPIC_API_KEY='your-key-here'\n"
|
|
||||||
"Or run structure validation: MOCK_MODE=1 pytest exports/{agent}/tests/"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not mock_mode:
|
|
||||||
agent_tools = [] # Update per agent
|
|
||||||
missing = creds.get_missing_for_tools(agent_tools)
|
|
||||||
if missing:
|
|
||||||
lines = ["\nMissing tool credentials!"]
|
|
||||||
for name in missing:
|
|
||||||
spec = creds.specs.get(name)
|
|
||||||
if spec:
|
|
||||||
lines.append(f" {spec.env_var} - {spec.description}")
|
|
||||||
pytest.fail("\n".join(lines))
|
|
||||||
```
|
|
||||||
|
|
||||||
### User Communication
|
|
||||||
|
|
||||||
When the user asks to test an agent, **ALWAYS check for ALL credentials first**:
|
|
||||||
|
|
||||||
1. **Identify the agent's tools** from `mcp_servers.json`
|
|
||||||
2. **Check ALL required credentials** using `CredentialManager`
|
|
||||||
3. **Ask the user to provide any missing credentials** before proceeding
|
|
||||||
4. Collect ALL missing credentials in a single prompt — not one at a time
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Safe Test Patterns
|
|
||||||
|
|
||||||
### OutputCleaner
|
|
||||||
|
|
||||||
The framework automatically validates and cleans node outputs using a fast LLM at edge traversal time. Tests should still use safe patterns because OutputCleaner may not catch all issues.
|
|
||||||
|
|
||||||
### Safe Access (REQUIRED)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# UNSAFE - will crash on missing keys
|
|
||||||
approval = result.output["approval_decision"]
|
|
||||||
category = result.output["analysis"]["category"]
|
|
||||||
|
|
||||||
# SAFE - use .get() with defaults
|
|
||||||
output = result.output or {}
|
|
||||||
approval = output.get("approval_decision", "UNKNOWN")
|
|
||||||
|
|
||||||
# SAFE - type check before operations
|
|
||||||
analysis = output.get("analysis", {})
|
|
||||||
if isinstance(analysis, dict):
|
|
||||||
category = analysis.get("category", "unknown")
|
|
||||||
|
|
||||||
# SAFE - handle JSON parsing trap (LLM response as string)
|
|
||||||
import json
|
|
||||||
recommendation = output.get("recommendation", "{}")
|
|
||||||
if isinstance(recommendation, str):
|
|
||||||
try:
|
|
||||||
parsed = json.loads(recommendation)
|
|
||||||
if isinstance(parsed, dict):
|
|
||||||
approval = parsed.get("approval_decision", "UNKNOWN")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
approval = "UNKNOWN"
|
|
||||||
elif isinstance(recommendation, dict):
|
|
||||||
approval = recommendation.get("approval_decision", "UNKNOWN")
|
|
||||||
|
|
||||||
# SAFE - type check before iteration
|
|
||||||
items = output.get("items", [])
|
|
||||||
if isinstance(items, list):
|
|
||||||
for item in items:
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
### Helper Functions for conftest.py
|
|
||||||
|
|
||||||
```python
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
def _parse_json_from_output(result, key):
|
|
||||||
"""Parse JSON from agent output (framework may store full LLM response as string)."""
|
|
||||||
response_text = result.output.get(key, "")
|
|
||||||
json_text = re.sub(r'```json\s*|\s*```', '', response_text).strip()
|
|
||||||
try:
|
|
||||||
return json.loads(json_text)
|
|
||||||
except (json.JSONDecodeError, AttributeError, TypeError):
|
|
||||||
return result.output.get(key)
|
|
||||||
|
|
||||||
def safe_get_nested(result, key_path, default=None):
|
|
||||||
"""Safely get nested value from result.output."""
|
|
||||||
output = result.output or {}
|
|
||||||
current = output
|
|
||||||
for key in key_path:
|
|
||||||
if isinstance(current, dict):
|
|
||||||
current = current.get(key)
|
|
||||||
elif isinstance(current, str):
|
|
||||||
try:
|
|
||||||
json_text = re.sub(r'```json\s*|\s*```', '', current).strip()
|
|
||||||
parsed = json.loads(json_text)
|
|
||||||
if isinstance(parsed, dict):
|
|
||||||
current = parsed.get(key)
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
return default
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
return current if current is not None else default
|
|
||||||
|
|
||||||
# Make available in tests
|
|
||||||
pytest.parse_json_from_output = _parse_json_from_output
|
|
||||||
pytest.safe_get_nested = safe_get_nested
|
|
||||||
```
|
|
||||||
|
|
||||||
### ExecutionResult Fields
|
|
||||||
|
|
||||||
**`result.success=True` means NO exception, NOT goal achieved**
|
|
||||||
|
|
||||||
```python
|
|
||||||
# WRONG
|
|
||||||
assert result.success
|
|
||||||
|
|
||||||
# RIGHT
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
approval = output.get("approval_decision")
|
|
||||||
assert approval == "APPROVED", f"Expected APPROVED, got {approval}"
|
|
||||||
```
|
|
||||||
|
|
||||||
All fields:
|
|
||||||
- `success: bool` — Completed without exception (NOT goal achieved!)
|
|
||||||
- `output: dict` — Complete memory snapshot (may contain raw strings)
|
|
||||||
- `error: str | None` — Error message if failed
|
|
||||||
- `steps_executed: int` — Number of nodes executed
|
|
||||||
- `total_tokens: int` — Cumulative token usage
|
|
||||||
- `total_latency_ms: int` — Total execution time
|
|
||||||
- `path: list[str]` — Node IDs traversed (may repeat in feedback loops)
|
|
||||||
- `paused_at: str | None` — Node ID if paused
|
|
||||||
- `session_state: dict` — State for resuming
|
|
||||||
- `node_visit_counts: dict[str, int]` — Visit counts per node (feedback loop testing)
|
|
||||||
- `execution_quality: str` — "clean", "degraded", or "failed"
|
|
||||||
|
|
||||||
### Test Count Guidance
|
|
||||||
|
|
||||||
**Write 8-15 tests, not 30+**
|
|
||||||
|
|
||||||
- 2-3 tests per success criterion
|
|
||||||
- 1 happy path test
|
|
||||||
- 1 boundary/edge case test
|
|
||||||
- 1 error handling test (optional)
|
|
||||||
|
|
||||||
Each real test costs ~3 seconds + LLM tokens. 12 tests = ~36 seconds, $0.12.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Test Patterns
|
|
||||||
|
|
||||||
### Happy Path
|
|
||||||
```python
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_happy_path(runner, auto_responder, mock_mode):
|
|
||||||
"""Test normal successful execution."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "python tutorials"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
assert output.get("report"), "No report produced"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Boundary Condition
|
|
||||||
```python
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_minimum_sources(runner, auto_responder, mock_mode):
|
|
||||||
"""Test at minimum source threshold."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "niche topic"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
sources = output.get("sources", [])
|
|
||||||
if isinstance(sources, list):
|
|
||||||
assert len(sources) >= 3, f"Expected >= 3 sources, got {len(sources)}"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Error Handling
|
|
||||||
```python
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_empty_input(runner, auto_responder, mock_mode):
|
|
||||||
"""Test graceful handling of empty input."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": ""})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
# Agent should either fail gracefully or produce an error message
|
|
||||||
output = result.output or {}
|
|
||||||
assert not result.success or output.get("error"), "Should handle empty input"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Feedback Loop
|
|
||||||
```python
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
|
|
||||||
"""Test that feedback loops don't run forever."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "test"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
visits = result.node_visit_counts or {}
|
|
||||||
for node_id, count in visits.items():
|
|
||||||
assert count <= 5, f"Node {node_id} visited {count} times — possible infinite loop"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## MCP Tool Reference
|
|
||||||
|
|
||||||
### Phase 1: Test Generation
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Check existing tests
|
|
||||||
list_tests(goal_id, agent_path)
|
|
||||||
|
|
||||||
# Get constraint test guidelines (returns templates, NOT generated tests)
|
|
||||||
generate_constraint_tests(goal_id, goal_json, agent_path)
|
|
||||||
# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines
|
|
||||||
|
|
||||||
# Get success criteria test guidelines
|
|
||||||
generate_success_tests(goal_id, goal_json, node_names, tool_names, agent_path)
|
|
||||||
# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 2: Execution
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Automated regression (no checkpoints, fresh runs)
|
|
||||||
run_tests(goal_id, agent_path, test_types='["all"]', parallel=-1, fail_fast=False)
|
|
||||||
|
|
||||||
# Run only specific test types
|
|
||||||
run_tests(goal_id, agent_path, test_types='["constraint"]')
|
|
||||||
run_tests(goal_id, agent_path, test_types='["success"]')
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Iterative debugging with checkpoints (via CLI)
|
|
||||||
uv run hive run exports/{agent_name} --input '{"query": "test"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 3: Analysis
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Debug a specific failed test
|
|
||||||
debug_test(goal_id, test_name, agent_path)
|
|
||||||
|
|
||||||
# Find failed sessions
|
|
||||||
list_agent_sessions(agent_work_dir, status="failed", limit=5)
|
|
||||||
|
|
||||||
# Inspect session state (excludes memory values)
|
|
||||||
get_agent_session_state(agent_work_dir, session_id)
|
|
||||||
|
|
||||||
# Inspect memory data
|
|
||||||
get_agent_session_memory(agent_work_dir, session_id, key="research_results")
|
|
||||||
|
|
||||||
# Runtime logs: L1 summaries
|
|
||||||
query_runtime_logs(agent_work_dir, status="needs_attention")
|
|
||||||
|
|
||||||
# Runtime logs: L2 per-node details
|
|
||||||
query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True)
|
|
||||||
|
|
||||||
# Runtime logs: L3 tool/LLM raw data
|
|
||||||
query_runtime_log_raw(agent_work_dir, run_id, node_id="research")
|
|
||||||
|
|
||||||
# Find clean checkpoints
|
|
||||||
list_agent_checkpoints(agent_work_dir, session_id, is_clean="true")
|
|
||||||
|
|
||||||
# Compare checkpoints (memory diff)
|
|
||||||
compare_agent_checkpoints(agent_work_dir, session_id, cp_before, cp_after)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 5: Recovery
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Inspect checkpoint before resuming
|
|
||||||
get_agent_checkpoint(agent_work_dir, session_id, checkpoint_id)
|
|
||||||
# Empty checkpoint_id = latest checkpoint
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Resume from checkpoint via CLI (headless)
|
|
||||||
uv run hive run exports/{agent_name} \
|
|
||||||
--resume-session {session_id} --checkpoint {checkpoint_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Anti-Patterns
|
|
||||||
|
|
||||||
| Don't | Do Instead |
|
|
||||||
|-------|-----------|
|
|
||||||
| Use `default_agent.run()` in tests | Use `runner.run()` with `auto_responder` fixtures (goes through AgentRuntime) |
|
|
||||||
| Re-run entire agent when a late node fails | Resume from last clean checkpoint |
|
|
||||||
| Treat `result.success` as goal achieved | Check `result.output` for actual criteria |
|
|
||||||
| Access `result.output["key"]` directly | Use `result.output.get("key")` |
|
|
||||||
| Fix random things hoping tests pass | Analyze L2/L3 logs to find root cause first |
|
|
||||||
| Write 30+ tests | Write 8-15 focused tests |
|
|
||||||
| Skip credential check | Use `/hive-credentials` before testing |
|
|
||||||
| Confuse `exports/` with `~/.hive/agents/` | Code in `exports/`, runtime data in `~/.hive/` |
|
|
||||||
| Use `run_tests` for iterative debugging | Use headless CLI with checkpoints for iterative debugging |
|
|
||||||
| Use headless CLI for final regression | Use `run_tests` for automated regression |
|
|
||||||
| Use `--tui` from Claude Code | Use headless `run` command — TUI hangs in non-interactive shells |
|
|
||||||
| Test client-facing nodes from Claude Code | Use mock mode, or have the user run the agent in their terminal |
|
|
||||||
| Run tests without reading goal first | Always understand the goal before writing tests |
|
|
||||||
| Skip Phase 3 analysis and guess | Use session + log tools to identify root cause |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Example Walkthrough: Deep Research Agent
|
|
||||||
|
|
||||||
A complete iteration showing the test loop for an agent with nodes: `intake → research → review → report`.
|
|
||||||
|
|
||||||
### Phase 1: Generate tests
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Read the goal
|
|
||||||
Read(file_path="exports/deep_research_agent/agent.py")
|
|
||||||
|
|
||||||
# Get success criteria test guidelines
|
|
||||||
result = generate_success_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "target": ">=5"}, {"id": "citation-coverage", "target": "100%"}, {"id": "report-completeness", "target": "90%"}]}',
|
|
||||||
node_names="intake,research,review,report",
|
|
||||||
tool_names="web_search,web_scrape",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Write tests
|
|
||||||
Write(
|
|
||||||
file_path=result["output_file"],
|
|
||||||
content=result["file_header"] + "\n\n" + test_code
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 2: First execution
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
agent_path="exports/deep_research_agent",
|
|
||||||
fail_fast=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Result: `test_success_source_diversity` fails — agent only found 2 sources instead of 5.
|
|
||||||
|
|
||||||
### Phase 3: Analyze
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Debug the failing test
|
|
||||||
debug_test(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
test_name="test_success_source_diversity",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
# → ASSERTION_FAILURE: Expected >= 5 sources, got 2
|
|
||||||
|
|
||||||
# Find the session
|
|
||||||
list_agent_sessions(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
status="completed",
|
|
||||||
limit=1
|
|
||||||
)
|
|
||||||
# → session_20260209_150000_abc12345
|
|
||||||
|
|
||||||
# See what the research node produced
|
|
||||||
get_agent_session_memory(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
session_id="session_20260209_150000_abc12345",
|
|
||||||
key="research_results"
|
|
||||||
)
|
|
||||||
# → Only 2 web_search calls made, each returned 1 source
|
|
||||||
|
|
||||||
# Check the LLM's behavior in the research node
|
|
||||||
query_runtime_log_raw(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
run_id="session_20260209_150000_abc12345",
|
|
||||||
node_id="research"
|
|
||||||
)
|
|
||||||
# → LLM called web_search only twice, then called set_output
|
|
||||||
```
|
|
||||||
|
|
||||||
Root cause: The research node's prompt doesn't tell the LLM to search for at least 5 diverse sources. It stops after the first couple of searches.
|
|
||||||
|
|
||||||
### Phase 4: Fix the prompt
|
|
||||||
|
|
||||||
```python
|
|
||||||
Read(file_path="exports/deep_research_agent/nodes/__init__.py")
|
|
||||||
|
|
||||||
Edit(
|
|
||||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
|
||||||
old_string='system_prompt="Search for information on the user\'s topic."',
|
|
||||||
new_string='system_prompt="Search for information on the user\'s topic. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries to ensure source diversity. Do not stop searching until you have at least 5 distinct sources."'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 5: Resume from checkpoint
|
|
||||||
|
|
||||||
For this example, the fix is to the `research` node. If we had run via CLI with checkpointing, we could resume from the checkpoint after `intake` to skip re-running intake:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if clean checkpoint exists after intake
|
|
||||||
list_agent_checkpoints(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
session_id="session_20260209_150000_abc12345",
|
|
||||||
is_clean="true"
|
|
||||||
)
|
|
||||||
# → cp_node_complete_intake_150005
|
|
||||||
|
|
||||||
# Resume from after intake, re-run research with fixed prompt
|
|
||||||
uv run hive run exports/deep_research_agent \
|
|
||||||
--resume-session session_20260209_150000_abc12345 \
|
|
||||||
--checkpoint cp_node_complete_intake_150005
|
|
||||||
```
|
|
||||||
|
|
||||||
Or for this simple case (intake is fast), just re-run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run hive run exports/deep_research_agent --input '{"topic": "test"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Phase 6: Final verification
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
# → All 12 tests pass
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Test File Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
exports/{agent_name}/
|
|
||||||
├── agent.py ← Agent to test (goal, nodes, edges)
|
|
||||||
├── nodes/__init__.py ← Node implementations (prompts, config)
|
|
||||||
├── config.py ← Agent configuration
|
|
||||||
├── mcp_servers.json ← Tool server config
|
|
||||||
└── tests/
|
|
||||||
├── conftest.py ← Shared fixtures + safe access helpers
|
|
||||||
├── test_constraints.py ← Constraint tests
|
|
||||||
├── test_success_criteria.py ← Success criteria tests
|
|
||||||
└── test_edge_cases.py ← Edge case tests
|
|
||||||
```
|
|
||||||
|
|
||||||
## Integration with Other Skills
|
|
||||||
|
|
||||||
| Scenario | From | To | Action |
|
|
||||||
|----------|------|----|--------|
|
|
||||||
| Agent built, ready to test | `/hive-create` | `/hive-test` | Generate tests, start loop |
|
|
||||||
| Prompt fix needed | `/hive-test` Phase 4 | Direct edit | Edit `nodes/__init__.py`, resume |
|
|
||||||
| Goal definition wrong | `/hive-test` Phase 4 | `/hive-create` | Update goal, may need rebuild |
|
|
||||||
| Missing credentials | `/hive-test` Phase 3 | `/hive-credentials` | Set up credentials |
|
|
||||||
| Complex runtime failure | `/hive-test` Phase 3 | `/hive-debugger` | Deep L1/L2/L3 analysis |
|
|
||||||
| All tests pass | `/hive-test` Phase 6 | Done | Agent validated |
|
|
||||||
@@ -1,333 +0,0 @@
|
|||||||
# Example: Iterative Testing of a Research Agent
|
|
||||||
|
|
||||||
This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.
|
|
||||||
|
|
||||||
## Agent Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
exports/deep_research_agent/
|
|
||||||
├── agent.py # Goal + graph: intake → research → review → report
|
|
||||||
├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
|
|
||||||
├── config.py # Model config
|
|
||||||
├── mcp_servers.json # Tools: web_search, web_scrape
|
|
||||||
└── tests/ # Test files (we'll create these)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 1: Generate Tests
|
|
||||||
|
|
||||||
### Read the goal
|
|
||||||
|
|
||||||
```python
|
|
||||||
Read(file_path="exports/deep_research_agent/agent.py")
|
|
||||||
# Extract: goal_id="rigorous-interactive-research"
|
|
||||||
# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
|
|
||||||
# constraints: no-hallucination, source-attribution
|
|
||||||
```
|
|
||||||
|
|
||||||
### Get test guidelines
|
|
||||||
|
|
||||||
```python
|
|
||||||
result = generate_success_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
|
|
||||||
node_names="intake,research,review,report",
|
|
||||||
tool_names="web_search,web_scrape",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Write tests
|
|
||||||
|
|
||||||
```python
|
|
||||||
Write(
|
|
||||||
file_path="exports/deep_research_agent/tests/test_success_criteria.py",
|
|
||||||
content=result["file_header"] + '''
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_success_source_diversity(runner, auto_responder, mock_mode):
|
|
||||||
"""At least 5 diverse sources are found."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "impact of remote work on productivity"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
sources = output.get("sources", [])
|
|
||||||
if isinstance(sources, list):
|
|
||||||
assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_success_citation_coverage(runner, auto_responder, mock_mode):
|
|
||||||
"""Every factual claim in the report cites its source."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "climate change effects on agriculture"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
report = output.get("report", "")
|
|
||||||
# Check that report contains numbered references
|
|
||||||
assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_success_report_completeness(runner, auto_responder, mock_mode):
|
|
||||||
"""Report addresses the original research question."""
|
|
||||||
query = "pros and cons of nuclear energy"
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": query})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
assert result.success, f"Agent failed: {result.error}"
|
|
||||||
output = result.output or {}
|
|
||||||
report = output.get("report", "")
|
|
||||||
assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_empty_query_handling(runner, auto_responder, mock_mode):
|
|
||||||
"""Agent handles empty input gracefully."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": ""})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
output = result.output or {}
|
|
||||||
assert not result.success or output.get("error"), "Should handle empty query"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
|
|
||||||
"""Feedback loop between review and research terminates."""
|
|
||||||
await auto_responder.start()
|
|
||||||
try:
|
|
||||||
result = await runner.run({"query": "quantum computing basics"})
|
|
||||||
finally:
|
|
||||||
await auto_responder.stop()
|
|
||||||
visits = result.node_visit_counts or {}
|
|
||||||
for node_id, count in visits.items():
|
|
||||||
assert count <= 5, f"Node {node_id} visited {count} times"
|
|
||||||
'''
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 2: First Execution
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
agent_path="exports/deep_research_agent",
|
|
||||||
fail_fast=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"overall_passed": false,
|
|
||||||
"summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
|
|
||||||
"failures": [
|
|
||||||
{"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
|
|
||||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 3: Analyze (Iteration 1)
|
|
||||||
|
|
||||||
### Debug the first failure
|
|
||||||
|
|
||||||
```python
|
|
||||||
debug_test(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
test_name="test_success_source_diversity",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
|
|
||||||
```
|
|
||||||
|
|
||||||
### Find the session and inspect memory
|
|
||||||
|
|
||||||
```python
|
|
||||||
list_agent_sessions(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
status="completed",
|
|
||||||
limit=1
|
|
||||||
)
|
|
||||||
# → session_20260209_150000_abc12345
|
|
||||||
|
|
||||||
get_agent_session_memory(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
session_id="session_20260209_150000_abc12345",
|
|
||||||
key="research_results"
|
|
||||||
)
|
|
||||||
# → Only 2 sources found. LLM stopped searching after 2 queries.
|
|
||||||
```
|
|
||||||
|
|
||||||
### Check LLM behavior in the research node
|
|
||||||
|
|
||||||
```python
|
|
||||||
query_runtime_log_raw(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
run_id="session_20260209_150000_abc12345",
|
|
||||||
node_id="research"
|
|
||||||
)
|
|
||||||
# → LLM called web_search twice, got results, immediately called set_output.
|
|
||||||
# → Prompt doesn't instruct it to find at least 5 sources.
|
|
||||||
```
|
|
||||||
|
|
||||||
**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 4: Fix (Iteration 1)
|
|
||||||
|
|
||||||
```python
|
|
||||||
Read(file_path="exports/deep_research_agent/nodes/__init__.py")
|
|
||||||
|
|
||||||
# Fix the research node prompt
|
|
||||||
Edit(
|
|
||||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
|
||||||
old_string='system_prompt="Search for information on the user\'s topic using web search."',
|
|
||||||
new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 5: Recover & Resume (Iteration 1)
|
|
||||||
|
|
||||||
The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
agent_path="exports/deep_research_agent",
|
|
||||||
fail_fast=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"overall_passed": false,
|
|
||||||
"summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
|
|
||||||
"failures": [
|
|
||||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Source diversity now passes. Citation coverage still fails.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 3: Analyze (Iteration 2)
|
|
||||||
|
|
||||||
```python
|
|
||||||
debug_test(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
test_name="test_success_citation_coverage",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
# Category: ASSERTION_FAILURE — Report lacks citations
|
|
||||||
|
|
||||||
# Check what the report node produced
|
|
||||||
list_agent_sessions(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
status="completed",
|
|
||||||
limit=1
|
|
||||||
)
|
|
||||||
# → session_20260209_151500_def67890
|
|
||||||
|
|
||||||
get_agent_session_memory(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
session_id="session_20260209_151500_def67890",
|
|
||||||
key="report"
|
|
||||||
)
|
|
||||||
# → Report text exists but uses no numbered references.
|
|
||||||
# → Sources are in memory but report node doesn't cite them.
|
|
||||||
```
|
|
||||||
|
|
||||||
**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 4: Fix (Iteration 2)
|
|
||||||
|
|
||||||
```python
|
|
||||||
Edit(
|
|
||||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
|
||||||
old_string='system_prompt="Write a comprehensive report based on the research findings."',
|
|
||||||
new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 5: Resume (Iteration 2)
|
|
||||||
|
|
||||||
The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run via CLI to get checkpoints
|
|
||||||
uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
|
|
||||||
|
|
||||||
# After it runs, find the clean checkpoint before report
|
|
||||||
list_agent_checkpoints(
|
|
||||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
|
||||||
session_id="session_20260209_152000_ghi34567",
|
|
||||||
is_clean="true"
|
|
||||||
)
|
|
||||||
# → cp_node_complete_review_152100 (after review, before report)
|
|
||||||
|
|
||||||
# Resume — skips intake, research, review entirely
|
|
||||||
uv run hive run exports/deep_research_agent \
|
|
||||||
--resume-session session_20260209_152000_ghi34567 \
|
|
||||||
--checkpoint cp_node_complete_review_152100
|
|
||||||
```
|
|
||||||
|
|
||||||
Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Phase 6: Final Verification
|
|
||||||
|
|
||||||
```python
|
|
||||||
run_tests(
|
|
||||||
goal_id="rigorous-interactive-research",
|
|
||||||
agent_path="exports/deep_research_agent"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"overall_passed": true,
|
|
||||||
"summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
All tests pass.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
| Iteration | Failure | Root Cause | Fix | Recovery |
|
|
||||||
|-----------|---------|------------|-----|----------|
|
|
||||||
| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
|
|
||||||
| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |
|
|
||||||
|
|
||||||
**Key takeaways:**
|
|
||||||
- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
|
|
||||||
- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
|
|
||||||
- Final `run_tests` confirms all scenarios pass end-to-end
|
|
||||||
@@ -1,526 +0,0 @@
|
|||||||
---
|
|
||||||
name: hive
|
|
||||||
description: Complete workflow for building, implementing, and testing goal-driven agents. Orchestrates hive-* skills. Use when starting a new agent project, unsure which skill to use, or need end-to-end guidance.
|
|
||||||
license: Apache-2.0
|
|
||||||
metadata:
|
|
||||||
author: hive
|
|
||||||
version: "2.0"
|
|
||||||
type: workflow-orchestrator
|
|
||||||
orchestrates:
|
|
||||||
- hive-concepts
|
|
||||||
- hive-create
|
|
||||||
- hive-patterns
|
|
||||||
- hive-test
|
|
||||||
- hive-credentials
|
|
||||||
- hive-debugger
|
|
||||||
---
|
|
||||||
|
|
||||||
# Agent Development Workflow
|
|
||||||
|
|
||||||
**THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**
|
|
||||||
|
|
||||||
When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
|
|
||||||
|
|
||||||
```
|
|
||||||
Use AskUserQuestion with these options:
|
|
||||||
- "Build a new agent" → Then invoke /hive-create
|
|
||||||
- "Test an existing agent" → Then invoke /hive-test
|
|
||||||
- "Learn agent concepts" → Then invoke /hive-concepts
|
|
||||||
- "Optimize agent design" → Then invoke /hive-patterns
|
|
||||||
- "Set up credentials" → Then invoke /hive-credentials
|
|
||||||
- "Debug a failing agent" → Then invoke /hive-debugger
|
|
||||||
- "Other" (please describe what you want to achieve)
|
|
||||||
```
|
|
||||||
|
|
||||||
**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Complete Standard Operating Procedure (SOP) for building production-ready goal-driven agents.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This workflow orchestrates specialized skills to take you from initial concept to production-ready agent:
|
|
||||||
|
|
||||||
1. **Understand Concepts** → `/hive-concepts` (optional)
|
|
||||||
2. **Build Structure** → `/hive-create`
|
|
||||||
3. **Optimize Design** → `/hive-patterns` (optional)
|
|
||||||
4. **Setup Credentials** → `/hive-credentials` (if agent uses tools requiring API keys)
|
|
||||||
5. **Test & Validate** → `/hive-test`
|
|
||||||
6. **Debug Issues** → `/hive-debugger` (if agent fails at runtime)
|
|
||||||
|
|
||||||
## When to Use This Workflow
|
|
||||||
|
|
||||||
Use this meta-skill when:
|
|
||||||
- Starting a new agent from scratch
|
|
||||||
- Unclear which skill to use first
|
|
||||||
- Need end-to-end guidance for agent development
|
|
||||||
- Want consistent, repeatable agent builds
|
|
||||||
|
|
||||||
**Skip this workflow** if:
|
|
||||||
- You only need to test an existing agent → use `/hive-test` directly
|
|
||||||
- You know exactly which phase you're in → use specific skill directly
|
|
||||||
|
|
||||||
## Quick Decision Tree
|
|
||||||
|
|
||||||
```
|
|
||||||
"Need to understand agent concepts" → hive-concepts
|
|
||||||
"Build a new agent" → hive-create
|
|
||||||
"Optimize my agent design" → hive-patterns
|
|
||||||
"Need client-facing nodes or feedback loops" → hive-patterns
|
|
||||||
"Set up API keys for my agent" → hive-credentials
|
|
||||||
"Test my agent" → hive-test
|
|
||||||
"My agent is failing/stuck/has errors" → hive-debugger
|
|
||||||
"Not sure what I need" → Read phases below, then decide
|
|
||||||
"Agent has structure but needs implementation" → See agent directory STATUS.md
|
|
||||||
```
|
|
||||||
|
|
||||||
## Phase 0: Understand Concepts (Optional)
|
|
||||||
|
|
||||||
**Skill**: `/hive-concepts`
|
|
||||||
**Input**: Questions about agent architecture
|
|
||||||
|
|
||||||
### When to Use
|
|
||||||
|
|
||||||
- First time building an agent
|
|
||||||
- Need to understand node types, edges, goals
|
|
||||||
- Want to validate tool availability
|
|
||||||
- Learning about event loop architecture and client-facing nodes
|
|
||||||
|
|
||||||
### What This Phase Provides
|
|
||||||
|
|
||||||
- Architecture overview (Python packages, not JSON)
|
|
||||||
- Core concepts (Goal, Node, Edge, Event Loop, Judges)
|
|
||||||
- Tool discovery and validation procedures
|
|
||||||
- Workflow overview
|
|
||||||
|
|
||||||
**Skip this phase** if you already understand agent fundamentals.
|
|
||||||
|
|
||||||
## Phase 1: Build Agent Structure
|
|
||||||
|
|
||||||
**Skill**: `/hive-create`
|
|
||||||
**Input**: User requirements ("Build an agent that...") or a template to start from
|
|
||||||
|
|
||||||
### What This Phase Does
|
|
||||||
|
|
||||||
Creates the complete agent architecture:
|
|
||||||
- Package structure (`exports/agent_name/`)
|
|
||||||
- Goal with success criteria and constraints
|
|
||||||
- Workflow graph (nodes and edges)
|
|
||||||
- Node specifications
|
|
||||||
- CLI interface
|
|
||||||
- Documentation
|
|
||||||
|
|
||||||
### Process
|
|
||||||
|
|
||||||
1. **Create package** - Directory structure with skeleton files
|
|
||||||
2. **Define goal** - Success criteria and constraints written to agent.py
|
|
||||||
3. **Design nodes** - Each node approved and written incrementally
|
|
||||||
4. **Connect edges** - Workflow graph with conditional routing
|
|
||||||
5. **Finalize** - Agent class, exports, and documentation
|
|
||||||
|
|
||||||
### Outputs
|
|
||||||
|
|
||||||
- ✅ `exports/agent_name/` package created
|
|
||||||
- ✅ Goal defined in agent.py
|
|
||||||
- ✅ 3-5 success criteria defined
|
|
||||||
- ✅ 1-5 constraints defined
|
|
||||||
- ✅ 5-10 nodes specified in nodes/__init__.py
|
|
||||||
- ✅ 8-15 edges connecting workflow
|
|
||||||
- ✅ Validated structure (passes `uv run python -m agent_name validate`)
|
|
||||||
- ✅ README.md with usage instructions
|
|
||||||
- ✅ CLI commands (info, validate, run, shell)
|
|
||||||
|
|
||||||
### Success Criteria
|
|
||||||
|
|
||||||
You're ready for Phase 2 when:
|
|
||||||
- Agent structure validates without errors
|
|
||||||
- All nodes and edges are defined
|
|
||||||
- CLI commands work (info, validate)
|
|
||||||
- You see: "Agent complete: exports/agent_name/"
|
|
||||||
|
|
||||||
### Common Outputs
|
|
||||||
|
|
||||||
The hive-create skill produces:
|
|
||||||
```
|
|
||||||
exports/agent_name/
|
|
||||||
├── __init__.py (package exports)
|
|
||||||
├── __main__.py (CLI interface)
|
|
||||||
├── agent.py (goal, graph, agent class)
|
|
||||||
├── nodes/__init__.py (node specifications)
|
|
||||||
├── config.py (configuration)
|
|
||||||
├── implementations.py (may be created for Python functions)
|
|
||||||
└── README.md (documentation)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Next Steps
|
|
||||||
|
|
||||||
**If structure complete and validated:**
|
|
||||||
→ Check `exports/agent_name/STATUS.md` or `IMPLEMENTATION_GUIDE.md`
|
|
||||||
→ These files explain implementation options
|
|
||||||
→ You may need to add Python functions or MCP tools (not covered by current skills)
|
|
||||||
|
|
||||||
**If want to optimize design:**
|
|
||||||
→ Proceed to Phase 1.5 (hive-patterns)
|
|
||||||
|
|
||||||
**If ready to test:**
|
|
||||||
→ Proceed to Phase 2
|
|
||||||
|
|
||||||
## Phase 1.5: Optimize Design (Optional)
|
|
||||||
|
|
||||||
**Skill**: `/hive-patterns`
|
|
||||||
**Input**: Completed agent structure
|
|
||||||
|
|
||||||
### When to Use
|
|
||||||
|
|
||||||
- Want to add client-facing blocking or feedback edges
|
|
||||||
- Need judge patterns for output validation
|
|
||||||
- Want fan-out/fan-in (parallel execution)
|
|
||||||
- Need error handling patterns
|
|
||||||
- Want best practices guidance
|
|
||||||
|
|
||||||
### What This Phase Provides
|
|
||||||
|
|
||||||
- Client-facing interaction patterns
|
|
||||||
- Feedback edge routing with nullable output keys
|
|
||||||
- Judge patterns (implicit, SchemaJudge)
|
|
||||||
- Fan-out/fan-in parallel execution
|
|
||||||
- Context management and spillover patterns
|
|
||||||
- Anti-patterns to avoid
|
|
||||||
|
|
||||||
**Skip this phase** if your agent design is straightforward.
|
|
||||||
|
|
||||||
## Phase 2: Test & Validate
|
|
||||||
|
|
||||||
**Skill**: `/hive-test`
|
|
||||||
**Input**: Working agent from Phase 1
|
|
||||||
|
|
||||||
### What This Phase Does
|
|
||||||
|
|
||||||
Guides the creation and execution of a comprehensive test suite:
|
|
||||||
- Constraint tests
|
|
||||||
- Success criteria tests
|
|
||||||
- Edge case tests
|
|
||||||
- Integration tests
|
|
||||||
|
|
||||||
### Process
|
|
||||||
|
|
||||||
1. **Analyze agent** - Read goal, constraints, success criteria
|
|
||||||
2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
|
|
||||||
3. **User approval** - Review and approve each test
|
|
||||||
4. **Run evaluation** - Execute tests and collect results
|
|
||||||
5. **Debug failures** - Identify and fix issues
|
|
||||||
6. **Iterate** - Repeat until all tests pass
|
|
||||||
|
|
||||||
### Outputs
|
|
||||||
|
|
||||||
- ✅ Test files in `exports/agent_name/tests/`
|
|
||||||
- ✅ Test report with pass/fail metrics
|
|
||||||
- ✅ Coverage of all success criteria
|
|
||||||
- ✅ Coverage of all constraints
|
|
||||||
- ✅ Edge case handling verified
|
|
||||||
|
|
||||||
### Success Criteria
|
|
||||||
|
|
||||||
You're done when:
|
|
||||||
- All tests pass
|
|
||||||
- All success criteria validated
|
|
||||||
- All constraints verified
|
|
||||||
- Agent handles edge cases
|
|
||||||
- Test coverage is comprehensive
|
|
||||||
|
|
||||||
### Next Steps
|
|
||||||
|
|
||||||
**Agent ready for:**
|
|
||||||
- Production deployment
|
|
||||||
- Integration into larger systems
|
|
||||||
- Documentation and handoff
|
|
||||||
- Continuous monitoring
|
|
||||||
|
|
||||||
## Phase Transitions
|
|
||||||
|
|
||||||
### From Phase 1 to Phase 2
|
|
||||||
|
|
||||||
**Trigger signals:**
|
|
||||||
- "Agent complete: exports/..."
|
|
||||||
- Structure validation passes
|
|
||||||
- README indicates implementation complete
|
|
||||||
|
|
||||||
**Before proceeding:**
|
|
||||||
- Verify agent can be imported: `from exports.agent_name import default_agent`
|
|
||||||
- Check if implementation is needed (see STATUS.md or IMPLEMENTATION_GUIDE.md)
|
|
||||||
- Confirm agent executes without import errors
|
|
||||||
|
|
||||||
### Skipping Phases
|
|
||||||
|
|
||||||
**When to skip Phase 1:**
|
|
||||||
- Agent structure already exists
|
|
||||||
- Only need to add tests
|
|
||||||
- Modifying existing agent
|
|
||||||
|
|
||||||
**When to skip Phase 2:**
|
|
||||||
- Prototyping or exploring
|
|
||||||
- Agent not production-bound
|
|
||||||
- Manual testing sufficient
|
|
||||||
|
|
||||||
## Common Patterns
|
|
||||||
|
|
||||||
### Pattern 1: Complete New Build (Simple)
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent that monitors files"
|
|
||||||
→ Use /hive-create
|
|
||||||
→ Agent structure created
|
|
||||||
→ Use /hive-test
|
|
||||||
→ Tests created and passing
|
|
||||||
→ Done: Production-ready agent
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pattern 1b: Complete New Build (With Learning)
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent (first time)"
|
|
||||||
→ Use /hive-concepts (understand concepts)
|
|
||||||
→ Use /hive-create (build structure)
|
|
||||||
→ Use /hive-patterns (optimize design)
|
|
||||||
→ Use /hive-test (validate)
|
|
||||||
→ Done: Production-ready agent
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pattern 1c: Build from Template
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent based on the deep research template"
|
|
||||||
→ Use /hive-create
|
|
||||||
→ Select "From a template" path
|
|
||||||
→ Pick template, name new agent
|
|
||||||
→ Review/modify goal, nodes, graph
|
|
||||||
→ Agent exported with customizations
|
|
||||||
→ Use /hive-test
|
|
||||||
→ Done: Customized agent
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pattern 2: Test Existing Agent
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Test my agent at exports/my_agent"
|
|
||||||
→ Skip Phase 1
|
|
||||||
→ Use /hive-test directly
|
|
||||||
→ Tests created
|
|
||||||
→ Done: Validated agent
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pattern 3: Iterative Development
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent"
|
|
||||||
→ Use /hive-create (Phase 1)
|
|
||||||
→ Implementation needed (see STATUS.md)
|
|
||||||
→ [User implements functions]
|
|
||||||
→ Use /hive-test (Phase 2)
|
|
||||||
→ Tests reveal bugs
|
|
||||||
→ [Fix bugs manually]
|
|
||||||
→ Re-run tests
|
|
||||||
→ Done: Working agent
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pattern 4: Agent with Review Loops and HITL Checkpoints
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent with human review and feedback loops"
|
|
||||||
→ Use /hive-concepts (learn event loop, client-facing nodes)
|
|
||||||
→ Use /hive-create (build structure with feedback edges)
|
|
||||||
→ Use /hive-patterns (implement client-facing + feedback patterns)
|
|
||||||
→ Use /hive-test (validate review flows and edge routing)
|
|
||||||
→ Done: Agent with HITL checkpoints and review loops
|
|
||||||
```
|
|
||||||
|
|
||||||
## Skill Dependencies
|
|
||||||
|
|
||||||
```
|
|
||||||
hive (meta-skill)
|
|
||||||
│
|
|
||||||
├── hive-concepts (foundational)
|
|
||||||
│ ├── Architecture concepts (event loop, judges)
|
|
||||||
│ ├── Node types (event_loop, function)
|
|
||||||
│ ├── Edge routing and priority
|
|
||||||
│ ├── Tool discovery procedures
|
|
||||||
│ └── Workflow overview
|
|
||||||
│
|
|
||||||
├── hive-create (procedural)
|
|
||||||
│ ├── Creates package structure
|
|
||||||
│ ├── Defines goal
|
|
||||||
│ ├── Adds nodes (event_loop, function)
|
|
||||||
│ ├── Connects edges with priority routing
|
|
||||||
│ ├── Finalizes agent class
|
|
||||||
│ └── Requires: hive-concepts
|
|
||||||
│
|
|
||||||
├── hive-patterns (reference)
|
|
||||||
│ ├── Client-facing interaction patterns
|
|
||||||
│ ├── Feedback edges and review loops
|
|
||||||
│ ├── Judge patterns (implicit, SchemaJudge)
|
|
||||||
│ ├── Fan-out/fan-in parallel execution
|
|
||||||
│ └── Context management and anti-patterns
|
|
||||||
│
|
|
||||||
├── hive-credentials (utility)
|
|
||||||
│ ├── Detects missing credentials
|
|
||||||
│ ├── Offers auth method choices (Aden OAuth, direct API key)
|
|
||||||
│ ├── Stores securely in ~/.hive/credentials
|
|
||||||
│ └── Validates with health checks
|
|
||||||
│
|
|
||||||
├── hive-test (validation)
|
|
||||||
│ ├── Reads agent goal
|
|
||||||
│ ├── Generates tests
|
|
||||||
│ ├── Runs evaluation
|
|
||||||
│ └── Reports results
|
|
||||||
│
|
|
||||||
└── hive-debugger (troubleshooting)
|
|
||||||
├── Monitors runtime logs (L1/L2/L3)
|
|
||||||
├── Identifies retry loops, tool failures
|
|
||||||
├── Categorizes issues (10 categories)
|
|
||||||
└── Provides fix recommendations
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### "Agent structure won't validate"
|
|
||||||
|
|
||||||
- Check node IDs match between nodes/__init__.py and agent.py
|
|
||||||
- Verify all edges reference valid node IDs
|
|
||||||
- Ensure entry_node exists in nodes list
|
|
||||||
- Run: `PYTHONPATH=exports uv run python -m agent_name validate`
|
|
||||||
|
|
||||||
### "Agent has structure but won't run"
|
|
||||||
|
|
||||||
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
|
|
||||||
- Implementation may be needed (Python functions or MCP tools)
|
|
||||||
- This is expected - hive-create creates structure, not implementation
|
|
||||||
- See implementation guide for completion options
|
|
||||||
|
|
||||||
### "Tests are failing"
|
|
||||||
|
|
||||||
- Review test output for specific failures
|
|
||||||
- Check agent goal and success criteria
|
|
||||||
- Verify constraints are met
|
|
||||||
- Use `/hive-test` to debug and iterate
|
|
||||||
- Fix agent code and re-run tests
|
|
||||||
|
|
||||||
### "Agent is failing at runtime"
|
|
||||||
|
|
||||||
- Use `/hive-debugger` to analyze runtime logs
|
|
||||||
- The debugger identifies retry loops, tool failures, and stalled execution
|
|
||||||
- Get actionable fix recommendations with code changes
|
|
||||||
- Monitor the agent in real-time during TUI sessions
|
|
||||||
|
|
||||||
### "Not sure which phase I'm in"
|
|
||||||
|
|
||||||
Run these checks:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if agent structure exists
|
|
||||||
ls exports/my_agent/agent.py
|
|
||||||
|
|
||||||
# Check if it validates
|
|
||||||
PYTHONPATH=exports uv run python -m my_agent validate
|
|
||||||
|
|
||||||
# Check if tests exist
|
|
||||||
ls exports/my_agent/tests/
|
|
||||||
|
|
||||||
# If structure exists and validates → Phase 2 (testing)
|
|
||||||
# If structure doesn't exist → Phase 1 (building)
|
|
||||||
# If tests exist but failing → Debug phase
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
### For Phase 1 (Building)
|
|
||||||
|
|
||||||
1. **Start with clear requirements** - Know what the agent should do
|
|
||||||
2. **Define success criteria early** - Measurable goals drive design
|
|
||||||
3. **Keep nodes focused** - One responsibility per node
|
|
||||||
4. **Use descriptive names** - Node IDs should explain purpose
|
|
||||||
5. **Validate incrementally** - Check structure after each major addition
|
|
||||||
|
|
||||||
### For Phase 2 (Testing)
|
|
||||||
|
|
||||||
1. **Test constraints first** - Hard requirements must pass
|
|
||||||
2. **Mock external dependencies** - Use mock mode for LLMs/APIs
|
|
||||||
3. **Cover edge cases** - Test failures, not just success paths
|
|
||||||
4. **Iterate quickly** - Fix one test at a time
|
|
||||||
5. **Document test patterns** - Future tests follow same structure
|
|
||||||
|
|
||||||
### General Workflow
|
|
||||||
|
|
||||||
1. **Use version control** - Git commit after each phase
|
|
||||||
2. **Document decisions** - Update README with changes
|
|
||||||
3. **Keep iterations small** - Build → Test → Fix → Repeat
|
|
||||||
4. **Preserve working states** - Tag successful iterations
|
|
||||||
5. **Learn from failures** - Failed tests reveal design issues
|
|
||||||
|
|
||||||
## Exit Criteria
|
|
||||||
|
|
||||||
You're done with the workflow when:
|
|
||||||
|
|
||||||
✅ Agent structure validates
|
|
||||||
✅ All tests pass
|
|
||||||
✅ Success criteria met
|
|
||||||
✅ Constraints verified
|
|
||||||
✅ Documentation complete
|
|
||||||
✅ Agent ready for deployment
|
|
||||||
|
|
||||||
## Additional Resources
|
|
||||||
|
|
||||||
- **hive-concepts**: See `.claude/skills/hive-concepts/SKILL.md`
|
|
||||||
- **hive-create**: See `.claude/skills/hive-create/SKILL.md`
|
|
||||||
- **hive-patterns**: See `.claude/skills/hive-patterns/SKILL.md`
|
|
||||||
- **hive-test**: See `.claude/skills/hive-test/SKILL.md`
|
|
||||||
- **Agent framework docs**: See `core/README.md`
|
|
||||||
- **Example agents**: See `exports/` directory
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
This workflow provides a proven path from concept to production-ready agent:
|
|
||||||
|
|
||||||
1. **Learn** with `/hive-concepts` → Understand fundamentals (optional)
|
|
||||||
2. **Build** with `/hive-create` → Get validated structure
|
|
||||||
3. **Optimize** with `/hive-patterns` → Apply best practices (optional)
|
|
||||||
4. **Configure** with `/hive-credentials` → Set up API keys (if needed)
|
|
||||||
5. **Test** with `/hive-test` → Get verified functionality
|
|
||||||
6. **Debug** with `/hive-debugger` → Fix runtime issues (if needed)
|
|
||||||
|
|
||||||
The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
|
|
||||||
|
|
||||||
## Skill Selection Guide
|
|
||||||
|
|
||||||
**Choose hive-concepts when:**
|
|
||||||
- First time building agents
|
|
||||||
- Need to understand event loop architecture
|
|
||||||
- Validating tool availability
|
|
||||||
- Learning about node types, edges, and judges
|
|
||||||
|
|
||||||
**Choose hive-create when:**
|
|
||||||
- Actually building an agent
|
|
||||||
- Have clear requirements
|
|
||||||
- Ready to write code
|
|
||||||
- Want step-by-step guidance
|
|
||||||
- Want to start from an existing template and customize it
|
|
||||||
|
|
||||||
**Choose hive-patterns when:**
|
|
||||||
- Agent structure complete
|
|
||||||
- Need client-facing nodes or feedback edges
|
|
||||||
- Implementing review loops or fan-out/fan-in
|
|
||||||
- Want judge patterns or context management
|
|
||||||
- Want best practices
|
|
||||||
|
|
||||||
**Choose hive-test when:**
|
|
||||||
- Agent structure complete
|
|
||||||
- Ready to validate functionality
|
|
||||||
- Need comprehensive test coverage
|
|
||||||
- Testing feedback loops, output keys, or fan-out
|
|
||||||
|
|
||||||
**Choose hive-debugger when:**
|
|
||||||
- Agent is failing or stuck at runtime
|
|
||||||
- Seeing retry loops or escalations
|
|
||||||
- Tool calls are failing
|
|
||||||
- Need to understand why a node isn't completing
|
|
||||||
- Want real-time monitoring of agent execution
|
|
||||||
@@ -1,199 +0,0 @@
|
|||||||
# Example: File Monitor Agent
|
|
||||||
|
|
||||||
This example shows the complete /hive workflow in action for building a file monitoring agent.
|
|
||||||
|
|
||||||
## Initial Request
|
|
||||||
|
|
||||||
```
|
|
||||||
User: "Build an agent that monitors ~/Downloads and copies new files to ~/Documents"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Phase 1: Building (20 minutes)
|
|
||||||
|
|
||||||
### Step 1: Create Structure
|
|
||||||
|
|
||||||
Agent invokes `/hive-create` skill and:
|
|
||||||
|
|
||||||
1. Creates `exports/file_monitor_agent/` package
|
|
||||||
2. Writes skeleton files (__init__.py, __main__.py, agent.py, etc.)
|
|
||||||
|
|
||||||
**Output**: Package structure visible immediately
|
|
||||||
|
|
||||||
### Step 2: Define Goal
|
|
||||||
|
|
||||||
```python
|
|
||||||
goal = Goal(
|
|
||||||
id="file-monitor-copy",
|
|
||||||
name="Automated File Monitor & Copy",
|
|
||||||
success_criteria=[
|
|
||||||
# 100% detection rate
|
|
||||||
# 100% copy success
|
|
||||||
# 100% conflict resolution
|
|
||||||
# >99% uptime
|
|
||||||
],
|
|
||||||
constraints=[
|
|
||||||
# Preserve originals
|
|
||||||
# Handle errors gracefully
|
|
||||||
# Track state
|
|
||||||
# Respect permissions
|
|
||||||
]
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Output**: Goal written to agent.py
|
|
||||||
|
|
||||||
### Step 3: Design Nodes
|
|
||||||
|
|
||||||
7 nodes approved and written incrementally:
|
|
||||||
|
|
||||||
1. `initialize-state` - Set up tracking
|
|
||||||
2. `list-downloads` - Scan directory
|
|
||||||
3. `identify-new-files` - Find new files
|
|
||||||
4. `check-for-new-files` - Router
|
|
||||||
5. `copy-files` - Copy with conflict resolution
|
|
||||||
6. `update-state` - Mark as processed
|
|
||||||
7. `wait-interval` - Sleep between cycles
|
|
||||||
|
|
||||||
**Output**: All nodes in nodes/__init__.py
|
|
||||||
|
|
||||||
### Step 4: Connect Edges
|
|
||||||
|
|
||||||
8 edges connecting the workflow loop:
|
|
||||||
|
|
||||||
```
|
|
||||||
initialize → list → identify → check
|
|
||||||
↓ ↓
|
|
||||||
copy wait
|
|
||||||
↓ ↑
|
|
||||||
update ↓
|
|
||||||
↓ ↓
|
|
||||||
wait → list (loop)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Output**: Edges written to agent.py
|
|
||||||
|
|
||||||
### Step 5: Finalize
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ PYTHONPATH=exports uv run python -m file_monitor_agent validate
|
|
||||||
✓ Agent is valid
|
|
||||||
|
|
||||||
$ PYTHONPATH=exports uv run python -m file_monitor_agent info
|
|
||||||
Agent: File Monitor & Copy Agent
|
|
||||||
Nodes: 7
|
|
||||||
Edges: 8
|
|
||||||
```
|
|
||||||
|
|
||||||
**Phase 1 Complete**: Structure validated ✅
|
|
||||||
|
|
||||||
### Status After Phase 1
|
|
||||||
|
|
||||||
```
|
|
||||||
exports/file_monitor_agent/
|
|
||||||
├── __init__.py ✅ (exports)
|
|
||||||
├── __main__.py ✅ (CLI)
|
|
||||||
├── agent.py ✅ (goal, graph, agent class)
|
|
||||||
├── nodes/__init__.py ✅ (7 nodes)
|
|
||||||
├── config.py ✅ (configuration)
|
|
||||||
├── implementations.py ✅ (Python functions)
|
|
||||||
├── README.md ✅ (documentation)
|
|
||||||
├── IMPLEMENTATION_GUIDE.md ✅ (next steps)
|
|
||||||
└── STATUS.md ✅ (current state)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Note**: Implementation gap exists - data flow needs connection (covered in STATUS.md)
|
|
||||||
|
|
||||||
## Phase 2: Testing (25 minutes)
|
|
||||||
|
|
||||||
### Step 1: Analyze Agent
|
|
||||||
|
|
||||||
Agent invokes `/hive-test` skill and:
|
|
||||||
|
|
||||||
1. Reads goal from `exports/file_monitor_agent/agent.py`
|
|
||||||
2. Identifies 4 success criteria to test
|
|
||||||
3. Identifies 4 constraints to verify
|
|
||||||
4. Plans test coverage
|
|
||||||
|
|
||||||
### Step 2: Generate Tests
|
|
||||||
|
|
||||||
Creates test files:
|
|
||||||
|
|
||||||
```
|
|
||||||
exports/file_monitor_agent/tests/
|
|
||||||
├── conftest.py (fixtures)
|
|
||||||
├── test_constraints.py (4 constraint tests)
|
|
||||||
├── test_success_criteria.py (4 success tests)
|
|
||||||
└── test_edge_cases.py (error handling)
|
|
||||||
```
|
|
||||||
|
|
||||||
Tests approved incrementally by user.
|
|
||||||
|
|
||||||
### Step 3: Run Tests
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ PYTHONPATH=exports uv run pytest exports/file_monitor_agent/tests/
|
|
||||||
|
|
||||||
test_constraints.py::test_preserves_originals PASSED
|
|
||||||
test_constraints.py::test_handles_errors PASSED
|
|
||||||
test_constraints.py::test_tracks_state PASSED
|
|
||||||
test_constraints.py::test_respects_permissions PASSED
|
|
||||||
|
|
||||||
test_success_criteria.py::test_detects_all_files PASSED
|
|
||||||
test_success_criteria.py::test_copies_all_files PASSED
|
|
||||||
test_success_criteria.py::test_resolves_conflicts PASSED
|
|
||||||
test_success_criteria.py::test_continuous_run PASSED
|
|
||||||
|
|
||||||
test_edge_cases.py::test_empty_directory PASSED
|
|
||||||
test_edge_cases.py::test_permission_denied PASSED
|
|
||||||
test_edge_cases.py::test_disk_full PASSED
|
|
||||||
test_edge_cases.py::test_large_files PASSED
|
|
||||||
|
|
||||||
========================== 12 passed in 3.42s ==========================
|
|
||||||
```
|
|
||||||
|
|
||||||
**Phase 2 Complete**: All tests pass ✅
|
|
||||||
|
|
||||||
## Final Output
|
|
||||||
|
|
||||||
**Production-Ready Agent:**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run the agent
|
|
||||||
./RUN_AGENT.sh
|
|
||||||
|
|
||||||
# Or manually
|
|
||||||
PYTHONPATH=exports uv run python -m file_monitor_agent run
|
|
||||||
```
|
|
||||||
|
|
||||||
**Capabilities:**
|
|
||||||
- Monitors ~/Downloads continuously
|
|
||||||
- Copies new files to ~/Documents
|
|
||||||
- Resolves conflicts with timestamps
|
|
||||||
- Handles errors gracefully
|
|
||||||
- Tracks processed files
|
|
||||||
- Runs as background service
|
|
||||||
|
|
||||||
**Total Time**: ~45 minutes from concept to production
|
|
||||||
|
|
||||||
## Key Learnings
|
|
||||||
|
|
||||||
1. **Incremental building** - Files written immediately, visible throughout
|
|
||||||
2. **Validation early** - Structure validated before moving to implementation
|
|
||||||
3. **Test-driven** - Tests reveal real behavior
|
|
||||||
4. **Documentation included** - README, STATUS, and guides auto-generated
|
|
||||||
5. **Repeatable process** - Same workflow for any agent type
|
|
||||||
|
|
||||||
## Variations
|
|
||||||
|
|
||||||
**For simpler agents:**
|
|
||||||
- Fewer nodes (3-5 instead of 7)
|
|
||||||
- Simpler workflow (linear instead of looping)
|
|
||||||
- Faster build time (10-15 minutes)
|
|
||||||
|
|
||||||
**For complex agents:**
|
|
||||||
- More nodes (10-15+)
|
|
||||||
- Multiple subgraphs
|
|
||||||
- Pause/resume points for human-in-the-loop
|
|
||||||
- Longer build time (45-60 minutes)
|
|
||||||
|
|
||||||
The workflow scales to your needs!
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
# Project-level Codex config for Hive.
|
|
||||||
# Keep this file minimal: MCP connectivity + skill discovery.
|
|
||||||
|
|
||||||
[mcp_servers.agent-builder]
|
|
||||||
command = "uv"
|
|
||||||
args = ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"]
|
|
||||||
cwd = "."
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"agent-builder": {
|
|
||||||
"command": "python",
|
|
||||||
"args": ["-m", "framework.mcp.agent_builder_server"],
|
|
||||||
"cwd": "core",
|
|
||||||
"env": {
|
|
||||||
"PYTHONPATH": "../tools/src"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tools": {
|
|
||||||
"command": "python",
|
|
||||||
"args": ["mcp_server.py", "--stdio"],
|
|
||||||
"cwd": "tools",
|
|
||||||
"env": {
|
|
||||||
"PYTHONPATH": "src"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-concepts
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-create
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-credentials
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-patterns
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-test
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"agent-builder": {
|
|
||||||
"command": "uv",
|
|
||||||
"args": [
|
|
||||||
"run",
|
|
||||||
"python",
|
|
||||||
"-m",
|
|
||||||
"framework.mcp.agent_builder_server"
|
|
||||||
],
|
|
||||||
"cwd": "core",
|
|
||||||
"env": {
|
|
||||||
"PYTHONPATH": "../tools/src"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tools": {
|
|
||||||
"command": "uv",
|
|
||||||
"args": [
|
|
||||||
"run",
|
|
||||||
"python",
|
|
||||||
"mcp_server.py",
|
|
||||||
"--stdio"
|
|
||||||
],
|
|
||||||
"cwd": "tools",
|
|
||||||
"env": {
|
|
||||||
"PYTHONPATH": "src"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-concepts
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-create
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-credentials
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-debugger
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-patterns
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/hive-test
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../../.claude/skills/triage-issue
|
|
||||||
Vendored
-7
@@ -1,7 +0,0 @@
|
|||||||
{
|
|
||||||
"recommendations": [
|
|
||||||
"charliermarsh.ruff",
|
|
||||||
"editorconfig.editorconfig",
|
|
||||||
"ms-python.python"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user