diff --git a/.claude/settings.local.json.example b/.claude/settings.local.json.example new file mode 100644 index 00000000..f4f34ec4 --- /dev/null +++ b/.claude/settings.local.json.example @@ -0,0 +1,34 @@ +{ + "permissions": { + "allow": [ + "mcp__agent-builder__create_session", + "mcp__agent-builder__set_goal", + "mcp__agent-builder__add_node", + "mcp__agent-builder__add_edge", + "mcp__agent-builder__configure_loop", + "mcp__agent-builder__add_mcp_server", + "mcp__agent-builder__validate_graph", + "mcp__agent-builder__export_graph", + "mcp__agent-builder__load_session_by_id", + "Bash(git status:*)", + "Bash(gh run view:*)", + "Bash(uv run:*)", + "Bash(env:*)", + "mcp__agent-builder__test_node", + "mcp__agent-builder__list_mcp_tools", + "Bash(python -m py_compile:*)", + "Bash(python -m pytest:*)", + "Bash(source:*)", + "mcp__agent-builder__update_node", + "mcp__agent-builder__check_missing_credentials", + "mcp__agent-builder__list_stored_credentials", + "Bash(find:*)", + "mcp__agent-builder__run_tests", + "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)", + "mcp__agent-builder__list_agent_sessions", + "mcp__agent-builder__generate_constraint_tests", + "mcp__agent-builder__generate_success_tests" + ] + }, + "enabledMcpjsonServers": ["agent-builder", "tools"] +} diff --git a/.claude/skills/hive-debugger/SKILL.md b/.claude/skills/hive-debugger/SKILL.md index 9f783764..cc46b4f2 100644 --- a/.claude/skills/hive-debugger/SKILL.md +++ b/.claude/skills/hive-debugger/SKILL.md @@ -562,15 +562,33 @@ PYTHONPATH=core:exports python -m {agent_name} --tui ### Find Available Checkpoints: -```bash -# In TUI: -/sessions {session_id} +Use MCP tools to programmatically find and inspect checkpoints: -# This shows all checkpoints with timestamps: -Available Checkpoints: (3) - 1. cp_node_complete_intake_143030 - 2. cp_node_complete_research_143115 - 3. cp_pause_research_143130 +``` +# List all sessions to find the failed one +list_agent_sessions(agent_work_dir="~/.hive/agents/{agent_name}", status="failed") + +# Inspect session state +get_agent_session_state(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}") + +# Find clean checkpoints to resume from +list_agent_checkpoints(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", is_clean="true") + +# Compare checkpoints to understand what changed +compare_agent_checkpoints( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="{session_id}", + checkpoint_id_before="cp_node_complete_intake_143030", + checkpoint_id_after="cp_node_complete_research_143115" +) + +# Inspect memory at a specific checkpoint +get_agent_checkpoint(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", checkpoint_id="cp_node_complete_intake_143030") +``` + +Or in TUI: +```bash +/sessions {session_id} ``` **Verification:** @@ -717,6 +735,80 @@ Let me know when you've run it and I'll help check the logs!" ) ``` +### Session & Checkpoint Tools + +**list_agent_sessions** - Browse sessions with filtering +- **When to use:** Finding resumable sessions, identifying failed sessions, Stage 3 triage +- **Returns:** Session list with status, timestamps, is_resumable, current_node, quality +- **Example:** + ``` + list_agent_sessions( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + status="failed", + limit=10 + ) + ``` + +**get_agent_session_state** - Load full session state (excludes memory values) +- **When to use:** Inspecting session progress, checking is_resumable, examining path +- **Returns:** Full state with memory_keys/memory_size instead of memory values +- **Example:** + ``` + get_agent_session_state( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + session_id="session_20260208_143022_abc12345" + ) + ``` + +**get_agent_session_memory** - Get memory contents from a session +- **When to use:** Stage 5 root cause analysis, inspecting produced data +- **Returns:** All memory keys+values, or a single key's value +- **Example:** + ``` + get_agent_session_memory( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + session_id="session_20260208_143022_abc12345", + key="twitter_handles" + ) + ``` + +**list_agent_checkpoints** - List checkpoints for a session +- **When to use:** Stage 6 recovery, finding clean checkpoints to resume from +- **Returns:** Checkpoint summaries with type, node, clean status +- **Example:** + ``` + list_agent_checkpoints( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + session_id="session_20260208_143022_abc12345", + is_clean="true" + ) + ``` + +**get_agent_checkpoint** - Load a specific checkpoint with full state +- **When to use:** Inspecting exact state at a checkpoint, comparing to current state +- **Returns:** Full checkpoint: memory snapshot, execution path, metrics +- **Example:** + ``` + get_agent_checkpoint( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + session_id="session_20260208_143022_abc12345", + checkpoint_id="cp_node_complete_intake_143030" + ) + ``` + +**compare_agent_checkpoints** - Diff memory between two checkpoints +- **When to use:** Understanding data flow, finding where state diverged +- **Returns:** Memory diff (added/removed/changed keys) + execution path diff +- **Example:** + ``` + compare_agent_checkpoints( + agent_work_dir="/home/user/.hive/agents/twitter_outreach", + session_id="session_20260208_143022_abc12345", + checkpoint_id_before="cp_node_complete_intake_143030", + checkpoint_id_after="cp_node_complete_research_143115" + ) + ``` + ### Query Patterns **Pattern 1: Top-Down Investigation** (Most common) @@ -739,6 +831,16 @@ Loop every 10 seconds: 2. If found: Alert and drill into L2 ``` +**Pattern 4: Session State + Checkpoint Recovery** +``` +1. list_agent_sessions: Find failed/paused sessions +2. get_agent_session_state: Check is_resumable, see execution path +3. get_agent_session_memory: Inspect what data was produced +4. list_agent_checkpoints: Find clean checkpoints before failure +5. compare_agent_checkpoints: Understand what changed between checkpoints +6. Recommend resume command with specific checkpoint +``` + --- ## Complete Example Walkthrough diff --git a/.claude/skills/hive-test/SKILL.md b/.claude/skills/hive-test/SKILL.md index 5c94be88..4e8cfe69 100644 --- a/.claude/skills/hive-test/SKILL.md +++ b/.claude/skills/hive-test/SKILL.md @@ -1,123 +1,396 @@ --- name: hive-test -description: Run goal-based evaluation tests for agents. Use when you need to verify an agent meets its goals, debug failing tests, or iterate on agent improvements based on test results. +description: Iterative agent testing with session recovery. Execute, analyze, fix, resume from checkpoints. Use when testing an agent, debugging test failures, or verifying fixes without re-running from scratch. --- -# Testing Workflow +# Agent Testing -This skill provides tools for testing agents built with the hive-create skill. +Test agents iteratively: execute, analyze failures, fix, resume from checkpoint, repeat. -## Workflow Overview +## When to Use -1. `mcp__agent-builder__list_tests` - Check what tests exist -2. `mcp__agent-builder__generate_constraint_tests` or `mcp__agent-builder__generate_success_tests` - Get test guidelines -3. **Write tests directly** using the Write tool with the guidelines provided -4. `mcp__agent-builder__run_tests` - Execute tests -5. `mcp__agent-builder__debug_test` - Debug failures +- Testing a newly built agent against its goal +- Debugging a failing agent iteratively +- Verifying fixes without re-running expensive early nodes +- Running final regression tests before deployment -## How Test Generation Works +## Prerequisites -The `generate_*_tests` MCP tools return **guidelines and templates** - they do NOT generate test code via LLM. -You (Claude) write the tests directly using the Write tool based on the guidelines. +1. Agent package at `exports/{agent_name}/` (built with `/hive-create`) +2. Credentials configured (`/hive-credentials`) +3. `ANTHROPIC_API_KEY` set (or appropriate LLM provider key) -### Example Workflow +**Path distinction** (critical — don't confuse these): +- `exports/{agent_name}/` — agent source code (edit here) +- `~/.hive/agents/{agent_name}/` — runtime data: sessions, checkpoints, logs (read here) + +--- + +## The Iterative Test Loop + +This is the core workflow. Don't re-run the entire agent when a late node fails — analyze, fix, and resume from the last clean checkpoint. + +``` +┌──────────────────────────────────────┐ +│ PHASE 1: Generate Test Scenarios │ +│ Goal → synthetic test inputs + tests │ +└──────────────┬───────────────────────┘ + ↓ +┌──────────────────────────────────────┐ +│ PHASE 2: Execute │◄────────────────┐ +│ Run agent (CLI or pytest) │ │ +└──────────────┬───────────────────────┘ │ + ↓ │ + Pass? ──yes──► PHASE 6: Final Verification │ + │ │ + no │ + ↓ │ +┌──────────────────────────────────────┐ │ +│ PHASE 3: Analyze │ │ +│ Session + runtime logs + checkpoints │ │ +└──────────────┬───────────────────────┘ │ + ↓ │ +┌──────────────────────────────────────┐ │ +│ PHASE 4: Fix │ │ +│ Prompt / code / graph / goal │ │ +└──────────────┬───────────────────────┘ │ + ↓ │ +┌──────────────────────────────────────┐ │ +│ PHASE 5: Recover & Resume │─────────────────┘ +│ Checkpoint resume OR fresh re-run │ +└──────────────────────────────────────┘ +``` + +--- + +### Phase 1: Generate Test Scenarios + +Create synthetic tests from the agent's goal, constraints, and success criteria. + +#### Step 1a: Read the goal ```python -# Step 1: Get test guidelines -result = mcp__agent-builder__generate_constraint_tests( - goal_id="my-goal", +# Read goal from agent.py +Read(file_path="exports/{agent_name}/agent.py") +# Extract the Goal definition and convert to JSON string +``` + +#### Step 1b: Get test guidelines + +```python +# Get constraint test guidelines +generate_constraint_tests( + goal_id="your-goal-id", goal_json='{"id": "...", "constraints": [...]}', - agent_path="exports/my_agent" + agent_path="exports/{agent_name}" ) -# Step 2: The result contains: -# - output_file: where to write tests -# - file_header: imports and fixtures to use -# - test_template: format for test functions -# - constraints_formatted: the constraints to test -# - test_guidelines: rules for writing tests +# Get success criteria test guidelines +generate_success_tests( + goal_id="your-goal-id", + goal_json='{"id": "...", "success_criteria": [...]}', + node_names="intake,research,review,report", + tool_names="web_search,web_scrape", + agent_path="exports/{agent_name}" +) +``` -# Step 3: Write tests directly using the Write tool +These return `file_header`, `test_template`, `constraints_formatted`/`success_criteria_formatted`, and `test_guidelines`. They do NOT generate test code — you write the tests. + +#### Step 1c: Write tests + +```python Write( file_path=result["output_file"], - content=result["file_header"] + test_code_you_write + content=result["file_header"] + "\n\n" + your_test_code ) +``` -# Step 4: Run tests via MCP tool -mcp__agent-builder__run_tests( - goal_id="my-goal", - agent_path="exports/my_agent" -) +#### Test writing rules -# Step 5: Debug failures via MCP tool -mcp__agent-builder__debug_test( - goal_id="my-goal", - test_name="test_constraint_foo", - agent_path="exports/my_agent" +- Every test MUST be `async` with `@pytest.mark.asyncio` +- Every test MUST accept `runner, auto_responder, mock_mode` fixtures +- Use `await auto_responder.start()` before running, `await auto_responder.stop()` in `finally` +- Use `await runner.run(input_dict)` — this goes through AgentRunner → AgentRuntime → ExecutionStream +- Access output via `result.output.get("key")` — NEVER `result.output["key"]` +- `result.success=True` means no exception, NOT goal achieved — always check output +- Write 8-15 tests total, not 30+ +- Each real test costs ~3 seconds + LLM tokens +- NEVER use `default_agent.run()` — it bypasses the runtime (no sessions, no logs, client-facing nodes hang) + +#### Step 1d: Check existing tests + +Before generating, check if tests already exist: + +```python +list_tests( + goal_id="your-goal-id", + agent_path="exports/{agent_name}" ) ``` --- -# Testing Agents with MCP Tools +### Phase 2: Execute -Run goal-based evaluation tests for agents built with the hive-create skill. +Two execution paths, use the right one for your situation. -**Key Principle: MCP tools provide guidelines, Claude writes tests directly** -- ✅ Get guidelines: `generate_constraint_tests`, `generate_success_tests` → returns templates and guidelines -- ✅ Write tests: Use the Write tool with the provided file_header and test_template -- ✅ Run tests: `run_tests` (runs pytest via subprocess) -- ✅ Debug failures: `debug_test` (re-runs single test with verbose output) -- ✅ List tests: `list_tests` (scans Python test files) -- ✅ Tests stored in `exports/{agent}/tests/test_*.py` +#### Iterative debugging (for complex agents) -## Architecture: Python Test Files +Run the agent via CLI. This creates sessions with checkpoints at `~/.hive/agents/{agent_name}/sessions/`: -``` -exports/my_agent/ -├── __init__.py -├── agent.py ← Agent to test -├── nodes/__init__.py -├── config.py -├── __main__.py -└── tests/ ← Test files written by MCP tools - ├── conftest.py # Shared fixtures (auto-created) - ├── test_constraints.py - ├── test_success_criteria.py - └── test_edge_cases.py +```bash +uv run hive run exports/{agent_name} --input '{"query": "test topic"}' ``` -**Tests import the agent directly:** +Sessions and checkpoints are saved automatically. + +**Client-facing nodes**: Agents with `client_facing=True` nodes (interactive conversation) work in headless mode when run from a real terminal — the agent streams output to stdout and reads user input from stdin via a `>>> ` prompt. In non-interactive shells (like Claude Code's Bash tool), client-facing nodes will hang because there is no stdin. For testing interactive agents from Claude Code, use `run_tests` with mock mode or have the user run the agent manually in their terminal. + +#### Automated regression (for CI or final verification) + +Use the `run_tests` MCP tool to run all pytest tests: + ```python -import pytest -from exports.my_agent import default_agent - - -@pytest.mark.asyncio -async def test_happy_path(mock_mode): - result = await default_agent.run({"query": "test"}, mock_mode=mock_mode) - assert result.success - assert len(result.output) > 0 +run_tests( + goal_id="your-goal-id", + agent_path="exports/{agent_name}" +) ``` -## Why This Approach +Returns structured results: +```json +{ + "overall_passed": false, + "summary": {"total": 12, "passed": 10, "failed": 2, "pass_rate": "83.3%"}, + "test_results": [{"test_name": "test_success_source_diversity", "status": "failed"}], + "failures": [{"test_name": "test_success_source_diversity", "details": "..."}] +} +``` -- MCP tools provide consistent test guidelines with proper imports, fixtures, and API key enforcement -- Claude writes tests directly, eliminating circular LLM dependencies in the MCP server -- `run_tests` parses pytest output into structured results for iteration -- `debug_test` provides formatted output with actionable debugging info -- File headers include conftest.py setup with proper fixtures +**Options:** +```python +# Run only constraint tests +run_tests(goal_id, agent_path, test_types='["constraint"]') -## Quick Start +# Stop on first failure +run_tests(goal_id, agent_path, fail_fast=True) -1. **Check existing tests** - `list_tests(goal_id, agent_path)` -2. **Get test guidelines** - `generate_constraint_tests` or `generate_success_tests` -3. **Write tests** - Use the Write tool with the provided file_header and guidelines -4. **Run tests** - `run_tests(goal_id, agent_path)` -5. **Debug failures** - `debug_test(goal_id, test_name, agent_path)` -6. **Iterate** - Repeat steps 4-5 until all pass +# Parallel execution +run_tests(goal_id, agent_path, parallel=4) +``` -## ⚠️ Credential Requirements for Testing +**Note:** `run_tests` uses `AgentRunner` with `tmp_path` storage, so sessions are isolated per test run. For checkpoint-based recovery with persistent sessions, use CLI execution. Use `run_tests` for quick regression checks and final verification. + +--- + +### Phase 3: Analyze Failures + +When a test fails, drill down systematically. Don't guess — use the tools. + +#### Step 3a: Get error category + +```python +debug_test( + goal_id="your-goal-id", + test_name="test_success_source_diversity", + agent_path="exports/{agent_name}" +) +``` + +Returns error category (`IMPLEMENTATION_ERROR`, `ASSERTION_FAILURE`, `TIMEOUT`, `IMPORT_ERROR`, `API_ERROR`) plus full traceback and suggestions. + +#### Step 3b: Find the failed session + +```python +list_agent_sessions( + agent_work_dir="~/.hive/agents/{agent_name}", + status="failed", + limit=5 +) +``` + +Returns session list with IDs, timestamps, current_node (where it failed), execution_quality. + +#### Step 3c: Inspect session state + +```python +get_agent_session_state( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="session_20260209_143022_abc12345" +) +``` + +Returns execution path, which node was current, step count, timestamps — but excludes memory values (to avoid context bloat). Shows `memory_keys` and `memory_size` instead. + +#### Step 3d: Examine runtime logs (L2/L3) + +```python +# L2: Per-node success/failure, retry counts +query_runtime_log_details( + agent_work_dir="~/.hive/agents/{agent_name}", + run_id="session_20260209_143022_abc12345", + needs_attention_only=True +) + +# L3: Exact LLM responses, tool call inputs/outputs +query_runtime_log_raw( + agent_work_dir="~/.hive/agents/{agent_name}", + run_id="session_20260209_143022_abc12345", + node_id="research" +) +``` + +#### Step 3e: Inspect memory data + +```python +# See what data a node actually produced +get_agent_session_memory( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="session_20260209_143022_abc12345", + key="research_results" +) +``` + +#### Step 3f: Find recovery points + +```python +list_agent_checkpoints( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="session_20260209_143022_abc12345", + is_clean="true" +) +``` + +Returns checkpoint summaries with IDs, types (`node_start`, `node_complete`), which node, and `is_clean` flag. Clean checkpoints are safe resume points. + +#### Step 3g: Compare checkpoints (optional) + +To understand what changed between two points in execution: + +```python +compare_agent_checkpoints( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="session_20260209_143022_abc12345", + checkpoint_id_before="cp_node_complete_research_143030", + checkpoint_id_after="cp_node_complete_review_143115" +) +``` + +Returns memory diff (added/removed/changed keys) and execution path diff. + +--- + +### Phase 4: Fix Based on Root Cause + +Use the analysis from Phase 3 to determine what to fix and where. + +| Root Cause | What to Fix | Where to Edit | +|------------|------------|---------------| +| **Prompt issue** — LLM produces wrong output format, misses instructions | Node `system_prompt` | `exports/{agent}/nodes/__init__.py` | +| **Code bug** — TypeError, KeyError, logic error in Python | Agent code | `exports/{agent}/agent.py`, `nodes/__init__.py` | +| **Graph issue** — wrong routing, missing edge, bad condition_expr | Edges, node config | `exports/{agent}/agent.py` | +| **Tool issue** — MCP tool fails, wrong config, missing credential | Tool config | `exports/{agent}/mcp_servers.json`, `/hive-credentials` | +| **Goal issue** — success criteria too strict/vague, wrong constraints | Goal definition | `exports/{agent}/agent.py` (goal section) | +| **Test issue** — test expectations don't match actual agent behavior | Test code | `exports/{agent}/tests/test_*.py` | + +#### Fix strategies by error category + +**IMPLEMENTATION_ERROR** (TypeError, AttributeError, KeyError): +```python +# Read the failing code +Read(file_path="exports/{agent_name}/nodes/__init__.py") + +# Fix the bug +Edit( + file_path="exports/{agent_name}/nodes/__init__.py", + old_string="results.get('videos')", + new_string="(results or {}).get('videos', [])" +) +``` + +**ASSERTION_FAILURE** (test assertions fail but agent ran successfully): +- Check if the agent's output is actually wrong → fix the prompt +- Check if the test's expectations are unrealistic → fix the test +- Use `get_agent_session_memory` to see what the agent actually produced + +**TIMEOUT / STALL** (agent runs too long): +- Check `node_visit_counts` for feedback loops hitting max_node_visits +- Check L3 logs for tool calls that hang +- Reduce `max_iterations` in loop_config or fix the prompt to converge faster + +**API_ERROR** (connection, rate limit, auth): +- Verify credentials with `/hive-credentials` +- Check MCP server configuration + +--- + +### Phase 5: Recover & Resume + +After fixing the agent, decide whether to resume or re-run. + +#### When to resume from checkpoint + +Resume when ALL of these are true: +- The fix is to a node that comes AFTER existing clean checkpoints +- Clean checkpoints exist (from a CLI execution with checkpointing) +- The early nodes are expensive (web scraping, API calls, long LLM chains) + +```bash +# Resume from the last clean checkpoint before the failing node +uv run hive run exports/{agent_name} \ + --resume-session session_20260209_143022_abc12345 \ + --checkpoint cp_node_complete_research_143030 +``` + +This skips all nodes before the checkpoint and only re-runs the fixed node onward. + +#### When to re-run from scratch + +Re-run when ANY of these are true: +- The fix is to the entry node or an early node +- No checkpoints exist (e.g., agent was run via `run_tests`) +- The agent is fast (2-3 nodes, completes in seconds) +- You changed the graph structure (added/removed nodes/edges) + +```bash +uv run hive run exports/{agent_name} --input '{"query": "test topic"}' +``` + +#### Inspecting a checkpoint before resuming + +```python +get_agent_checkpoint( + agent_work_dir="~/.hive/agents/{agent_name}", + session_id="session_20260209_143022_abc12345", + checkpoint_id="cp_node_complete_research_143030" +) +``` + +Returns the full checkpoint: shared_memory snapshot, execution_path, current_node, next_node, is_clean. + +#### Loop back to Phase 2 + +After resuming or re-running, check if the fix worked. If not, go back to Phase 3. + +--- + +### Phase 6: Final Verification + +Once the iterative fix loop converges (the agent produces correct output), run the full automated test suite: + +```python +run_tests( + goal_id="your-goal-id", + agent_path="exports/{agent_name}" +) +``` + +All tests should pass. If not, repeat the loop for remaining failures. + +--- + +## Credential Requirements **CRITICAL: Testing requires ALL credentials the agent depends on.** This includes both the LLM API key AND any tool-specific credentials (HubSpot, Brave Search, etc.). @@ -157,35 +430,31 @@ Common tool credentials: - Tests need to execute the agent's LLM nodes to validate behavior - Tools with missing credentials will return error dicts instead of real data - Mock mode bypasses everything, providing no confidence in real-world performance -- The `AgentRunner.run()` method validates credentials at startup and will fail fast if any are missing ### Mock Mode Limitations -Mock mode (`--mock` flag or `mock_mode=True`) is **ONLY for structure validation**: +Mock mode (`--mock` flag or `MOCK_MODE=1`) is **ONLY for structure validation**: -✓ Validates graph structure (nodes, edges, connections) -✓ Tests that code doesn't crash on execution -✗ Does NOT test LLM message generation -✗ Does NOT test reasoning or decision-making quality -✗ Does NOT test constraint validation (length limits, format rules) -✗ Does NOT test real API integrations or tool use -✗ Does NOT test personalization or content quality +- Validates graph structure (nodes, edges, connections) +- Validates that `AgentRunner.load()` succeeds and the agent is importable +- Does NOT execute event_loop agents — MockLLMProvider never calls `set_output`, so event_loop nodes loop forever +- Does NOT test LLM reasoning, content quality, or constraint validation +- Does NOT test real API integrations or tool use -**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials for ALL services. +**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials. ### Enforcing Credentials in Tests -When generating tests, **ALWAYS include credential checks for ALL required services**: +When writing tests, **ALWAYS include credential checks**: ```python import os import pytest from aden_tools.credentials import CredentialManager -# At the top of every test file pytestmark = pytest.mark.skipif( not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"), - reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure validation only." + reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1." ) @@ -195,628 +464,62 @@ def check_credentials(): creds = CredentialManager() mock_mode = os.environ.get("MOCK_MODE") - # Always check LLM key if not creds.is_available("anthropic"): if mock_mode: - print("\n⚠️ Running in MOCK MODE - structure validation only") - print(" This does NOT test LLM behavior or agent quality") - print(" Set ANTHROPIC_API_KEY for real testing\n") + print("\nRunning in MOCK MODE - structure validation only") else: pytest.fail( - "\n❌ ANTHROPIC_API_KEY not set!\n\n" - "Real testing requires an API key. Choose one:\n" - "1. Set API key (RECOMMENDED):\n" - " export ANTHROPIC_API_KEY='your-key-here'\n" - "2. Run structure validation only:\n" - " MOCK_MODE=1 pytest exports/{agent}/tests/\n\n" - "Note: Mock mode does NOT validate agent behavior or quality." + "\nANTHROPIC_API_KEY not set!\n" + "Set API key: export ANTHROPIC_API_KEY='your-key-here'\n" + "Or run structure validation: MOCK_MODE=1 pytest exports/{agent}/tests/" ) - # Check tool-specific credentials (skip in mock mode) if not mock_mode: - # List the tools this agent uses - update per agent - agent_tools = [] # e.g., ["hubspot_search_contacts", "hubspot_get_contact"] + agent_tools = [] # Update per agent missing = creds.get_missing_for_tools(agent_tools) if missing: - lines = ["\n❌ Missing tool credentials!\n"] + lines = ["\nMissing tool credentials!"] for name in missing: spec = creds.specs.get(name) if spec: lines.append(f" {spec.env_var} - {spec.description}") - if spec.help_url: - lines.append(f" Setup: {spec.help_url}") - lines.append("\nSet the required environment variables and re-run.") pytest.fail("\n".join(lines)) ``` ### User Communication -When the user asks to test an agent, **ALWAYS check for ALL credentials first** — not just the LLM key: +When the user asks to test an agent, **ALWAYS check for ALL credentials first**: -1. **Identify the agent's tools** from `agent.json` or `mcp_servers.json` +1. **Identify the agent's tools** from `mcp_servers.json` 2. **Check ALL required credentials** using `CredentialManager` 3. **Ask the user to provide any missing credentials** before proceeding +4. Collect ALL missing credentials in a single prompt — not one at a time + +--- + +## Safe Test Patterns + +### OutputCleaner + +The framework automatically validates and cleans node outputs using a fast LLM at edge traversal time. Tests should still use safe patterns because OutputCleaner may not catch all issues. + +### Safe Access (REQUIRED) ```python -from aden_tools.credentials import CredentialManager, CREDENTIAL_SPECS - -creds = CredentialManager() - -# 1. Check LLM key -missing_creds = [] -if not creds.is_available("anthropic"): - missing_creds.append(("ANTHROPIC_API_KEY", "Anthropic API key for LLM calls")) - -# 2. Check tool-specific credentials -agent_tools = [...] # Determined from agent config -missing_tools = creds.get_missing_for_tools(agent_tools) -for name in missing_tools: - spec = CREDENTIAL_SPECS.get(name) - if spec: - missing_creds.append((spec.env_var, spec.description)) - -# 3. Present ALL missing credentials to the user at once -if missing_creds: - print("⚠️ Missing credentials required by this agent:\n") - for env_var, description in missing_creds: - print(f" • {env_var} — {description}") - print() - print("Please set the missing environment variables:") - for env_var, _ in missing_creds: - print(f" export {env_var}='your-value-here'") - print() - print("Or run in mock mode (structure validation only):") - print(" MOCK_MODE=1 pytest exports/{agent}/tests/") - - # Ask user to provide credentials or choose mock mode - AskUserQuestion(...) -``` - -**IMPORTANT:** Do NOT skip credential collection. If an agent uses HubSpot tools, the user MUST provide `HUBSPOT_ACCESS_TOKEN`. If it uses web search, the user MUST provide the appropriate search API key. Collect ALL missing credentials in a single prompt rather than discovering them one at a time during test failures. - -## The Three-Stage Flow - -``` -┌─────────────────────────────────────────────────────────────────────────┐ -│ GOAL STAGE │ -│ (hive-create skill) │ -│ │ -│ 1. User defines goal with success_criteria and constraints │ -│ 2. Goal written to agent.py immediately │ -│ 3. Generate CONSTRAINT TESTS → Write to tests/ → USER APPROVAL │ -│ Files created: exports/{agent}/tests/test_constraints.py │ -└─────────────────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────────────────┐ -│ AGENT STAGE │ -│ (hive-create skill) │ -│ │ -│ Build nodes + edges, written immediately to files │ -│ Constraint tests can run during development: │ -│ run_tests(goal_id, agent_path, test_types='["constraint"]') │ -└─────────────────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────────────────┐ -│ EVAL STAGE (this skill) │ -│ │ -│ 1. Generate SUCCESS_CRITERIA TESTS → Write to tests/ → USER APPROVAL │ -│ Files created: exports/{agent}/tests/test_success_criteria.py │ -│ 2. Run all tests: run_tests(goal_id, agent_path) │ -│ 3. On failure → debug_test(goal_id, test_name, agent_path) │ -│ 4. Iterate: Edit agent code → Re-run run_tests (instant feedback) │ -└─────────────────────────────────────────────────────────────────────────┘ -``` - -## Step-by-Step: Testing an Agent - -### Step 1: Check Existing Tests - -**ALWAYS check first** before generating new tests: - -```python -mcp__agent-builder__list_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent" -) -``` - -This shows what test files already exist. If tests exist: -- Review the list to see what's covered -- Ask user if they want to add more or run existing tests - -### Step 2: Get Constraint Test Guidelines (Goal Stage) - -After goal is defined, get test guidelines using the MCP tool: - -```python -# First, read the goal from agent.py to get the goal JSON -goal_code = Read(file_path="exports/your_agent/agent.py") -# Extract the goal definition and convert to JSON - -# Get constraint test guidelines via MCP tool -result = mcp__agent-builder__generate_constraint_tests( - goal_id="your-goal-id", - goal_json='{"id": "goal-id", "name": "...", "constraints": [...]}', - agent_path="exports/your_agent" -) -``` - -**Response includes:** -- `output_file`: Where to write tests (e.g., `exports/your_agent/tests/test_constraints.py`) -- `file_header`: Imports, fixtures, and pytest setup to use at the top of the file -- `test_template`: Format for test functions -- `constraints_formatted`: The constraints to test -- `test_guidelines`: Rules and best practices for writing tests -- `instruction`: How to proceed - -**Write tests directly** using the provided guidelines: - -```python -# Write tests using the Write tool -Write( - file_path=result["output_file"], - content=result["file_header"] + "\n\n" + your_test_code -) -``` - -### Step 3: Get Success Criteria Test Guidelines (Eval Stage) - -After agent is fully built, get success criteria test guidelines: - -```python -# Get success criteria test guidelines via MCP tool -result = mcp__agent-builder__generate_success_tests( - goal_id="your-goal-id", - goal_json='{"id": "goal-id", "name": "...", "success_criteria": [...]}', - node_names="analyze_request,search_web,format_results", - tool_names="web_search,web_scrape", - agent_path="exports/your_agent" -) -``` - -**Write tests directly** using the provided guidelines: - -```python -# Write tests using the Write tool -Write( - file_path=result["output_file"], - content=result["file_header"] + "\n\n" + your_test_code -) -``` - -### Step 4: Test Fixtures (conftest.py) - -The `file_header` returned by the MCP tools includes proper imports and fixtures. -You should also create a conftest.py file in the tests directory with shared fixtures: - -```python -# Create conftest.py with the conftest template -Write( - file_path="exports/your_agent/tests/conftest.py", - content=conftest_content # Use PYTEST_CONFTEST_TEMPLATE format -) -``` - -### Step 5: Run Tests - -**Use the MCP tool to run tests** (not pytest directly): - -```python -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent" -) - -**Response includes structured results:** -```json -{ - "goal_id": "your-goal-id", - "overall_passed": false, - "summary": { - "total": 12, - "passed": 10, - "failed": 2, - "skipped": 0, - "errors": 0, - "pass_rate": "83.3%" - }, - "test_results": [ - {"file": "test_constraints.py", "test_name": "test_constraint_api_rate_limits", "status": "passed"}, - {"file": "test_success_criteria.py", "test_name": "test_success_find_relevant_results", "status": "failed"} - ], - "failures": [ - {"test_name": "test_success_find_relevant_results", "details": "AssertionError: Expected 3-5 results..."} - ] -} -``` - -**Options for `run_tests`:** -```python -# Run only constraint tests -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - test_types='["constraint"]' -) - -# Run with parallel workers -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - parallel=4 -) - -# Stop on first failure -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - fail_fast=True -) -``` - -### Step 6: Debug Failed Tests - -**Use the MCP tool to debug** (not Bash/pytest directly): - -```python -mcp__agent-builder__debug_test( - goal_id="your-goal-id", - test_name="test_success_find_relevant_results", - agent_path="exports/your_agent" -) -``` - -**Response includes:** -- Full verbose output from the test -- Stack trace with exact line numbers -- Captured logs and prints -- Suggestions for fixing the issue - -### Step 7: Categorize Errors - -When a test fails, categorize the error to guide iteration: - -```python -def categorize_test_failure(test_output, agent_code): - """Categorize test failure to guide iteration.""" - - # Read test output and agent code - failure_info = { - "test_name": "...", - "error_message": "...", - "stack_trace": "...", - } - - # Pattern-based categorization - if any(pattern in failure_info["error_message"].lower() for pattern in [ - "typeerror", "attributeerror", "keyerror", "valueerror", - "null", "none", "undefined", "tool call failed" - ]): - category = "IMPLEMENTATION_ERROR" - guidance = { - "stage": "Agent", - "action": "Fix the bug in agent code", - "files_to_edit": ["agent.py", "nodes/__init__.py"], - "restart_required": False, - "description": "Code bug - fix and re-run tests" - } - - elif any(pattern in failure_info["error_message"].lower() for pattern in [ - "assertion", "expected", "got", "should be", "success criteria" - ]): - category = "LOGIC_ERROR" - guidance = { - "stage": "Goal", - "action": "Update goal definition", - "files_to_edit": ["agent.py (goal section)"], - "restart_required": True, - "description": "Goal definition is wrong - update and rebuild" - } - - elif any(pattern in failure_info["error_message"].lower() for pattern in [ - "timeout", "rate limit", "empty", "boundary", "edge case" - ]): - category = "EDGE_CASE" - guidance = { - "stage": "Eval", - "action": "Add edge case test and fix handling", - "files_to_edit": ["agent.py", "tests/test_edge_cases.py"], - "restart_required": False, - "description": "New scenario - add test and handle it" - } - - else: - category = "UNKNOWN" - guidance = { - "stage": "Unknown", - "action": "Manual investigation required", - "restart_required": False - } - - return { - "category": category, - "guidance": guidance, - "failure_info": failure_info - } -``` - -**Show categorization to user:** - -```python -AskUserQuestion( - questions=[{ - "question": f"Test failed with {category}. How would you like to proceed?", - "header": "Test Failure", - "options": [ - { - "label": "Fix code directly (Recommended)" if category == "IMPLEMENTATION_ERROR" else "Update goal", - "description": guidance["description"] - }, - { - "label": "Show detailed error info", - "description": "View full stack trace and logs" - }, - { - "label": "Skip for now", - "description": "Continue with other tests" - } - ], - "multiSelect": false - }] -) -``` - -### Step 8: Iterate Based on Error Category - -#### IMPLEMENTATION_ERROR → Fix Agent Code - -```python -# 1. Show user the exact file and line that failed -print(f"Error in: exports/{agent_name}/nodes/__init__.py:42") -print(f"Issue: 'NoneType' object has no attribute 'get'") - -# 2. Read the problematic code -code = Read(file_path=f"exports/{agent_name}/nodes/__init__.py") - -# 3. User can fix directly, or you suggest a fix: -Edit( - file_path=f"exports/{agent_name}/nodes/__init__.py", - old_string="if results.get('videos'):", - new_string="if results and results.get('videos'):" -) - -# 4. Re-run tests immediately (instant feedback!) -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path=f"exports/{agent_name}" -) -``` - -#### LOGIC_ERROR → Update Goal - -```python -# 1. Show user the goal definition -goal_code = Read(file_path=f"exports/{agent_name}/agent.py") - -# 2. Discuss what needs to change in success_criteria or constraints - -# 3. Edit the goal -Edit( - file_path=f"exports/{agent_name}/agent.py", - old_string='target="3-5 videos"', - new_string='target="1-5 videos"' # More realistic -) - -# 4. May need to regenerate agent nodes if goal changed significantly -# This requires going back to hive-create skill -``` - -#### EDGE_CASE → Add Test and Fix - -```python -# 1. Create new edge case test with API key enforcement -edge_case_test = ''' -@pytest.mark.asyncio -async def test_edge_case_empty_results(mock_mode): - """Test: Agent handles no results gracefully""" - result = await default_agent.run({{"query": "xyzabc123nonsense"}}, mock_mode=mock_mode) - - # Should succeed with empty results, not crash - assert result.success or result.error is not None - if result.success: - assert result.output.get("message") == "No results found" -''' - -# 2. Add to test file -Edit( - file_path=f"exports/{agent_name}/tests/test_edge_cases.py", - old_string="# Add edge case tests here", - new_string=edge_case_test -) - -# 3. Fix agent to handle edge case -# Edit agent code to handle empty results - -# 4. Re-run tests -``` - -## Test File Templates (Reference Only) - -**⚠️ Do NOT copy-paste these templates directly.** Use `generate_constraint_tests` and `generate_success_tests` MCP tools to create properly structured tests with correct imports and fixtures. - -These templates show the structure of generated tests for reference only. - -### Constraint Test Template - -```python -"""Constraint tests for {agent_name}. - -These tests validate that the agent respects its defined constraints. -Requires ANTHROPIC_API_KEY for real testing. -""" - -import os -import pytest -from exports.{agent_name} import default_agent -from aden_tools.credentials import CredentialManager - - -# Enforce API key for real testing -pytestmark = pytest.mark.skipif( - not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"), - reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1." -) - - -@pytest.mark.asyncio -async def test_constraint_{constraint_id}(): - """Test: {constraint_description}""" - # Test implementation based on constraint type - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({{"test": "input"}}, mock_mode=mock_mode) - - # Assert constraint is respected - assert True # Replace with actual check -``` - -### Success Criteria Test Template - -```python -"""Success criteria tests for {agent_name}. - -These tests validate that the agent achieves its defined success criteria. -Requires ANTHROPIC_API_KEY for real testing - mock mode cannot validate success criteria. -""" - -import os -import pytest -from exports.{agent_name} import default_agent -from aden_tools.credentials import CredentialManager - - -# Enforce API key for real testing -pytestmark = pytest.mark.skipif( - not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"), - reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1." -) - - -@pytest.mark.asyncio -async def test_success_{criteria_id}(): - """Test: {criteria_description}""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({{"test": "input"}}, mock_mode=mock_mode) - - assert result.success, f"Agent failed: {{result.error}}" - - # Verify success criterion met - # e.g., assert metric meets target - assert True # Replace with actual check -``` - -### Edge Case Test Template - -```python -"""Edge case tests for {agent_name}. - -These tests validate agent behavior in unusual or boundary conditions. -Requires ANTHROPIC_API_KEY for real testing. -""" - -import os -import pytest -from exports.{agent_name} import default_agent -from aden_tools.credentials import CredentialManager - - -# Enforce API key for real testing -pytestmark = pytest.mark.skipif( - not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"), - reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1." -) - - -@pytest.mark.asyncio -async def test_edge_case_{scenario_name}(): - """Test: Agent handles {scenario_description}""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({{"edge": "case_input"}}, mock_mode=mock_mode) - - # Verify graceful handling - assert result.success or result.error is not None -``` - -## Interactive Build + Test Loop - -During agent construction (Agent stage), you can run constraint tests incrementally: - -```python -# After adding first node -print("Added search_node. Running relevant constraint tests...") -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path=f"exports/{agent_name}", - test_types='["constraint"]' -) - -# After adding second node -print("Added filter_node. Running all constraint tests...") -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path=f"exports/{agent_name}", - test_types='["constraint"]' -) -``` - -This provides **immediate feedback** during development, catching issues early. - -## Common Test Patterns - -**Note:** All test patterns should include API key enforcement via conftest.py. - -### ⚠️ CRITICAL: Framework Features You Must Know - -#### OutputCleaner - Automatic I/O Cleaning (NEW!) - -**The framework now automatically validates and cleans node outputs** using a fast LLM (Cerebras llama-3.3-70b) at edge traversal time. This prevents cascading failures from malformed output. - -**What OutputCleaner does**: -- ✅ Validates output matches next node's input schema -- ✅ Detects JSON parsing trap (entire response in one key) -- ✅ Cleans malformed output automatically (~200-500ms, ~$0.001 per cleaning) -- ✅ Boosts success rates by 1.8-2.2x - -**Impact on tests**: Tests should still use safe patterns because OutputCleaner may not catch all issues in test mode. - -#### Safe Test Patterns (REQUIRED) - -**❌ UNSAFE** (will cause test failures): -```python -# Direct key access - can crash! -approval_decision = result.output["approval_decision"] -assert approval_decision == "APPROVED" - -# Nested access without checks +# UNSAFE - will crash on missing keys +approval = result.output["approval_decision"] category = result.output["analysis"]["category"] -# Assuming parsed JSON structure -for issue in result.output["compliance_issues"]: - ... -``` - -**✅ SAFE** (correct patterns): -```python -# 1. Safe dict access with .get() +# SAFE - use .get() with defaults output = result.output or {} -approval_decision = output.get("approval_decision", "UNKNOWN") -assert "APPROVED" in approval_decision or approval_decision == "APPROVED" +approval = output.get("approval_decision", "UNKNOWN") -# 2. Type checking before operations +# SAFE - type check before operations analysis = output.get("analysis", {}) if isinstance(analysis, dict): category = analysis.get("category", "unknown") -# 3. Parse JSON from strings (the JSON parsing trap!) +# SAFE - handle JSON parsing trap (LLM response as string) import json recommendation = output.get("recommendation", "{}") if isinstance(recommendation, str): @@ -829,16 +532,15 @@ if isinstance(recommendation, str): elif isinstance(recommendation, dict): approval = recommendation.get("approval_decision", "UNKNOWN") -# 4. Safe iteration with type check -compliance_issues = output.get("compliance_issues", []) -if isinstance(compliance_issues, list): - for issue in compliance_issues: +# SAFE - type check before iteration +items = output.get("items", []) +if isinstance(items, list): + for item in items: ... ``` -#### Helper Functions for Safe Access +### Helper Functions for conftest.py -**Add to conftest.py**: ```python import json import re @@ -846,9 +548,7 @@ import re def _parse_json_from_output(result, key): """Parse JSON from agent output (framework may store full LLM response as string).""" response_text = result.output.get(key, "") - # Remove markdown code blocks if present json_text = re.sub(r'```json\s*|\s*```', '', response_text).strip() - try: return json.loads(json_text) except (json.JSONDecodeError, AttributeError, TypeError): @@ -858,7 +558,6 @@ def safe_get_nested(result, key_path, default=None): """Safely get nested value from result.output.""" output = result.output or {} current = output - for key in key_path: if isinstance(current, dict): current = current.get(key) @@ -874,7 +573,6 @@ def safe_get_nested(result, key_path, default=None): return default else: return default - return current if current is not None else default # Make available in tests @@ -882,313 +580,361 @@ pytest.parse_json_from_output = _parse_json_from_output pytest.safe_get_nested = safe_get_nested ``` -**Usage in tests**: -```python -# Use helper to parse JSON safely -parsed = pytest.parse_json_from_output(result, "recommendation") -if isinstance(parsed, dict): - approval = parsed.get("approval_decision", "UNKNOWN") - -# Safe nested access -risk_score = pytest.safe_get_nested(result, ["analysis", "risk_score"], default=0.0) -``` - -#### Test Count Guidance - -**Generate 8-15 tests total, NOT 30+** - -- ✅ 2-3 tests per success criterion -- ✅ 1 happy path test -- ✅ 1 boundary/edge case test -- ✅ 1 error handling test (optional) - -**Why fewer tests?**: -- Each test requires real LLM call (~3 seconds, costs money) -- 30 tests = 90 seconds, $0.30+ in costs -- 12 tests = 36 seconds, $0.12 in costs -- Focus on quality over quantity - -#### ExecutionResult Fields (Important!) +### ExecutionResult Fields **`result.success=True` means NO exception, NOT goal achieved** ```python -# ❌ WRONG - assumes goal achieved +# WRONG assert result.success -# ✅ RIGHT - check success AND output +# RIGHT assert result.success, f"Agent failed: {result.error}" output = result.output or {} approval = output.get("approval_decision") assert approval == "APPROVED", f"Expected APPROVED, got {approval}" ``` -**All ExecutionResult fields**: -- `success: bool` - Execution completed without exception (NOT goal achieved!) -- `output: dict` - Complete memory snapshot (may contain raw strings) -- `error: str | None` - Error message if failed -- `steps_executed: int` - Number of nodes executed -- `total_tokens: int` - Cumulative token usage -- `total_latency_ms: int` - Total execution time -- `path: list[str]` - Node IDs traversed (may contain repeated IDs from feedback loops) -- `paused_at: str | None` - Node ID if HITL pause occurred -- `session_state: dict` - State for resuming -- `node_visit_counts: dict[str, int]` - How many times each node executed (useful for feedback loop testing) +All fields: +- `success: bool` — Completed without exception (NOT goal achieved!) +- `output: dict` — Complete memory snapshot (may contain raw strings) +- `error: str | None` — Error message if failed +- `steps_executed: int` — Number of nodes executed +- `total_tokens: int` — Cumulative token usage +- `total_latency_ms: int` — Total execution time +- `path: list[str]` — Node IDs traversed (may repeat in feedback loops) +- `paused_at: str | None` — Node ID if paused +- `session_state: dict` — State for resuming +- `node_visit_counts: dict[str, int]` — Visit counts per node (feedback loop testing) +- `execution_quality: str` — "clean", "degraded", or "failed" -### Happy Path Test +### Test Count Guidance + +**Write 8-15 tests, not 30+** + +- 2-3 tests per success criterion +- 1 happy path test +- 1 boundary/edge case test +- 1 error handling test (optional) + +Each real test costs ~3 seconds + LLM tokens. 12 tests = ~36 seconds, $0.12. + +--- + +## Test Patterns + +### Happy Path ```python @pytest.mark.asyncio -async def test_happy_path(mock_mode): - """Test normal successful execution""" - result = await default_agent.run({{"query": "python tutorials"}}, mock_mode=mock_mode) - assert result.success - assert len(result.output) > 0 +async def test_happy_path(runner, auto_responder, mock_mode): + """Test normal successful execution.""" + await auto_responder.start() + try: + result = await runner.run({"query": "python tutorials"}) + finally: + await auto_responder.stop() + assert result.success, f"Agent failed: {result.error}" + output = result.output or {} + assert output.get("report"), "No report produced" ``` -### Boundary Condition Test +### Boundary Condition ```python @pytest.mark.asyncio -async def test_boundary_minimum(mock_mode): - """Test at minimum threshold""" - result = await default_agent.run({{"query": "very specific niche topic"}}, mock_mode=mock_mode) - assert result.success - assert len(result.output.get("results", [])) >= 1 +async def test_minimum_sources(runner, auto_responder, mock_mode): + """Test at minimum source threshold.""" + await auto_responder.start() + try: + result = await runner.run({"query": "niche topic"}) + finally: + await auto_responder.stop() + assert result.success, f"Agent failed: {result.error}" + output = result.output or {} + sources = output.get("sources", []) + if isinstance(sources, list): + assert len(sources) >= 3, f"Expected >= 3 sources, got {len(sources)}" ``` -### Error Handling Test +### Error Handling ```python @pytest.mark.asyncio -async def test_error_handling(mock_mode): - """Test graceful error handling""" - result = await default_agent.run({{"query": ""}}, mock_mode=mock_mode) # Invalid input - assert not result.success or result.output.get("error") is not None +async def test_empty_input(runner, auto_responder, mock_mode): + """Test graceful handling of empty input.""" + await auto_responder.start() + try: + result = await runner.run({"query": ""}) + finally: + await auto_responder.stop() + # Agent should either fail gracefully or produce an error message + output = result.output or {} + assert not result.success or output.get("error"), "Should handle empty input" ``` -### Performance Test +### Feedback Loop ```python @pytest.mark.asyncio -async def test_performance_latency(mock_mode): - """Test response time is acceptable""" - import time - start = time.time() - result = await default_agent.run({{"query": "test"}}, mock_mode=mock_mode) - duration = time.time() - start - assert duration < 5.0, f"Took {{duration}}s, expected <5s" -``` - -### Testing Event Loop Nodes - -Event loop nodes run multi-turn loops internally. Tests should verify: - -**Output Keys Test** — All required keys are set via `set_output`: -```python -@pytest.mark.asyncio -async def test_all_output_keys_set(mock_mode): - """Test that event_loop nodes set all required output keys.""" - result = await default_agent.run({{"query": "test"}}, mock_mode=mock_mode) - assert result.success, f"Agent failed: {{result.error}}" - output = result.output or {{}} - for key in ["expected_key_1", "expected_key_2"]: - assert key in output, f"Output key '{{key}}' not set by event_loop node" -``` - -**Feedback Loop Test** — Verify feedback loops terminate: -```python -@pytest.mark.asyncio -async def test_feedback_loop_respects_max_visits(mock_mode): - """Test that feedback loops terminate at max_node_visits.""" - result = await default_agent.run({{"input": "trigger_rejection"}}, mock_mode=mock_mode) - assert result.success or result.error is not None - visits = getattr(result, "node_visit_counts", {{}}) or {{}} +async def test_feedback_loop_terminates(runner, auto_responder, mock_mode): + """Test that feedback loops don't run forever.""" + await auto_responder.start() + try: + result = await runner.run({"query": "test"}) + finally: + await auto_responder.stop() + visits = result.node_visit_counts or {} for node_id, count in visits.items(): - assert count <= 5, f"Node {{node_id}} visited {{count}} times" -``` - -**Fan-Out Test** — Verify parallel branches both complete: -```python -@pytest.mark.asyncio -async def test_parallel_branches_complete(mock_mode): - """Test that fan-out branches all complete and produce outputs.""" - result = await default_agent.run({{"query": "test"}}, mock_mode=mock_mode) - assert result.success - output = result.output or {{}} - # Check outputs from both parallel branches - assert "branch_a_output" in output, "Branch A output missing" - assert "branch_b_output" in output, "Branch B output missing" -``` - -**Client-Facing Node Test** — In mock mode, client-facing nodes may not block: -```python -@pytest.mark.asyncio -async def test_client_facing_node(mock_mode): - """Test that client-facing nodes produce output.""" - result = await default_agent.run({{"query": "test"}}, mock_mode=mock_mode) - # In mock mode, client-facing blocking is typically bypassed - assert result.success or result.paused_at is not None -``` - -## Integration with hive-create - -### Handoff Points - -| Scenario | From | To | Action | -|----------|------|-----|--------| -| Agent built, ready to test | hive-create | hive-test | Generate success tests | -| LOGIC_ERROR found | hive-test | hive-create | Update goal, rebuild | -| IMPLEMENTATION_ERROR found | hive-test | Direct fix | Edit agent files, re-run tests | -| EDGE_CASE found | hive-test | hive-test | Add edge case test | -| All tests pass | hive-test | Done | Agent validated ✅ | - -### Iteration Speed Comparison - -| Scenario | Old Approach | New Approach | -|----------|--------------|--------------| -| **Bug Fix** | Rebuild via MCP tools (14 min) | Edit Python file, pytest (2 min) | -| **Add Test** | Generate via MCP, export (5 min) | Write test file directly (1 min) | -| **Debug** | Read subprocess logs | pdb, breakpoints, prints | -| **Inspect** | Limited visibility | Full Python introspection | - -## Anti-Patterns - -### Testing Best Practices - -| Don't | Do Instead | -|-------|------------| -| ❌ Write tests without getting guidelines first | ✅ Use `generate_*_tests` to get proper file_header and guidelines | -| ❌ Run pytest via Bash | ✅ Use `run_tests` MCP tool for structured results | -| ❌ Debug tests with Bash pytest -vvs | ✅ Use `debug_test` MCP tool for formatted output | -| ❌ Check for tests with Glob | ✅ Use `list_tests` MCP tool | -| ❌ Skip the file_header from guidelines | ✅ Always include the file_header for proper imports and fixtures | - -### General Testing - -| Don't | Do Instead | -|-------|------------| -| ❌ Treat all failures the same | ✅ Use debug_test to categorize and iterate appropriately | -| ❌ Rebuild entire agent for small bugs | ✅ Edit code directly, re-run tests | -| ❌ Run tests without API key | ✅ Always set ANTHROPIC_API_KEY first | -| ❌ Write tests without understanding the constraints/criteria | ✅ Read the formatted constraints/criteria from guidelines | - -## Workflow Summary - -``` -1. Check existing tests: list_tests(goal_id, agent_path) - → Scans exports/{agent}/tests/test_*.py - ↓ -2. Get test guidelines: generate_constraint_tests, generate_success_tests - → Returns file_header, test_template, constraints/criteria, guidelines - ↓ -3. Write tests: Use Write tool with the provided guidelines - → Write tests to exports/{agent}/tests/test_*.py - ↓ -4. Run tests: run_tests(goal_id, agent_path) - → Executes: pytest exports/{agent}/tests/ -v - ↓ -5. Debug failures: debug_test(goal_id, test_name, agent_path) - → Re-runs single test with verbose output - ↓ -6. Fix based on category: - - IMPLEMENTATION_ERROR → Edit agent code directly - - ASSERTION_FAILURE → Fix agent logic or update test - - IMPORT_ERROR → Check package structure - - API_ERROR → Check API keys and connectivity - ↓ -7. Re-run tests: run_tests(goal_id, agent_path) - ↓ -8. Repeat until all pass ✅ -``` - -## MCP Tools Reference - -```python -# Check existing tests (scans Python test files) -mcp__agent-builder__list_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent" -) - -# Get constraint test guidelines (returns templates and guidelines, NOT generated tests) -mcp__agent-builder__generate_constraint_tests( - goal_id="your-goal-id", - goal_json='{"id": "...", "constraints": [...]}', - agent_path="exports/your_agent" -) -# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines - -# Get success criteria test guidelines -mcp__agent-builder__generate_success_tests( - goal_id="your-goal-id", - goal_json='{"id": "...", "success_criteria": [...]}', - node_names="node1,node2", - tool_names="tool1,tool2", - agent_path="exports/your_agent" -) -# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines - -# Run tests via pytest subprocess -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent" -) - -# Debug a failed test (re-runs with verbose output) -mcp__agent-builder__debug_test( - goal_id="your-goal-id", - test_name="test_constraint_foo", - agent_path="exports/your_agent" -) -``` - -## run_tests Options - -```python -# Run only constraint tests -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - test_types='["constraint"]' -) - -# Run only success criteria tests -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - test_types='["success"]' -) - -# Run with pytest-xdist parallelism (requires pytest-xdist) -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - parallel=4 -) - -# Stop on first failure -mcp__agent-builder__run_tests( - goal_id="your-goal-id", - agent_path="exports/your_agent", - fail_fast=True -) -``` - -## Direct pytest Commands - -You can also run tests directly with pytest (the MCP tools use pytest internally): - -```bash -# Run all tests -pytest exports/your_agent/tests/ -v - -# Run specific test file -pytest exports/your_agent/tests/test_constraints.py -v - -# Run specific test -pytest exports/your_agent/tests/test_constraints.py::test_constraint_foo -vvs - -# Run in mock mode (structure validation only) -MOCK_MODE=1 pytest exports/your_agent/tests/ -v + assert count <= 5, f"Node {node_id} visited {count} times — possible infinite loop" ``` --- -**MCP tools generate tests, write them to Python files, and run them via pytest.** +## MCP Tool Reference + +### Phase 1: Test Generation + +```python +# Check existing tests +list_tests(goal_id, agent_path) + +# Get constraint test guidelines (returns templates, NOT generated tests) +generate_constraint_tests(goal_id, goal_json, agent_path) +# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines + +# Get success criteria test guidelines +generate_success_tests(goal_id, goal_json, node_names, tool_names, agent_path) +# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines +``` + +### Phase 2: Execution + +```python +# Automated regression (no checkpoints, fresh runs) +run_tests(goal_id, agent_path, test_types='["all"]', parallel=-1, fail_fast=False) + +# Run only specific test types +run_tests(goal_id, agent_path, test_types='["constraint"]') +run_tests(goal_id, agent_path, test_types='["success"]') +``` + +```bash +# Iterative debugging with checkpoints (via CLI) +uv run hive run exports/{agent_name} --input '{"query": "test"}' +``` + +### Phase 3: Analysis + +```python +# Debug a specific failed test +debug_test(goal_id, test_name, agent_path) + +# Find failed sessions +list_agent_sessions(agent_work_dir, status="failed", limit=5) + +# Inspect session state (excludes memory values) +get_agent_session_state(agent_work_dir, session_id) + +# Inspect memory data +get_agent_session_memory(agent_work_dir, session_id, key="research_results") + +# Runtime logs: L1 summaries +query_runtime_logs(agent_work_dir, status="needs_attention") + +# Runtime logs: L2 per-node details +query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True) + +# Runtime logs: L3 tool/LLM raw data +query_runtime_log_raw(agent_work_dir, run_id, node_id="research") + +# Find clean checkpoints +list_agent_checkpoints(agent_work_dir, session_id, is_clean="true") + +# Compare checkpoints (memory diff) +compare_agent_checkpoints(agent_work_dir, session_id, cp_before, cp_after) +``` + +### Phase 5: Recovery + +```python +# Inspect checkpoint before resuming +get_agent_checkpoint(agent_work_dir, session_id, checkpoint_id) +# Empty checkpoint_id = latest checkpoint +``` + +```bash +# Resume from checkpoint via CLI (headless) +uv run hive run exports/{agent_name} \ + --resume-session {session_id} --checkpoint {checkpoint_id} +``` + +--- + +## Anti-Patterns + +| Don't | Do Instead | +|-------|-----------| +| Use `default_agent.run()` in tests | Use `runner.run()` with `auto_responder` fixtures (goes through AgentRuntime) | +| Re-run entire agent when a late node fails | Resume from last clean checkpoint | +| Treat `result.success` as goal achieved | Check `result.output` for actual criteria | +| Access `result.output["key"]` directly | Use `result.output.get("key")` | +| Fix random things hoping tests pass | Analyze L2/L3 logs to find root cause first | +| Write 30+ tests | Write 8-15 focused tests | +| Skip credential check | Use `/hive-credentials` before testing | +| Confuse `exports/` with `~/.hive/agents/` | Code in `exports/`, runtime data in `~/.hive/` | +| Use `run_tests` for iterative debugging | Use headless CLI with checkpoints for iterative debugging | +| Use headless CLI for final regression | Use `run_tests` for automated regression | +| Use `--tui` from Claude Code | Use headless `run` command — TUI hangs in non-interactive shells | +| Test client-facing nodes from Claude Code | Use mock mode, or have the user run the agent in their terminal | +| Run tests without reading goal first | Always understand the goal before writing tests | +| Skip Phase 3 analysis and guess | Use session + log tools to identify root cause | + +--- + +## Example Walkthrough: Deep Research Agent + +A complete iteration showing the test loop for an agent with nodes: `intake → research → review → report`. + +### Phase 1: Generate tests + +```python +# Read the goal +Read(file_path="exports/deep_research_agent/agent.py") + +# Get success criteria test guidelines +result = generate_success_tests( + goal_id="rigorous-interactive-research", + goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "target": ">=5"}, {"id": "citation-coverage", "target": "100%"}, {"id": "report-completeness", "target": "90%"}]}', + node_names="intake,research,review,report", + tool_names="web_search,web_scrape", + agent_path="exports/deep_research_agent" +) + +# Write tests +Write( + file_path=result["output_file"], + content=result["file_header"] + "\n\n" + test_code +) +``` + +### Phase 2: First execution + +```python +run_tests( + goal_id="rigorous-interactive-research", + agent_path="exports/deep_research_agent", + fail_fast=True +) +``` + +Result: `test_success_source_diversity` fails — agent only found 2 sources instead of 5. + +### Phase 3: Analyze + +```python +# Debug the failing test +debug_test( + goal_id="rigorous-interactive-research", + test_name="test_success_source_diversity", + agent_path="exports/deep_research_agent" +) +# → ASSERTION_FAILURE: Expected >= 5 sources, got 2 + +# Find the session +list_agent_sessions( + agent_work_dir="~/.hive/agents/deep_research_agent", + status="completed", + limit=1 +) +# → session_20260209_150000_abc12345 + +# See what the research node produced +get_agent_session_memory( + agent_work_dir="~/.hive/agents/deep_research_agent", + session_id="session_20260209_150000_abc12345", + key="research_results" +) +# → Only 2 web_search calls made, each returned 1 source + +# Check the LLM's behavior in the research node +query_runtime_log_raw( + agent_work_dir="~/.hive/agents/deep_research_agent", + run_id="session_20260209_150000_abc12345", + node_id="research" +) +# → LLM called web_search only twice, then called set_output +``` + +Root cause: The research node's prompt doesn't tell the LLM to search for at least 5 diverse sources. It stops after the first couple of searches. + +### Phase 4: Fix the prompt + +```python +Read(file_path="exports/deep_research_agent/nodes/__init__.py") + +Edit( + file_path="exports/deep_research_agent/nodes/__init__.py", + old_string='system_prompt="Search for information on the user\'s topic."', + new_string='system_prompt="Search for information on the user\'s topic. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries to ensure source diversity. Do not stop searching until you have at least 5 distinct sources."' +) +``` + +### Phase 5: Resume from checkpoint + +For this example, the fix is to the `research` node. If we had run via CLI with checkpointing, we could resume from the checkpoint after `intake` to skip re-running intake: + +```bash +# Check if clean checkpoint exists after intake +list_agent_checkpoints( + agent_work_dir="~/.hive/agents/deep_research_agent", + session_id="session_20260209_150000_abc12345", + is_clean="true" +) +# → cp_node_complete_intake_150005 + +# Resume from after intake, re-run research with fixed prompt +uv run hive run exports/deep_research_agent \ + --resume-session session_20260209_150000_abc12345 \ + --checkpoint cp_node_complete_intake_150005 +``` + +Or for this simple case (intake is fast), just re-run: + +```bash +uv run hive run exports/deep_research_agent --input '{"topic": "test"}' +``` + +### Phase 6: Final verification + +```python +run_tests( + goal_id="rigorous-interactive-research", + agent_path="exports/deep_research_agent" +) +# → All 12 tests pass +``` + +--- + +## Test File Structure + +``` +exports/{agent_name}/ +├── agent.py ← Agent to test (goal, nodes, edges) +├── nodes/__init__.py ← Node implementations (prompts, config) +├── config.py ← Agent configuration +├── mcp_servers.json ← Tool server config +└── tests/ + ├── conftest.py ← Shared fixtures + safe access helpers + ├── test_constraints.py ← Constraint tests + ├── test_success_criteria.py ← Success criteria tests + └── test_edge_cases.py ← Edge case tests +``` + +## Integration with Other Skills + +| Scenario | From | To | Action | +|----------|------|----|--------| +| Agent built, ready to test | `/hive-create` | `/hive-test` | Generate tests, start loop | +| Prompt fix needed | `/hive-test` Phase 4 | Direct edit | Edit `nodes/__init__.py`, resume | +| Goal definition wrong | `/hive-test` Phase 4 | `/hive-create` | Update goal, may need rebuild | +| Missing credentials | `/hive-test` Phase 3 | `/hive-credentials` | Set up credentials | +| Complex runtime failure | `/hive-test` Phase 3 | `/hive-debugger` | Deep L1/L2/L3 analysis | +| All tests pass | `/hive-test` Phase 6 | Done | Agent validated | diff --git a/.claude/skills/hive-test/examples/testing-youtube-agent.md b/.claude/skills/hive-test/examples/testing-youtube-agent.md index 9d1f2d0c..92ed7bc2 100644 --- a/.claude/skills/hive-test/examples/testing-youtube-agent.md +++ b/.claude/skills/hive-test/examples/testing-youtube-agent.md @@ -1,351 +1,333 @@ -# Example: Testing a YouTube Research Agent +# Example: Iterative Testing of a Research Agent -This example walks through testing a YouTube research agent that finds relevant videos based on a topic. +This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report. -## Prerequisites +## Agent Structure -- Agent built with hive-create skill at `exports/youtube-research/` -- Goal defined with success criteria and constraints - -## Step 1: Load the Goal - -First, load the goal that was defined during the Goal stage: - -```json -{ - "id": "youtube-research", - "name": "YouTube Research Agent", - "description": "Find relevant YouTube videos on a given topic", - "success_criteria": [ - { - "id": "find_videos", - "description": "Find 3-5 relevant videos", - "metric": "video_count", - "target": "3-5", - "weight": 1.0 - }, - { - "id": "relevance", - "description": "Videos must be relevant to the topic", - "metric": "relevance_score", - "target": ">0.8", - "weight": 0.8 - } - ], - "constraints": [ - { - "id": "api_limits", - "description": "Must not exceed YouTube API rate limits", - "constraint_type": "hard", - "category": "technical" - }, - { - "id": "content_safety", - "description": "Must filter out inappropriate content", - "constraint_type": "hard", - "category": "safety" - } - ] -} +``` +exports/deep_research_agent/ +├── agent.py # Goal + graph: intake → research → review → report +├── nodes/__init__.py # Node definitions (system_prompt, input/output keys) +├── config.py # Model config +├── mcp_servers.json # Tools: web_search, web_scrape +└── tests/ # Test files (we'll create these) ``` -## Step 2: Get Constraint Test Guidelines +**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report. -During the Goal stage (or early Eval), get test guidelines for constraints: +--- + +## Phase 1: Generate Tests + +### Read the goal ```python -result = generate_constraint_tests( - goal_id="youtube-research", - goal_json='', - agent_path="exports/youtube-research" -) +Read(file_path="exports/deep_research_agent/agent.py") +# Extract: goal_id="rigorous-interactive-research" +# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%) +# constraints: no-hallucination, source-attribution ``` -**The result contains guidelines (not generated tests):** -- `output_file`: Where to write tests -- `file_header`: Imports and fixtures to use -- `test_template`: Format for test functions -- `constraints_formatted`: The constraints to test -- `test_guidelines`: Rules for writing tests - -## Step 3: Write Constraint Tests - -Using the guidelines, write tests directly with the Write tool: - -```python -# Write constraint tests using the provided file_header and guidelines -Write( - file_path="exports/youtube-research/tests/test_constraints.py", - content=''' -"""Constraint tests for youtube-research agent.""" - -import os -import pytest -from exports.youtube_research import default_agent - - -pytestmark = pytest.mark.skipif( - not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"), - reason="API key required for real testing." -) - - -@pytest.mark.asyncio -async def test_constraint_api_limits_respected(): - """Verify API rate limits are not exceeded.""" - import time - mock_mode = bool(os.environ.get("MOCK_MODE")) - - for i in range(10): - result = await default_agent.run({"topic": f"test_{i}"}, mock_mode=mock_mode) - time.sleep(0.1) - - # Should complete without rate limit errors - assert "rate limit" not in str(result).lower() - - -@pytest.mark.asyncio -async def test_constraint_content_safety_filter(): - """Verify inappropriate content is filtered.""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({"topic": "general topic"}, mock_mode=mock_mode) - - for video in result.videos: - assert video.safe_for_work is True - assert video.age_restricted is False -''' -) -``` - -## Step 4: Get Success Criteria Test Guidelines - -After the agent is built, get success criteria test guidelines: +### Get test guidelines ```python result = generate_success_tests( - goal_id="youtube-research", - goal_json='', - node_names="search_node,filter_node,rank_node,format_node", - tool_names="youtube_search,video_details,channel_info", - agent_path="exports/youtube-research" + goal_id="rigorous-interactive-research", + goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}', + node_names="intake,research,review,report", + tool_names="web_search,web_scrape", + agent_path="exports/deep_research_agent" ) ``` -## Step 5: Write Success Criteria Tests - -Using the guidelines, write success criteria tests: +### Write tests ```python Write( - file_path="exports/youtube-research/tests/test_success_criteria.py", - content=''' -"""Success criteria tests for youtube-research agent.""" - -import os -import pytest -from exports.youtube_research import default_agent - - -pytestmark = pytest.mark.skipif( - not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"), - reason="API key required for real testing." -) - + file_path="exports/deep_research_agent/tests/test_success_criteria.py", + content=result["file_header"] + ''' @pytest.mark.asyncio -async def test_find_videos_happy_path(): - """Test finding videos for a common topic.""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({"topic": "machine learning"}, mock_mode=mock_mode) - - assert result.success - assert 3 <= len(result.videos) <= 5 - assert all(v.title for v in result.videos) - assert all(v.video_id for v in result.videos) - +async def test_success_source_diversity(runner, auto_responder, mock_mode): + """At least 5 diverse sources are found.""" + await auto_responder.start() + try: + result = await runner.run({"query": "impact of remote work on productivity"}) + finally: + await auto_responder.stop() + assert result.success, f"Agent failed: {result.error}" + output = result.output or {} + sources = output.get("sources", []) + if isinstance(sources, list): + assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}" @pytest.mark.asyncio -async def test_find_videos_minimum_boundary(): - """Test at minimum threshold (3 videos).""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({"topic": "niche topic xyz"}, mock_mode=mock_mode) - - assert len(result.videos) >= 3 - +async def test_success_citation_coverage(runner, auto_responder, mock_mode): + """Every factual claim in the report cites its source.""" + await auto_responder.start() + try: + result = await runner.run({"query": "climate change effects on agriculture"}) + finally: + await auto_responder.stop() + assert result.success, f"Agent failed: {result.error}" + output = result.output or {} + report = output.get("report", "") + # Check that report contains numbered references + assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations" @pytest.mark.asyncio -async def test_relevance_score_threshold(): - """Test relevance scoring meets threshold.""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({"topic": "python programming"}, mock_mode=mock_mode) - - for video in result.videos: - assert video.relevance_score > 0.8 - +async def test_success_report_completeness(runner, auto_responder, mock_mode): + """Report addresses the original research question.""" + query = "pros and cons of nuclear energy" + await auto_responder.start() + try: + result = await runner.run({"query": query}) + finally: + await auto_responder.stop() + assert result.success, f"Agent failed: {result.error}" + output = result.output or {} + report = output.get("report", "") + assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars" @pytest.mark.asyncio -async def test_find_videos_no_results_graceful(): - """Test graceful handling of no results.""" - mock_mode = bool(os.environ.get("MOCK_MODE")) - result = await default_agent.run({"topic": "xyznonexistent123"}, mock_mode=mock_mode) +async def test_empty_query_handling(runner, auto_responder, mock_mode): + """Agent handles empty input gracefully.""" + await auto_responder.start() + try: + result = await runner.run({"query": ""}) + finally: + await auto_responder.stop() + output = result.output or {} + assert not result.success or output.get("error"), "Should handle empty query" - # Should not crash, return empty or message - assert result.videos == [] or result.message +@pytest.mark.asyncio +async def test_feedback_loop_terminates(runner, auto_responder, mock_mode): + """Feedback loop between review and research terminates.""" + await auto_responder.start() + try: + result = await runner.run({"query": "quantum computing basics"}) + finally: + await auto_responder.stop() + visits = result.node_visit_counts or {} + for node_id, count in visits.items(): + assert count <= 5, f"Node {node_id} visited {count} times" ''' ) ``` -## Step 6: Run All Tests +--- -Execute all tests: +## Phase 2: First Execution ```python -result = run_tests( - goal_id="youtube-research", - agent_path="exports/youtube-research", - test_types='["all"]', - parallel=4 +run_tests( + goal_id="rigorous-interactive-research", + agent_path="exports/deep_research_agent", + fail_fast=True ) ``` -**Results:** - +**Result:** ```json { - "goal_id": "youtube-research", - "overall_passed": false, - "summary": { - "total": 6, - "passed": 5, - "failed": 1, - "pass_rate": "83.3%" - }, - "duration_ms": 4521, - "results": [ - {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234}, - {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456}, - {"test_id": "test_success_001", "passed": true, "duration_ms": 789}, - {"test_id": "test_success_002", "passed": true, "duration_ms": 654}, - {"test_id": "test_success_003", "passed": true, "duration_ms": 543}, - {"test_id": "test_success_004", "passed": false, "duration_ms": 845, - "error_category": "IMPLEMENTATION_ERROR", - "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"} - ] + "overall_passed": false, + "summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"}, + "failures": [ + {"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"}, + {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"} + ] } ``` -## Step 7: Debug the Failed Test +--- + +## Phase 3: Analyze (Iteration 1) + +### Debug the first failure ```python -result = debug_test( - goal_id="youtube-research", - test_name="test_find_videos_no_results_graceful", - agent_path="exports/youtube-research" +debug_test( + goal_id="rigorous-interactive-research", + test_name="test_success_source_diversity", + agent_path="exports/deep_research_agent" +) +# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2 +``` + +### Find the session and inspect memory + +```python +list_agent_sessions( + agent_work_dir="~/.hive/agents/deep_research_agent", + status="completed", + limit=1 +) +# → session_20260209_150000_abc12345 + +get_agent_session_memory( + agent_work_dir="~/.hive/agents/deep_research_agent", + session_id="session_20260209_150000_abc12345", + key="research_results" +) +# → Only 2 sources found. LLM stopped searching after 2 queries. +``` + +### Check LLM behavior in the research node + +```python +query_runtime_log_raw( + agent_work_dir="~/.hive/agents/deep_research_agent", + run_id="session_20260209_150000_abc12345", + node_id="research" +) +# → LLM called web_search twice, got results, immediately called set_output. +# → Prompt doesn't instruct it to find at least 5 sources. +``` + +**Root cause:** The research node's system_prompt doesn't specify minimum source requirements. + +--- + +## Phase 4: Fix (Iteration 1) + +```python +Read(file_path="exports/deep_research_agent/nodes/__init__.py") + +# Fix the research node prompt +Edit( + file_path="exports/deep_research_agent/nodes/__init__.py", + old_string='system_prompt="Search for information on the user\'s topic using web search."', + new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."' ) ``` -**Debug Output:** +--- +## Phase 5: Recover & Resume (Iteration 1) + +The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch: + +```python +run_tests( + goal_id="rigorous-interactive-research", + agent_path="exports/deep_research_agent", + fail_fast=True +) +``` + +**Result:** ```json { - "test_id": "test_success_004", - "test_name": "test_find_videos_no_results_graceful", - "input": {"topic": "xyznonexistent123"}, - "expected": "Empty list or message", - "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"}, - "passed": false, - "error_message": "TypeError: 'NoneType' object has no attribute 'videos'", - "error_category": "IMPLEMENTATION_ERROR", - "stack_trace": "Traceback (most recent call last):\n File \"filter_node.py\", line 42\n for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'", - "logs": [ - {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"}, - {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"}, - {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"} - ], - "runtime_data": { - "execution_path": ["start", "search_node", "filter_node"], - "node_outputs": { - "search_node": null - } - }, - "suggested_fix": "Add null check in filter_node before accessing .videos attribute", - "iteration_guidance": { - "stage": "Agent", - "action": "Fix the code in nodes/edges", - "restart_required": false, - "description": "The goal is correct, but filter_node doesn't handle null results from search_node." - } + "overall_passed": false, + "summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"}, + "failures": [ + {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"} + ] } ``` -## Step 8: Iterate Based on Category +Source diversity now passes. Citation coverage still fails. -Since this is an **IMPLEMENTATION_ERROR**, we: +--- -1. **Don't restart** the Goal → Agent → Eval flow -2. **Fix the agent** using hive-create skill: - - Modify `filter_node` to handle null results -3. **Re-run Eval** (tests only) - -### Fix in hive-create: +## Phase 3: Analyze (Iteration 2) ```python -# Update the filter_node to handle null -add_node( - node_id="filter_node", - name="Filter Node", - description="Filter and rank videos", - node_type="function", - input_keys=["search_results"], - output_keys=["filtered_videos"], - system_prompt=""" - Filter videos by relevance. - IMPORTANT: Handle case where search_results is None or empty. - Return empty list if no results. - """ +debug_test( + goal_id="rigorous-interactive-research", + test_name="test_success_citation_coverage", + agent_path="exports/deep_research_agent" +) +# Category: ASSERTION_FAILURE — Report lacks citations + +# Check what the report node produced +list_agent_sessions( + agent_work_dir="~/.hive/agents/deep_research_agent", + status="completed", + limit=1 +) +# → session_20260209_151500_def67890 + +get_agent_session_memory( + agent_work_dir="~/.hive/agents/deep_research_agent", + session_id="session_20260209_151500_def67890", + key="report" +) +# → Report text exists but uses no numbered references. +# → Sources are in memory but report node doesn't cite them. +``` + +**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations. + +--- + +## Phase 4: Fix (Iteration 2) + +```python +Edit( + file_path="exports/deep_research_agent/nodes/__init__.py", + old_string='system_prompt="Write a comprehensive report based on the research findings."', + new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."' ) ``` -### Re-export and re-test: +--- + +## Phase 5: Resume (Iteration 2) + +The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI: + +```bash +# Run via CLI to get checkpoints +uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}' + +# After it runs, find the clean checkpoint before report +list_agent_checkpoints( + agent_work_dir="~/.hive/agents/deep_research_agent", + session_id="session_20260209_152000_ghi34567", + is_clean="true" +) +# → cp_node_complete_review_152100 (after review, before report) + +# Resume — skips intake, research, review entirely +uv run hive run exports/deep_research_agent \ + --resume-session session_20260209_152000_ghi34567 \ + --checkpoint cp_node_complete_review_152100 +``` + +Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint. + +--- + +## Phase 6: Final Verification ```python -# Re-export the fixed agent -export_graph(path="exports/youtube-research") - -# Re-run tests -result = run_tests( - goal_id="youtube-research", - agent_path="exports/youtube-research", - test_types='["all"]' +run_tests( + goal_id="rigorous-interactive-research", + agent_path="exports/deep_research_agent" ) ``` -**Updated Results:** - +**Result:** ```json { - "goal_id": "youtube-research", - "overall_passed": true, - "summary": { - "total": 6, - "passed": 6, - "failed": 0, - "pass_rate": "100.0%" - } + "overall_passed": true, + "summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"} } ``` +All tests pass. + +--- + ## Summary -1. **Got guidelines** for constraint tests during Goal stage -2. **Wrote** constraint tests using Write tool -3. **Got guidelines** for success criteria tests during Eval stage -4. **Wrote** success criteria tests using Write tool -5. **Ran** tests in parallel -6. **Debugged** the one failure -7. **Categorized** as IMPLEMENTATION_ERROR -8. **Fixed** the agent (not the goal) -9. **Re-ran** Eval only (didn't restart full flow) -10. **Passed** all tests +| Iteration | Failure | Root Cause | Fix | Recovery | +|-----------|---------|------------|-----|----------| +| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) | +| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) | -The agent is now validated and ready for production use. +**Key takeaways:** +- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing +- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes +- Final `run_tests` confirms all scenarios pass end-to-end diff --git a/.opencode/agents/hive.md b/.opencode/agents/hive.md new file mode 100644 index 00000000..6c70024c --- /dev/null +++ b/.opencode/agents/hive.md @@ -0,0 +1,20 @@ +--- +name: hive +description: Hive Agent Builder & Manager +mode: primary +tools: + agent-builder: true + tools: true +--- + +# Hive Agent +You are the Hive Agent Builder. Your goal is to help the user construct, configure, and deploy AI agents using the Hive framework. + +## Capabilities +1. **Scaffold Agents:** Create new agent directories/configs. +2. **Manage Tools:** Add/remove tools via MCP. +3. **Debug:** Analyze agent workflows. + +## Context +- You are an expert in the Hive framework architecture. +- Always use the `agent-builder` MCP server for filesystem operations. \ No newline at end of file diff --git a/.opencode/mcp.json b/.opencode/mcp.json new file mode 100644 index 00000000..74cb5c27 --- /dev/null +++ b/.opencode/mcp.json @@ -0,0 +1,30 @@ +{ + "mcpServers": { + "agent-builder": { + "command": "uv", + "args": [ + "run", + "python", + "-m", + "framework.mcp.agent_builder_server" + ], + "cwd": "core", + "env": { + "PYTHONPATH": "../tools/src" + } + }, + "tools": { + "command": "uv", + "args": [ + "run", + "python", + "mcp_server.py", + "--stdio" + ], + "cwd": "tools", + "env": { + "PYTHONPATH": "src" + } + } + } +} \ No newline at end of file diff --git a/.opencode/skills/hive b/.opencode/skills/hive new file mode 120000 index 00000000..47ca6b8e --- /dev/null +++ b/.opencode/skills/hive @@ -0,0 +1 @@ +../../.claude/skills/hive \ No newline at end of file diff --git a/.opencode/skills/hive-concepts b/.opencode/skills/hive-concepts new file mode 120000 index 00000000..4f460b1d --- /dev/null +++ b/.opencode/skills/hive-concepts @@ -0,0 +1 @@ +../../.claude/skills/hive-concepts \ No newline at end of file diff --git a/.opencode/skills/hive-create b/.opencode/skills/hive-create new file mode 120000 index 00000000..9247f883 --- /dev/null +++ b/.opencode/skills/hive-create @@ -0,0 +1 @@ +../../.claude/skills/hive-create \ No newline at end of file diff --git a/.opencode/skills/hive-credentials b/.opencode/skills/hive-credentials new file mode 120000 index 00000000..610180f4 --- /dev/null +++ b/.opencode/skills/hive-credentials @@ -0,0 +1 @@ +../../.claude/skills/hive-credentials \ No newline at end of file diff --git a/.opencode/skills/hive-debugger b/.opencode/skills/hive-debugger new file mode 120000 index 00000000..48edc69e --- /dev/null +++ b/.opencode/skills/hive-debugger @@ -0,0 +1 @@ +../../.claude/skills/hive-debugger \ No newline at end of file diff --git a/.opencode/skills/hive-patterns b/.opencode/skills/hive-patterns new file mode 120000 index 00000000..c18612b5 --- /dev/null +++ b/.opencode/skills/hive-patterns @@ -0,0 +1 @@ +../../.claude/skills/hive-patterns \ No newline at end of file diff --git a/.opencode/skills/hive-test b/.opencode/skills/hive-test new file mode 120000 index 00000000..d2377d0e --- /dev/null +++ b/.opencode/skills/hive-test @@ -0,0 +1 @@ +../../.claude/skills/hive-test \ No newline at end of file diff --git a/.opencode/skills/triage-issue b/.opencode/skills/triage-issue new file mode 120000 index 00000000..41183c47 --- /dev/null +++ b/.opencode/skills/triage-issue @@ -0,0 +1 @@ +../../.claude/skills/triage-issue \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d299c5a7..310fb96c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,8 +49,8 @@ You may submit PRs without prior assignment for: make check # Lint and format checks (ruff check + ruff format --check on core/ and tools/) make test # Core tests (cd core && pytest tests/ -v) ``` -6. Commit your changes following our commit conventions -7. Push to your fork and submit a Pull Request +8. Commit your changes following our commit conventions +9. Push to your fork and submit a Pull Request ## Development Setup @@ -145,6 +145,9 @@ make test # Or run tests directly cd core && pytest tests/ -v +# Run tools package tests (when contributing to tools/) +cd tools && uv run pytest tests/ -v + # Run tests for a specific agent PYTHONPATH=exports uv run python -m agent_name test ``` diff --git a/README.md b/README.md index 70837797..32d9daff 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,16 @@ hive tui # Or run directly hive run exports/your_agent_name --input '{"key": "value"}' ``` +## Coding Agent Support +### Opencode +Hive includes native support for [Opencode](https://github.com/opencode-ai/opencode). + +1. **Setup:** Run the quickstart script +2. **Launch:** Open Opencode in the project root. +3. **Activate:** Type `/hive` in the chat to switch to the Hive Agent. +4. **Verify:** Ask the agent *"List your tools"* to confirm the connection. + +The agent has access to all Hive skills and can scaffold agents, add tools, and debug workflows directly from the chat. **[📖 Complete Setup Guide](docs/environment-setup.md)** - Detailed instructions for agent development diff --git a/core/framework/graph/event_loop_node.py b/core/framework/graph/event_loop_node.py index 1c97f158..79591057 100644 --- a/core/framework/graph/event_loop_node.py +++ b/core/framework/graph/event_loop_node.py @@ -274,6 +274,7 @@ class EventLoopNode(NodeProtocol): # 5. Stall detection state recent_responses: list[str] = [] + user_interaction_count = 0 # tracks how many times this node blocked for user input # 6. Main loop for iteration in range(start_iteration, self._config.max_iterations): @@ -485,13 +486,11 @@ class EventLoopNode(NodeProtocol): # 6h. Client-facing input blocking # - # For client_facing nodes, block for user input only when the - # LLM explicitly called ask_user(). Text-only turns without - # ask_user flow through without blocking, allowing progress - # updates and summaries to stream freely. - # - # After user input, always fall through to judge evaluation - # (6i). The judge handles all acceptance decisions. + # Block ONLY when the LLM explicitly calls ask_user(). + # Text-only turns and set_output-only turns flow through + # without blocking, allowing progress updates and summaries + # to stream freely. After user input arrives, fall through + # to judge evaluation (6i) — the judge handles acceptance. if ctx.node_spec.client_facing and user_input_requested: if self._shutdown: await self._publish_loop_completed(stream_id, node_id, iteration + 1) @@ -578,6 +577,7 @@ class EventLoopNode(NodeProtocol): latency_ms=latency_ms, ) + user_interaction_count += 1 recent_responses.clear() # Fall through to judge evaluation (6i) @@ -824,6 +824,12 @@ class EventLoopNode(NodeProtocol): Returns True if input arrived, False if shutdown was signaled. """ + # Clear BEFORE emitting so that synchronous handlers (e.g. the + # headless stdin handler) can call inject_event() during the emit + # and the signal won't be lost. TUI handlers return immediately + # without injecting, so the wait still blocks until the user types. + self._input_ready.clear() + if self._event_bus: await self._event_bus.emit_client_input_requested( stream_id=ctx.node_id, @@ -831,7 +837,6 @@ class EventLoopNode(NodeProtocol): prompt="", ) - self._input_ready.clear() await self._input_ready.wait() return not self._shutdown @@ -989,7 +994,7 @@ class EventLoopNode(NodeProtocol): is_error=result.is_error, ) if not result.is_error: - value = tc.tool_input["value"] + value = tc.tool_input.get("value", "") # Parse JSON strings into native types so downstream # consumers get lists/dicts instead of serialised JSON, # and the hallucination validator skips non-string values. @@ -1000,8 +1005,9 @@ class EventLoopNode(NodeProtocol): value = parsed except (json.JSONDecodeError, TypeError): pass - await accumulator.set(tc.tool_input["key"], value) - outputs_set_this_turn.append(tc.tool_input["key"]) + key = tc.tool_input.get("key", "") + await accumulator.set(key, value) + outputs_set_this_turn.append(key) logged_tool_calls.append( { "tool_use_id": tc.tool_use_id, @@ -1283,6 +1289,24 @@ class EventLoopNode(NodeProtocol): accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys ) if not missing: + # Safety check: when ALL output keys are nullable and NONE + # have been set, the node produced nothing useful. Retry + # instead of accepting an empty result — this prevents + # client-facing nodes from terminating before the user + # ever interacts, and non-client-facing nodes from + # short-circuiting without doing their work. + output_keys = ctx.node_spec.output_keys or [] + nullable_keys = set(ctx.node_spec.nullable_output_keys or []) + all_nullable = output_keys and nullable_keys >= set(output_keys) + none_set = not any(accumulator.get(k) is not None for k in output_keys) + if all_nullable and none_set: + return JudgeVerdict( + action="RETRY", + feedback=( + f"No output keys have been set yet. " + f"Use set_output to set at least one of: {output_keys}" + ), + ) return JudgeVerdict(action="ACCEPT") else: return JudgeVerdict( diff --git a/core/framework/graph/executor.py b/core/framework/graph/executor.py index a75cea41..f90d9ec6 100644 --- a/core/framework/graph/executor.py +++ b/core/framework/graph/executor.py @@ -368,7 +368,7 @@ class GraphExecutor: # Check if resuming from paused_at (session state resume) paused_at = session_state.get("paused_at") if session_state else None node_ids = [n.id for n in graph.nodes] - self.logger.info(f"🔍 Debug: paused_at={paused_at}, available node IDs={node_ids}") + self.logger.debug(f"paused_at={paused_at}, available node IDs={node_ids}") if paused_at and graph.get_node(paused_at) is not None: # Resume from paused_at node directly (works for any node, not just pause_nodes) @@ -505,6 +505,21 @@ class GraphExecutor: path.append(current_node_id) + # Clear stale nullable outputs from previous visits. + # When a node is re-visited (e.g. review → process-batch → review), + # nullable outputs from the PREVIOUS visit linger in shared memory. + # This causes stale edge conditions to fire (e.g. "feedback is not None" + # from visit 1 triggers even when visit 2 sets "final_summary" instead). + # Clearing them ensures only the CURRENT visit's outputs affect routing. + if node_visit_counts.get(current_node_id, 0) > 1: + nullable_keys = getattr(node_spec, "nullable_output_keys", None) or [] + for key in nullable_keys: + if memory.read(key) is not None: + memory.write(key, None, validate=False) + self.logger.info( + f" 🧹 Cleared stale nullable output '{key}' from previous visit" + ) + # Check if pause (HITL) before execution if current_node_id in graph.pause_nodes: self.logger.info(f"⏸ Paused at HITL node: {node_spec.name}") diff --git a/core/framework/graph/node.py b/core/framework/graph/node.py index dc3a659e..fd19e1ea 100644 --- a/core/framework/graph/node.py +++ b/core/framework/graph/node.py @@ -1134,7 +1134,7 @@ Keep the same JSON structure but with shorter content values. decision_id=decision_id, success=True, result=response.content, - tokens_used=response.input_tokens + response.output_tokens, + tokens_used=total_input_tokens + total_output_tokens, latency_ms=latency_ms, ) @@ -1233,7 +1233,7 @@ Keep the same JSON structure but with shorter content values. success=False, error=_extraction_error, output={}, - tokens_used=response.input_tokens + response.output_tokens, + tokens_used=total_input_tokens + total_output_tokens, latency_ms=latency_ms, ) # JSON extraction failed completely - still strip code blocks @@ -1275,7 +1275,7 @@ Keep the same JSON structure but with shorter content values. return NodeResult( success=True, output=output, - tokens_used=response.input_tokens + response.output_tokens, + tokens_used=total_input_tokens + total_output_tokens, latency_ms=latency_ms, ) diff --git a/core/framework/mcp/agent_builder_server.py b/core/framework/mcp/agent_builder_server.py index 0fd5418c..6210f345 100644 --- a/core/framework/mcp/agent_builder_server.py +++ b/core/framework/mcp/agent_builder_server.py @@ -14,13 +14,15 @@ from datetime import datetime from pathlib import Path from typing import Annotated +# Project root resolution. This file lives at core/framework/mcp/agent_builder_server.py, +# so the project root (where exports/ lives) is four parents up. +_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent + # Ensure exports/ is on sys.path so AgentRunner can import agent modules. -_framework_dir = Path(__file__).resolve().parent.parent # core/framework/ -> core/ -_project_root = _framework_dir.parent # core/ -> project root -_exports_dir = _project_root / "exports" +_exports_dir = _PROJECT_ROOT / "exports" if _exports_dir.is_dir() and str(_exports_dir) not in sys.path: sys.path.insert(0, str(_exports_dir)) -del _framework_dir, _project_root, _exports_dir +del _exports_dir from mcp.server import FastMCP # noqa: E402 from pydantic import ValidationError # noqa: E402 @@ -542,6 +544,9 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]: """ Validate and normalize agent_path. + Resolves relative paths against _PROJECT_ROOT since the MCP server's + cwd (core/) differs from the user's cwd (project root). + Returns: (Path, None) if valid (None, error_json) if invalid @@ -556,6 +561,12 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]: path = Path(agent_path) + # Resolve relative paths against project root (not MCP server's cwd) + if not path.is_absolute() and not path.exists(): + resolved = _PROJECT_ROOT / path + if resolved.exists(): + path = resolved + if not path.exists(): return None, json.dumps( { @@ -3019,18 +3030,15 @@ def _format_success_criteria(criteria: list[SuccessCriterion]) -> str: # Test template for Claude to use when writing tests CONSTRAINT_TEST_TEMPLATE = '''@pytest.mark.asyncio -async def test_constraint_{constraint_id}_{scenario}(mock_mode): +async def test_constraint_{constraint_id}_{scenario}(runner, auto_responder, mock_mode): """Test: {description}""" - result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode) - - # IMPORTANT: result is an ExecutionResult object with these attributes: - # - result.success: bool - whether the agent succeeded - # - result.output: dict - the agent's output data (access data here!) - # - result.error: str or None - error message if failed + await auto_responder.start() + try: + result = await runner.run({{"key": "value"}}) + finally: + await auto_responder.stop() assert result.success, f"Agent failed: {{result.error}}" - - # Access output data via result.output output_data = result.output or {{}} # Add constraint-specific assertions here @@ -3038,18 +3046,15 @@ async def test_constraint_{constraint_id}_{scenario}(mock_mode): ''' SUCCESS_TEST_TEMPLATE = '''@pytest.mark.asyncio -async def test_success_{criteria_id}_{scenario}(mock_mode): +async def test_success_{criteria_id}_{scenario}(runner, auto_responder, mock_mode): """Test: {description}""" - result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode) - - # IMPORTANT: result is an ExecutionResult object with these attributes: - # - result.success: bool - whether the agent succeeded - # - result.output: dict - the agent's output data (access data here!) - # - result.error: str or None - error message if failed + await auto_responder.start() + try: + result = await runner.run({{"key": "value"}}) + finally: + await auto_responder.stop() assert result.success, f"Agent failed: {{result.error}}" - - # Access output data via result.output output_data = result.output or {{}} # Add success criteria-specific assertions here @@ -3105,7 +3110,6 @@ def generate_constraint_tests( test_type="Constraint", agent_name=agent_module, description=f"Tests for constraints defined in goal: {goal.name}", - agent_module=agent_module, ) # Return guidelines + data for Claude to write tests directly @@ -3121,14 +3125,22 @@ def generate_constraint_tests( "max_tests": 5, "naming_convention": "test_constraint__", "required_decorator": "@pytest.mark.asyncio", - "required_fixture": "mock_mode", - "agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)", + "required_fixtures": "runner, auto_responder, mock_mode", + "agent_call_pattern": "await runner.run(input_dict)", + "auto_responder_pattern": ( + "await auto_responder.start()\n" + "try:\n" + " result = await runner.run(input_dict)\n" + "finally:\n" + " await auto_responder.stop()" + ), "result_type": "ExecutionResult with .success, .output (dict), .error", "critical_rules": [ "Every test function MUST be async with @pytest.mark.asyncio", - "Every test MUST accept mock_mode as a parameter", - "Use await default_agent.run(input, mock_mode=mock_mode)", - "default_agent is already imported - do NOT add imports", + "Every test MUST accept runner, auto_responder, and mock_mode fixtures", + "Use await runner.run(input) -- NOT default_agent.run()", + "Start auto_responder before running, stop in finally block", + "runner and auto_responder are from conftest.py -- do NOT import them", "NEVER call result.get() - use result.output.get() instead", "Always check result.success before accessing result.output", ], @@ -3192,7 +3204,6 @@ def generate_success_tests( test_type="Success criteria", agent_name=agent_module, description=f"Tests for success criteria defined in goal: {goal.name}", - agent_module=agent_module, ) # Return guidelines + data for Claude to write tests directly @@ -3214,14 +3225,22 @@ def generate_success_tests( "max_tests": 12, "naming_convention": "test_success__", "required_decorator": "@pytest.mark.asyncio", - "required_fixture": "mock_mode", - "agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)", + "required_fixtures": "runner, auto_responder, mock_mode", + "agent_call_pattern": "await runner.run(input_dict)", + "auto_responder_pattern": ( + "await auto_responder.start()\n" + "try:\n" + " result = await runner.run(input_dict)\n" + "finally:\n" + " await auto_responder.stop()" + ), "result_type": "ExecutionResult with .success, .output (dict), .error", "critical_rules": [ "Every test function MUST be async with @pytest.mark.asyncio", - "Every test MUST accept mock_mode as a parameter", - "Use await default_agent.run(input, mock_mode=mock_mode)", - "default_agent is already imported - do NOT add imports", + "Every test MUST accept runner, auto_responder, and mock_mode fixtures", + "Use await runner.run(input) -- NOT default_agent.run()", + "Start auto_responder before running, stop in finally block", + "runner and auto_responder are from conftest.py -- do NOT import them", "NEVER call result.get() - use result.output.get() instead", "Always check result.success before accessing result.output", ], @@ -3318,11 +3337,13 @@ def run_tests( # Add short traceback and quiet summary cmd.append("--tb=short") - # Set PYTHONPATH to project root so agents can import from core.framework + # Set PYTHONPATH so framework and agent packages are importable env = os.environ.copy() pythonpath = env.get("PYTHONPATH", "") project_root = Path(__file__).parent.parent.parent.parent.resolve() - env["PYTHONPATH"] = f"{project_root}:{pythonpath}" + core_path = project_root / "core" + exports_path = project_root / "exports" + env["PYTHONPATH"] = f"{core_path}:{exports_path}:{project_root}:{pythonpath}" # Run pytest try: @@ -3792,7 +3813,11 @@ def check_missing_credentials( from framework.runner import AgentRunner - runner = AgentRunner.load(agent_path) + path, err = _validate_agent_path(agent_path) + if err: + return err + + runner = AgentRunner.load(str(path)) runner.validate() store = _get_credential_store() @@ -3992,7 +4017,11 @@ def verify_credentials( try: from framework.runner import AgentRunner - runner = AgentRunner.load(agent_path) + path, err = _validate_agent_path(agent_path) + if err: + return err + + runner = AgentRunner.load(str(path)) validation = runner.validate() return json.dumps( @@ -4009,6 +4038,382 @@ def verify_credentials( return json.dumps({"error": str(e)}) +# ============================================================================= +# SESSION & CHECKPOINT TOOLS (read-only, no build session required) +# ============================================================================= + +_MAX_DIFF_VALUE_LEN = 500 + + +def _read_session_json(path: Path) -> dict | None: + """Read a JSON file, returning None on failure.""" + if not path.exists(): + return None + try: + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + return None + + +def _scan_agent_sessions(agent_work_dir: Path) -> list[tuple[str, Path]]: + """Find session directories with state.json, sorted most-recent-first.""" + sessions: list[tuple[str, Path]] = [] + sessions_dir = agent_work_dir / "sessions" + if not sessions_dir.exists(): + return sessions + for session_dir in sessions_dir.iterdir(): + if session_dir.is_dir() and session_dir.name.startswith("session_"): + state_path = session_dir / "state.json" + if state_path.exists(): + sessions.append((session_dir.name, state_path)) + sessions.sort(key=lambda t: t[0], reverse=True) + return sessions + + +def _truncate_value(value: object, max_len: int = _MAX_DIFF_VALUE_LEN) -> object: + """Truncate a value's JSON representation if too long.""" + s = json.dumps(value, default=str) + if len(s) <= max_len: + return value + return {"_truncated": True, "_preview": s[:max_len] + "...", "_length": len(s)} + + +@mcp.tool() +def list_agent_sessions( + agent_work_dir: Annotated[ + str, + "Path to the agent's working directory (e.g., ~/.hive/agents/my_agent)", + ], + status: Annotated[ + str, + "Filter by status: 'active', 'paused', 'completed', 'failed', 'cancelled'. Empty for all.", + ] = "", + limit: Annotated[int, "Maximum number of results (default 20)"] = 20, + offset: Annotated[int, "Number of sessions to skip for pagination"] = 0, +) -> str: + """ + List sessions for an agent with optional status filter. + + Use this to discover which sessions exist, find resumable sessions, + or identify failed sessions for debugging. Combines well with + query_runtime_logs for correlating session state with log data. + """ + work_dir = Path(agent_work_dir) + all_sessions = _scan_agent_sessions(work_dir) + + if not all_sessions: + return json.dumps({"sessions": [], "total": 0, "offset": offset, "limit": limit}) + + summaries = [] + for session_id, state_path in all_sessions: + data = _read_session_json(state_path) + if data is None: + continue + + session_status = data.get("status", "") + if status and session_status != status: + continue + + timestamps = data.get("timestamps", {}) + progress = data.get("progress", {}) + checkpoint_dir = state_path.parent / "checkpoints" + + summaries.append( + { + "session_id": session_id, + "status": session_status, + "goal_id": data.get("goal_id", ""), + "started_at": timestamps.get("started_at", ""), + "updated_at": timestamps.get("updated_at", ""), + "completed_at": timestamps.get("completed_at"), + "is_resumable": data.get("is_resumable", False), + "is_resumable_from_checkpoint": data.get("is_resumable_from_checkpoint", False), + "current_node": progress.get("current_node"), + "paused_at": progress.get("paused_at"), + "steps_executed": progress.get("steps_executed", 0), + "execution_quality": progress.get("execution_quality", ""), + "has_checkpoints": checkpoint_dir.exists() + and any(checkpoint_dir.glob("cp_*.json")), + } + ) + + total = len(summaries) + page = summaries[offset : offset + limit] + return json.dumps( + {"sessions": page, "total": total, "offset": offset, "limit": limit}, indent=2 + ) + + +@mcp.tool() +def get_agent_session_state( + agent_work_dir: Annotated[str, "Path to the agent's working directory"], + session_id: Annotated[str, "The session ID (e.g., 'session_20260208_143022_abc12345')"], +) -> str: + """ + Load full session state for a specific session. + + Returns complete session data including status, progress, result, + metrics, and checkpoint info. Memory values are excluded to prevent + context bloat -- use get_agent_session_memory to retrieve memory contents. + """ + state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json" + data = _read_session_json(state_path) + if data is None: + return json.dumps({"error": f"Session not found: {session_id}"}) + + memory = data.get("memory", {}) + data["memory_keys"] = list(memory.keys()) if isinstance(memory, dict) else [] + data["memory_size"] = len(memory) if isinstance(memory, dict) else 0 + data.pop("memory", None) + + return json.dumps(data, indent=2, default=str) + + +@mcp.tool() +def get_agent_session_memory( + agent_work_dir: Annotated[str, "Path to the agent's working directory"], + session_id: Annotated[str, "The session ID"], + key: Annotated[str, "Specific memory key to retrieve. Empty for all."] = "", +) -> str: + """ + Get memory contents from a session. + + Memory stores intermediate results passed between nodes. Use this + to inspect what data was produced during execution. + + If key is provided, returns only that memory key's value. + If key is empty, returns all memory keys and their values. + """ + state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json" + data = _read_session_json(state_path) + if data is None: + return json.dumps({"error": f"Session not found: {session_id}"}) + + memory = data.get("memory", {}) + if not isinstance(memory, dict): + memory = {} + + if key: + if key not in memory: + return json.dumps( + { + "error": f"Memory key not found: '{key}'", + "available_keys": list(memory.keys()), + } + ) + value = memory[key] + return json.dumps( + { + "session_id": session_id, + "key": key, + "value": value, + "value_type": type(value).__name__, + }, + indent=2, + default=str, + ) + + return json.dumps( + {"session_id": session_id, "memory": memory, "total_keys": len(memory)}, + indent=2, + default=str, + ) + + +@mcp.tool() +def list_agent_checkpoints( + agent_work_dir: Annotated[str, "Path to the agent's working directory"], + session_id: Annotated[str, "The session ID to list checkpoints for"], + checkpoint_type: Annotated[ + str, + "Filter by type: 'node_start', 'node_complete', 'loop_iteration'. Empty for all.", + ] = "", + is_clean: Annotated[str, "Filter by clean status: 'true', 'false', or empty for all."] = "", +) -> str: + """ + List checkpoints for a specific session. + + Checkpoints capture execution state at node boundaries for + crash recovery and resume. Use with get_agent_checkpoint for + detailed checkpoint inspection. + """ + session_dir = Path(agent_work_dir) / "sessions" / session_id + checkpoint_dir = session_dir / "checkpoints" + + if not session_dir.exists(): + return json.dumps({"error": f"Session not found: {session_id}"}) + + if not checkpoint_dir.exists(): + return json.dumps( + { + "session_id": session_id, + "checkpoints": [], + "total": 0, + "latest_checkpoint_id": None, + } + ) + + # Try index.json first + index_data = _read_session_json(checkpoint_dir / "index.json") + if index_data and "checkpoints" in index_data: + checkpoints = index_data["checkpoints"] + else: + # Fallback: scan individual checkpoint files + checkpoints = [] + for cp_file in sorted(checkpoint_dir.glob("cp_*.json")): + cp_data = _read_session_json(cp_file) + if cp_data: + checkpoints.append( + { + "checkpoint_id": cp_data.get("checkpoint_id", cp_file.stem), + "checkpoint_type": cp_data.get("checkpoint_type", ""), + "created_at": cp_data.get("created_at", ""), + "current_node": cp_data.get("current_node"), + "next_node": cp_data.get("next_node"), + "is_clean": cp_data.get("is_clean", True), + "description": cp_data.get("description", ""), + } + ) + + # Apply filters + if checkpoint_type: + checkpoints = [c for c in checkpoints if c.get("checkpoint_type") == checkpoint_type] + if is_clean: + clean_val = is_clean.lower() == "true" + checkpoints = [c for c in checkpoints if c.get("is_clean") == clean_val] + + latest_id = None + if index_data: + latest_id = index_data.get("latest_checkpoint_id") + elif checkpoints: + latest_id = checkpoints[-1].get("checkpoint_id") + + return json.dumps( + { + "session_id": session_id, + "checkpoints": checkpoints, + "total": len(checkpoints), + "latest_checkpoint_id": latest_id, + }, + indent=2, + ) + + +@mcp.tool() +def get_agent_checkpoint( + agent_work_dir: Annotated[str, "Path to the agent's working directory"], + session_id: Annotated[str, "The session ID"], + checkpoint_id: Annotated[str, "Specific checkpoint ID, or empty for latest"] = "", +) -> str: + """ + Load a specific checkpoint with full state data. + + Returns the complete checkpoint including shared memory snapshot, + execution path, accumulated outputs, and metrics. If checkpoint_id + is empty, loads the latest checkpoint. + """ + session_dir = Path(agent_work_dir) / "sessions" / session_id + checkpoint_dir = session_dir / "checkpoints" + + if not checkpoint_dir.exists(): + return json.dumps({"error": f"No checkpoints found for session: {session_id}"}) + + if not checkpoint_id: + index_data = _read_session_json(checkpoint_dir / "index.json") + if index_data and index_data.get("latest_checkpoint_id"): + checkpoint_id = index_data["latest_checkpoint_id"] + else: + cp_files = sorted(checkpoint_dir.glob("cp_*.json")) + if not cp_files: + return json.dumps({"error": f"No checkpoints found for session: {session_id}"}) + checkpoint_id = cp_files[-1].stem + + cp_path = checkpoint_dir / f"{checkpoint_id}.json" + data = _read_session_json(cp_path) + if data is None: + return json.dumps({"error": f"Checkpoint not found: {checkpoint_id}"}) + + return json.dumps(data, indent=2, default=str) + + +@mcp.tool() +def compare_agent_checkpoints( + agent_work_dir: Annotated[str, "Path to the agent's working directory"], + session_id: Annotated[str, "The session ID"], + checkpoint_id_before: Annotated[str, "The earlier checkpoint ID"], + checkpoint_id_after: Annotated[str, "The later checkpoint ID"], +) -> str: + """ + Compare memory state between two checkpoints. + + Shows what memory keys were added, removed, or changed between + two points in execution. Useful for understanding how data flows + through the agent graph. + """ + checkpoint_dir = Path(agent_work_dir) / "sessions" / session_id / "checkpoints" + + before = _read_session_json(checkpoint_dir / f"{checkpoint_id_before}.json") + if before is None: + return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_before}"}) + + after = _read_session_json(checkpoint_dir / f"{checkpoint_id_after}.json") + if after is None: + return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_after}"}) + + mem_before = before.get("shared_memory", {}) + mem_after = after.get("shared_memory", {}) + + keys_before = set(mem_before.keys()) + keys_after = set(mem_after.keys()) + + added = {k: _truncate_value(mem_after[k]) for k in keys_after - keys_before} + removed = list(keys_before - keys_after) + unchanged = [] + changed = {} + + for k in keys_before & keys_after: + if mem_before[k] == mem_after[k]: + unchanged.append(k) + else: + changed[k] = { + "before": _truncate_value(mem_before[k]), + "after": _truncate_value(mem_after[k]), + } + + path_before = before.get("execution_path", []) + path_after = after.get("execution_path", []) + new_nodes = path_after[len(path_before) :] + + return json.dumps( + { + "session_id": session_id, + "before": { + "checkpoint_id": checkpoint_id_before, + "current_node": before.get("current_node"), + "created_at": before.get("created_at", ""), + }, + "after": { + "checkpoint_id": checkpoint_id_after, + "current_node": after.get("current_node"), + "created_at": after.get("created_at", ""), + }, + "memory_diff": { + "added": added, + "removed": removed, + "changed": changed, + "unchanged": unchanged, + }, + "execution_path_diff": { + "new_nodes": new_nodes, + "path_before": path_before, + "path_after": path_after, + }, + }, + indent=2, + default=str, + ) + + # ============================================================================= # MAIN # ============================================================================= diff --git a/core/framework/runner/cli.py b/core/framework/runner/cli.py index 7362da94..82c4c9aa 100644 --- a/core/framework/runner/cli.py +++ b/core/framework/runner/cli.py @@ -332,6 +332,60 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None: resume_parser.set_defaults(func=cmd_resume) +def _load_resume_state( + agent_path: str, session_id: str, checkpoint_id: str | None = None +) -> dict | None: + """Load session or checkpoint state for headless resume. + + Args: + agent_path: Path to the agent folder (e.g., exports/my_agent) + session_id: Session ID to resume from + checkpoint_id: Optional checkpoint ID within the session + + Returns: + session_state dict for executor, or None if not found + """ + agent_name = Path(agent_path).name + agent_work_dir = Path.home() / ".hive" / "agents" / agent_name + session_dir = agent_work_dir / "sessions" / session_id + + if not session_dir.exists(): + return None + + if checkpoint_id: + # Checkpoint-based resume: load checkpoint and extract state + cp_path = session_dir / "checkpoints" / f"{checkpoint_id}.json" + if not cp_path.exists(): + return None + try: + cp_data = json.loads(cp_path.read_text()) + except (json.JSONDecodeError, OSError): + return None + return { + "memory": cp_data.get("shared_memory", {}), + "paused_at": cp_data.get("next_node") or cp_data.get("current_node"), + "execution_path": cp_data.get("execution_path", []), + "node_visit_counts": {}, + } + else: + # Session state resume: load state.json + state_path = session_dir / "state.json" + if not state_path.exists(): + return None + try: + state_data = json.loads(state_path.read_text()) + except (json.JSONDecodeError, OSError): + return None + progress = state_data.get("progress", {}) + paused_at = progress.get("paused_at") or progress.get("resume_from") + return { + "memory": state_data.get("memory", {}), + "paused_at": paused_at, + "execution_path": progress.get("path", []), + "node_visit_counts": progress.get("node_visit_counts", {}), + } + + def cmd_run(args: argparse.Namespace) -> int: """Run an exported agent.""" import logging @@ -375,7 +429,6 @@ def cmd_run(args: argparse.Namespace) -> int: runner = AgentRunner.load( args.agent_path, model=args.model, - enable_tui=True, ) except CredentialError as e: print(f"\n{e}", file=sys.stderr) @@ -419,7 +472,6 @@ def cmd_run(args: argparse.Namespace) -> int: runner = AgentRunner.load( args.agent_path, model=args.model, - enable_tui=False, ) except CredentialError as e: print(f"\n{e}", file=sys.stderr) @@ -428,6 +480,27 @@ def cmd_run(args: argparse.Namespace) -> int: print(f"Error: {e}", file=sys.stderr) return 1 + # Load session/checkpoint state for resume (headless mode) + session_state = None + resume_session = getattr(args, "resume_session", None) + checkpoint = getattr(args, "checkpoint", None) + if resume_session: + session_state = _load_resume_state(args.agent_path, resume_session, checkpoint) + if session_state is None: + print( + f"Error: Could not load session state for {resume_session}", + file=sys.stderr, + ) + return 1 + if not args.quiet: + resume_node = session_state.get("paused_at", "unknown") + if checkpoint: + print(f"Resuming from checkpoint: {checkpoint}") + else: + print(f"Resuming session: {resume_session}") + print(f"Resume point: {resume_node}") + print() + # Auto-inject user_id if the agent expects it but it's not provided entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else [] if "user_id" in entry_input_keys and context.get("user_id") is None: @@ -447,7 +520,7 @@ def cmd_run(args: argparse.Namespace) -> int: print("=" * 60) print() - result = asyncio.run(runner.run(context)) + result = asyncio.run(runner.run(context, session_state=session_state)) # Format output output = { @@ -1205,7 +1278,6 @@ def cmd_tui(args: argparse.Namespace) -> int: runner = AgentRunner.load( agent_path, model=args.model, - enable_tui=True, ) except CredentialError as e: print(f"\n{e}", file=sys.stderr) diff --git a/core/framework/runner/runner.py b/core/framework/runner/runner.py index 9df11bf1..ffc191a8 100644 --- a/core/framework/runner/runner.py +++ b/core/framework/runner/runner.py @@ -17,17 +17,13 @@ from framework.graph.edge import ( EdgeSpec, GraphSpec, ) -from framework.graph.executor import ExecutionResult, GraphExecutor +from framework.graph.executor import ExecutionResult from framework.graph.node import NodeSpec from framework.llm.provider import LLMProvider, Tool from framework.runner.tool_registry import ToolRegistry - -# Multi-entry-point runtime imports from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime -from framework.runtime.core import Runtime from framework.runtime.execution_stream import EntryPointSpec from framework.runtime.runtime_log_store import RuntimeLogStore -from framework.runtime.runtime_logger import RuntimeLogger if TYPE_CHECKING: from framework.runner.protocol import AgentMessage, CapabilityResponse @@ -271,7 +267,6 @@ class AgentRunner: mock_mode: bool = False, storage_path: Path | None = None, model: str | None = None, - enable_tui: bool = False, intro_message: str = "", ): """ @@ -284,7 +279,6 @@ class AgentRunner: mock_mode: If True, use mock LLM responses storage_path: Path for runtime storage (defaults to temp) model: Model to use (reads from agent config or ~/.hive/configuration.json if None) - enable_tui: If True, forces use of AgentRuntime with EventBus intro_message: Optional greeting shown to user on TUI load """ self.agent_path = agent_path @@ -292,7 +286,6 @@ class AgentRunner: self.goal = goal self.mock_mode = mock_mode self.model = model or self._resolve_default_model() - self.enable_tui = enable_tui self.intro_message = intro_message # Set up storage @@ -313,12 +306,10 @@ class AgentRunner: # Initialize components self._tool_registry = ToolRegistry() - self._runtime: Runtime | None = None self._llm: LLMProvider | None = None - self._executor: GraphExecutor | None = None self._approval_callback: Callable | None = None - # Multi-entry-point support (AgentRuntime) + # AgentRuntime — unified execution path for all agents self._agent_runtime: AgentRuntime | None = None self._uses_async_entry_points = self.graph.has_async_entry_points() @@ -466,7 +457,6 @@ class AgentRunner: mock_mode: bool = False, storage_path: Path | None = None, model: str | None = None, - enable_tui: bool = False, ) -> "AgentRunner": """ Load an agent from an export folder. @@ -480,7 +470,6 @@ class AgentRunner: mock_mode: If True, use mock LLM responses storage_path: Path for runtime storage (defaults to ~/.hive/agents/{name}) model: LLM model to use (reads from agent's default_config if None) - enable_tui: If True, forces use of AgentRuntime with EventBus Returns: AgentRunner instance ready to run @@ -541,7 +530,6 @@ class AgentRunner: mock_mode=mock_mode, storage_path=storage_path, model=model, - enable_tui=enable_tui, intro_message=intro_message, ) @@ -560,7 +548,6 @@ class AgentRunner: mock_mode=mock_mode, storage_path=storage_path, model=model, - enable_tui=enable_tui, ) def register_tool( @@ -650,9 +637,6 @@ class AgentRunner: callback: Function to call for approval (receives node info, returns bool) """ self._approval_callback = callback - # If executor already exists, update it - if self._executor is not None: - self._executor.approval_callback = callback def _setup(self) -> None: """Set up runtime, LLM, and executor.""" @@ -717,16 +701,11 @@ class AgentRunner: print(f"Warning: {api_key_env} not set. LLM calls will fail.") print(f"Set it with: export {api_key_env}=your-api-key") - # Get tools for executor/runtime + # Get tools for runtime tools = list(self._tool_registry.get_tools().values()) tool_executor = self._tool_registry.get_executor() - if self._uses_async_entry_points or self.enable_tui: - # Multi-entry-point mode or TUI mode: use AgentRuntime - self._setup_agent_runtime(tools, tool_executor) - else: - # Single-entry-point mode: use legacy GraphExecutor - self._setup_legacy_executor(tools, tool_executor) + self._setup_agent_runtime(tools, tool_executor) def _get_api_key_env_var(self, model: str) -> str | None: """Get the environment variable name for the API key based on model name.""" @@ -741,7 +720,7 @@ class AgentRunner: elif model_lower.startswith("anthropic/") or model_lower.startswith("claude"): return "ANTHROPIC_API_KEY" elif model_lower.startswith("gemini/") or model_lower.startswith("google/"): - return "GOOGLE_API_KEY" + return "GEMINI_API_KEY" elif model_lower.startswith("mistral/"): return "MISTRAL_API_KEY" elif model_lower.startswith("groq/"): @@ -787,26 +766,6 @@ class AgentRunner: except Exception: return None - def _setup_legacy_executor(self, tools: list, tool_executor: Callable | None) -> None: - """Set up legacy single-entry-point execution using GraphExecutor.""" - # Create runtime - self._runtime = Runtime(storage_path=self._storage_path) - - # Create runtime logger - log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs") - runtime_logger = RuntimeLogger(store=log_store, agent_id=self.graph.id) - - # Create executor - self._executor = GraphExecutor( - runtime=self._runtime, - llm=self._llm, - tools=tools, - tool_executor=tool_executor, - approval_callback=self._approval_callback, - runtime_logger=runtime_logger, - loop_config=self.graph.loop_config, - ) - def _setup_agent_runtime(self, tools: list, tool_executor: Callable | None) -> None: """Set up multi-entry-point execution using AgentRuntime.""" # Convert AsyncEntryPointSpec to EntryPointSpec for AgentRuntime @@ -824,9 +783,9 @@ class AgentRunner: ) entry_points.append(ep) - # If TUI enabled but no entry points (single-entry agent), create default - if not entry_points and self.enable_tui and self.graph.entry_node: - logger.info("Creating default entry point for TUI") + # Single-entry agent with no async entry points: create a default entry point + if not entry_points and self.graph.entry_node: + logger.info("Creating default entry point for single-entry agent") entry_points.append( EntryPointSpec( id="default", @@ -905,32 +864,9 @@ class AgentRunner: error=error_msg, ) - if self._uses_async_entry_points or self.enable_tui: - # Multi-entry-point mode: use AgentRuntime - return await self._run_with_agent_runtime( - input_data=input_data or {}, - entry_point_id=entry_point_id, - ) - else: - # Legacy single-entry-point mode - return await self._run_with_executor( - input_data=input_data or {}, - session_state=session_state, - ) - - async def _run_with_executor( - self, - input_data: dict, - session_state: dict | None = None, - ) -> ExecutionResult: - """Run using legacy GraphExecutor (single entry point).""" - if self._executor is None: - self._setup() - - return await self._executor.execute( - graph=self.graph, - goal=self.goal, - input_data=input_data, + return await self._run_with_agent_runtime( + input_data=input_data or {}, + entry_point_id=entry_point_id, session_state=session_state, ) @@ -938,8 +874,11 @@ class AgentRunner: self, input_data: dict, entry_point_id: str | None = None, + session_state: dict | None = None, ) -> ExecutionResult: - """Run using AgentRuntime (multi-entry-point).""" + """Run using AgentRuntime.""" + import sys + if self._agent_runtime is None: self._setup() @@ -947,6 +886,52 @@ class AgentRunner: if not self._agent_runtime.is_running: await self._agent_runtime.start() + # Set up stdin-based I/O for client-facing nodes in headless mode. + # When a client_facing EventLoopNode calls ask_user(), it emits + # CLIENT_INPUT_REQUESTED on the event bus and blocks. We subscribe + # a handler that prints the prompt and reads from stdin, then injects + # the user's response back into the node to unblock it. + has_client_facing = any(n.client_facing for n in self.graph.nodes) + sub_ids: list[str] = [] + + if has_client_facing and sys.stdin.isatty(): + from framework.runtime.event_bus import EventType + + runtime = self._agent_runtime + + async def _handle_client_output(event): + """Print agent output to stdout as it streams.""" + content = event.data.get("content", "") + if content: + print(content, end="", flush=True) + + async def _handle_input_requested(event): + """Read user input from stdin and inject it into the node.""" + import asyncio + + node_id = event.node_id + try: + loop = asyncio.get_event_loop() + user_input = await loop.run_in_executor(None, input, "\n>>> ") + except EOFError: + user_input = "" + + # Inject into the waiting EventLoopNode via runtime + await runtime.inject_input(node_id, user_input) + + sub_ids.append( + runtime.subscribe_to_events( + event_types=[EventType.CLIENT_OUTPUT_DELTA], + handler=_handle_client_output, + ) + ) + sub_ids.append( + runtime.subscribe_to_events( + event_types=[EventType.CLIENT_INPUT_REQUESTED], + handler=_handle_input_requested, + ) + ) + # Determine entry point if entry_point_id is None: # Use first entry point or "default" if no entry points defined @@ -956,44 +941,38 @@ class AgentRunner: else: entry_point_id = "default" - # Trigger and wait for result - result = await self._agent_runtime.trigger_and_wait( - entry_point_id=entry_point_id, - input_data=input_data, - ) - - # Return result or create error result - if result is not None: - return result - else: - return ExecutionResult( - success=False, - error="Execution timed out or failed to complete", + try: + # Trigger and wait for result + result = await self._agent_runtime.trigger_and_wait( + entry_point_id=entry_point_id, + input_data=input_data, + session_state=session_state, ) - # === Multi-Entry-Point API (for agents with async_entry_points) === + # Return result or create error result + if result is not None: + return result + else: + return ExecutionResult( + success=False, + error="Execution timed out or failed to complete", + ) + finally: + # Clean up subscriptions + for sub_id in sub_ids: + self._agent_runtime.unsubscribe_from_events(sub_id) + + # === Runtime API === async def start(self) -> None: - """ - Start the agent runtime (for multi-entry-point agents). - - This starts all registered entry points and allows concurrent execution. - For single-entry-point agents, this is a no-op. - """ - if not self._uses_async_entry_points: - return - + """Start the agent runtime.""" if self._agent_runtime is None: self._setup() await self._agent_runtime.start() async def stop(self) -> None: - """ - Stop the agent runtime (for multi-entry-point agents). - - For single-entry-point agents, this is a no-op. - """ + """Stop the agent runtime.""" if self._agent_runtime is not None: await self._agent_runtime.stop() @@ -1006,7 +985,7 @@ class AgentRunner: """ Trigger execution at a specific entry point (non-blocking). - For multi-entry-point agents only. Returns execution ID for tracking. + Returns execution ID for tracking. Args: entry_point_id: Which entry point to trigger @@ -1015,16 +994,7 @@ class AgentRunner: Returns: Execution ID for tracking - - Raises: - RuntimeError: If agent doesn't use async entry points """ - if not self._uses_async_entry_points: - raise RuntimeError( - "trigger() is only available for multi-entry-point agents. " - "Use run() for single-entry-point agents." - ) - if self._agent_runtime is None: self._setup() @@ -1041,19 +1011,9 @@ class AgentRunner: """ Get goal progress across all execution streams. - For multi-entry-point agents only. - Returns: Dict with overall_progress, criteria_status, constraint_violations, etc. - - Raises: - RuntimeError: If agent doesn't use async entry points """ - if not self._uses_async_entry_points: - raise RuntimeError( - "get_goal_progress() is only available for multi-entry-point agents." - ) - if self._agent_runtime is None: self._setup() @@ -1061,14 +1021,11 @@ class AgentRunner: def get_entry_points(self) -> list[EntryPointSpec]: """ - Get all registered entry points (for multi-entry-point agents). + Get all registered entry points. Returns: List of EntryPointSpec objects """ - if not self._uses_async_entry_points: - return [] - if self._agent_runtime is None: self._setup() @@ -1492,7 +1449,7 @@ Respond with JSON only: self._temp_dir = None async def cleanup_async(self) -> None: - """Clean up resources (asynchronous - for multi-entry-point agents).""" + """Clean up resources (asynchronous).""" # Stop agent runtime if running if self._agent_runtime is not None and self._agent_runtime.is_running: await self._agent_runtime.stop() @@ -1503,8 +1460,7 @@ Respond with JSON only: async def __aenter__(self) -> "AgentRunner": """Context manager entry.""" self._setup() - # Start runtime for multi-entry-point agents - if self._uses_async_entry_points and self._agent_runtime is not None: + if self._agent_runtime is not None: await self._agent_runtime.start() return self diff --git a/core/framework/runtime/README.md b/core/framework/runtime/README.md new file mode 100644 index 00000000..cc71e48c --- /dev/null +++ b/core/framework/runtime/README.md @@ -0,0 +1,172 @@ +# Agent Runtime + +Unified execution system for all Hive agents. Every agent — single-entry or multi-entry, headless or TUI — runs through the same runtime stack. + +## Topology + +``` + AgentRunner.load(agent_path) + | + AgentRunner + (factory + public API) + | + _setup_agent_runtime() + | + AgentRuntime + (lifecycle + orchestration) + / | \ + Stream A Stream B Stream C ← one per entry point + | | | + GraphExecutor GraphExecutor GraphExecutor + | | | + Node → Node → Node (graph traversal) +``` + +Single-entry agents get a `"default"` entry point automatically. There is no separate code path. + +## Components + +| Component | File | Role | +|---|---|---| +| `AgentRunner` | `runner/runner.py` | Load agents, configure tools/LLM, expose high-level API | +| `AgentRuntime` | `runtime/agent_runtime.py` | Lifecycle management, entry point routing, event bus | +| `ExecutionStream` | `runtime/execution_stream.py` | Per-entry-point execution queue, session persistence | +| `GraphExecutor` | `graph/executor.py` | Node traversal, tool dispatch, checkpointing | +| `EventBus` | `runtime/event_bus.py` | Pub/sub for execution events (streaming, I/O) | +| `SharedStateManager` | `runtime/shared_state.py` | Cross-stream state with isolation levels | +| `OutcomeAggregator` | `runtime/outcome_aggregator.py` | Goal progress tracking across streams | +| `SessionStore` | `storage/session_store.py` | Session state persistence (`sessions/{id}/state.json`) | + +## Programming Interface + +### AgentRunner (high-level) + +```python +from framework.runner import AgentRunner + +# Load and run +runner = AgentRunner.load("exports/my_agent", model="anthropic/claude-sonnet-4-20250514") +result = await runner.run({"query": "hello"}) + +# Resume from paused session +result = await runner.run({"query": "continue"}, session_state=saved_state) + +# Lifecycle +await runner.start() # Start the runtime +await runner.stop() # Stop the runtime +exec_id = await runner.trigger("default", {}) # Non-blocking trigger +progress = await runner.get_goal_progress() # Goal evaluation +entry_points = runner.get_entry_points() # List entry points + +# Context manager +async with AgentRunner.load("exports/my_agent") as runner: + result = await runner.run({"query": "hello"}) + +# Cleanup +runner.cleanup() # Synchronous +await runner.cleanup_async() # Asynchronous +``` + +### AgentRuntime (lower-level) + +```python +from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime +from framework.runtime.execution_stream import EntryPointSpec + +# Create runtime with entry points +runtime = create_agent_runtime( + graph=graph, + goal=goal, + storage_path=Path("~/.hive/agents/my_agent"), + entry_points=[ + EntryPointSpec(id="default", name="Default", entry_node="start", trigger_type="manual"), + ], + llm=llm, + tools=tools, + tool_executor=tool_executor, + checkpoint_config=checkpoint_config, +) + +# Lifecycle +await runtime.start() +await runtime.stop() + +# Execution +exec_id = await runtime.trigger("default", {"query": "hello"}) # Non-blocking +result = await runtime.trigger_and_wait("default", {"query": "hello"}) # Blocking +result = await runtime.trigger_and_wait("default", {}, session_state=state) # Resume + +# Client-facing node I/O +await runtime.inject_input(node_id="chat", content="user response") + +# Events +sub_id = runtime.subscribe_to_events( + event_types=[EventType.CLIENT_OUTPUT_DELTA], + handler=my_handler, +) +runtime.unsubscribe_from_events(sub_id) + +# Inspection +runtime.is_running # bool +runtime.event_bus # EventBus +runtime.state_manager # SharedStateManager +runtime.get_stats() # Runtime statistics +``` + +## Execution Flow + +1. `AgentRunner.run()` calls `AgentRuntime.trigger_and_wait()` +2. `AgentRuntime` routes to the `ExecutionStream` for the entry point +3. `ExecutionStream` creates a `GraphExecutor` and calls `execute()` +4. `GraphExecutor` traverses nodes, dispatches tools, manages checkpoints +5. `ExecutionResult` flows back up through the stack +6. `ExecutionStream` writes session state to disk + +## Session Resume + +All execution paths support session resume: + +```python +# First run (agent pauses at a client-facing node) +result = await runner.run({"query": "start task"}) +# result.paused_at = "review-node" +# result.session_state = {"memory": {...}, "paused_at": "review-node", ...} + +# Resume +result = await runner.run({"input": "approved"}, session_state=result.session_state) +``` + +Session state flows: `AgentRunner.run()` → `AgentRuntime.trigger_and_wait()` → `ExecutionStream.execute()` → `GraphExecutor.execute()`. + +Checkpoints are saved at node boundaries (`sessions/{id}/checkpoints/`) for crash recovery. + +## Event Bus + +The `EventBus` provides real-time execution visibility: + +| Event | When | +|---|---| +| `NODE_STARTED` | Node begins execution | +| `NODE_COMPLETED` | Node finishes | +| `TOOL_CALL_STARTED` | Tool invocation begins | +| `TOOL_CALL_COMPLETED` | Tool invocation finishes | +| `CLIENT_OUTPUT_DELTA` | Agent streams text to user | +| `CLIENT_INPUT_REQUESTED` | Agent needs user input | +| `EXECUTION_COMPLETED` | Full execution finishes | + +In headless mode, `AgentRunner` subscribes to `CLIENT_OUTPUT_DELTA` and `CLIENT_INPUT_REQUESTED` to print output and read stdin. In TUI mode, `AdenTUI` subscribes to route events to UI widgets. + +## Storage Layout + +``` +~/.hive/agents/{agent_name}/ + sessions/ + session_YYYYMMDD_HHMMSS_{uuid}/ + state.json # Session state (status, memory, progress) + checkpoints/ # Node-boundary snapshots + logs/ + summary.json # Execution summary + details.jsonl # Detailed event log + tool_logs.jsonl # Tool call log + runtime_logs/ # Cross-session runtime logs +``` diff --git a/core/framework/testing/prompts.py b/core/framework/testing/prompts.py index 3bbe8898..08df7625 100644 --- a/core/framework/testing/prompts.py +++ b/core/framework/testing/prompts.py @@ -3,6 +3,10 @@ Pytest templates for test file generation. These templates provide headers and fixtures for pytest-compatible async tests. Tests are written to exports/{agent}/tests/ as Python files and run with pytest. + +Tests use AgentRunner.load() — the canonical runtime path — which creates +AgentRuntime, ExecutionStream, and proper session/log storage. For agents +with client-facing nodes, an auto_responder fixture handles input injection. """ # Template for the test file header (imports and fixtures) @@ -11,17 +15,19 @@ PYTEST_TEST_FILE_HEADER = '''""" {description} -REQUIRES: API_KEY (OpenAI or Anthropic) for real testing. +REQUIRES: API_KEY for execution tests. Structure tests run without keys. """ import os import pytest -from {agent_module} import default_agent +from pathlib import Path + +# Agent path resolved from this test file's location +AGENT_PATH = Path(__file__).resolve().parents[1] def _get_api_key(): """Get API key from CredentialStoreAdapter or environment.""" - # 1. Try CredentialStoreAdapter for Anthropic try: from aden_tools.credentials import CredentialStoreAdapter creds = CredentialStoreAdapter.default() @@ -29,28 +35,43 @@ def _get_api_key(): return creds.get("anthropic") except (ImportError, KeyError): pass - - # 2. Fallback to standard environment variables for OpenAI and others return ( os.environ.get("OPENAI_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("CEREBRAS_API_KEY") or - os.environ.get("GROQ_API_KEY") + os.environ.get("GROQ_API_KEY") or + os.environ.get("GEMINI_API_KEY") ) # Skip all tests if no API key and not in mock mode pytestmark = pytest.mark.skipif( not _get_api_key() and not os.environ.get("MOCK_MODE"), - reason="API key required. Please set OPENAI_API_KEY, ANTHROPIC_API_KEY, or use MOCK_MODE=1." + reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests." ) ''' # Template for conftest.py with shared fixtures PYTEST_CONFTEST_TEMPLATE = '''"""Shared test fixtures for {agent_name} tests.""" +import json import os +import re +import sys +from pathlib import Path + +# Add exports/ and core/ to sys.path so the agent package and framework are importable +_repo_root = Path(__file__).resolve().parents[3] +for _p in ["exports", "core"]: + _path = str(_repo_root / _p) + if _path not in sys.path: + sys.path.insert(0, _path) + import pytest +from framework.runner.runner import AgentRunner +from framework.runtime.event_bus import EventType + +AGENT_PATH = Path(__file__).resolve().parents[1] def _get_api_key(): @@ -62,19 +83,80 @@ def _get_api_key(): return creds.get("anthropic") except (ImportError, KeyError): pass - return ( os.environ.get("OPENAI_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("CEREBRAS_API_KEY") or - os.environ.get("GROQ_API_KEY") + os.environ.get("GROQ_API_KEY") or + os.environ.get("GEMINI_API_KEY") ) -@pytest.fixture +@pytest.fixture(scope="session") def mock_mode(): - """Check if running in mock mode.""" - return bool(os.environ.get("MOCK_MODE")) + """Return True if running in mock mode (no API key or MOCK_MODE=1).""" + if os.environ.get("MOCK_MODE"): + return True + return not bool(_get_api_key()) + + +@pytest.fixture(scope="session") +async def runner(tmp_path_factory, mock_mode): + """Create an AgentRunner using the canonical runtime path. + + Uses tmp_path_factory for storage so tests don't pollute ~/.hive/agents/. + Goes through AgentRunner.load() -> _setup() -> AgentRuntime, the same + path as ``hive run``. + """ + storage = tmp_path_factory.mktemp("agent_storage") + r = AgentRunner.load( + AGENT_PATH, + mock_mode=mock_mode, + storage_path=storage, + ) + r._setup() + yield r + await r.cleanup_async() + + +@pytest.fixture +def auto_responder(runner): + """Auto-respond to client-facing node input requests. + + Subscribes to CLIENT_INPUT_REQUESTED events and injects a response + to unblock the node. Customize the response before calling start(): + + auto_responder.response = "approve the report" + await auto_responder.start() + """ + class AutoResponder: + def __init__(self, runner_instance): + self._runner = runner_instance + self.response = "yes, proceed" + self.interactions = [] + self._sub_id = None + + async def start(self): + runtime = self._runner._agent_runtime + if runtime is None: + return + + async def _handle(event): + self.interactions.append(event.node_id) + await runtime.inject_input(event.node_id, self.response) + + self._sub_id = runtime.subscribe_to_events( + event_types=[EventType.CLIENT_INPUT_REQUESTED], + handler=_handle, + ) + + async def stop(self): + runtime = self._runner._agent_runtime + if self._sub_id and runtime: + runtime.unsubscribe_from_events(self._sub_id) + self._sub_id = None + + return AutoResponder(runner) @pytest.fixture(scope="session", autouse=True) @@ -82,19 +164,51 @@ def check_api_key(): """Ensure API key is set for real testing.""" if not _get_api_key(): if os.environ.get("MOCK_MODE"): - print("\\n⚠️ Running in MOCK MODE - structure validation only") - print(" This does NOT test LLM behavior or agent quality") - print(" Set OPENAI_API_KEY or ANTHROPIC_API_KEY for real testing\\n") + print("\\n Running in MOCK MODE - structure validation only") + print(" Set ANTHROPIC_API_KEY for real testing\\n") else: pytest.fail( - "\\n❌ No API key found!\\n\\n" - "Real testing requires an API key. Choose one:\\n" - "1. Set OpenAI key:\\n" - " export OPENAI_API_KEY='your-key-here'\\n" - "2. Set Anthropic key:\\n" - " export ANTHROPIC_API_KEY='your-key-here'\\n" - "3. Run structure validation only:\\n" - " MOCK_MODE=1 pytest exports/{agent_name}/tests/\\n\\n" - "Note: Mock mode does NOT validate agent behavior or quality." + "\\nNo API key found!\\n" + "Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests.\\n" ) + + +def parse_json_from_output(result, key): + """Parse JSON from agent output (framework may store full LLM response as string).""" + val = result.output.get(key, "") + if isinstance(val, (dict, list)): + return val + if isinstance(val, str): + json_text = re.sub(r"```json\\s*|\\s*```", "", val).strip() + try: + return json.loads(json_text) + except (json.JSONDecodeError, TypeError): + return val + return val + + +def safe_get_nested(result, key_path, default=None): + """Safely get nested value from result.output.""" + output = result.output or {{}} + current = output + for key in key_path: + if isinstance(current, dict): + current = current.get(key) + elif isinstance(current, str): + try: + json_text = re.sub(r"```json\\s*|\\s*```", "", current).strip() + parsed = json.loads(json_text) + if isinstance(parsed, dict): + current = parsed.get(key) + else: + return default + except json.JSONDecodeError: + return default + else: + return default + return current if current is not None else default + + +pytest.parse_json_from_output = parse_json_from_output +pytest.safe_get_nested = safe_get_nested ''' diff --git a/core/tests/test_event_loop_integration.py b/core/tests/test_event_loop_integration.py index 90e91b34..d19f008a 100644 --- a/core/tests/test_event_loop_integration.py +++ b/core/tests/test_event_loop_integration.py @@ -951,7 +951,7 @@ async def test_client_facing_node_streams_output(): config=LoopConfig(max_iterations=5), ) - # Text-only on client_facing no longer blocks (no ask_user called), + # Text-only on client_facing does not block (no ask_user called), # so the node completes without needing a shutdown workaround. result = await node.execute(ctx) diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 648de682..94cfebeb 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -116,6 +116,16 @@ Skills are also available in Cursor. To enable: 3. Restart Cursor to load the MCP servers from `.cursor/mcp.json` 4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`) + +### Opencode Support +To enable Opencode integration: + +1. Create/Ensure `.opencode/` directory exists +2. Configure MCP servers in `.opencode/mcp.json` +3. Restart Opencode to load the MCP servers +4. Switch to the Hive agent +* **Tools:** Accesses `agent-builder` and standard `tools` via standard MCP protocols over stdio. + ### Verify Setup ```bash diff --git a/docs/environment-setup.md b/docs/environment-setup.md index c37cf9b7..48911abc 100644 --- a/docs/environment-setup.md +++ b/docs/environment-setup.md @@ -65,28 +65,26 @@ source .venv/bin/activate If you prefer to set up manually or the script fails: -### 1. Install Core Framework +### 1. Sync Workspace Dependencies ```bash -cd core -uv pip install -e . +# From repository root - this creates a single .venv at the root +uv sync ``` -### 2. Install Tools Package +> **Note:** The `uv sync` command uses the workspace configuration in `pyproject.toml` to install both `core` (framework) and `tools` (aden_tools) packages together. This is the recommended approach over individual `pip install -e` commands which may fail due to circular dependencies. + +### 2. Activate the Virtual Environment ```bash -cd tools -uv pip install -e . +# Linux/macOS +source .venv/bin/activate + +# Windows (PowerShell) +.venv\Scripts\Activate.ps1 ``` -### 3. Upgrade OpenAI Package - -```bash -# litellm requires openai >= 1.0.0 -uv pip install --upgrade "openai>=1.0.0" -``` - -### 4. Verify Installation +### 3. Verify Installation ```bash uv run python -c "import framework; print('✓ framework OK')" @@ -281,18 +279,20 @@ Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass ### "ModuleNotFoundError: No module named 'framework'" -**Solution:** Install the core package: +**Solution:** Sync the workspace dependencies: ```bash -cd core && uv pip install -e . +# From repository root +uv sync ``` ### "ModuleNotFoundError: No module named 'aden_tools'" -**Solution:** Install the tools package: +**Solution:** Sync the workspace dependencies: ```bash -cd tools && uv pip install -e . +# From repository root +uv sync ``` Or run the setup script: @@ -350,15 +350,14 @@ The Hive framework consists of three Python packages: ``` hive/ +├── .venv/ # Single workspace venv (created by uv sync) ├── core/ # Core framework (runtime, graph executor, LLM providers) │ ├── framework/ -│ ├── .venv/ # Created by quickstart.sh │ └── pyproject.toml │ ├── tools/ # Tools and MCP servers │ ├── src/ │ │ └── aden_tools/ # Actual package location -│ ├── .venv/ # Created by quickstart.sh │ └── pyproject.toml │ ├── exports/ # Agent packages (user-created, gitignored) @@ -368,28 +367,29 @@ hive/ └── templates/ # Pre-built template agents ``` -## Separate Virtual Environments +## Virtual Environment Setup -Hive primarily uses **uv** to create and manage separate virtual environments for `core` and `tools`. +Hive uses **uv workspaces** to manage dependencies. When you run `uv sync` from the repository root, a **single `.venv`** is created at the root containing both packages. -The project uses separate virtual environments to: +### Benefits of Workspace Mode -- Isolate dependencies and avoid conflicts -- Allow independent development and testing of each package -- Enable MCP servers to run with their specific dependencies +- **Single environment** - No need to switch between multiple venvs +- **Unified dependencies** - Consistent package versions across core and tools +- **Simpler development** - One activation, access to everything ### How It Works -When you run `./quickstart.sh`, `uv` sets up: +When you run `./quickstart.sh` or `uv sync`: -1. **core/.venv/** - Contains the `framework` package and its dependencies (anthropic, litellm, mcp, etc.) -2. **tools/.venv/** - Contains the `aden_tools` package and its dependencies (beautifulsoup4, pandas, etc.) +1. **/.venv/** - Single root virtual environment is created +2. Both `framework` (from core/) and `aden_tools` (from tools/) are installed +3. All dependencies (anthropic, litellm, beautifulsoup4, pandas, etc.) are resolved together -If you need to refresh environments manually, use `uv`: +If you need to refresh the environment: ```bash -cd core && uv sync -cd ../tools && uv sync +# From repository root +uv sync ``` ### Cross-Package Imports @@ -521,7 +521,15 @@ export ADEN_CREDENTIALS_PATH="/custom/path" # Agent storage location (default: /tmp) export AGENT_STORAGE_PATH="/custom/storage" ``` +## Opencode Setup +[Opencode](https://github.com/opencode-ai/opencode) is fully supported as a coding agent. + +### Automatic Setup +Run the quickstart script in the root directorys: +```bash +./quickstart.sh +``` ## Additional Resources - **Framework Documentation:** [core/README.md](../core/README.md) diff --git a/docs/quizzes/README.md b/docs/quizzes/README.md index 1ec206e1..de81d2e6 100644 --- a/docs/quizzes/README.md +++ b/docs/quizzes/README.md @@ -40,7 +40,7 @@ Welcome to the Aden Engineering Challenges! These quizzes are designed for stude After completing challenges, submit your work by: 1. Creating a GitHub Gist with your answers -2. Emailing the link to `careers@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name` +2. Emailing the link to `contact@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name` 3. Include your GitHub username in the email ## Getting Help diff --git a/docs/why-conditional-edge-priority.md b/docs/why-conditional-edge-priority.md new file mode 100644 index 00000000..a664fe0b --- /dev/null +++ b/docs/why-conditional-edge-priority.md @@ -0,0 +1,42 @@ +# Why Conditional Edges Need Priority (Function Nodes) + +## The problem + +Function nodes return everything they computed. They don't pick one output key — they return all of them. + +```python +def score_lead(inputs): + score = compute_score(inputs["profile"]) + return { + "score": score, + "is_high_value": score > 80, + "needs_enrichment": score > 50 and not inputs["profile"].get("company"), + } +``` + +Lead comes in: score 92, no company on file. Output: `{"score": 92, "is_high_value": True, "needs_enrichment": True}`. + +Two conditional edges leaving this node: + +``` +Edge A: needs_enrichment == True → enrichment node +Edge B: is_high_value == True → outreach node +``` + +Both are true. Without priority, the graph either fans out to both (wrong — you'd email someone while still enriching their data) or picks one randomly (wrong — non-deterministic). + +## Priority fixes it + +``` +Edge A: needs_enrichment == True priority=2 (higher = checked first) +Edge B: is_high_value == True priority=1 +Edge C: is_high_value == False priority=0 +``` + +Executor keeps only the highest-priority matching group. A wins. Lead gets enriched first, loops back, gets re-scored — now `needs_enrichment` is false, B wins, outreach happens. + +## Why event loop nodes don't need this + +The LLM understands "if/else." You tell it in the prompt: "if needs enrichment, set `needs_enrichment`. Otherwise if high value, set `approved`." It picks one. Only one conditional edge matches. + +A function just returns a dict. It doesn't do "otherwise." Priority is the "otherwise" for function nodes. diff --git a/quickstart.sh b/quickstart.sh index 3d44fd86..39a06ad4 100755 --- a/quickstart.sh +++ b/quickstart.sh @@ -303,8 +303,8 @@ if [ "$USE_ASSOC_ARRAYS" = true ]; then ) declare -A DEFAULT_MODELS=( - ["anthropic"]="claude-opus-4-6" - ["openai"]="gpt-5.2" + ["anthropic"]="claude-haiku-4-5" + ["openai"]="gpt-5-mini" ["gemini"]="gemini-3-flash-preview" ["groq"]="moonshotai/kimi-k2-instruct-0905" ["cerebras"]="zai-glm-4.7" @@ -945,6 +945,16 @@ else echo -e "${YELLOW}--${NC}" fi +echo -n " ⬡ local settings... " +if [ -f "$SCRIPT_DIR/.claude/settings.local.json" ]; then + echo -e "${GREEN}ok${NC}" +elif [ -f "$SCRIPT_DIR/.claude/settings.local.json.example" ]; then + cp "$SCRIPT_DIR/.claude/settings.local.json.example" "$SCRIPT_DIR/.claude/settings.local.json" + echo -e "${GREEN}copied from example${NC}" +else + echo -e "${YELLOW}--${NC}" +fi + echo -n " ⬡ credential store... " if [ -n "$HIVE_CREDENTIAL_KEY" ] && [ -d "$HOME/.hive/credentials/credentials" ]; then echo -e "${GREEN}ok${NC}" @@ -1050,4 +1060,4 @@ if [ -n "$SELECTED_PROVIDER_ID" ] || [ -n "$HIVE_CREDENTIAL_KEY" ]; then fi echo -e "${DIM}Run ./quickstart.sh again to reconfigure.${NC}" -echo "" \ No newline at end of file +echo ""