Compare commits

..

33 Commits

Author SHA1 Message Date
bryan fb203b5bdf update oauth to refresh token 2026-02-06 19:43:30 -08:00
Timothy @aden 7e40d6950a Merge pull request #3871 from TimothyZhang7/main
fix(micro-fix): uv paths in templates
2026-02-06 17:07:19 -08:00
Timothy 590bfa92cb chore: fix mcp server default config 2026-02-06 17:04:03 -08:00
Timothy f0e89a1720 fix: mcp server config with uv 2026-02-06 17:01:42 -08:00
Timothy @aden 575563b1e8 Merge pull request #3870 from adenhq/feat/multi-level-logging
fix: hardening hive cli setup
2026-02-06 16:37:37 -08:00
Timothy 82ea0e47ce fix: hardening hive cli setup 2026-02-06 16:31:31 -08:00
RichardTang-Aden 2f57ca10f7 Merge pull request #3862 from adenhq/feat/hive-tui
(micro-fix): documentation update
2026-02-06 16:19:46 -08:00
RichardTang-Aden 75c2d541c4 Merge branch 'main' into feat/hive-tui 2026-02-06 16:19:30 -08:00
Richard Tang b666f8b50b docs: minor doc update 2026-02-06 16:16:56 -08:00
RichardTang-Aden 09f9322676 Merge pull request #3863 from RichardTang-Aden/fix-remove-old-mock-mode
Fix remove old mock mode
2026-02-06 16:02:01 -08:00
Richard Tang f9a864ef93 fix: remove mock mode in the template 2026-02-06 15:59:48 -08:00
Richard Tang 27f28afe9c fix: remove --mock in the codebase + documentation 2026-02-06 15:59:22 -08:00
Timothy @aden 8f85722fef Merge pull request #3715 from adenhq/feat/multi-level-logging
Feat/multi level logging
2026-02-06 15:59:16 -08:00
bryan 5588445a01 documentation update 2026-02-06 15:59:01 -08:00
Timothy 40529b5722 fix: debugger to instruct on hive tui 2026-02-06 15:56:13 -08:00
Timothy @aden cee632f50c Merge pull request #3855 from adenhq/feat/hive-tui
update tui to support menu, highlight/copy, update quickstart
2026-02-06 15:24:10 -08:00
bryan 3453e3aa05 Merge branch 'feat/hive-tui' into feat/multi-level-logging 2026-02-06 15:21:52 -08:00
Timothy 8de637c421 fix: deprecated tests 2026-02-06 14:00:31 -08:00
Timothy 6c75de862c fix: skip outdated tests 2026-02-06 13:46:12 -08:00
Timothy 2971134882 docs: runtime logging structure 2026-02-06 13:26:53 -08:00
Timothy 6e79860b43 feat: hive debugger skill 2026-02-06 13:22:25 -08:00
bryan 74d0287ec5 update tui to support menu, highlight/copy, update quickstart to include hive tui 2026-02-06 13:10:04 -08:00
RichardTang-Aden 51e81d80fc Merge pull request #3853 from adenhq/docs-key-concepts
Docs key concepts
2026-02-06 12:45:16 -08:00
Timothy a73239dd98 feat: runtime log tools 2026-02-06 12:37:18 -08:00
Timothy d68783a612 refactor: unify storage layer for agent runtime 2026-02-06 12:20:46 -08:00
Timothy a28ea40a7d fix: execution log details in error trace 2026-02-06 11:03:19 -08:00
bryan 5b00445c05 Merge branch 'main' into feat/multi-level-logging 2026-02-05 19:09:18 -08:00
bryan 8b828dd139 Merge branch 'main' into feat/multi-level-logging 2026-02-05 17:19:17 -08:00
bryan 221712128d bug fix for crashing agent 2026-02-05 11:59:57 -08:00
bryan e9fc36f2d3 Merge branch 'main' into feat/multi-level-logging 2026-02-05 09:10:56 -08:00
bryan 305b880b1d including missing tool log inputs 2026-02-05 09:08:42 -08:00
bryan 7519c73f2a Merge branch 'main' into feat/multi-level-logging 2026-02-04 19:34:01 -08:00
bryan bf402aaa18 initial multi-level logging 2026-02-04 17:26:58 -08:00
72 changed files with 7950 additions and 799 deletions
+33 -9
View File
@@ -28,8 +28,8 @@ metadata:
mcp__agent-builder__add_mcp_server(
name="hive-tools",
transport="stdio",
command="python",
args='["mcp_server.py", "--stdio"]',
command="uv",
args='["run", "python", "mcp_server.py", "--stdio"]',
cwd="tools",
description="Hive tools MCP server"
)
@@ -369,8 +369,8 @@ mcp__agent-builder__export_graph()
{
"hive-tools": {
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../../tools",
"description": "Hive tools MCP server"
}
@@ -379,6 +379,7 @@ mcp__agent-builder__export_graph()
- NO `"mcpServers"` wrapper (that's Claude Desktop format, NOT hive format)
- `cwd` MUST be `"../../tools"` (relative from `exports/AGENT_NAME/` to `tools/`)
- `command` MUST be `"uv"` with `"args": ["run", "python", ...]` (NOT bare `"python"` which fails on Mac)
**Use the example agent** at `.claude/skills/hive-create/examples/deep_research_agent/` as a template for file structure and patterns. It demonstrates: STEP 1/STEP 2 prompts, client-facing nodes, feedback loops, nullable_output_keys, and data tools.
@@ -409,11 +410,34 @@ cd /home/timothy/oss/hive && PYTHONPATH=exports uv run python -m AGENT_NAME vali
- If valid: Agent is complete!
- If errors: Fix the issues and re-run
**TELL the user the agent is ready** and suggest next steps:
**TELL the user the agent is ready** and display the next steps box:
- Run with mock mode to test without API calls
- Use `/hive-test` skill for comprehensive testing
- Use `/hive-credentials` if the agent needs API keys
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ ✅ AGENT BUILD COMPLETE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ NEXT STEPS: │
│ │
│ 1. SET UP CREDENTIALS (if agent uses tools like web_search, send_email): │
│ │
│ /hive-credentials --agent AGENT_NAME │
│ │
│ 2. RUN YOUR AGENT: │
│ │
│ hive tui │
│ │
│ Then select your agent from the list and press Enter. │
│ │
│ 3. DEBUG ANY ISSUES: │
│ │
│ /hive-debugger │
│ │
│ The debugger monitors runtime logs, identifies retry loops, │
│ tool failures, and missing outputs, and provides fix recommendations. │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
@@ -513,4 +537,4 @@ result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
8. **Forgetting nullable_output_keys** - Mark input_keys that only arrive on certain edges (e.g., feedback) as nullable on the receiving node
9. **Adding framework gating for LLM behavior** - Fix prompts or use judges, not ad-hoc code
10. **Writing code before user approves the graph** - Always get approval on goal, nodes, and graph BEFORE writing any agent code
11. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), and `cwd` must be `"../../tools"` not `"tools"`
11. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), `cwd` must be `"../../tools"`, and `command` must be `"uv"` with args `["run", "python", ...]`
@@ -1,8 +1,8 @@
{
"hive-tools": {
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../../tools",
"description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
}
+19 -1
View File
@@ -596,5 +596,23 @@ All credentials are now configured:
✓ brave_search (BRAVE_SEARCH_API_KEY) — already in encrypted store
✓ google_search (GOOGLE_API_KEY) — stored in encrypted store
✓ google_cse (GOOGLE_CSE_ID) — stored in encrypted store
Your agent is ready to run!
┌─────────────────────────────────────────────────────────────────────────────┐
│ ✅ CREDENTIALS CONFIGURED │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ NEXT STEPS: │
│ │
│ 1. RUN YOUR AGENT: │
│ │
│ PYTHONPATH=core:exports python -m research-agent tui │
│ │
│ 2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER: │
│ │
│ /hive-debugger │
│ │
│ The debugger analyzes runtime logs, identifies retry loops, tool │
│ failures, stalled execution, and provides actionable fix suggestions. │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
+848
View File
@@ -0,0 +1,848 @@
---
name: hive-debugger
type: utility
description: Interactive debugging companion for Hive agents - identifies runtime issues and proposes solutions
version: 1.0.0
requires:
- hive-concepts
tags:
- debugging
- runtime-logs
- agent-development
---
# Hive Debugger
An interactive debugging companion that helps developers identify and fix runtime issues in Hive agents. The debugger analyzes runtime logs at three levels (L1/L2/L3), categorizes issues, and provides actionable fix recommendations.
## When to Use This Skill
Use `/hive-debugger` when:
- Your agent is failing or producing unexpected results
- You need to understand why a specific node is retrying repeatedly
- Tool calls are failing and you need to identify the root cause
- Agent execution is stalled or taking too long
- You want to monitor agent behavior in real-time during development
This skill works alongside agents running in TUI mode and provides supervisor-level insights into execution behavior.
---
## Prerequisites
Before using this skill, ensure:
1. You have an exported agent in `exports/{agent_name}/`
2. The agent has been run at least once (logs exist)
3. Runtime logging is enabled (default in Hive framework)
4. You have access to the agent's working directory at `~/.hive/{agent_name}/`
---
## Workflow
### Stage 1: Setup & Context Gathering
**Objective:** Understand the agent being debugged
**What to do:**
1. **Ask the developer which agent needs debugging:**
- Get agent name (e.g., "twitter_outreach", "deep_research_agent")
- Confirm the agent exists in `exports/{agent_name}/`
2. **Determine agent working directory:**
- Calculate: `~/.hive/{agent_name}/`
- Verify this directory exists and contains session logs
3. **Read agent configuration:**
- Read file: `exports/{agent_name}/agent.json`
- Extract goal information from the JSON:
- `goal.id` - The goal identifier
- `goal.success_criteria` - What success looks like
- `goal.constraints` - Rules the agent must follow
- Extract graph information:
- List of node IDs from `graph.nodes`
- List of edges from `graph.edges`
4. **Store context for the debugging session:**
- agent_name
- agent_work_dir (e.g., `/home/user/.hive/twitter_outreach`)
- goal_id
- success_criteria
- constraints
- node_ids
**Example:**
```
Developer: "My twitter_outreach agent keeps failing"
You: "I'll help debug the twitter_outreach agent. Let me gather context..."
[Read exports/twitter_outreach/agent.json]
Context gathered:
- Agent: twitter_outreach
- Goal: twitter-outreach-multi-loop
- Working Directory: /home/user/.hive/twitter_outreach
- Success Criteria: ["Successfully send 5 personalized outreach messages"]
- Constraints: ["Must verify handle exists", "Must personalize message"]
- Nodes: ["intake-collector", "profile-analyzer", "message-composer", "outreach-sender"]
```
---
### Stage 2: Mode Selection
**Objective:** Choose the debugging approach that best fits the situation
**What to do:**
Ask the developer which debugging mode they want to use. Use AskUserQuestion with these options:
1. **Real-time Monitoring Mode**
- Description: Monitor active TUI session continuously, poll logs every 5-10 seconds, alert on new issues immediately
- Best for: Live debugging sessions where you want to catch issues as they happen
- Note: Requires agent to be currently running
2. **Post-Mortem Analysis Mode**
- Description: Analyze completed or failed runs in detail, deep dive into specific session
- Best for: Understanding why a past execution failed
- Note: Most common mode for debugging
3. **Historical Trends Mode**
- Description: Analyze patterns across multiple runs, identify recurring issues
- Best for: Finding systemic problems that happen repeatedly
- Note: Useful for agents that have run many times
**Implementation:**
```
Use AskUserQuestion to present these options and let the developer choose.
Store the selected mode for the session.
```
---
### Stage 3: Triage (L1 Analysis)
**Objective:** Identify which sessions need attention
**What to do:**
1. **Query high-level run summaries** using the MCP tool:
```
query_runtime_logs(
agent_work_dir="{agent_work_dir}",
status="needs_attention",
limit=20
)
```
2. **Analyze the results:**
- Look for runs with `needs_attention: true`
- Check `attention_summary.categories` for issue types
- Note the `run_id` of problematic sessions
- Check `status` field: "degraded", "failure", "in_progress"
3. **Attention flag triggers to understand:**
From runtime_logger.py, runs are flagged when:
- retry_count > 3
- escalate_count > 2
- latency_ms > 60000
- tokens_used > 100000
- total_steps > 20
4. **Present findings to developer:**
- Summarize how many runs need attention
- List the most recent problematic runs
- Show attention categories for each
- Ask which run they want to investigate (if multiple)
**Example Output:**
```
Found 2 runs needing attention:
1. session_20260206_115718_e22339c5 (30 minutes ago)
Status: degraded
Categories: missing_outputs, retry_loops
2. session_20260206_103422_9f8d1b2a (2 hours ago)
Status: failure
Categories: tool_failures, high_latency
Which run would you like to investigate?
```
---
### Stage 4: Diagnosis (L2 Analysis)
**Objective:** Identify which nodes failed and what patterns exist
**What to do:**
1. **Query per-node details** using the MCP tool:
```
query_runtime_log_details(
agent_work_dir="{agent_work_dir}",
run_id="{selected_run_id}",
needs_attention_only=True
)
```
2. **Categorize issues** using the Issue Taxonomy:
**10 Issue Categories:**
| Category | Detection Pattern | Meaning |
|----------|------------------|---------|
| **Missing Outputs** | `exit_status != "success"`, `attention_reasons` contains "missing_outputs" | Node didn't call set_output with required keys |
| **Tool Errors** | `tool_error_count > 0`, `attention_reasons` contains "tool_failures" | Tool calls failed (API errors, timeouts, auth issues) |
| **Retry Loops** | `retry_count > 3`, `verdict_counts.RETRY > 5` | Judge repeatedly rejecting outputs |
| **Guard Failures** | `guard_reject_count > 0` | Output validation failed (wrong types, missing keys) |
| **Stalled Execution** | `total_steps > 20`, `verdict_counts.CONTINUE > 10` | EventLoopNode not making progress |
| **High Latency** | `latency_ms > 60000`, `avg_step_latency > 5000` | Slow tool calls or LLM responses |
| **Client-Facing Issues** | `client_input_requested` but no `user_input_received` | Premature set_output before user input |
| **Edge Routing Errors** | `exit_status == "no_valid_edge"`, `attention_reasons` contains "routing_issue" | No edges match current state |
| **Memory/Context Issues** | `tokens_used > 100000`, `context_overflow_count > 0` | Conversation history too long |
| **Constraint Violations** | Compare output against goal constraints | Agent violated goal-level rules |
3. **Analyze each flagged node:**
- Node ID and name
- Exit status
- Retry count
- Verdict distribution (ACCEPT/RETRY/ESCALATE/CONTINUE)
- Attention reasons
- Total steps executed
4. **Present diagnosis to developer:**
- List problematic nodes
- Categorize each issue
- Highlight the most severe problems
- Show evidence (retry counts, error types)
**Example Output:**
```
Diagnosis for session_20260206_115718_e22339c5:
Problem Node: intake-collector
├─ Exit Status: escalate
├─ Retry Count: 5 (HIGH)
├─ Verdict Counts: {RETRY: 5, ESCALATE: 1}
├─ Attention Reasons: ["high_retry_count", "missing_outputs"]
├─ Total Steps: 8
└─ Categories: Missing Outputs + Retry Loops
Root Issue: The intake-collector node is stuck in a retry loop because it's not setting required outputs.
```
---
### Stage 5: Root Cause Analysis (L3 Analysis)
**Objective:** Understand exactly what went wrong by examining detailed logs
**What to do:**
1. **Query detailed tool/LLM logs** using the MCP tool:
```
query_runtime_log_raw(
agent_work_dir="{agent_work_dir}",
run_id="{run_id}",
node_id="{problem_node_id}"
)
```
2. **Analyze based on issue category:**
**For Missing Outputs:**
- Check `step.tool_calls` for set_output usage
- Look for conditional logic that skipped set_output
- Check if LLM is calling other tools instead
**For Tool Errors:**
- Check `step.tool_results` for error messages
- Identify error types: rate limits, auth failures, timeouts, network errors
- Note which specific tool is failing
**For Retry Loops:**
- Check `step.verdict_feedback` from judge
- Look for repeated failure reasons
- Identify if it's the same issue every time
**For Guard Failures:**
- Check `step.guard_results` for validation errors
- Identify missing keys or type mismatches
- Compare actual output to expected schema
**For Stalled Execution:**
- Check `step.llm_response_text` for repetition
- Look for LLM stuck in same action loop
- Check if tool calls are succeeding but not progressing
3. **Extract evidence:**
- Specific error messages
- Tool call arguments and results
- LLM response text
- Judge feedback
- Step-by-step progression
4. **Formulate root cause explanation:**
- Clearly state what is happening
- Explain why it's happening
- Show evidence from logs
**Example Output:**
```
Root Cause Analysis for intake-collector:
Step-by-step breakdown:
Step 3:
- Tool Call: web_search(query="@RomuloNevesOf")
- Result: Found Twitter profile information
- Verdict: RETRY
- Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
Step 4:
- Tool Call: web_search(query="@RomuloNevesOf twitter")
- Result: Found additional Twitter information
- Verdict: RETRY
- Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
Steps 5-7: Similar pattern continues...
ROOT CAUSE: The node is successfully finding Twitter handles via web_search, but the LLM is not calling set_output to save the results. It keeps searching for more information instead of completing the task.
```
---
### Stage 6: Fix Recommendations
**Objective:** Provide actionable solutions the developer can implement
**What to do:**
Based on the issue category identified, provide specific fix recommendations using these templates:
#### Template 1: Missing Outputs (Client-Facing Nodes)
```markdown
## Issue: Premature set_output in Client-Facing Node
**Root Cause:** Node called set_output before receiving user input
**Fix:** Use STEP 1/STEP 2 prompt pattern
**File to edit:** `exports/{agent_name}/nodes/{node_name}.py`
**Changes:**
1. Update the system_prompt to include explicit step guidance:
```python
system_prompt = """
STEP 1: Analyze the user input and decide what action to take.
DO NOT call set_output in this step.
STEP 2: After receiving feedback or completing analysis,
ONLY THEN call set_output with your results.
"""
```
2. If some inputs are optional (like feedback on retry edges), add nullable_output_keys:
```python
nullable_output_keys=["feedback"]
```
**Verification:**
- Run the agent with test input
- Verify the client-facing node waits for user input before calling set_output
```
#### Template 2: Retry Loops
```markdown
## Issue: Judge Repeatedly Rejecting Outputs
**Root Cause:** {Insert specific reason from verdict_feedback}
**Fix Options:**
**Option A - If outputs are actually correct:** Adjust judge evaluation rules
- File: `exports/{agent_name}/agent.json`
- Update `evaluation_rules` section to accept the current output format
- Example: If judge expects list but gets string, update rule to accept both
**Option B - If prompt is ambiguous:** Clarify node instructions
- File: `exports/{agent_name}/nodes/{node_name}.py`
- Make system_prompt more explicit about output format and requirements
- Add examples of correct outputs
**Option C - If tool is unreliable:** Add retry logic with fallback
- Consider using alternative tools
- Add manual fallback option
- Update prompt to handle tool failures gracefully
**Verification:**
- Run the node with test input
- Confirm judge accepts output on first try
- Check that retry_count stays at 0
```
#### Template 3: Tool Errors
```markdown
## Issue: {tool_name} Failing with {error_type}
**Root Cause:** {Insert specific error message from logs}
**Fix Strategy:**
**If API rate limit:**
1. Add exponential backoff in tool retry logic
2. Reduce API call frequency
3. Consider caching results
**If auth failure:**
1. Check credentials using:
```bash
/hive-credentials --agent {agent_name}
```
2. Verify API key environment variables
3. Update `mcp_servers.json` if needed
**If timeout:**
1. Increase timeout in `mcp_servers.json`:
```json
{
"timeout_ms": 60000
}
```
2. Consider using faster alternative tools
3. Break large requests into smaller chunks
**Verification:**
- Test tool call manually
- Confirm successful response
- Monitor for recurring errors
```
#### Template 4: Edge Routing Errors
```markdown
## Issue: No Valid Edge from Node {node_id}
**Root Cause:** No edge condition matched the current state
**File to edit:** `exports/{agent_name}/agent.json`
**Analysis:**
- Current node output: {show actual output keys}
- Existing edge conditions: {list edge conditions}
- Why no match: {explain the mismatch}
**Fix:**
Add the missing edge to the graph:
```json
{
"edge_id": "{node_id}_to_{target_node}",
"source": "{node_id}",
"target": "{target_node}",
"condition": "on_success"
}
```
**Alternative:** Update existing edge condition to cover this case
**Verification:**
- Run agent with same input
- Verify edge is traversed successfully
- Check that execution continues to next node
```
#### Template 5: Stalled Execution
```markdown
## Issue: EventLoopNode Not Making Progress
**Root Cause:** {Insert analysis - e.g., "LLM repeating same failed action"}
**File to edit:** `exports/{agent_name}/nodes/{node_name}.py`
**Fix:** Update system_prompt to guide LLM out of loops
**Add this guidance:**
```python
system_prompt = """
{existing prompt}
IMPORTANT: If a tool call fails multiple times:
1. Try an alternative approach or different tool
2. If no alternatives work, call set_output with partial results
3. DO NOT retry the same failed action more than 3 times
Progress is more important than perfection. Move forward even with incomplete data.
"""
```
**Additional fix:** Lower max_iterations to prevent infinite loops
```python
# In node configuration
max_node_visits=3 # Prevent getting stuck
```
**Verification:**
- Run node with same input that caused stall
- Verify it exits after reasonable attempts (< 10 steps)
- Confirm it calls set_output eventually
```
**Selecting the right template:**
- Match the issue category from Stage 4
- Customize with specific details from Stage 5
- Include actual error messages and code snippets
- Provide file paths and line numbers when possible
---
### Stage 7: Verification Support
**Objective:** Help the developer confirm their fixes work
**What to do:**
1. **Suggest appropriate tests based on fix type:**
**For node-level fixes:**
```bash
# Use hive-test to run goal-based tests
/hive-test --agent {agent_name} --goal {goal_id}
# Or run specific test scenarios
/hive-test --agent {agent_name} --scenario {specific_input}
```
**For quick manual tests:**
```bash
# Launch the interactive TUI dashboard
hive tui
```
Then use arrow keys to select the agent from the list and press Enter to run it.
2. **Provide MCP tool queries to validate the fix:**
**Check if issue is resolved:**
```
query_runtime_logs(
agent_work_dir="~/.hive/{agent_name}",
status="needs_attention",
limit=5
)
# Should show 0 results if fully fixed
```
**Verify specific node behavior:**
```
query_runtime_log_details(
agent_work_dir="~/.hive/{agent_name}",
run_id="{new_run_id}",
node_id="{fixed_node_id}"
)
# Should show exit_status="success", retry_count=0
```
3. **Monitor for regression:**
- Run the agent multiple times
- Check for similar issues reappearing
- Verify fix works across different inputs
4. **Provide verification checklist:**
```
Verification Checklist:
□ Applied recommended fix to code
□ Ran agent with test input
□ Checked runtime logs show no attention flags
□ Verified specific node completes successfully
□ Tested with multiple inputs
□ No regression of original issue
□ Agent meets success criteria
```
**Example interaction:**
```
Developer: "I applied the fix to intake-collector. How do I verify it works?"
You: "Great! Let's verify the fix with these steps:
1. Launch the TUI dashboard:
hive tui
Then select your agent from the list and press Enter to run it.
2. After it completes, check the logs:
[Use query_runtime_logs to check for attention flags]
3. Verify the specific node:
[Use query_runtime_log_details for intake-collector]
Expected results:
- No 'needs_attention' flags
- intake-collector shows exit_status='success'
- retry_count should be 0
Let me know when you've run it and I'll help check the logs!"
```
---
## MCP Tool Usage Guide
### Three Levels of Observability
**L1: query_runtime_logs** - Session-level summaries
- **When to use:** Initial triage, identifying problematic runs, monitoring trends
- **Returns:** List of runs with status, attention flags, timestamps
- **Example:**
```
query_runtime_logs(
agent_work_dir="/home/user/.hive/twitter_outreach",
status="needs_attention",
limit=20
)
```
**L2: query_runtime_log_details** - Node-level details
- **When to use:** Diagnosing which nodes failed, understanding retry patterns
- **Returns:** Per-node completion details, retry counts, verdicts
- **Example:**
```
query_runtime_log_details(
agent_work_dir="/home/user/.hive/twitter_outreach",
run_id="session_20260206_115718_e22339c5",
needs_attention_only=True
)
```
**L3: query_runtime_log_raw** - Step-level details
- **When to use:** Root cause analysis, understanding exact failures
- **Returns:** Full tool calls, LLM responses, judge feedback
- **Example:**
```
query_runtime_log_raw(
agent_work_dir="/home/user/.hive/twitter_outreach",
run_id="session_20260206_115718_e22339c5",
node_id="intake-collector"
)
```
### Query Patterns
**Pattern 1: Top-Down Investigation** (Most common)
```
1. L1: Find problematic runs
2. L2: Identify failing nodes
3. L3: Analyze specific failures
```
**Pattern 2: Node-Specific Debugging**
```
1. L2: Get details for specific node across all runs
2. L3: Deep dive into worst failures
```
**Pattern 3: Real-time Monitoring**
```
Loop every 10 seconds:
1. L1: Check for new needs_attention runs
2. If found: Alert and drill into L2
```
---
## Complete Example Walkthrough
### Scenario: twitter_outreach Agent Failing
**Developer says:** "My twitter_outreach agent keeps failing. It's not finding Twitter handles correctly."
#### Stage 1: Setup
```
You: "I'll help debug the twitter_outreach agent. Let me gather context..."
[Read exports/twitter_outreach/agent.json]
Context:
- Agent: twitter_outreach
- Goal: twitter-outreach-multi-loop
- Working Dir: ~/.hive/twitter_outreach
- Success Criteria: ["Successfully send 5 personalized outreach messages"]
- Constraints: ["Must verify handle exists", "Must personalize message"]
- Nodes: intake-collector, profile-analyzer, message-composer, outreach-sender
```
#### Stage 2: Mode Selection
```
You: "Which debugging approach would you prefer?"
[Present 3 mode options using AskUserQuestion]
Developer selects: "Post-Mortem Analysis Mode"
```
#### Stage 3: Triage (L1)
```
You: "Querying runtime logs for problematic runs..."
[Call query_runtime_logs with status="needs_attention"]
Results:
- Found 1 run needing attention
- session_20260206_115718_e22339c5 (30 min ago)
- Status: degraded
- Categories: missing_outputs, retry_loops
```
#### Stage 4: Diagnosis (L2)
```
You: "Analyzing node-level details..."
[Call query_runtime_log_details for the session]
Results:
- Problem Node: intake-collector
- Exit Status: escalate
- Retry Count: 5
- Verdict Counts: {RETRY: 5, ESCALATE: 1}
- Attention Reasons: ["high_retry_count", "missing_outputs"]
- Total Steps: 8
Diagnosis: intake-collector has retry loop + missing outputs issue
Category: Missing Outputs + Retry Loops
```
#### Stage 5: Root Cause (L3)
```
You: "Drilling into detailed logs for intake-collector..."
[Call query_runtime_log_raw for node intake-collector]
Evidence:
Step 3: web_search("@RomuloNevesOf") → Found profile
Verdict: RETRY
Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
Step 4: web_search("@RomuloNevesOf twitter") → Found more info
Verdict: RETRY
Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
Steps 5-7: Same pattern repeats...
ROOT CAUSE: Node successfully finds handles but LLM doesn't call set_output. It keeps searching instead of completing.
```
#### Stage 6: Fix Recommendation
```
You: "Here's the fix for this issue:
## Issue: intake-collector Not Setting Outputs
**Root Cause:** Node successfully finds Twitter handles but doesn't call set_output
**Category:** Missing Outputs
**File to edit:** exports/twitter_outreach/nodes/intake_collector.py
**Changes:**
Update the system_prompt to emphasize the output requirement:
```python
system_prompt = """
You are the Intake Collector for the Twitter outreach agent.
Your task:
1. Extract Twitter handle from user input (URL or @mention)
2. Verify it's a valid Twitter handle format
3. **CRITICAL: Call set_output(twitter_handles=[handle]) immediately after finding valid handle**
IMPORTANT: You MUST call set_output even if you found the handle on the first try.
Do not continue searching once you have a valid handle - set output and finish.
Output format:
{
"twitter_handles": ["@handle1", "@handle2", ...]
}
"""
```
**Why this works:** The explicit instruction to call set_output immediately prevents the LLM from continuing to search after finding the handle.
```
#### Stage 7: Verification
```
Developer: "I updated the prompt. How do I test it?"
You: "Let's verify the fix:
1. Launch the TUI dashboard:
hive tui
Then select your agent and press Enter to run it.
2. I'll check the logs after it runs..."
[Developer runs agent]
Developer: "It completed! Can you check if it worked?"
You: "Checking now..."
[Call query_runtime_log_details for the new run]
Results:
- Node: intake-collector
- Exit Status: success ✓
- Retry Count: 0 ✓
- Total Steps: 2 ✓
- No attention flags ✓
"Perfect! The fix worked. The intake-collector now:
- Completes successfully on first try
- No retries needed
- Calls set_output properly
Your agent should now work correctly!"
```
---
## Tips for Effective Debugging
1. **Always start with L1 logs** - Don't jump straight to detailed logs
2. **Focus on attention flags** - They highlight the real issues
3. **Compare verdict_feedback across steps** - Patterns reveal root causes
4. **Check tool error messages carefully** - They often contain the exact problem
5. **Consider the agent's goal** - Fixes should align with success criteria
6. **Test fixes immediately** - Quick verification prevents wasted effort
7. **Look for patterns across multiple runs** - One-time failures might be transient
## Common Pitfalls to Avoid
1. **Don't recommend code you haven't verified exists** - Always read files first
2. **Don't assume tool capabilities** - Check MCP server configs
3. **Don't ignore edge conditions** - Missing edges cause routing failures
4. **Don't overlook judge configuration** - Mismatched expectations cause retry loops
5. **Don't forget nullable_output_keys** - Optional inputs need explicit marking
---
## Storage Locations Reference
**New unified storage (default):**
- Logs: `~/.hive/{agent_name}/sessions/session_YYYYMMDD_HHMMSS_{uuid}/logs/`
- State: `~/.hive/{agent_name}/sessions/{session_id}/state.json`
- Conversations: `~/.hive/{agent_name}/sessions/{session_id}/conversations/`
**Old storage (deprecated, still supported):**
- Logs: `~/.hive/{agent_name}/runtime_logs/runs/{run_id}/`
The MCP tools automatically check both locations.
---
**Remember:** Your role is to be a debugging companion and thought partner. Guide the developer through the investigation, explain what you find, and provide actionable fixes. Don't just report errors - help understand and solve them.
+38 -6
View File
@@ -12,6 +12,7 @@ metadata:
- hive-patterns
- hive-test
- hive-credentials
- hive-debugger
---
# Agent Development Workflow
@@ -24,6 +25,7 @@ When this skill is loaded, determine what the user needs and invoke the appropri
- **User wants to learn concepts** → Invoke `/hive-concepts` immediately
- **User wants patterns/optimization** → Invoke `/hive-patterns` immediately
- **User wants to set up credentials** → Invoke `/hive-credentials` immediately
- **User has a failing/broken agent** → Invoke `/hive-debugger` immediately
- **Unclear what user needs** → Ask the user (do NOT explore the codebase to figure it out)
**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
@@ -41,6 +43,7 @@ This workflow orchestrates specialized skills to take you from initial concept t
3. **Optimize Design**`/hive-patterns` (optional)
4. **Setup Credentials**`/hive-credentials` (if agent uses tools requiring API keys)
5. **Test & Validate**`/hive-test`
6. **Debug Issues**`/hive-debugger` (if agent fails at runtime)
## When to Use This Workflow
@@ -63,6 +66,7 @@ Use this meta-skill when:
"Need client-facing nodes or feedback loops" → hive-patterns
"Set up API keys for my agent" → hive-credentials
"Test my agent" → hive-test
"My agent is failing/stuck/has errors" → hive-debugger
"Not sure what I need" → Read phases below, then decide
"Agent has structure but needs implementation" → See agent directory STATUS.md
```
@@ -345,11 +349,23 @@ hive (meta-skill)
│ ├── Fan-out/fan-in parallel execution
│ └── Context management and anti-patterns
── hive-test
├── Reads agent goal
├── Generates tests
├── Runs evaluation
└── Reports results
── hive-credentials (utility)
├── Detects missing credentials
├── Offers auth method choices (Aden OAuth, direct API key)
├── Stores securely in ~/.hive/credentials
└── Validates with health checks
├── hive-test (validation)
│ ├── Reads agent goal
│ ├── Generates tests
│ ├── Runs evaluation
│ └── Reports results
└── hive-debugger (troubleshooting)
├── Monitors runtime logs (L1/L2/L3)
├── Identifies retry loops, tool failures
├── Categorizes issues (10 categories)
└── Provides fix recommendations
```
## Troubleshooting
@@ -376,6 +392,13 @@ hive (meta-skill)
- Use `/hive-test` to debug and iterate
- Fix agent code and re-run tests
### "Agent is failing at runtime"
- Use `/hive-debugger` to analyze runtime logs
- The debugger identifies retry loops, tool failures, and stalled execution
- Get actionable fix recommendations with code changes
- Monitor the agent in real-time during TUI sessions
### "Not sure which phase I'm in"
Run these checks:
@@ -448,7 +471,9 @@ This workflow provides a proven path from concept to production-ready agent:
1. **Learn** with `/hive-concepts` → Understand fundamentals (optional)
2. **Build** with `/hive-create` → Get validated structure
3. **Optimize** with `/hive-patterns` → Apply best practices (optional)
4. **Test** with `/hive-test`Get verified functionality
4. **Configure** with `/hive-credentials`Set up API keys (if needed)
5. **Test** with `/hive-test` → Get verified functionality
6. **Debug** with `/hive-debugger` → Fix runtime issues (if needed)
The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
@@ -478,3 +503,10 @@ The workflow is **flexible** - skip phases as needed, iterate freely, and adapt
- Ready to validate functionality
- Need comprehensive test coverage
- Testing feedback loops, output keys, or fan-out
**Choose hive-debugger when:**
- Agent is failing or stuck at runtime
- Seeing retry loops or escalations
- Tool calls are failing
- Need to understand why a node isn't completing
- Want real-time monitoring of agent execution
+1
View File
@@ -74,3 +74,4 @@ exports/*
docs/github-issues/*
core/tests/*dumps/*
screenshots/*
+24 -23
View File
@@ -109,6 +109,8 @@ This sets up:
- **framework** - Core agent runtime and graph executor (in `core/.venv`)
- **aden_tools** - MCP tools for agent capabilities (in `tools/.venv`)
- **credential store** - Encrypted API key storage (`~/.hive/credentials`)
- **LLM provider** - Interactive default model configuration
- All required Python dependencies
### Build Your First Agent
@@ -118,10 +120,13 @@ This sets up:
claude> /hive
# Test your agent
claude> /hive-test
claude> /hive-debugger
# Run your agent
PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
# (at separate terminal) Launch the interactive dashboard
hive tui
# Or run directly
hive run exports/your_agent_name --input '{"key": "value"}'
```
**[📖 Complete Setup Guide](docs/environment-setup.md)** - Detailed instructions for agent development
@@ -143,6 +148,7 @@ Skills are also available in Cursor. To enable:
- **SDK-Wrapped Nodes** - Every node gets shared memory, local RLM memory, monitoring, tools, and LLM access out of the box
- **[Human-in-the-Loop](docs/key_concepts/graph.md#human-in-the-loop)** - Intervention nodes that pause execution for human input with configurable timeouts and escalation
- **Real-time Observability** - WebSocket streaming for live monitoring of agent execution, decisions, and node-to-node communication
- **Interactive TUI Dashboard** - Terminal-based dashboard with live graph view, event log, and chat interface for agent interaction
- **Cost & Budget Control** - Set spending limits, throttles, and automatic model degradation policies
- **Production-Ready** - Self-hostable, built for scale and reliability
@@ -201,40 +207,35 @@ flowchart LR
4. **Control Plane Monitors** → Real-time metrics, budget enforcement, policy management
5. **[Adaptiveness](docs/key_concepts/evolution.md)** → On failure, the system evolves the graph and redeploys automatically
## Run pre-built Agents (Coming Soon)
## Run Agents
### Run a sample agent
Aden Hive provides a list of featured agents that you can use and build on top of.
### Run an agent shared by others
Put the agent in `exports/` and run `PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'`
For building and running goal-driven agents with the framework:
The `hive` CLI is the primary interface for running agents.
```bash
# One-time setup
./quickstart.sh
# Browse and run agents interactively (Recommended)
hive tui
# This sets up:
# - framework package (core runtime)
# - aden_tools package (MCP tools)
# - All Python dependencies
# Run a specific agent directly
hive run exports/my_agent --input '{"task": "Your input here"}'
# Build new agents using Agent Skills
claude> /hive
# Run a specific agent with the TUI dashboard
hive run exports/my_agent --tui
# Run agents
PYTHONPATH=exports uv run python -m agent_name run --input '{...}'
# Interactive REPL
hive shell
```
The TUI scans both `exports/` and `examples/templates/` for available agents.
> **Using Python directly (alternative):** You can also run agents with `PYTHONPATH=exports uv run python -m agent_name run --input '{...}'`
See [environment-setup.md](docs/environment-setup.md) for complete setup instructions.
## Documentation
- **[Developer Guide](docs/developer-guide.md)** - Comprehensive guide for developers
- [Getting Started](docs/getting-started.md) - Quick setup instructions
- [TUI Guide](docs/tui-selection-guide.md) - Interactive dashboard usage
- [Configuration Guide](docs/configuration.md) - All configuration options
- [Architecture Overview](docs/architecture/README.md) - System design and structure
+2 -2
View File
@@ -4,8 +4,8 @@
"name": "tools",
"description": "Aden tools including web search, file operations, and PDF reading",
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../tools",
"env": {
"BRAVE_SEARCH_API_KEY": "${BRAVE_SEARCH_API_KEY}"
+7
View File
@@ -44,6 +44,13 @@ def _configure_paths():
if exports_str not in sys.path:
sys.path.insert(0, exports_str)
# Add examples/templates/ to sys.path so template agents are importable
templates_dir = project_root / "examples" / "templates"
if templates_dir.is_dir():
templates_str = str(templates_dir)
if templates_str not in sys.path:
sys.path.insert(0, templates_str)
# Ensure core/ is also in sys.path (for non-editable-install scenarios)
core_str = str(project_root / "core")
if (project_root / "core").is_dir() and core_str not in sys.path:
+512 -111
View File
@@ -149,7 +149,7 @@ class EventLoopNode(NodeProtocol):
1. Try to restore from durable state (crash recovery)
2. If no prior state, init from NodeSpec.system_prompt + input_keys
3. Loop: drain injection queue -> stream LLM -> execute tools
-> if client_facing + no real tools: block for user input
-> if client_facing + ask_user called: block for user input
-> judge evaluates (acceptance criteria)
(each add_* and set_output writes through to store immediately)
4. Publish events to EventBus at each stage
@@ -157,11 +157,11 @@ class EventLoopNode(NodeProtocol):
6. Terminate when judge returns ACCEPT, shutdown signaled, or max iterations
7. Build output dict from OutputAccumulator
Client-facing blocking: When ``client_facing=True`` and the LLM finishes
without real tool calls (stop_reason != tool_call), the node blocks via
``_await_user_input()`` until ``inject_event()`` or ``signal_shutdown()``
is called. After user input, the judge evaluates the judge is the
sole mechanism for acceptance decisions.
Client-facing blocking: When ``client_facing=True``, a synthetic
``ask_user`` tool is injected. The node blocks for user input ONLY
when the LLM explicitly calls ``ask_user()``. Text-only turns
without ``ask_user`` flow through without blocking, allowing the LLM
to stream progress updates and summaries freely.
Always returns NodeResult with retryable=False semantics. The executor
must NOT retry event loop nodes -- retry is handled internally by the
@@ -210,9 +210,28 @@ class EventLoopNode(NodeProtocol):
stream_id = ctx.node_id
node_id = ctx.node_id
# Verdict counters for runtime logging
_accept_count = _retry_count = _escalate_count = _continue_count = 0
# 1. Guard: LLM required
if ctx.llm is None:
return NodeResult(success=False, error="LLM provider not available")
error_msg = "LLM provider not available"
# Log guard failure
if ctx.runtime_logger:
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=False,
error=error_msg,
exit_status="guard_failure",
total_steps=0,
tokens_used=0,
input_tokens=0,
output_tokens=0,
latency_ms=0,
)
return NodeResult(success=False, error=error_msg)
# 2. Restore or create new conversation + accumulator
conversation, accumulator, start_iteration = await self._restore(ctx)
@@ -233,11 +252,13 @@ class EventLoopNode(NodeProtocol):
if initial_message:
await conversation.add_user_message(initial_message)
# 3. Build tool list: node tools + synthetic set_output tool
# 3. Build tool list: node tools + synthetic set_output + ask_user tools
tools = list(ctx.available_tools)
set_output_tool = self._build_set_output_tool(ctx.node_spec.output_keys)
if set_output_tool:
tools.append(set_output_tool)
if ctx.node_spec.client_facing:
tools.append(self._build_ask_user_tool())
logger.info(
"[%s] Tools available (%d): %s | client_facing=%s | judge=%s",
@@ -256,9 +277,28 @@ class EventLoopNode(NodeProtocol):
# 6. Main loop
for iteration in range(start_iteration, self._config.max_iterations):
# 6a. Check pause
iter_start = time.time()
# 6a. Check pause (no current-iteration data yet — only log_node_complete needed)
if await self._check_pause(ctx, conversation, iteration):
latency_ms = int((time.time() - start_time) * 1000)
if ctx.runtime_logger:
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=True,
total_steps=iteration,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="paused",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=True,
output=accumulator.to_dict(),
@@ -283,25 +323,73 @@ class EventLoopNode(NodeProtocol):
iteration,
len(conversation.messages),
)
(
assistant_text,
real_tool_results,
outputs_set,
turn_tokens,
) = await self._run_single_turn(ctx, conversation, tools, iteration, accumulator)
logger.info(
"[%s] iter=%d: LLM done — text=%d chars, real_tools=%d, "
"outputs_set=%s, tokens=%s, accumulator=%s",
node_id,
iteration,
len(assistant_text),
len(real_tool_results),
outputs_set or "[]",
turn_tokens,
{k: ("set" if v is not None else "None") for k, v in accumulator.to_dict().items()},
)
total_input_tokens += turn_tokens.get("input", 0)
total_output_tokens += turn_tokens.get("output", 0)
try:
(
assistant_text,
real_tool_results,
outputs_set,
turn_tokens,
logged_tool_calls,
user_input_requested,
) = await self._run_single_turn(ctx, conversation, tools, iteration, accumulator)
logger.info(
"[%s] iter=%d: LLM done — text=%d chars, real_tools=%d, "
"outputs_set=%s, tokens=%s, accumulator=%s",
node_id,
iteration,
len(assistant_text),
len(real_tool_results),
outputs_set or "[]",
turn_tokens,
{
k: ("set" if v is not None else "None")
for k, v in accumulator.to_dict().items()
},
)
total_input_tokens += turn_tokens.get("input", 0)
total_output_tokens += turn_tokens.get("output", 0)
except Exception as e:
# LLM call crashed - log partial step with error
import traceback
iter_latency_ms = int((time.time() - iter_start) * 1000)
latency_ms = int((time.time() - start_time) * 1000)
error_msg = f"LLM call failed: {e}"
stack_trace = traceback.format_exc()
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
error=error_msg,
stacktrace=stack_trace,
is_partial=True,
input_tokens=0,
output_tokens=0,
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=False,
error=error_msg,
stacktrace=stack_trace,
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="failure",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
# Re-raise to maintain existing error handling
raise
# 6e'. Feed actual API token count back for accurate estimation
turn_input = turn_tokens.get("input", 0)
@@ -317,7 +405,12 @@ class EventLoopNode(NodeProtocol):
# outputs are already set, accept immediately. This prevents
# wasted iterations when the LLM has genuinely finished its
# work (e.g. after calling set_output in a previous turn).
truly_empty = not assistant_text and not real_tool_results and not outputs_set
truly_empty = (
not assistant_text
and not real_tool_results
and not outputs_set
and not user_input_requested
)
if truly_empty and accumulator is not None:
missing = self._get_missing_output_keys(
accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys
@@ -344,6 +437,38 @@ class EventLoopNode(NodeProtocol):
if self._is_stalled(recent_responses):
await self._publish_stalled(stream_id, node_id)
latency_ms = int((time.time() - start_time) * 1000)
_continue_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="CONTINUE",
verdict_feedback="Stall detected before judge evaluation",
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=False,
error="Node stalled",
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="stalled",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=False,
error=(
@@ -360,18 +485,48 @@ class EventLoopNode(NodeProtocol):
# 6h. Client-facing input blocking
#
# For client_facing nodes, block for user input whenever the
# LLM finishes without making real tool calls (i.e. the LLM's
# stop_reason is not tool_call). set_output is separated from
# real tools by _run_single_turn, so this correctly treats
# set_output-only turns as conversational boundaries.
# For client_facing nodes, block for user input only when the
# LLM explicitly called ask_user(). Text-only turns without
# ask_user flow through without blocking, allowing progress
# updates and summaries to stream freely.
#
# After user input, always fall through to judge evaluation
# (6i). The judge handles all acceptance decisions.
if ctx.node_spec.client_facing and not real_tool_results:
if ctx.node_spec.client_facing and user_input_requested:
if self._shutdown:
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
_continue_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="CONTINUE",
verdict_feedback="Shutdown signaled (client-facing)",
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=True,
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="success",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=True,
output=accumulator.to_dict(),
@@ -385,6 +540,37 @@ class EventLoopNode(NodeProtocol):
if not got_input:
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
_continue_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="CONTINUE",
verdict_feedback="No input received (shutdown during wait)",
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=True,
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="success",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=True,
output=accumulator.to_dict(),
@@ -402,75 +588,207 @@ class EventLoopNode(NodeProtocol):
)
logger.info("[%s] iter=%d: 6i should_judge=%s", node_id, iteration, should_judge)
if should_judge:
verdict = await self._evaluate(
ctx,
conversation,
accumulator,
assistant_text,
real_tool_results,
iteration,
)
fb_preview = (verdict.feedback or "")[:200]
logger.info(
"[%s] iter=%d: judge verdict=%s feedback=%r",
node_id,
iteration,
verdict.action,
fb_preview,
)
if verdict.action == "ACCEPT":
# Check for missing output keys
missing = self._get_missing_output_keys(
accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys
if not should_judge:
# Gap C: unjudged iteration — log as CONTINUE
_continue_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="CONTINUE",
verdict_feedback="Unjudged (judge_every_n_turns skip)",
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
if missing and self._judge is not None:
hint = (
f"Missing required output keys: {missing}. "
"Use set_output to provide them."
)
logger.info(
"[%s] iter=%d: ACCEPT but missing keys %s",
node_id,
iteration,
missing,
)
await conversation.add_user_message(hint)
continue
continue
# Write outputs to shared memory
for key, value in accumulator.to_dict().items():
ctx.memory.write(key, value, validate=False)
# Judge evaluation (should_judge is always True here)
verdict = await self._evaluate(
ctx,
conversation,
accumulator,
assistant_text,
real_tool_results,
iteration,
)
fb_preview = (verdict.feedback or "")[:200]
logger.info(
"[%s] iter=%d: judge verdict=%s feedback=%r",
node_id,
iteration,
verdict.action,
fb_preview,
)
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
return NodeResult(
if verdict.action == "ACCEPT":
# Check for missing output keys
missing = self._get_missing_output_keys(
accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys
)
if missing and self._judge is not None:
hint = (
f"Missing required output keys: {missing}. Use set_output to provide them."
)
logger.info(
"[%s] iter=%d: ACCEPT but missing keys %s",
node_id,
iteration,
missing,
)
await conversation.add_user_message(hint)
# Gap D: log ACCEPT-with-missing-keys as RETRY
_retry_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="RETRY",
verdict_feedback=(f"Judge accepted but missing output keys: {missing}"),
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
continue
# Exit point 5: Judge ACCEPT — log step + log_node_complete
# Write outputs to shared memory
for key, value in accumulator.to_dict().items():
ctx.memory.write(key, value, validate=False)
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
_accept_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="ACCEPT",
verdict_feedback=verdict.feedback,
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=True,
output=accumulator.to_dict(),
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="success",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=True,
output=accumulator.to_dict(),
tokens_used=total_input_tokens + total_output_tokens,
latency_ms=latency_ms,
)
elif verdict.action == "ESCALATE":
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
return NodeResult(
elif verdict.action == "ESCALATE":
# Exit point 6: Judge ESCALATE — log step + log_node_complete
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
latency_ms = int((time.time() - start_time) * 1000)
_escalate_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="ESCALATE",
verdict_feedback=verdict.feedback,
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=False,
error=f"Judge escalated: {verdict.feedback}",
output=accumulator.to_dict(),
total_steps=iteration + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="escalated",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=False,
error=f"Judge escalated: {verdict.feedback}",
output=accumulator.to_dict(),
tokens_used=total_input_tokens + total_output_tokens,
latency_ms=latency_ms,
)
elif verdict.action == "RETRY":
if verdict.feedback:
await conversation.add_user_message(f"[Judge feedback]: {verdict.feedback}")
continue
elif verdict.action == "RETRY":
_retry_count += 1
if ctx.runtime_logger:
iter_latency_ms = int((time.time() - iter_start) * 1000)
ctx.runtime_logger.log_step(
node_id=node_id,
node_type="event_loop",
step_index=iteration,
verdict="RETRY",
verdict_feedback=verdict.feedback,
tool_calls=logged_tool_calls,
llm_text=assistant_text,
input_tokens=turn_tokens.get("input", 0),
output_tokens=turn_tokens.get("output", 0),
latency_ms=iter_latency_ms,
)
if verdict.feedback:
await conversation.add_user_message(f"[Judge feedback]: {verdict.feedback}")
continue
# 7. Max iterations exhausted
await self._publish_loop_completed(stream_id, node_id, self._config.max_iterations)
latency_ms = int((time.time() - start_time) * 1000)
if ctx.runtime_logger:
ctx.runtime_logger.log_node_complete(
node_id=node_id,
node_name=ctx.node_spec.name,
node_type="event_loop",
success=False,
error=f"Max iterations ({self._config.max_iterations}) reached without acceptance",
total_steps=self._config.max_iterations,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
exit_status="failure",
accept_count=_accept_count,
retry_count=_retry_count,
escalate_count=_escalate_count,
continue_count=_continue_count,
)
return NodeResult(
success=False,
error=(f"Max iterations ({self._config.max_iterations}) reached without acceptance"),
@@ -501,8 +819,8 @@ class EventLoopNode(NodeProtocol):
async def _await_user_input(self, ctx: NodeContext) -> bool:
"""Block until user input arrives or shutdown is signaled.
Called when a client_facing node produces text without tool calls
a natural conversational turn boundary.
Called when a client_facing node explicitly calls ask_user()
an intentional conversational turn boundary.
Returns True if input arrived, False if shutdown was signaled.
"""
@@ -528,16 +846,23 @@ class EventLoopNode(NodeProtocol):
tools: list[Tool],
iteration: int,
accumulator: OutputAccumulator,
) -> tuple[str, list[dict], list[str], dict[str, int]]:
) -> tuple[str, list[dict], list[str], dict[str, int], list[dict], bool]:
"""Run a single LLM turn with streaming and tool execution.
Returns (assistant_text, real_tool_results, outputs_set, token_counts).
Returns (assistant_text, real_tool_results, outputs_set, token_counts, logged_tool_calls,
user_input_requested).
``real_tool_results`` contains only results from actual tools (web_search,
etc.), NOT from the synthetic ``set_output`` tool. ``outputs_set`` lists
the output keys written via ``set_output`` during this turn. This
separation lets the caller treat set_output as a framework concern
rather than a tool-execution concern.
etc.), NOT from the synthetic ``set_output`` or ``ask_user`` tools.
``outputs_set`` lists the output keys written via ``set_output`` during
this turn. ``user_input_requested`` is True if the LLM called
``ask_user`` during this turn. This separation lets the caller treat
synthetic tools as framework concerns rather than tool-execution concerns.
``logged_tool_calls`` accumulates ALL tool calls across inner iterations
(real tools, set_output, and discarded calls) for L3 logging. Unlike
``real_tool_results`` which resets each inner iteration, this list grows
across the entire turn.
"""
stream_id = ctx.node_id
node_id = ctx.node_id
@@ -546,6 +871,10 @@ class EventLoopNode(NodeProtocol):
final_text = ""
# Track output keys set via set_output across all inner iterations
outputs_set_this_turn: list[str] = []
user_input_requested = False
# Accumulate ALL tool calls across inner iterations for L3 logging.
# Unlike real_tool_results (reset each inner iteration), this persists.
logged_tool_calls: list[dict] = []
# Inner tool loop: stream may produce tool calls requiring re-invocation
while True:
@@ -616,7 +945,14 @@ class EventLoopNode(NodeProtocol):
# If no tool calls, turn is complete
if not tool_calls:
return final_text, [], outputs_set_this_turn, token_counts
return (
final_text,
[],
outputs_set_this_turn,
token_counts,
logged_tool_calls,
user_input_requested,
)
# Execute tool calls — separate real tools from set_output
real_tool_results: list[dict] = []
@@ -666,18 +1002,36 @@ class EventLoopNode(NodeProtocol):
pass
await accumulator.set(tc.tool_input["key"], value)
outputs_set_this_turn.append(tc.tool_input["key"])
else:
# --- Real tool execution ---
result = await self._execute_tool(tc)
result = self._truncate_tool_result(result, tc.tool_name)
real_tool_results.append(
logged_tool_calls.append(
{
"tool_use_id": tc.tool_use_id,
"tool_name": tc.tool_name,
"tool_name": "set_output",
"tool_input": tc.tool_input,
"content": result.content,
"is_error": result.is_error,
}
)
elif tc.tool_name == "ask_user":
# --- Framework-level ask_user handling ---
user_input_requested = True
result = ToolResult(
tool_use_id=tc.tool_use_id,
content="Waiting for user input...",
is_error=False,
)
else:
# --- Real tool execution ---
result = await self._execute_tool(tc)
result = self._truncate_tool_result(result, tc.tool_name)
tool_entry = {
"tool_use_id": tc.tool_use_id,
"tool_name": tc.tool_name,
"tool_input": tc.tool_input,
"content": result.content,
"is_error": result.is_error,
}
real_tool_results.append(tool_entry)
logged_tool_calls.append(tool_entry)
# Record tool result in conversation (both real and set_output
# go into the conversation for LLM context continuity)
@@ -723,14 +1077,15 @@ class EventLoopNode(NodeProtocol):
)
# Discarded calls go into real_tool_results so the
# caller sees they were attempted (for judge context).
real_tool_results.append(
{
"tool_use_id": tc.tool_use_id,
"tool_name": tc.tool_name,
"content": discard_msg,
"is_error": True,
}
)
discard_entry = {
"tool_use_id": tc.tool_use_id,
"tool_name": tc.tool_name,
"tool_input": tc.tool_input,
"content": discard_msg,
"is_error": True,
}
real_tool_results.append(discard_entry)
logged_tool_calls.append(discard_entry)
# Prune old tool results NOW to prevent context bloat on the
# next turn. The char-based token estimator underestimates
# actual API tokens, so the standard compaction check in the
@@ -748,7 +1103,14 @@ class EventLoopNode(NodeProtocol):
)
# Limit hit — return from this turn so the judge can
# evaluate instead of looping back for another stream.
return final_text, real_tool_results, outputs_set_this_turn, token_counts
return (
final_text,
real_tool_results,
outputs_set_this_turn,
token_counts,
logged_tool_calls,
user_input_requested,
)
# --- Mid-turn pruning: prevent context blowup within a single turn ---
if conversation.usage_ratio() >= 0.6:
@@ -764,12 +1126,51 @@ class EventLoopNode(NodeProtocol):
conversation.usage_ratio() * 100,
)
# If ask_user was called, return immediately so the outer loop
# can block for user input instead of re-invoking the LLM.
if user_input_requested:
return (
final_text,
real_tool_results,
outputs_set_this_turn,
token_counts,
logged_tool_calls,
user_input_requested,
)
# Tool calls processed -- loop back to stream with updated conversation
# -------------------------------------------------------------------
# set_output synthetic tool
# Synthetic tools: set_output, ask_user
# -------------------------------------------------------------------
def _build_ask_user_tool(self) -> Tool:
"""Build the synthetic ask_user tool for explicit user-input requests.
Client-facing nodes call ask_user() when they need to pause and wait
for user input. Text-only turns WITHOUT ask_user flow through without
blocking, allowing progress updates and summaries to stream freely.
"""
return Tool(
name="ask_user",
description=(
"Call this tool when you need to wait for the user's response. "
"Use it after greeting the user, asking a question, or requesting "
"approval. Do NOT call it when you are just providing a status "
"update or summary that doesn't require a response."
),
parameters={
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "Optional: the question or prompt shown to the user.",
},
},
"required": [],
},
)
def _build_set_output_tool(self, output_keys: list[str] | None) -> Tool | None:
"""Build the synthetic set_output tool for explicit output declaration."""
if not output_keys:
+98
View File
@@ -131,6 +131,7 @@ class GraphExecutor:
parallel_config: ParallelExecutionConfig | None = None,
event_bus: Any | None = None,
stream_id: str = "",
runtime_logger: Any = None,
storage_path: str | Path | None = None,
loop_config: dict[str, Any] | None = None,
):
@@ -149,6 +150,7 @@ class GraphExecutor:
parallel_config: Configuration for parallel execution behavior
event_bus: Optional event bus for emitting node lifecycle events
stream_id: Stream ID for event correlation
runtime_logger: Optional RuntimeLogger for per-graph-run logging
storage_path: Optional base path for conversation persistence
loop_config: Optional EventLoopNode configuration (max_iterations, etc.)
"""
@@ -162,6 +164,7 @@ class GraphExecutor:
self.logger = logging.getLogger(__name__)
self._event_bus = event_bus
self._stream_id = stream_id
self.runtime_logger = runtime_logger
self._storage_path = Path(storage_path) if storage_path else None
self._loop_config = loop_config or {}
@@ -284,6 +287,14 @@ class GraphExecutor:
input_data=input_data or {},
)
if self.runtime_logger:
# Extract session_id from storage_path if available (for unified sessions)
# storage_path format: base_path/sessions/{session_id}/
session_id = ""
if self._storage_path and self._storage_path.name.startswith("session_"):
session_id = self._storage_path.name
self.runtime_logger.start_run(goal_id=goal.id, session_id=session_id)
self.logger.info(f"🚀 Starting execution: {goal.name}")
self.logger.info(f" Goal: {goal.description}")
self.logger.info(f" Entry node: {graph.entry_node}")
@@ -396,6 +407,18 @@ class GraphExecutor:
stream_id=self._stream_id, node_id=current_node_id, iterations=1
)
# Ensure runtime logging has an L2 entry for this node
if self.runtime_logger:
self.runtime_logger.ensure_node_logged(
node_id=node_spec.id,
node_name=node_spec.name,
node_type=node_spec.node_type,
success=result.success,
error=result.error,
tokens_used=result.tokens_used,
latency_ms=result.latency_ms,
)
if result.success:
# Validate output before accepting it.
# Skip for event_loop nodes — their judge system is
@@ -526,6 +549,14 @@ class GraphExecutor:
total_retries_count = sum(node_retry_counts.values())
nodes_failed = list(node_retry_counts.keys())
if self.runtime_logger:
await self.runtime_logger.end_run(
status="failure",
duration_ms=total_latency,
node_path=path,
execution_quality="failed",
)
return ExecutionResult(
success=False,
error=(
@@ -568,6 +599,14 @@ class GraphExecutor:
nodes_failed = [nid for nid, count in node_retry_counts.items() if count > 0]
exec_quality = "degraded" if total_retries_count > 0 else "clean"
if self.runtime_logger:
await self.runtime_logger.end_run(
status="success",
duration_ms=total_latency,
node_path=path,
execution_quality=exec_quality,
)
return ExecutionResult(
success=True,
output=saved_memory,
@@ -691,6 +730,14 @@ class GraphExecutor:
),
)
if self.runtime_logger:
await self.runtime_logger.end_run(
status="success" if exec_quality != "failed" else "failure",
duration_ms=total_latency,
node_path=path,
execution_quality=exec_quality,
)
return ExecutionResult(
success=True,
output=output,
@@ -707,6 +754,10 @@ class GraphExecutor:
)
except Exception as e:
import traceback
stack_trace = traceback.format_exc()
self.runtime.report_problem(
severity="critical",
description=str(e),
@@ -716,10 +767,29 @@ class GraphExecutor:
narrative=f"Failed at step {steps}: {e}",
)
# Log the crashing node to L2 with full stack trace
if self.runtime_logger and node_spec is not None:
self.runtime_logger.ensure_node_logged(
node_id=node_spec.id,
node_name=node_spec.name,
node_type=node_spec.node_type,
success=False,
error=str(e),
stacktrace=stack_trace,
)
# Calculate quality metrics even for exceptions
total_retries_count = sum(node_retry_counts.values())
nodes_failed = list(node_retry_counts.keys())
if self.runtime_logger:
await self.runtime_logger.end_run(
status="failure",
duration_ms=total_latency,
node_path=path,
execution_quality="failed",
)
return ExecutionResult(
success=False,
error=str(e),
@@ -770,6 +840,7 @@ class GraphExecutor:
goal_context=goal.to_prompt_context(),
goal=goal, # Pass Goal object for LLM-powered routers
max_tokens=max_tokens,
runtime_logger=self.runtime_logger,
)
# Valid node types - no ambiguous "llm" type allowed
@@ -1171,6 +1242,18 @@ class GraphExecutor:
result = await node_impl.execute(ctx)
last_result = result
# Ensure L2 entry for this branch node
if self.runtime_logger:
self.runtime_logger.ensure_node_logged(
node_id=node_spec.id,
node_name=node_spec.name,
node_type=node_spec.node_type,
success=result.success,
error=result.error,
tokens_used=result.tokens_used,
latency_ms=result.latency_ms,
)
# Emit node-completed event (skip event_loop nodes)
if self._event_bus and node_spec.node_type != "event_loop":
await self._event_bus.emit_node_loop_completed(
@@ -1206,9 +1289,24 @@ class GraphExecutor:
return branch, last_result
except Exception as e:
import traceback
stack_trace = traceback.format_exc()
branch.status = "failed"
branch.error = str(e)
self.logger.error(f" ✗ Branch {branch.node_id}: exception - {e}")
# Log the crashing branch node to L2 with full stack trace
if self.runtime_logger and node_spec is not None:
self.runtime_logger.ensure_node_logged(
node_id=node_spec.id,
node_name=node_spec.name,
node_type=node_spec.node_type,
success=False,
error=str(e),
stacktrace=stack_trace,
)
return branch, e
# Execute all branches concurrently
+155 -4
View File
@@ -477,6 +477,9 @@ class NodeContext:
attempt: int = 1
max_attempts: int = 3
# Runtime logging (optional)
runtime_logger: Any = None # RuntimeLogger | None — uses Any to avoid import
@dataclass
class NodeResult:
@@ -854,6 +857,8 @@ Keep the same JSON structure but with shorter content values.
)
start = time.time()
_step_index = 0
_captured_tool_calls: list[dict] = []
try:
# Build messages
@@ -893,6 +898,16 @@ Keep the same JSON structure but with shorter content values.
if len(str(result.content)) > 150:
result_str += "..."
logger.info(f" ✓ Tool result: {result_str}")
# Capture for runtime logging
_captured_tool_calls.append(
{
"tool_use_id": tool_use.id,
"tool_name": tool_use.name,
"tool_input": tool_use.input,
"content": result.content,
"is_error": result.is_error,
}
)
return result
response = ctx.llm.complete_with_tools(
@@ -1072,6 +1087,29 @@ Keep the same JSON structure but with shorter content values.
f"Pydantic validation failed after "
f"{max_validation_retries} retries: {err}"
)
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type=ctx.node_spec.node_type,
step_index=_step_index,
llm_text=response.content,
tool_calls=_captured_tool_calls,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type=ctx.node_spec.node_type,
success=False,
error=error_msg,
total_steps=_step_index + 1,
tokens_used=total_input_tokens + total_output_tokens,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
latency_ms=latency_ms,
)
return NodeResult(
success=False,
error=error_msg,
@@ -1161,12 +1199,36 @@ Keep the same JSON structure but with shorter content values.
)
# Return failure instead of writing garbage to all keys
_extraction_error = (
f"Output extraction failed: {e}. LLM returned non-JSON response. "
f"Expected keys: {ctx.node_spec.output_keys}"
)
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type=ctx.node_spec.node_type,
step_index=_step_index,
llm_text=response.content,
tool_calls=_captured_tool_calls,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type=ctx.node_spec.node_type,
success=False,
error=_extraction_error,
total_steps=_step_index + 1,
tokens_used=response.input_tokens + response.output_tokens,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
latency_ms=latency_ms,
)
return NodeResult(
success=False,
error=(
f"Output extraction failed: {e}. LLM returned non-JSON response. "
f"Expected keys: {ctx.node_spec.output_keys}"
),
error=_extraction_error,
output={},
tokens_used=response.input_tokens + response.output_tokens,
latency_ms=latency_ms,
@@ -1184,6 +1246,29 @@ Keep the same JSON structure but with shorter content values.
ctx.memory.write(key, stripped_content, validate=False)
output[key] = stripped_content
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type=ctx.node_spec.node_type,
step_index=_step_index,
llm_text=response.content,
tool_calls=_captured_tool_calls,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type=ctx.node_spec.node_type,
success=True,
total_steps=_step_index + 1,
tokens_used=response.input_tokens + response.output_tokens,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
latency_ms=latency_ms,
)
return NodeResult(
success=True,
output=output,
@@ -1199,6 +1284,15 @@ Keep the same JSON structure but with shorter content values.
error=str(e),
latency_ms=latency_ms,
)
if ctx.runtime_logger:
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type=ctx.node_spec.node_type,
success=False,
error=str(e),
latency_ms=latency_ms,
)
return NodeResult(success=False, error=str(e), latency_ms=latency_ms)
def _parse_output(self, content: str, node_spec: NodeSpec) -> dict[str, Any]:
@@ -1591,6 +1685,9 @@ class RouterNode(NodeProtocol):
async def execute(self, ctx: NodeContext) -> NodeResult:
"""Execute routing logic."""
import time as _time
start = _time.time()
ctx.runtime.set_node(ctx.node_id)
# Build options from routes
@@ -1635,10 +1732,30 @@ class RouterNode(NodeProtocol):
summary=f"Routing to {chosen_route[1]}",
)
latency_ms = int((_time.time() - start) * 1000)
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type="router",
step_index=0,
llm_text=f"Route: {chosen_route[0]} -> {chosen_route[1]}",
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type="router",
success=True,
total_steps=1,
latency_ms=latency_ms,
)
return NodeResult(
success=True,
next_node=chosen_route[1],
route_reason=f"Chose route: {chosen_route[0]}",
latency_ms=latency_ms,
)
async def _llm_route(
@@ -1800,6 +1917,22 @@ class FunctionNode(NodeProtocol):
else:
output = {"result": result}
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type="function",
step_index=0,
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type="function",
success=True,
total_steps=1,
latency_ms=latency_ms,
)
return NodeResult(success=True, output=output, latency_ms=latency_ms)
except Exception as e:
@@ -1810,4 +1943,22 @@ class FunctionNode(NodeProtocol):
error=str(e),
latency_ms=latency_ms,
)
if ctx.runtime_logger:
ctx.runtime_logger.log_step(
node_id=ctx.node_id,
node_type="function",
step_index=0,
latency_ms=latency_ms,
)
ctx.runtime_logger.log_node_complete(
node_id=ctx.node_id,
node_name=ctx.node_spec.name,
node_type="function",
success=False,
error=str(e),
total_steps=1,
latency_ms=latency_ms,
)
return NodeResult(success=False, error=str(e), latency_ms=latency_ms)
+25 -2
View File
@@ -585,7 +585,11 @@ def add_node(
str, "JSON object mapping conditions to target node IDs for router nodes"
] = "{}",
client_facing: Annotated[
bool, "If True, node streams output to user and blocks for input between turns"
bool,
"If True, an ask_user() tool is injected so the LLM can explicitly request user input. "
"The node blocks ONLY when ask_user() is called — text-only turns stream freely. "
"Set True for nodes that interact with users (intake, review, approval). "
"Nodes that do autonomous work (research, data processing, API calls) MUST be False.",
] = False,
nullable_output_keys: Annotated[
str, "JSON array of output keys that may remain unset (for mutually exclusive outputs)"
@@ -666,6 +670,14 @@ def add_node(
"EventLoopNode supports tool use, streaming, and judge-based evaluation."
)
# Warn about client_facing on nodes with tools (likely autonomous work)
if node_type == "event_loop" and client_facing and tools_list:
warnings.append(
f"Node '{node_id}' is client_facing=True but has tools {tools_list}. "
"Nodes with tools typically do autonomous work and should be "
"client_facing=False. Only set True if this node needs user approval."
)
# nullable_output_keys must be a subset of output_keys
if nullable_output_keys_list:
invalid_nullable = [k for k in nullable_output_keys_list if k not in output_keys_list]
@@ -1376,6 +1388,17 @@ def validate_graph() -> str:
f"Node '{dn['node_id']}' uses deprecated type '{dn['type']}'. Use 'event_loop' instead."
)
# Warn if all event_loop nodes are client_facing (common misconfiguration)
el_nodes = [n for n in session.nodes if n.node_type == "event_loop"]
cf_el_nodes = [n for n in el_nodes if n.client_facing]
if len(el_nodes) > 1 and len(cf_el_nodes) == len(el_nodes):
warnings.append(
f"ALL {len(el_nodes)} event_loop nodes are client_facing=True. "
"This injects ask_user() on every node. Only nodes that need user "
"interaction (intake, review, approval) should be client_facing. Set "
"client_facing=False on autonomous processing nodes."
)
# Collect summary info
event_loop_nodes = [n.id for n in session.nodes if n.node_type == "event_loop"]
client_facing_nodes = [n.id for n in session.nodes if n.client_facing]
@@ -2213,7 +2236,7 @@ def test_node(
)
else:
cf_note = (
"Node is client-facing: will block for user input between turns. "
"Node is client-facing: has ask_user() tool, blocks when LLM calls it. "
if node_spec.client_facing
else ""
)
+335 -31
View File
@@ -33,11 +33,6 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
type=str,
help="Input context from JSON file",
)
run_parser.add_argument(
"--mock",
action="store_true",
help="Run in mock mode (no real LLM calls)",
)
run_parser.add_argument(
"--output",
"-o",
@@ -186,6 +181,21 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
)
shell_parser.set_defaults(func=cmd_shell)
# tui command (interactive agent dashboard)
tui_parser = subparsers.add_parser(
"tui",
help="Launch interactive TUI dashboard",
description="Browse available agents and launch the terminal dashboard.",
)
tui_parser.add_argument(
"--model",
"-m",
type=str,
default=None,
help="LLM model to use (any LiteLLM-compatible name)",
)
tui_parser.set_defaults(func=cmd_tui)
def cmd_run(args: argparse.Namespace) -> int:
"""Run an exported agent."""
@@ -228,7 +238,6 @@ def cmd_run(args: argparse.Namespace) -> int:
try:
runner = AgentRunner.load(
args.agent_path,
mock_mode=args.mock,
model=args.model,
enable_tui=True,
)
@@ -266,7 +275,6 @@ def cmd_run(args: argparse.Namespace) -> int:
try:
runner = AgentRunner.load(
args.agent_path,
mock_mode=args.mock,
model=args.model,
enable_tui=False,
)
@@ -985,8 +993,215 @@ def cmd_shell(args: argparse.Namespace) -> int:
return 0
def cmd_tui(args: argparse.Namespace) -> int:
"""Browse agents and launch the interactive TUI dashboard."""
import logging
from framework.runner import AgentRunner
from framework.tui.app import AdenTUI
logging.basicConfig(level=logging.WARNING, format="%(message)s")
exports_dir = Path("exports")
examples_dir = Path("examples/templates")
has_exports = _has_agents(exports_dir)
has_examples = _has_agents(examples_dir)
if not has_exports and not has_examples:
print("No agents found in exports/ or examples/templates/", file=sys.stderr)
return 1
# Determine which directory to browse
if has_exports and has_examples:
print("\nAgent sources:\n")
print(" 1. Your Agents (exports/)")
print(" 2. Sample Agents (examples/templates/)")
print()
try:
choice = input("Select source (number): ").strip()
if choice == "1":
agents_dir = exports_dir
elif choice == "2":
agents_dir = examples_dir
else:
print("Invalid selection")
return 1
except (EOFError, KeyboardInterrupt):
print()
return 1
elif has_exports:
agents_dir = exports_dir
else:
agents_dir = examples_dir
# Let user pick an agent
agent_path = _select_agent(agents_dir)
if not agent_path:
return 1
# Launch TUI (same pattern as cmd_run --tui)
async def run_with_tui():
try:
runner = AgentRunner.load(
agent_path,
model=args.model,
enable_tui=True,
)
except Exception as e:
print(f"Error loading agent: {e}")
return
if runner._agent_runtime is None:
runner._setup()
if runner._agent_runtime and not runner._agent_runtime.is_running:
await runner._agent_runtime.start()
app = AdenTUI(runner._agent_runtime)
try:
await app.run_async()
except Exception as e:
import traceback
traceback.print_exc()
print(f"TUI error: {e}")
await runner.cleanup_async()
asyncio.run(run_with_tui())
print("TUI session ended.")
return 0
def _extract_python_agent_metadata(agent_path: Path) -> tuple[str, str]:
"""Extract name and description from a Python-based agent's config.py.
Uses AST parsing to safely extract values without executing code.
Returns (name, description) tuple, with fallbacks if parsing fails.
"""
import ast
config_path = agent_path / "config.py"
fallback_name = agent_path.name.replace("_", " ").title()
fallback_desc = "(Python-based agent)"
if not config_path.exists():
return fallback_name, fallback_desc
try:
with open(config_path) as f:
tree = ast.parse(f.read())
# Find AgentMetadata class definition
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and node.name == "AgentMetadata":
name = fallback_name
desc = fallback_desc
# Extract default values from class body
for item in node.body:
if isinstance(item, ast.AnnAssign) and isinstance(item.target, ast.Name):
field_name = item.target.id
if item.value:
# Handle simple string constants
if isinstance(item.value, ast.Constant):
if field_name == "name":
name = item.value.value
elif field_name == "description":
desc = item.value.value
# Handle parenthesized multi-line strings (concatenated)
elif isinstance(item.value, ast.JoinedStr):
# f-strings - skip, use fallback
pass
elif isinstance(item.value, ast.BinOp):
# String concatenation with + - try to evaluate
try:
result = _eval_string_binop(item.value)
if result and field_name == "name":
name = result
elif result and field_name == "description":
desc = result
except Exception:
pass
return name, desc
return fallback_name, fallback_desc
except Exception:
return fallback_name, fallback_desc
def _eval_string_binop(node) -> str | None:
"""Recursively evaluate a BinOp of string constants."""
import ast
if isinstance(node, ast.Constant) and isinstance(node.value, str):
return node.value
elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
left = _eval_string_binop(node.left)
right = _eval_string_binop(node.right)
if left is not None and right is not None:
return left + right
return None
def _is_valid_agent_dir(path: Path) -> bool:
"""Check if a directory contains a valid agent (agent.json or agent.py)."""
if not path.is_dir():
return False
return (path / "agent.json").exists() or (path / "agent.py").exists()
def _has_agents(directory: Path) -> bool:
"""Check if a directory contains any valid agents (folders with agent.json or agent.py)."""
if not directory.exists():
return False
return any(_is_valid_agent_dir(p) for p in directory.iterdir())
def _getch() -> str:
"""Read a single character from stdin without waiting for Enter."""
try:
if sys.platform == "win32":
import msvcrt
ch = msvcrt.getch()
return ch.decode("utf-8", errors="ignore")
else:
import termios
import tty
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
ch = sys.stdin.read(1)
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
return ch
except Exception:
return ""
def _read_key() -> str:
"""Read a key, handling arrow key escape sequences."""
ch = _getch()
if ch == "\x1b": # Escape sequence start
ch2 = _getch()
if ch2 == "[":
ch3 = _getch()
if ch3 == "C": # Right arrow
return "RIGHT"
elif ch3 == "D": # Left arrow
return "LEFT"
return ch
def _select_agent(agents_dir: Path) -> str | None:
"""Let user select an agent from available agents."""
"""Let user select an agent from available agents with pagination."""
AGENTS_PER_PAGE = 10
if not agents_dir.exists():
print(f"Directory not found: {agents_dir}", file=sys.stderr)
# fixes issue #696, creates an exports folder if it does not exist
@@ -996,37 +1211,126 @@ def _select_agent(agents_dir: Path) -> str | None:
agents = []
for path in agents_dir.iterdir():
if path.is_dir() and (path / "agent.json").exists():
if _is_valid_agent_dir(path):
agents.append(path)
if not agents:
print(f"No agents found in {agents_dir}", file=sys.stderr)
return None
print(f"\nAvailable agents in {agents_dir}:\n")
for i, agent_path in enumerate(agents, 1):
# Pagination setup
page = 0
total_pages = (len(agents) + AGENTS_PER_PAGE - 1) // AGENTS_PER_PAGE
while True:
start_idx = page * AGENTS_PER_PAGE
end_idx = min(start_idx + AGENTS_PER_PAGE, len(agents))
page_agents = agents[start_idx:end_idx]
# Show page header with indicator
if total_pages > 1:
print(f"\nAvailable agents in {agents_dir} (Page {page + 1}/{total_pages}):\n")
else:
print(f"\nAvailable agents in {agents_dir}:\n")
# Display agents for current page (with global numbering)
for i, agent_path in enumerate(page_agents, start_idx + 1):
try:
agent_json = agent_path / "agent.json"
if agent_json.exists():
with open(agent_json) as f:
data = json.load(f)
agent_meta = data.get("agent", {})
name = agent_meta.get("name", agent_path.name)
desc = agent_meta.get("description", "")
else:
# Python-based agent - extract from config.py
name, desc = _extract_python_agent_metadata(agent_path)
desc = desc[:50] + "..." if len(desc) > 50 else desc
print(f" {i}. {name}")
print(f" {desc}")
except Exception as e:
print(f" {i}. {agent_path.name} (error: {e})")
# Build navigation options
nav_options = []
if total_pages > 1:
nav_options.append("←/→ or p/n=navigate")
nav_options.append("q=quit")
print()
if total_pages > 1:
print(f" [{', '.join(nav_options)}]")
print()
# Show prompt
print("Select agent (number), use arrows to navigate, or q to quit: ", end="", flush=True)
try:
from framework.runner import AgentRunner
key = _read_key()
runner = AgentRunner.load(agent_path)
info = runner.info()
desc = info.description[:50] + "..." if len(info.description) > 50 else info.description
print(f" {i}. {info.name}")
print(f" {desc}")
runner.cleanup()
except Exception as e:
print(f" {i}. {agent_path.name} (error: {e})")
if key == "RIGHT" and page < total_pages - 1:
page += 1
print() # Newline before redrawing
elif key == "LEFT" and page > 0:
page -= 1
print()
elif key == "q":
print()
return None
elif key in ("n", ">") and page < total_pages - 1:
page += 1
print()
elif key in ("p", "<") and page > 0:
page -= 1
print()
elif key.isdigit():
# Build number with support for backspace
buffer = key
print(key, end="", flush=True)
print()
try:
choice = input("Select agent (number): ").strip()
idx = int(choice) - 1
if 0 <= idx < len(agents):
return str(agents[idx])
print("Invalid selection")
return None
except (ValueError, EOFError, KeyboardInterrupt):
return None
while True:
ch = _getch()
if ch in ("\r", "\n"):
# Enter pressed - submit
print()
break
elif ch in ("\x7f", "\x08"):
# Backspace (DEL or BS)
if buffer:
buffer = buffer[:-1]
# Erase character: move back, print space, move back
print("\b \b", end="", flush=True)
elif ch.isdigit():
buffer += ch
print(ch, end="", flush=True)
elif ch == "\x1b":
# Escape - cancel input
print()
buffer = ""
break
elif ch == "\x03":
# Ctrl+C
print()
return None
# Ignore other characters
if buffer:
try:
idx = int(buffer) - 1
if 0 <= idx < len(agents):
return str(agents[idx])
print("Invalid selection")
except ValueError:
print("Invalid input")
elif key == "\r" or key == "\n":
print() # Just pressed enter, redraw
else:
print()
print("Invalid input")
except (EOFError, KeyboardInterrupt):
print()
return None
def _interactive_multi(agents_dir: Path) -> int:
@@ -1042,7 +1346,7 @@ def _interactive_multi(agents_dir: Path) -> int:
# Register all agents
for path in agents_dir.iterdir():
if path.is_dir() and (path / "agent.json").exists():
if _is_valid_agent_dir(path):
try:
orchestrator.register(path.name, path)
agent_count += 1
+10
View File
@@ -19,6 +19,8 @@ from framework.runner.tool_registry import ToolRegistry
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
from framework.runtime.core import Runtime
from framework.runtime.execution_stream import EntryPointSpec
from framework.runtime.runtime_log_store import RuntimeLogStore
from framework.runtime.runtime_logger import RuntimeLogger
if TYPE_CHECKING:
from framework.runner.protocol import AgentMessage, CapabilityResponse
@@ -691,6 +693,10 @@ class AgentRunner:
# Create runtime
self._runtime = Runtime(storage_path=self._storage_path)
# Create runtime logger
log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs")
runtime_logger = RuntimeLogger(store=log_store, agent_id=self.graph.id)
# Create executor
self._executor = GraphExecutor(
runtime=self._runtime,
@@ -698,6 +704,7 @@ class AgentRunner:
tools=tools,
tool_executor=tool_executor,
approval_callback=self._approval_callback,
runtime_logger=runtime_logger,
loop_config=self.graph.loop_config,
)
@@ -732,6 +739,8 @@ class AgentRunner:
)
# Create AgentRuntime with all entry points
log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs")
self._agent_runtime = create_agent_runtime(
graph=self.graph,
goal=self.goal,
@@ -740,6 +749,7 @@ class AgentRunner:
llm=self._llm,
tools=tools,
tool_executor=tool_executor,
runtime_log_store=log_store,
)
async def run(
+688
View File
@@ -0,0 +1,688 @@
# Runtime Logging System
## Overview
The Hive framework uses a **three-level observability system** for tracking agent execution at different granularities:
- **L1 (Summary)**: High-level run outcomes - success/failure, execution quality, attention flags
- **L2 (Details)**: Per-node completion details - retries, verdicts, latency, attention reasons
- **L3 (Tool Logs)**: Step-by-step execution - tool calls, LLM responses, judge feedback
This layered approach enables efficient debugging: start with L1 to identify problematic runs, drill into L2 to find failing nodes, and analyze L3 for root cause details.
---
## Storage Architecture
### Current Structure (Unified Sessions)
**Default since 2026-02-06**
```
~/.hive/{agent_name}/
└── sessions/
└── session_YYYYMMDD_HHMMSS_{uuid}/
├── state.json # Session state and metadata
├── logs/ # Runtime logs (L1/L2/L3)
│ ├── summary.json # L1: Run outcome
│ ├── details.jsonl # L2: Per-node results
│ └── tool_logs.jsonl # L3: Step-by-step execution
├── conversations/ # Per-node EventLoop state
└── data/ # Spillover artifacts
```
**Key characteristics:**
- All session data colocated in one directory
- Consistent ID format: `session_YYYYMMDD_HHMMSS_{short_uuid}`
- Logs written incrementally (JSONL for L2/L3)
- Single source of truth: `state.json`
### Legacy Structure (Deprecated)
**Read-only for backward compatibility**
```
~/.hive/{agent_name}/
├── runtime_logs/
│ └── runs/
│ └── {run_id}/
│ ├── summary.json # L1
│ ├── details.jsonl # L2
│ └── tool_logs.jsonl # L3
├── sessions/
│ └── exec_{stream_id}_{uuid}/
│ ├── conversations/
│ └── data/
├── runs/ # Deprecated
│ └── run_start_*.json
└── summaries/ # Deprecated
└── run_start_*.json
```
**Migration status:**
- ✅ New sessions write to unified structure only
- ✅ Old sessions remain readable
- ❌ No new writes to `runs/`, `summaries/`, `runtime_logs/runs/`
- ⚠️ Deprecation warnings emitted when reading old locations
---
## Components
### RuntimeLogger
**Location:** `core/framework/runtime/runtime_logger.py`
**Responsibilities:**
- Receives execution events from GraphExecutor
- Tracks per-node execution details
- Aggregates attention flags
- Coordinates with RuntimeLogStore
**Key methods:**
```python
def start_run(goal_id: str, session_id: str = "") -> str:
"""Initialize a new run. Uses session_id as run_id if provided."""
def log_step(node_id: str, step_index: int, tool_calls: list, ...):
"""Record one LLM step (L3). Appends to tool_logs.jsonl immediately."""
def log_node_complete(node_id: str, exit_status: str, ...):
"""Record node completion (L2). Appends to details.jsonl immediately."""
async def end_run(status: str):
"""Finalize run, aggregate L2→L1, write summary.json."""
```
**Attention flag triggers:**
```python
# From runtime_logger.py:190-203
needs_attention = any([
retry_count > 3,
escalate_count > 2,
latency_ms > 60000,
tokens_used > 100000,
total_steps > 20,
])
```
### RuntimeLogStore
**Location:** `core/framework/runtime/runtime_log_store.py`
**Responsibilities:**
- Manages log file I/O
- Handles both old and new storage paths
- Provides incremental append for L2/L3 (crash-safe)
- Atomic writes for L1
**Storage path resolution:**
```python
def _get_run_dir(run_id: str) -> Path:
"""Determine log directory based on run_id format.
- session_* → {storage_root}/sessions/{run_id}/logs/
- Other → {base_path}/runtime_logs/runs/{run_id}/ (deprecated)
"""
```
**Key methods:**
```python
def ensure_run_dir(run_id: str):
"""Create log directory immediately at start_run()."""
def append_step(run_id: str, step: NodeStepLog):
"""Append L3 entry to tool_logs.jsonl. Thread-safe sync write."""
def append_node_detail(run_id: str, detail: NodeDetail):
"""Append L2 entry to details.jsonl. Thread-safe sync write."""
async def save_summary(run_id: str, summary: RunSummaryLog):
"""Write L1 summary.json atomically at end_run()."""
```
**File format:**
- **L1 (summary.json)**: Standard JSON, written once at end
- **L2 (details.jsonl)**: JSONL (one object per line), appended per node
- **L3 (tool_logs.jsonl)**: JSONL (one object per line), appended per step
### Runtime Log Schemas
**Location:** `core/framework/runtime/runtime_log_schemas.py`
**L1: RunSummaryLog**
```python
@dataclass
class RunSummaryLog:
run_id: str
goal_id: str
status: str # "success", "failure", "degraded", "in_progress"
started_at: str # ISO 8601
ended_at: str | None
needs_attention: bool
attention_summary: AttentionSummary
total_nodes_executed: int
nodes_with_failures: list[str]
execution_quality: str # "clean", "degraded", "failed"
total_latency_ms: int
# ... additional metrics
```
**L2: NodeDetail**
```python
@dataclass
class NodeDetail:
node_id: str
exit_status: str # "success", "escalate", "no_valid_edge"
retry_count: int
verdict_counts: dict[str, int] # {ACCEPT: 1, RETRY: 3, ...}
total_steps: int
latency_ms: int
needs_attention: bool
attention_reasons: list[str]
# ... tool error tracking, token counts
```
**L3: NodeStepLog**
```python
@dataclass
class NodeStepLog:
node_id: str
step_index: int
tool_calls: list[dict]
tool_results: list[dict]
verdict: str # "ACCEPT", "RETRY", "ESCALATE", "CONTINUE"
verdict_feedback: str
llm_response_text: str
tokens_used: int
latency_ms: int
# ... detailed execution state
```
---
## Querying Logs (MCP Tools)
### Tools Location
**MCP Server:** `tools/src/aden_tools/tools/runtime_logs_tool/runtime_logs_tool.py`
Three MCP tools provide access to the logging system:
### L1: query_runtime_logs
**Purpose:** Find problematic runs
```python
query_runtime_logs(
agent_work_dir: str, # e.g., "~/.hive/twitter_outreach"
status: str = "", # "needs_attention", "success", "failure", "degraded"
limit: int = 20
) -> dict # {"runs": [...], "total": int}
```
**Returns:**
```json
{
"runs": [
{
"run_id": "session_20260206_115718_e22339c5",
"status": "degraded",
"needs_attention": true,
"attention_summary": {
"total_attention_flags": 3,
"categories": ["missing_outputs", "retry_loops"]
},
"started_at": "2026-02-06T11:57:18Z"
}
],
"total": 1
}
```
**Common queries:**
```python
# Find all problematic runs
query_runtime_logs(agent_work_dir, status="needs_attention")
# Get recent runs regardless of status
query_runtime_logs(agent_work_dir, limit=10)
# Check for failures
query_runtime_logs(agent_work_dir, status="failure")
```
### L2: query_runtime_log_details
**Purpose:** Identify which nodes failed
```python
query_runtime_log_details(
agent_work_dir: str,
run_id: str, # From L1 query
needs_attention_only: bool = False,
node_id: str = "" # Filter to specific node
) -> dict # {"run_id": str, "nodes": [...]}
```
**Returns:**
```json
{
"run_id": "session_20260206_115718_e22339c5",
"nodes": [
{
"node_id": "intake-collector",
"exit_status": "escalate",
"retry_count": 5,
"verdict_counts": {"RETRY": 5, "ESCALATE": 1},
"attention_reasons": ["high_retry_count", "missing_outputs"],
"total_steps": 8,
"latency_ms": 12500,
"needs_attention": true
}
]
}
```
**Common queries:**
```python
# Get all problematic nodes
query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True)
# Analyze specific node across run
query_runtime_log_details(agent_work_dir, run_id, node_id="intake-collector")
# Full node breakdown
query_runtime_log_details(agent_work_dir, run_id)
```
### L3: query_runtime_log_raw
**Purpose:** Root cause analysis
```python
query_runtime_log_raw(
agent_work_dir: str,
run_id: str,
step_index: int = -1, # Specific step or -1 for all
node_id: str = "" # Filter to specific node
) -> dict # {"run_id": str, "steps": [...]}
```
**Returns:**
```json
{
"run_id": "session_20260206_115718_e22339c5",
"steps": [
{
"node_id": "intake-collector",
"step_index": 3,
"tool_calls": [
{
"tool": "web_search",
"args": {"query": "@RomuloNevesOf"}
}
],
"tool_results": [
{
"status": "success",
"data": "..."
}
],
"verdict": "RETRY",
"verdict_feedback": "Missing required output 'twitter_handles'. You found the handle but didn't call set_output.",
"llm_response_text": "I found the Twitter profile...",
"tokens_used": 1234,
"latency_ms": 2500
}
]
}
```
**Common queries:**
```python
# All steps for a problematic node
query_runtime_log_raw(agent_work_dir, run_id, node_id="intake-collector")
# Specific step analysis
query_runtime_log_raw(agent_work_dir, run_id, step_index=5)
# Full execution trace
query_runtime_log_raw(agent_work_dir, run_id)
```
---
## Usage Patterns
### Pattern 1: Top-Down Investigation
**Use case:** Debug a failing agent
```python
# 1. Find problematic runs (L1)
result = query_runtime_logs(
agent_work_dir="~/.hive/twitter_outreach",
status="needs_attention"
)
run_id = result["runs"][0]["run_id"]
# 2. Identify failing nodes (L2)
details = query_runtime_log_details(
agent_work_dir="~/.hive/twitter_outreach",
run_id=run_id,
needs_attention_only=True
)
problem_node = details["nodes"][0]["node_id"]
# 3. Analyze root cause (L3)
raw = query_runtime_log_raw(
agent_work_dir="~/.hive/twitter_outreach",
run_id=run_id,
node_id=problem_node
)
# Examine verdict_feedback, tool_results, etc.
```
### Pattern 2: Node-Specific Debugging
**Use case:** Investigate why a specific node keeps failing
```python
# Get recent runs
runs = query_runtime_logs("~/.hive/my_agent", limit=10)
# For each run, check specific node
for run in runs["runs"]:
node_details = query_runtime_log_details(
"~/.hive/my_agent",
run["run_id"],
node_id="problematic-node"
)
# Analyze retry patterns, error types
```
### Pattern 3: Real-Time Monitoring
**Use case:** Watch for issues during development
```python
import time
while True:
result = query_runtime_logs(
agent_work_dir="~/.hive/my_agent",
status="needs_attention",
limit=1
)
if result["total"] > 0:
new_issue = result["runs"][0]
print(f"⚠️ New issue detected: {new_issue['run_id']}")
# Alert or drill into L2/L3
time.sleep(10) # Poll every 10 seconds
```
---
## Integration Points
### GraphExecutor → RuntimeLogger
**Location:** `core/framework/graph/executor.py`
```python
# Executor creates logger and passes session_id
logger = RuntimeLogger(store, agent_id)
run_id = logger.start_run(goal_id, session_id=execution_id)
# During execution
logger.log_step(node_id, step_index, tool_calls, ...)
logger.log_node_complete(node_id, exit_status, ...)
# At completion
await logger.end_run(status="success")
```
### EventLoopNode → RuntimeLogger
**Location:** `core/framework/graph/event_loop_node.py`
```python
# EventLoopNode logs each step
self._logger.log_step(
node_id=self.id,
step_index=step_count,
tool_calls=current_tool_calls,
tool_results=current_tool_results,
verdict=verdict,
verdict_feedback=feedback,
...
)
```
### AgentRuntime → RuntimeLogger
**Location:** `core/framework/runtime/agent_runtime.py`
```python
# Runtime initializes logger with storage path
log_store = RuntimeLogStore(base_path / "runtime_logs")
logger = RuntimeLogger(log_store, agent_id)
# Passes session_id from ExecutionStream
logger.start_run(goal_id, session_id=execution_id)
```
---
## File Format Details
### L1: summary.json
**Written:** Once at end_run()
**Format:** Standard JSON
```json
{
"run_id": "session_20260206_115718_e22339c5",
"goal_id": "twitter-outreach-multi-loop",
"status": "degraded",
"started_at": "2026-02-06T11:57:18.593081",
"ended_at": "2026-02-06T11:58:45.123456",
"needs_attention": true,
"attention_summary": {
"total_attention_flags": 3,
"categories": ["missing_outputs", "retry_loops"],
"nodes_with_attention": ["intake-collector"]
},
"total_nodes_executed": 4,
"nodes_with_failures": ["intake-collector"],
"execution_quality": "degraded",
"total_latency_ms": 86530,
"total_retries": 5
}
```
### L2: details.jsonl
**Written:** Incrementally (append per node completion)
**Format:** JSONL (one JSON object per line)
```jsonl
{"node_id":"intake-collector","exit_status":"escalate","retry_count":5,"verdict_counts":{"RETRY":5,"ESCALATE":1},"total_steps":8,"latency_ms":12500,"needs_attention":true,"attention_reasons":["high_retry_count","missing_outputs"],"tool_error_count":0,"tokens_used":9876}
{"node_id":"profile-analyzer","exit_status":"success","retry_count":0,"verdict_counts":{"ACCEPT":1},"total_steps":2,"latency_ms":5432,"needs_attention":false,"attention_reasons":[],"tool_error_count":0,"tokens_used":3456}
```
### L3: tool_logs.jsonl
**Written:** Incrementally (append per step)
**Format:** JSONL (one JSON object per line)
```jsonl
{"node_id":"intake-collector","step_index":3,"tool_calls":[{"tool":"web_search","args":{"query":"@RomuloNevesOf"}}],"tool_results":[{"status":"success","data":"..."}],"verdict":"RETRY","verdict_feedback":"Missing required output 'twitter_handles'. You found the handle but didn't call set_output.","llm_response_text":"I found the profile...","tokens_used":1234,"latency_ms":2500}
{"node_id":"intake-collector","step_index":4,"tool_calls":[{"tool":"web_search","args":{"query":"@RomuloNevesOf twitter"}}],"tool_results":[{"status":"success","data":"..."}],"verdict":"RETRY","verdict_feedback":"Still missing 'twitter_handles'.","llm_response_text":"Found more info...","tokens_used":1456,"latency_ms":2300}
```
**Why JSONL?**
- Incremental append during execution (crash-safe)
- No need to parse entire file to add one line
- Data persisted immediately, not buffered
- Easy to stream/process line-by-line
---
## Attention Flags System
### Automatic Detection
The runtime logger automatically flags issues based on execution metrics:
| Trigger | Threshold | Attention Reason | Category |
|---------|-----------|------------------|----------|
| High retries | `retry_count > 3` | `high_retry_count` | Retry Loops |
| Escalations | `escalate_count > 2` | `escalation_pattern` | Guard Failures |
| High latency | `latency_ms > 60000` | `high_latency` | High Latency |
| Token usage | `tokens_used > 100000` | `high_token_usage` | Memory/Context |
| Stalled steps | `total_steps > 20` | `excessive_steps` | Stalled Execution |
| Tool errors | `tool_error_count > 0` | `tool_failures` | Tool Errors |
| Missing outputs | `exit_status != "success"` | `missing_outputs` | Missing Outputs |
### Attention Categories
Used by `/hive-debugger` skill for issue categorization:
1. **Missing Outputs**: Node didn't set required output keys
2. **Tool Errors**: Tool calls failed (API errors, timeouts)
3. **Retry Loops**: Judge repeatedly rejecting outputs
4. **Guard Failures**: Output validation failed
5. **Stalled Execution**: EventLoopNode not making progress
6. **High Latency**: Slow tool calls or LLM responses
7. **Client-Facing Issues**: Premature set_output before user input
8. **Edge Routing Errors**: No edges match current state
9. **Memory/Context Issues**: Conversation history too long
10. **Constraint Violations**: Agent violated goal-level rules
---
## Migration Guide
### Reading Old Logs
The system automatically handles both old and new formats:
```python
# MCP tools check both locations automatically
result = query_runtime_logs("~/.hive/old_agent")
# Returns logs from both:
# - ~/.hive/old_agent/runtime_logs/runs/*/
# - ~/.hive/old_agent/sessions/session_*/logs/
```
### Deprecation Warnings
When reading from old locations, deprecation warnings are emitted:
```
DeprecationWarning: Reading logs from deprecated location for run_id=20260101T120000_abc12345.
New sessions use unified storage at sessions/session_*/logs/
```
### Migration Script (Optional)
For migrating existing old logs to new format, see:
- `EXECUTION_STORAGE_REDESIGN.md` - Migration strategy
- Future: `scripts/migrate_to_unified_sessions.py`
---
## Performance Characteristics
### Write Performance
- **L3 append**: ~1-2ms per step (sync I/O, thread-safe)
- **L2 append**: ~1-2ms per node (sync I/O, thread-safe)
- **L1 write**: ~5-10ms at end_run (atomic, async)
**Overhead:** < 5% of total execution time for typical agents
### Read Performance
- **L1 summary**: ~1-5ms (single JSON file)
- **L2 details**: ~10-50ms (JSONL, depends on node count)
- **L3 raw logs**: ~50-500ms (JSONL, depends on step count)
**Optimization:** Use filters (node_id, step_index) to reduce data read
### Storage Size
Typical session with 5 nodes, 20 steps:
- **L1 (summary.json)**: ~2-5 KB
- **L2 (details.jsonl)**: ~5-10 KB (1-2 KB per node)
- **L3 (tool_logs.jsonl)**: ~50-200 KB (2-10 KB per step)
**Total per session:** ~60-215 KB
**Compression:** Consider archiving old sessions after 90 days
---
## Troubleshooting
### Issue: Logs not appearing
**Symptom:** MCP tools return empty results
**Check:**
1. Verify storage path exists: `~/.hive/{agent_name}/`
2. Check session directories: `ls ~/.hive/{agent_name}/sessions/`
3. Verify logs directory exists: `ls ~/.hive/{agent_name}/sessions/session_*/logs/`
4. Check file permissions
### Issue: Corrupt JSONL files
**Symptom:** Partial data or JSON decode errors
**Cause:** Process crash during write (rare, but possible)
**Recovery:**
```python
# MCP tools skip corrupt lines automatically
query_runtime_log_details(agent_work_dir, run_id)
# Logs warning but continues with valid lines
```
### Issue: High disk usage
**Symptom:** Storage growing too large
**Solution:**
```bash
# Archive old sessions
cd ~/.hive/{agent_name}/sessions/
find . -name "session_2025*" -type d -exec tar -czf archive.tar.gz {} +
rm -rf session_2025*
# Or set up automatic cleanup (future feature)
```
---
## References
**Implementation:**
- `core/framework/runtime/runtime_logger.py` - Logger implementation
- `core/framework/runtime/runtime_log_store.py` - Storage layer
- `core/framework/runtime/runtime_log_schemas.py` - Data schemas
- `tools/src/aden_tools/tools/runtime_logs_tool/runtime_logs_tool.py` - MCP query tools
**Documentation:**
- `EXECUTION_STORAGE_REDESIGN.md` - Unified session storage design
- `/.claude/skills/hive-debugger/SKILL.md` - Interactive debugging skill
**Related:**
- `core/framework/schemas/session_state.py` - Session state schema
- `core/framework/storage/session_store.py` - Session state storage
- `core/framework/graph/executor.py` - GraphExecutor integration
+26 -1
View File
@@ -18,6 +18,7 @@ from framework.runtime.execution_stream import EntryPointSpec, ExecutionStream
from framework.runtime.outcome_aggregator import OutcomeAggregator
from framework.runtime.shared_state import SharedStateManager
from framework.storage.concurrent import ConcurrentStorage
from framework.storage.session_store import SessionStore
if TYPE_CHECKING:
from framework.graph.edge import GraphSpec
@@ -100,6 +101,7 @@ class AgentRuntime:
tools: list["Tool"] | None = None,
tool_executor: Callable | None = None,
config: AgentRuntimeConfig | None = None,
runtime_log_store: Any = None,
):
"""
Initialize agent runtime.
@@ -112,18 +114,24 @@ class AgentRuntime:
tools: Available tools
tool_executor: Function to execute tools
config: Optional runtime configuration
runtime_log_store: Optional RuntimeLogStore for per-execution logging
"""
self.graph = graph
self.goal = goal
self._config = config or AgentRuntimeConfig()
self._runtime_log_store = runtime_log_store
# Initialize storage
storage_path_obj = Path(storage_path) if isinstance(storage_path, str) else storage_path
self._storage = ConcurrentStorage(
base_path=storage_path,
base_path=storage_path_obj,
cache_ttl=self._config.cache_ttl,
batch_interval=self._config.batch_interval,
)
# Initialize SessionStore for unified sessions (always enabled)
self._session_store = SessionStore(storage_path_obj)
# Initialize shared components
self._state_manager = SharedStateManager()
self._event_bus = EventBus(max_history=self._config.max_history)
@@ -212,6 +220,8 @@ class AgentRuntime:
tool_executor=self._tool_executor,
result_retention_max=self._config.execution_result_max,
result_retention_ttl_seconds=self._config.execution_result_ttl_seconds,
runtime_log_store=self._runtime_log_store,
session_store=self._session_store,
)
await stream.start()
self._streams[ep_id] = stream
@@ -448,11 +458,14 @@ def create_agent_runtime(
tools: list["Tool"] | None = None,
tool_executor: Callable | None = None,
config: AgentRuntimeConfig | None = None,
runtime_log_store: Any = None,
enable_logging: bool = True,
) -> AgentRuntime:
"""
Create and configure an AgentRuntime with entry points.
Convenience factory that creates runtime and registers entry points.
Runtime logging is enabled by default for observability.
Args:
graph: Graph specification
@@ -463,10 +476,21 @@ def create_agent_runtime(
tools: Available tools
tool_executor: Tool executor function
config: Runtime configuration
runtime_log_store: Optional RuntimeLogStore for per-execution logging.
If None and enable_logging=True, creates one automatically.
enable_logging: Whether to enable runtime logging (default: True).
Set to False to disable logging entirely.
Returns:
Configured AgentRuntime (not yet started)
"""
# Auto-create runtime log store if logging is enabled and not provided
if enable_logging and runtime_log_store is None:
from framework.runtime.runtime_log_store import RuntimeLogStore
storage_path_obj = Path(storage_path) if isinstance(storage_path, str) else storage_path
runtime_log_store = RuntimeLogStore(storage_path_obj / "runtime_logs")
runtime = AgentRuntime(
graph=graph,
goal=goal,
@@ -475,6 +499,7 @@ def create_agent_runtime(
tools=tools,
tool_executor=tool_executor,
config=config,
runtime_log_store=runtime_log_store,
)
for spec in entry_points:
+130 -2
View File
@@ -28,6 +28,7 @@ if TYPE_CHECKING:
from framework.runtime.event_bus import EventBus
from framework.runtime.outcome_aggregator import OutcomeAggregator
from framework.storage.concurrent import ConcurrentStorage
from framework.storage.session_store import SessionStore
logger = logging.getLogger(__name__)
@@ -112,6 +113,8 @@ class ExecutionStream:
tool_executor: Callable | None = None,
result_retention_max: int | None = 1000,
result_retention_ttl_seconds: float | None = None,
runtime_log_store: Any = None,
session_store: "SessionStore | None" = None,
):
"""
Initialize execution stream.
@@ -128,6 +131,8 @@ class ExecutionStream:
llm: LLM provider for nodes
tools: Available tools
tool_executor: Function to execute tools
runtime_log_store: Optional RuntimeLogStore for per-execution logging
session_store: Optional SessionStore for unified session storage
"""
self.stream_id = stream_id
self.entry_spec = entry_spec
@@ -142,6 +147,8 @@ class ExecutionStream:
self._tool_executor = tool_executor
self._result_retention_max = result_retention_max
self._result_retention_ttl_seconds = result_retention_ttl_seconds
self._runtime_log_store = runtime_log_store
self._session_store = session_store
# Create stream-scoped runtime
self._runtime = StreamRuntime(
@@ -221,6 +228,13 @@ class ExecutionStream:
await task
except asyncio.CancelledError:
pass
except RuntimeError as e:
# Task may be attached to a different event loop (e.g., when TUI
# uses a separate loop). Log and continue cleanup.
if "attached to a different loop" in str(e):
logger.warning(f"Task cleanup skipped (different event loop): {e}")
else:
raise
self._execution_tasks.clear()
self._active_executions.clear()
@@ -275,8 +289,21 @@ class ExecutionStream:
if not self._running:
raise RuntimeError(f"ExecutionStream '{self.stream_id}' is not running")
# Generate execution ID
execution_id = f"exec_{self.stream_id}_{uuid.uuid4().hex[:8]}"
# Generate execution ID using unified session format
if self._session_store:
execution_id = self._session_store.generate_session_id()
else:
# Fallback to old format if SessionStore not available (shouldn't happen)
import warnings
warnings.warn(
"SessionStore not available, using deprecated exec_* ID format. "
"Please ensure AgentRuntime is properly initialized.",
DeprecationWarning,
stacklevel=2,
)
execution_id = f"exec_{self.stream_id}_{uuid.uuid4().hex[:8]}"
if correlation_id is None:
correlation_id = execution_id
@@ -330,6 +357,15 @@ class ExecutionStream:
# Create runtime adapter for this execution
runtime_adapter = StreamRuntimeAdapter(self._runtime, execution_id)
# Create per-execution runtime logger
runtime_logger = None
if self._runtime_log_store:
from framework.runtime.runtime_logger import RuntimeLogger
runtime_logger = RuntimeLogger(
store=self._runtime_log_store, agent_id=self.graph.id
)
# Create executor for this execution.
# Each execution gets its own storage under sessions/{exec_id}/
# so conversations, spillover, and data files are all scoped
@@ -345,11 +381,15 @@ class ExecutionStream:
event_bus=self._event_bus,
stream_id=self.stream_id,
storage_path=exec_storage,
runtime_logger=runtime_logger,
loop_config=self.graph.loop_config,
)
# Track executor so inject_input() can reach EventLoopNode instances
self._active_executors[execution_id] = executor
# Write initial session state
await self._write_session_state(execution_id, ctx)
# Create modified graph with entry point
# We need to override the entry_node to use our entry point
modified_graph = self._create_modified_graph()
@@ -374,6 +414,9 @@ class ExecutionStream:
if result.paused_at:
ctx.status = "paused"
# Write final session state
await self._write_session_state(execution_id, ctx, result=result)
# Emit completion/failure event
if self._event_bus:
if result.success:
@@ -410,6 +453,9 @@ class ExecutionStream:
),
)
# Write error session state
await self._write_session_state(execution_id, ctx, error=str(e))
# Emit failure event
if self._event_bus:
await self._event_bus.emit_execution_failed(
@@ -433,6 +479,88 @@ class ExecutionStream:
self._completion_events.pop(execution_id, None)
self._execution_tasks.pop(execution_id, None)
async def _write_session_state(
self,
execution_id: str,
ctx: ExecutionContext,
result: ExecutionResult | None = None,
error: str | None = None,
) -> None:
"""
Write state.json for a session.
Args:
execution_id: Session/execution ID
ctx: Execution context
result: Optional execution result (if completed)
error: Optional error message (if failed)
"""
# Only write if session_store is available
if not self._session_store:
return
from framework.schemas.session_state import SessionState, SessionStatus
try:
# Determine status
if result:
if result.paused_at:
status = SessionStatus.PAUSED
elif result.success:
status = SessionStatus.COMPLETED
else:
status = SessionStatus.FAILED
elif error:
status = SessionStatus.FAILED
else:
status = SessionStatus.ACTIVE
# Create SessionState
if result:
# Create from execution result
state = SessionState.from_execution_result(
session_id=execution_id,
goal_id=self.goal.id,
result=result,
stream_id=self.stream_id,
correlation_id=ctx.correlation_id,
started_at=ctx.started_at.isoformat(),
input_data=ctx.input_data,
agent_id=self.graph.id,
entry_point=self.entry_spec.id,
)
else:
# Create initial state
from framework.schemas.session_state import SessionTimestamps
now = datetime.now().isoformat()
state = SessionState(
session_id=execution_id,
stream_id=self.stream_id,
correlation_id=ctx.correlation_id,
goal_id=self.goal.id,
agent_id=self.graph.id,
entry_point=self.entry_spec.id,
status=status,
timestamps=SessionTimestamps(
started_at=ctx.started_at.isoformat(),
updated_at=now,
),
input_data=ctx.input_data,
)
# Handle error case
if error:
state.result.error = error
# Write state.json
await self._session_store.write_state(execution_id, state)
logger.debug(f"Wrote state.json for session {execution_id} (status={status})")
except Exception as e:
# Log but don't fail the execution
logger.error(f"Failed to write state.json for {execution_id}: {e}")
def _create_modified_graph(self) -> "GraphSpec":
"""Create a graph with the entry point overridden."""
# Use the existing graph but override entry_node
@@ -0,0 +1,122 @@
"""Pydantic models for the three-level runtime logging system.
Level 1 - SUMMARY: Per graph run pass/fail, token counts, timing
Level 2 - DETAILS: Per node completion results and attention flags
Level 3 - TOOL LOGS: Per step within any node (tool calls, LLM text, tokens)
"""
from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
# ---------------------------------------------------------------------------
# Level 3: Tool logs (most granular) — per step within any node
# ---------------------------------------------------------------------------
class ToolCallLog(BaseModel):
"""A single tool call within a step."""
tool_use_id: str
tool_name: str
tool_input: dict[str, Any] = Field(default_factory=dict)
result: str = ""
is_error: bool = False
class NodeStepLog(BaseModel):
"""Full tool and LLM details for one step within a node.
For EventLoopNode, each iteration is a step. For single-step nodes
(LLMNode, FunctionNode, RouterNode), step_index is 0.
"""
node_id: str
node_type: str = "" # "event_loop"|"llm_tool_use"|"llm_generate"|"function"|"router"
step_index: int = 0 # iteration number for event_loop, 0 for single-step nodes
llm_text: str = ""
tool_calls: list[ToolCallLog] = Field(default_factory=list)
input_tokens: int = 0
output_tokens: int = 0
latency_ms: int = 0
# EventLoopNode only:
verdict: str = "" # "ACCEPT"|"RETRY"|"ESCALATE"|"CONTINUE"
verdict_feedback: str = ""
# Error tracking:
error: str = "" # Error message if step failed
stacktrace: str = "" # Full stack trace if exception occurred
is_partial: bool = False # True if step didn't complete normally
# ---------------------------------------------------------------------------
# Level 2: Per-node completion details
# ---------------------------------------------------------------------------
class NodeDetail(BaseModel):
"""Per-node completion result and attention flags."""
node_id: str
node_name: str = ""
node_type: str = ""
success: bool = True
error: str | None = None
stacktrace: str = "" # Full stack trace if exception occurred
total_steps: int = 0
tokens_used: int = 0 # combined input+output from NodeResult
input_tokens: int = 0
output_tokens: int = 0
latency_ms: int = 0
attempt: int = 1 # retry attempt number
# EventLoopNode-specific:
exit_status: str = "" # "success"|"failure"|"stalled"|"escalated"|"paused"|"guard_failure"
accept_count: int = 0
retry_count: int = 0
escalate_count: int = 0
continue_count: int = 0
needs_attention: bool = False
attention_reasons: list[str] = Field(default_factory=list)
# ---------------------------------------------------------------------------
# Level 1: Run summary — one per full graph execution
# ---------------------------------------------------------------------------
class RunSummaryLog(BaseModel):
"""Run-level summary for a full graph execution."""
run_id: str
agent_id: str = ""
goal_id: str = ""
status: str = "" # "success"|"failure"|"degraded"
total_nodes_executed: int = 0
node_path: list[str] = Field(default_factory=list)
total_input_tokens: int = 0
total_output_tokens: int = 0
needs_attention: bool = False
attention_reasons: list[str] = Field(default_factory=list)
started_at: str = "" # ISO timestamp
duration_ms: int = 0
execution_quality: str = "" # "clean"|"degraded"|"failed"
# ---------------------------------------------------------------------------
# Container models for file serialization
# ---------------------------------------------------------------------------
class RunDetailsLog(BaseModel):
"""Level 2 container: all node details for a run."""
run_id: str
nodes: list[NodeDetail] = Field(default_factory=list)
class RunToolLogs(BaseModel):
"""Level 3 container: all step logs for a run."""
run_id: str
steps: list[NodeStepLog] = Field(default_factory=list)
+306
View File
@@ -0,0 +1,306 @@
"""File-based storage for runtime logs.
Each run gets its own directory under ``runs/``. No shared mutable index
``list_runs()`` scans the directory and loads summary.json from each run.
This eliminates concurrency issues when parallel EventLoopNodes write
simultaneously.
L2 (details) and L3 (tool logs) use JSONL (one JSON object per line) for
incremental append-on-write. This provides crash resilience data is on
disk as soon as it's logged, not only at end_run(). L1 (summary) is still
written once at end as a regular JSON file since it aggregates L2.
Storage layout (current)::
{base_path}/
sessions/
{session_id}/
logs/
summary.json # Level 1 — written once at end
details.jsonl # Level 2 — appended per node completion
tool_logs.jsonl # Level 3 — appended per step
"""
from __future__ import annotations
import asyncio
import json
import logging
from datetime import UTC, datetime
from pathlib import Path
from framework.runtime.runtime_log_schemas import (
NodeDetail,
NodeStepLog,
RunDetailsLog,
RunSummaryLog,
RunToolLogs,
)
logger = logging.getLogger(__name__)
class RuntimeLogStore:
"""Persists runtime logs at three levels. Thread-safe via per-run directories."""
def __init__(self, base_path: Path) -> None:
self._base_path = base_path
# Note: _runs_dir is determined per-run_id by _get_run_dir()
def _get_run_dir(self, run_id: str) -> Path:
"""Determine run directory path based on run_id format.
- New format (session_*): {storage_root}/sessions/{run_id}/logs/
- Old format (anything else): {base_path}/runs/{run_id}/ (deprecated)
When base_path ends with 'runtime_logs', we use the parent directory
to avoid nesting under runtime_logs/.
This allows backward compatibility for reading old logs.
"""
if run_id.startswith("session_"):
# New: sessions/{session_id}/logs/
# If base_path ends with runtime_logs, use parent (storage root)
is_runtime_logs = self._base_path.name == "runtime_logs"
root = self._base_path.parent if is_runtime_logs else self._base_path
return root / "sessions" / run_id / "logs"
else:
# Old: runs/{run_id}/ (deprecated, backward compatibility only)
import warnings
warnings.warn(
f"Reading logs from deprecated location for run_id={run_id}. "
"New sessions use unified storage at sessions/session_*/logs/",
DeprecationWarning,
stacklevel=3,
)
return self._base_path / "runs" / run_id
# -------------------------------------------------------------------
# Incremental write (sync — called from locked sections)
# -------------------------------------------------------------------
def ensure_run_dir(self, run_id: str) -> None:
"""Create the run directory immediately. Called by start_run()."""
run_dir = self._get_run_dir(run_id)
run_dir.mkdir(parents=True, exist_ok=True)
def append_step(self, run_id: str, step: NodeStepLog) -> None:
"""Append one JSONL line to tool_logs.jsonl. Sync."""
path = self._get_run_dir(run_id) / "tool_logs.jsonl"
line = json.dumps(step.model_dump(), ensure_ascii=False) + "\n"
with open(path, "a", encoding="utf-8") as f:
f.write(line)
def append_node_detail(self, run_id: str, detail: NodeDetail) -> None:
"""Append one JSONL line to details.jsonl. Sync."""
path = self._get_run_dir(run_id) / "details.jsonl"
line = json.dumps(detail.model_dump(), ensure_ascii=False) + "\n"
with open(path, "a", encoding="utf-8") as f:
f.write(line)
def read_node_details_sync(self, run_id: str) -> list[NodeDetail]:
"""Read details.jsonl back into a list of NodeDetail. Sync.
Used by end_run() to aggregate L2 into L1. Skips corrupt lines.
"""
path = self._get_run_dir(run_id) / "details.jsonl"
return _read_jsonl_as_models(path, NodeDetail)
# -------------------------------------------------------------------
# Summary write (async — called from end_run)
# -------------------------------------------------------------------
async def save_summary(self, run_id: str, summary: RunSummaryLog) -> None:
"""Write summary.json atomically. Called once at end_run()."""
run_dir = self._get_run_dir(run_id)
await asyncio.to_thread(run_dir.mkdir, parents=True, exist_ok=True)
await self._write_json(run_dir / "summary.json", summary.model_dump())
# -------------------------------------------------------------------
# Read
# -------------------------------------------------------------------
async def load_summary(self, run_id: str) -> RunSummaryLog | None:
"""Load Level 1 summary for a specific run."""
data = await self._read_json(self._get_run_dir(run_id) / "summary.json")
return RunSummaryLog(**data) if data is not None else None
async def load_details(self, run_id: str) -> RunDetailsLog | None:
"""Load Level 2 details from details.jsonl for a specific run."""
path = self._get_run_dir(run_id) / "details.jsonl"
def _read() -> RunDetailsLog | None:
if not path.exists():
return None
nodes = _read_jsonl_as_models(path, NodeDetail)
return RunDetailsLog(run_id=run_id, nodes=nodes)
return await asyncio.to_thread(_read)
async def load_tool_logs(self, run_id: str) -> RunToolLogs | None:
"""Load Level 3 tool logs from tool_logs.jsonl for a specific run."""
path = self._get_run_dir(run_id) / "tool_logs.jsonl"
def _read() -> RunToolLogs | None:
if not path.exists():
return None
steps = _read_jsonl_as_models(path, NodeStepLog)
return RunToolLogs(run_id=run_id, steps=steps)
return await asyncio.to_thread(_read)
async def list_runs(
self,
status: str = "",
needs_attention: bool | None = None,
limit: int = 20,
) -> list[RunSummaryLog]:
"""Scan both old and new directory structures, load summaries, filter, and sort.
Scans:
- Old: base_path/runs/{run_id}/
- New: base_path/sessions/{session_id}/logs/
Directories without summary.json are treated as in-progress runs and
get a synthetic summary with status="in_progress".
"""
entries = await asyncio.to_thread(self._scan_run_dirs)
summaries: list[RunSummaryLog] = []
for run_id in entries:
summary = await self.load_summary(run_id)
if summary is None:
# In-progress run: no summary.json yet. Synthesize one.
run_dir = self._get_run_dir(run_id)
if not run_dir.is_dir():
continue
summary = RunSummaryLog(
run_id=run_id,
status="in_progress",
started_at=_infer_started_at(run_id),
)
if status and status != "needs_attention" and summary.status != status:
continue
if status == "needs_attention" and not summary.needs_attention:
continue
if needs_attention is not None and summary.needs_attention != needs_attention:
continue
summaries.append(summary)
# Sort by started_at descending (most recent first)
summaries.sort(key=lambda s: s.started_at, reverse=True)
return summaries[:limit]
# -------------------------------------------------------------------
# Internal helpers
# -------------------------------------------------------------------
def _scan_run_dirs(self) -> list[str]:
"""Return list of run_id directory names from both old and new locations.
Scans:
- New: base_path/sessions/{session_id}/logs/ (preferred)
- Old: base_path/runs/{run_id}/ (deprecated, backward compatibility)
Returns run_ids/session_ids. Includes all directories, not just those
with summary.json, so in-progress runs are visible.
"""
run_ids = []
# Scan new location: base_path/sessions/{session_id}/logs/
# Determine the correct base path for sessions
is_runtime_logs = self._base_path.name == "runtime_logs"
root = self._base_path.parent if is_runtime_logs else self._base_path
sessions_dir = root / "sessions"
if sessions_dir.exists():
for session_dir in sessions_dir.iterdir():
if session_dir.is_dir() and session_dir.name.startswith("session_"):
logs_dir = session_dir / "logs"
if logs_dir.exists() and logs_dir.is_dir():
run_ids.append(session_dir.name)
# Scan old location: base_path/runs/ (deprecated)
old_runs_dir = self._base_path / "runs"
if old_runs_dir.exists():
old_ids = [d.name for d in old_runs_dir.iterdir() if d.is_dir()]
if old_ids:
import warnings
warnings.warn(
f"Found {len(old_ids)} runs in deprecated location. "
"Consider migrating to unified session storage.",
DeprecationWarning,
stacklevel=3,
)
run_ids.extend(old_ids)
return run_ids
@staticmethod
async def _write_json(path: Path, data: dict) -> None:
"""Write JSON atomically: write to .tmp then rename."""
tmp = path.with_suffix(".tmp")
content = json.dumps(data, indent=2, ensure_ascii=False)
def _write() -> None:
tmp.write_text(content, encoding="utf-8")
tmp.rename(path)
await asyncio.to_thread(_write)
@staticmethod
async def _read_json(path: Path) -> dict | None:
"""Read and parse a JSON file. Returns None if missing or corrupt."""
def _read() -> dict | None:
if not path.exists():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError) as e:
logger.warning("Failed to read %s: %s", path, e)
return None
return await asyncio.to_thread(_read)
# -------------------------------------------------------------------
# Module-level helpers
# -------------------------------------------------------------------
def _read_jsonl_as_models(path: Path, model_cls: type) -> list:
"""Parse a JSONL file into a list of Pydantic model instances.
Skips blank lines and corrupt JSON lines (partial writes from crashes).
"""
results = []
if not path.exists():
return results
try:
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
results.append(model_cls(**data))
except (json.JSONDecodeError, Exception) as e:
logger.warning("Skipping corrupt JSONL line in %s: %s", path, e)
continue
except OSError as e:
logger.warning("Failed to read %s: %s", path, e)
return results
def _infer_started_at(run_id: str) -> str:
"""Best-effort ISO timestamp from a run_id like '20250101T120000_abc12345'."""
try:
ts_part = run_id.split("_")[0] # '20250101T120000'
dt = datetime.strptime(ts_part, "%Y%m%dT%H%M%S").replace(tzinfo=UTC)
return dt.isoformat()
except (ValueError, IndexError):
return ""
+304
View File
@@ -0,0 +1,304 @@
"""RuntimeLogger: captures runtime data during graph execution.
Injected into GraphExecutor as an optional parameter. Each log_step() and
log_node_complete() call writes immediately to disk (JSONL append). Only
the L1 summary is written at end_run() since it aggregates L2 data.
This provides crash resilience L2 and L3 data survives process death
without needing end_run() to complete.
Usage::
store = RuntimeLogStore(Path(work_dir) / "runtime_logs")
runtime_logger = RuntimeLogger(store=store, agent_id="my-agent")
executor = GraphExecutor(..., runtime_logger=runtime_logger)
# After execution, logger has persisted all data to store
Safety: ``end_run()`` catches all exceptions internally and logs them via
the Python logger. Logging failure must never kill a successful run.
"""
from __future__ import annotations
import logging
import threading
import uuid
from datetime import UTC, datetime
from typing import Any
from framework.runtime.runtime_log_schemas import (
NodeDetail,
NodeStepLog,
RunSummaryLog,
ToolCallLog,
)
from framework.runtime.runtime_log_store import RuntimeLogStore
logger = logging.getLogger(__name__)
class RuntimeLogger:
"""Captures runtime data during graph execution.
Thread-safe: uses a lock around file appends for parallel node safety.
"""
def __init__(self, store: RuntimeLogStore, agent_id: str = "") -> None:
self._store = store
self._agent_id = agent_id
self._run_id = ""
self._goal_id = ""
self._started_at = ""
self._logged_node_ids: set[str] = set()
self._lock = threading.Lock()
def start_run(self, goal_id: str = "", session_id: str = "") -> str:
"""Start a new run. Called by GraphExecutor at graph start. Returns run_id.
Args:
goal_id: Goal ID for this run
session_id: Optional session ID. If provided, uses it as run_id (for unified sessions).
Otherwise generates a new run_id in old format.
Returns:
The run_id (same as session_id if provided)
"""
if session_id:
# Use provided session_id as run_id (unified sessions)
self._run_id = session_id
else:
# Generate run_id in old format (backward compatibility)
ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S")
short_uuid = uuid.uuid4().hex[:8]
self._run_id = f"{ts}_{short_uuid}"
self._goal_id = goal_id
self._started_at = datetime.now(UTC).isoformat()
self._logged_node_ids = set()
self._store.ensure_run_dir(self._run_id)
return self._run_id
def log_step(
self,
node_id: str,
node_type: str,
step_index: int,
llm_text: str = "",
tool_calls: list[dict[str, Any]] | None = None,
input_tokens: int = 0,
output_tokens: int = 0,
latency_ms: int = 0,
verdict: str = "",
verdict_feedback: str = "",
error: str = "",
stacktrace: str = "",
is_partial: bool = False,
) -> None:
"""Record data for one step within a node.
Called by any node during execution. Synchronous, appends to JSONL file.
Args:
error: Error message if step failed
stacktrace: Full stack trace if exception occurred
is_partial: True if step didn't complete normally (e.g., LLM call crashed)
"""
if tool_calls is None:
tool_calls = []
call_logs = []
for tc in tool_calls:
call_logs.append(
ToolCallLog(
tool_use_id=tc.get("tool_use_id", ""),
tool_name=tc.get("tool_name", ""),
tool_input=tc.get("tool_input", {}),
result=tc.get("content", ""),
is_error=tc.get("is_error", False),
)
)
step_log = NodeStepLog(
node_id=node_id,
node_type=node_type,
step_index=step_index,
llm_text=llm_text,
tool_calls=call_logs,
input_tokens=input_tokens,
output_tokens=output_tokens,
latency_ms=latency_ms,
verdict=verdict,
verdict_feedback=verdict_feedback,
error=error,
stacktrace=stacktrace,
is_partial=is_partial,
)
with self._lock:
self._store.append_step(self._run_id, step_log)
def log_node_complete(
self,
node_id: str,
node_name: str,
node_type: str,
success: bool,
error: str | None = None,
stacktrace: str = "",
total_steps: int = 0,
tokens_used: int = 0,
input_tokens: int = 0,
output_tokens: int = 0,
latency_ms: int = 0,
attempt: int = 1,
# EventLoopNode-specific kwargs:
exit_status: str = "",
accept_count: int = 0,
retry_count: int = 0,
escalate_count: int = 0,
continue_count: int = 0,
) -> None:
"""Record completion of a node.
Called after each node completes. EventLoopNode calls this with
verdict counts and exit_status. Other nodes: executor calls this
from NodeResult data.
"""
needs_attention = not success
attention_reasons: list[str] = []
if not success and error:
attention_reasons.append(f"Node {node_id} failed: {error}")
# Enhanced attention flags
if retry_count > 3:
needs_attention = True
attention_reasons.append(f"Excessive retries: {retry_count}")
if escalate_count > 2:
needs_attention = True
attention_reasons.append(f"Excessive escalations: {escalate_count}")
if latency_ms > 60000: # > 1 minute
needs_attention = True
attention_reasons.append(f"High latency: {latency_ms}ms")
if tokens_used > 100000: # High token usage
needs_attention = True
attention_reasons.append(f"High token usage: {tokens_used}")
if total_steps > 20: # Many iterations
needs_attention = True
attention_reasons.append(f"Many iterations: {total_steps}")
detail = NodeDetail(
node_id=node_id,
node_name=node_name,
node_type=node_type,
success=success,
error=error,
stacktrace=stacktrace,
total_steps=total_steps,
tokens_used=tokens_used,
input_tokens=input_tokens,
output_tokens=output_tokens,
latency_ms=latency_ms,
attempt=attempt,
exit_status=exit_status,
accept_count=accept_count,
retry_count=retry_count,
escalate_count=escalate_count,
continue_count=continue_count,
needs_attention=needs_attention,
attention_reasons=attention_reasons,
)
with self._lock:
self._store.append_node_detail(self._run_id, detail)
self._logged_node_ids.add(node_id)
def ensure_node_logged(
self,
node_id: str,
node_name: str,
node_type: str,
success: bool,
error: str | None = None,
stacktrace: str = "",
tokens_used: int = 0,
latency_ms: int = 0,
) -> None:
"""Fallback: ensure a node has an L2 entry.
Called by executor after each node returns. If node_id already
appears in _logged_node_ids (because the node called log_node_complete
itself), this is a no-op. Otherwise appends a basic NodeDetail.
"""
with self._lock:
if node_id in self._logged_node_ids:
return # Already logged by the node itself
# Not yet logged — create a basic entry
self.log_node_complete(
node_id=node_id,
node_name=node_name,
node_type=node_type,
success=success,
error=error,
stacktrace=stacktrace,
tokens_used=tokens_used,
latency_ms=latency_ms,
)
async def end_run(
self,
status: str,
duration_ms: int,
node_path: list[str] | None = None,
execution_quality: str = "",
) -> None:
"""Read L2 from disk, aggregate into L1, write summary.json.
Called by GraphExecutor when graph finishes. Async, writes 1 file.
Catches all exceptions internally -- logging failure must not
propagate to the caller.
"""
try:
# Read L2 back from disk to aggregate into L1
node_details = self._store.read_node_details_sync(self._run_id)
total_input = sum(nd.input_tokens for nd in node_details)
total_output = sum(nd.output_tokens for nd in node_details)
needs_attention = any(nd.needs_attention for nd in node_details)
attention_reasons: list[str] = []
for nd in node_details:
attention_reasons.extend(nd.attention_reasons)
summary = RunSummaryLog(
run_id=self._run_id,
agent_id=self._agent_id,
goal_id=self._goal_id,
status=status,
total_nodes_executed=len(node_details),
node_path=node_path or [],
total_input_tokens=total_input,
total_output_tokens=total_output,
needs_attention=needs_attention,
attention_reasons=attention_reasons,
started_at=self._started_at,
duration_ms=duration_ms,
execution_quality=execution_quality,
)
await self._store.save_summary(self._run_id, summary)
logger.info(
"Runtime logs saved: run_id=%s status=%s nodes=%d",
self._run_id,
status,
len(node_details),
)
except Exception:
logger.exception(
"Failed to save runtime logs for run_id=%s (non-fatal)",
self._run_id,
)
+274
View File
@@ -0,0 +1,274 @@
"""
Session State Schema - Unified state for session execution.
This schema consolidates data from Run, ExecutionResult, and runtime logs
into a single source of truth for session status and resumability.
"""
from datetime import datetime
from enum import StrEnum
from typing import TYPE_CHECKING, Any
from pydantic import BaseModel, Field, computed_field
if TYPE_CHECKING:
from framework.graph.executor import ExecutionResult
from framework.schemas.run import Run
class SessionStatus(StrEnum):
"""Status of a session execution."""
ACTIVE = "active" # Currently executing
PAUSED = "paused" # Waiting for resume (client input, pause node)
COMPLETED = "completed" # Finished successfully
FAILED = "failed" # Finished with error
CANCELLED = "cancelled" # User/system cancelled
class SessionTimestamps(BaseModel):
"""Timestamps tracking session lifecycle."""
started_at: str # ISO 8601 format
updated_at: str # ISO 8601 format (updated on every state write)
completed_at: str | None = None
paused_at_time: str | None = None # When it was paused
model_config = {"extra": "allow"}
class SessionProgress(BaseModel):
"""Execution progress tracking."""
current_node: str | None = None
paused_at: str | None = None # Node ID where paused
resume_from: str | None = None # Entry point or node ID to resume from
steps_executed: int = 0
total_tokens: int = 0
total_latency_ms: int = 0
path: list[str] = Field(default_factory=list) # Node IDs traversed
# Quality metrics (from ExecutionResult)
total_retries: int = 0
nodes_with_failures: list[str] = Field(default_factory=list)
retry_details: dict[str, int] = Field(default_factory=dict)
had_partial_failures: bool = False
execution_quality: str = "clean" # "clean", "degraded", or "failed"
node_visit_counts: dict[str, int] = Field(default_factory=dict)
model_config = {"extra": "allow"}
class SessionResult(BaseModel):
"""Final result of session execution."""
success: bool | None = None # None if still running
error: str | None = None
output: dict[str, Any] = Field(default_factory=dict)
model_config = {"extra": "allow"}
class SessionMetrics(BaseModel):
"""Execution metrics (from Run.metrics)."""
decision_count: int = 0
problem_count: int = 0
total_input_tokens: int = 0
total_output_tokens: int = 0
nodes_executed: list[str] = Field(default_factory=list)
edges_traversed: list[str] = Field(default_factory=list)
model_config = {"extra": "allow"}
class SessionState(BaseModel):
"""
Complete state for a session execution.
This is the single source of truth for session status and resumability.
Consolidates data from ExecutionResult, ExecutionContext, Run, and runtime logs.
Version History:
- v1.0: Initial schema (2026-02-06)
"""
# Schema version for forward/backward compatibility
schema_version: str = "1.0"
# Identity
session_id: str # Format: session_YYYYMMDD_HHMMSS_{uuid_8char}
stream_id: str = "" # Which ExecutionStream created this
correlation_id: str = "" # For correlating related executions
# Status
status: SessionStatus = SessionStatus.ACTIVE
# Goal/Agent context
goal_id: str
agent_id: str = ""
entry_point: str = "start"
# Timestamps
timestamps: SessionTimestamps
# Progress
progress: SessionProgress = Field(default_factory=SessionProgress)
# Result
result: SessionResult = Field(default_factory=SessionResult)
# Memory (for resumability)
memory: dict[str, Any] = Field(default_factory=dict)
# Metrics
metrics: SessionMetrics = Field(default_factory=SessionMetrics)
# Problems (from Run.problems)
problems: list[dict[str, Any]] = Field(default_factory=list)
# Decisions (from Run.decisions - can be large, so store references)
decisions: list[dict[str, Any]] = Field(default_factory=list)
# Input data (for debugging/replay)
input_data: dict[str, Any] = Field(default_factory=dict)
# Isolation level (from ExecutionContext)
isolation_level: str = "shared"
model_config = {"extra": "allow"}
@computed_field
@property
def duration_ms(self) -> int:
"""Duration of the session in milliseconds."""
if not self.timestamps.completed_at:
return 0
started = datetime.fromisoformat(self.timestamps.started_at)
completed = datetime.fromisoformat(self.timestamps.completed_at)
return int((completed - started).total_seconds() * 1000)
@computed_field
@property
def is_resumable(self) -> bool:
"""Can this session be resumed?"""
return self.status == SessionStatus.PAUSED and self.progress.resume_from is not None
@classmethod
def from_execution_result(
cls,
session_id: str,
goal_id: str,
result: "ExecutionResult",
stream_id: str = "",
correlation_id: str = "",
started_at: str = "",
input_data: dict[str, Any] | None = None,
agent_id: str = "",
entry_point: str = "start",
) -> "SessionState":
"""Create SessionState from ExecutionResult."""
now = datetime.now().isoformat()
# Determine status based on execution result
if result.paused_at:
status = SessionStatus.PAUSED
elif result.success:
status = SessionStatus.COMPLETED
else:
status = SessionStatus.FAILED
return cls(
session_id=session_id,
stream_id=stream_id,
correlation_id=correlation_id,
goal_id=goal_id,
agent_id=agent_id,
entry_point=entry_point,
status=status,
timestamps=SessionTimestamps(
started_at=started_at or now,
updated_at=now,
completed_at=now if not result.paused_at else None,
paused_at_time=now if result.paused_at else None,
),
progress=SessionProgress(
current_node=result.paused_at or (result.path[-1] if result.path else None),
paused_at=result.paused_at,
resume_from=result.session_state.get("resume_from")
if result.session_state
else None,
steps_executed=result.steps_executed,
total_tokens=result.total_tokens,
total_latency_ms=result.total_latency_ms,
path=result.path,
total_retries=result.total_retries,
nodes_with_failures=result.nodes_with_failures,
retry_details=result.retry_details,
had_partial_failures=result.had_partial_failures,
execution_quality=result.execution_quality,
node_visit_counts=result.node_visit_counts,
),
result=SessionResult(
success=result.success,
error=result.error,
output=result.output,
),
memory=result.session_state.get("memory", {}) if result.session_state else {},
input_data=input_data or {},
)
@classmethod
def from_legacy_run(cls, run: "Run", session_id: str, stream_id: str = "") -> "SessionState":
"""Create SessionState from legacy Run object."""
from framework.schemas.run import RunStatus
now = datetime.now().isoformat()
# Map RunStatus to SessionStatus
status_mapping = {
RunStatus.RUNNING: SessionStatus.ACTIVE,
RunStatus.COMPLETED: SessionStatus.COMPLETED,
RunStatus.FAILED: SessionStatus.FAILED,
RunStatus.CANCELLED: SessionStatus.CANCELLED,
RunStatus.STUCK: SessionStatus.FAILED,
}
status = status_mapping.get(run.status, SessionStatus.FAILED)
return cls(
schema_version="1.0",
session_id=session_id,
stream_id=stream_id,
goal_id=run.goal_id,
status=status,
timestamps=SessionTimestamps(
started_at=run.started_at.isoformat(),
updated_at=now,
completed_at=run.completed_at.isoformat() if run.completed_at else None,
),
result=SessionResult(
success=run.status == RunStatus.COMPLETED,
output=run.output_data,
),
metrics=SessionMetrics(
decision_count=run.metrics.total_decisions,
problem_count=len(run.problems),
total_input_tokens=run.metrics.total_tokens, # Approximate
total_output_tokens=0, # Not tracked in old format
nodes_executed=run.metrics.nodes_executed,
edges_traversed=run.metrics.edges_traversed,
),
decisions=[d.model_dump() for d in run.decisions],
problems=[p.model_dump() for p in run.problems],
input_data=run.input_data,
)
def to_session_state_dict(self) -> dict[str, Any]:
"""Convert to session_state format for GraphExecutor.execute()."""
return {
"paused_at": self.progress.paused_at,
"resume_from": self.progress.resume_from,
"memory": self.memory,
"next_node": None,
}
+93 -40
View File
@@ -1,7 +1,10 @@
"""
File-based storage backend for runtime data.
Stores runs as JSON files with indexes for efficient querying.
DEPRECATED: This storage backend is deprecated for new sessions.
New sessions use unified storage at sessions/{session_id}/state.json.
This module is kept for backward compatibility with old run data only.
Uses Pydantic's built-in serialization.
"""
@@ -14,21 +17,24 @@ from framework.utils.io import atomic_write
class FileStorage:
"""
Simple file-based storage for runs.
DEPRECATED: File-based storage for old runs only.
Directory structure:
New sessions use unified storage at sessions/{session_id}/state.json.
This class is kept for backward compatibility with old run data.
Old directory structure (deprecated):
{base_path}/
runs/
{run_id}.json # Full run data
indexes/
runs/ # DEPRECATED - no longer written
{run_id}.json
summaries/ # DEPRECATED - no longer written
{run_id}.json
indexes/ # DEPRECATED - no longer written or read
by_goal/
{goal_id}.json # List of run IDs for this goal
{goal_id}.json
by_status/
{status}.json # List of run IDs with this status
{status}.json
by_node/
{node_id}.json # List of run IDs that used this node
summaries/
{run_id}.json # Run summary (for quick loading)
{node_id}.json
"""
def __init__(self, base_path: str | Path):
@@ -36,16 +42,14 @@ class FileStorage:
self._ensure_dirs()
def _ensure_dirs(self) -> None:
"""Create directory structure if it doesn't exist."""
dirs = [
self.base_path / "runs",
self.base_path / "indexes" / "by_goal",
self.base_path / "indexes" / "by_status",
self.base_path / "indexes" / "by_node",
self.base_path / "summaries",
]
for d in dirs:
d.mkdir(parents=True, exist_ok=True)
"""Create directory structure if it doesn't exist.
DEPRECATED: All directories (runs/, summaries/, indexes/) are deprecated.
New sessions use unified storage at sessions/{session_id}/state.json.
This method is now a no-op. Tests should not rely on this.
"""
# No-op: do not create deprecated directories
pass
def _validate_key(self, key: str) -> None:
"""
@@ -84,23 +88,22 @@ class FileStorage:
# === RUN OPERATIONS ===
def save_run(self, run: Run) -> None:
"""Save a run to storage."""
# Save full run using Pydantic's model_dump_json
run_path = self.base_path / "runs" / f"{run.id}.json"
with atomic_write(run_path) as f:
f.write(run.model_dump_json(indent=2))
"""Save a run to storage.
# Save summary
summary = RunSummary.from_run(run)
summary_path = self.base_path / "summaries" / f"{run.id}.json"
with atomic_write(summary_path) as f:
f.write(summary.model_dump_json(indent=2))
DEPRECATED: This method is now a no-op.
New sessions use unified storage at sessions/{session_id}/state.json.
Tests should not rely on FileStorage - use unified session storage instead.
"""
import warnings
# Update indexes
self._add_to_index("by_goal", run.goal_id, run.id)
self._add_to_index("by_status", run.status.value, run.id)
for node_id in run.metrics.nodes_executed:
self._add_to_index("by_node", node_id, run.id)
warnings.warn(
"FileStorage.save_run() is deprecated. "
"New sessions use unified storage at sessions/{session_id}/state.json. "
"This write has been skipped.",
DeprecationWarning,
stacklevel=2,
)
# No-op: do not write to deprecated locations
def load_run(self, run_id: str) -> Run | None:
"""Load a run from storage."""
@@ -148,17 +151,53 @@ class FileStorage:
# === QUERY OPERATIONS ===
def get_runs_by_goal(self, goal_id: str) -> list[str]:
"""Get all run IDs for a goal."""
"""Get all run IDs for a goal.
DEPRECATED: Indexes are deprecated. For new sessions, scan sessions/*/state.json instead.
This method only returns old run IDs from deprecated indexes.
"""
import warnings
warnings.warn(
"FileStorage.get_runs_by_goal() is deprecated. "
"For new sessions, scan sessions/*/state.json instead.",
DeprecationWarning,
stacklevel=2,
)
return self._get_index("by_goal", goal_id)
def get_runs_by_status(self, status: str | RunStatus) -> list[str]:
"""Get all run IDs with a status."""
"""Get all run IDs with a status.
DEPRECATED: Indexes are deprecated. For new sessions, scan sessions/*/state.json instead.
This method only returns old run IDs from deprecated indexes.
"""
import warnings
warnings.warn(
"FileStorage.get_runs_by_status() is deprecated. "
"For new sessions, scan sessions/*/state.json instead.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(status, RunStatus):
status = status.value
return self._get_index("by_status", status)
def get_runs_by_node(self, node_id: str) -> list[str]:
"""Get all run IDs that executed a node."""
"""Get all run IDs that executed a node.
DEPRECATED: Indexes are deprecated. For new sessions, scan sessions/*/state.json instead.
This method only returns old run IDs from deprecated indexes.
"""
import warnings
warnings.warn(
"FileStorage.get_runs_by_node() is deprecated. "
"For new sessions, scan sessions/*/state.json instead.",
DeprecationWarning,
stacklevel=2,
)
return self._get_index("by_node", node_id)
def list_all_runs(self) -> list[str]:
@@ -167,8 +206,22 @@ class FileStorage:
return [f.stem for f in runs_dir.glob("*.json")]
def list_all_goals(self) -> list[str]:
"""List all goal IDs that have runs."""
"""List all goal IDs that have runs.
DEPRECATED: Indexes are deprecated. For new sessions, scan sessions/*/state.json instead.
This method only returns goals from old run IDs in deprecated indexes.
"""
import warnings
warnings.warn(
"FileStorage.list_all_goals() is deprecated. "
"For new sessions, scan sessions/*/state.json instead.",
DeprecationWarning,
stacklevel=2,
)
goals_dir = self.base_path / "indexes" / "by_goal"
if not goals_dir.exists():
return []
return [f.stem for f in goals_dir.glob("*.json")]
# === INDEX OPERATIONS ===
+213
View File
@@ -0,0 +1,213 @@
"""
Session Store - Unified session storage with state.json.
Handles reading and writing session state to the new unified structure:
sessions/session_YYYYMMDD_HHMMSS_{uuid}/state.json
"""
import asyncio
import logging
import uuid
from datetime import datetime
from pathlib import Path
from framework.schemas.session_state import SessionState
from framework.utils.io import atomic_write
logger = logging.getLogger(__name__)
class SessionStore:
"""
Unified session storage with state.json.
Manages sessions in the new structure:
{base_path}/sessions/session_YYYYMMDD_HHMMSS_{uuid}/
state.json # Single source of truth
conversations/ # Per-node EventLoop state
artifacts/ # Spillover data
logs/ # L1/L2/L3 observability
summary.json
details.jsonl
tool_logs.jsonl
"""
def __init__(self, base_path: Path):
"""
Initialize session store.
Args:
base_path: Base path for storage (e.g., ~/.hive/twitter_outreach)
"""
self.base_path = Path(base_path)
self.sessions_dir = self.base_path / "sessions"
def generate_session_id(self) -> str:
"""
Generate session ID in format: session_YYYYMMDD_HHMMSS_{uuid}.
Returns:
Session ID string (e.g., "session_20260206_143022_abc12345")
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
short_uuid = uuid.uuid4().hex[:8]
return f"session_{timestamp}_{short_uuid}"
def get_session_path(self, session_id: str) -> Path:
"""
Get path to session directory.
Args:
session_id: Session ID
Returns:
Path to session directory
"""
return self.sessions_dir / session_id
def get_state_path(self, session_id: str) -> Path:
"""
Get path to state.json file.
Args:
session_id: Session ID
Returns:
Path to state.json
"""
return self.get_session_path(session_id) / "state.json"
async def write_state(self, session_id: str, state: SessionState) -> None:
"""
Atomically write state.json for a session.
Uses temp file + rename for crash safety.
Args:
session_id: Session ID
state: SessionState to write
"""
def _write():
state_path = self.get_state_path(session_id)
state_path.parent.mkdir(parents=True, exist_ok=True)
with atomic_write(state_path) as f:
f.write(state.model_dump_json(indent=2))
await asyncio.to_thread(_write)
logger.debug(f"Wrote state.json for session {session_id}")
async def read_state(self, session_id: str) -> SessionState | None:
"""
Read state.json for a session.
Args:
session_id: Session ID
Returns:
SessionState or None if not found
"""
def _read():
state_path = self.get_state_path(session_id)
if not state_path.exists():
return None
return SessionState.model_validate_json(state_path.read_text())
return await asyncio.to_thread(_read)
async def list_sessions(
self,
status: str | None = None,
goal_id: str | None = None,
limit: int = 100,
) -> list[SessionState]:
"""
List sessions, optionally filtered by status or goal.
Args:
status: Optional status filter (e.g., "paused", "completed")
goal_id: Optional goal ID filter
limit: Maximum number of sessions to return
Returns:
List of SessionState objects
"""
def _scan():
sessions = []
if not self.sessions_dir.exists():
return sessions
for session_dir in self.sessions_dir.iterdir():
if not session_dir.is_dir():
continue
state_path = session_dir / "state.json"
if not state_path.exists():
continue
try:
state = SessionState.model_validate_json(state_path.read_text())
# Apply filters
if status and state.status != status:
continue
if goal_id and state.goal_id != goal_id:
continue
sessions.append(state)
except Exception as e:
logger.warning(f"Failed to load {state_path}: {e}")
continue
# Sort by updated_at descending (most recent first)
sessions.sort(key=lambda s: s.timestamps.updated_at, reverse=True)
return sessions[:limit]
return await asyncio.to_thread(_scan)
async def delete_session(self, session_id: str) -> bool:
"""
Delete a session and all its data.
Args:
session_id: Session ID to delete
Returns:
True if deleted, False if not found
"""
def _delete():
import shutil
session_path = self.get_session_path(session_id)
if not session_path.exists():
return False
shutil.rmtree(session_path)
logger.info(f"Deleted session {session_id}")
return True
return await asyncio.to_thread(_delete)
async def session_exists(self, session_id: str) -> bool:
"""
Check if a session exists.
Args:
session_id: Session ID
Returns:
True if session exists
"""
def _check():
return self.get_state_path(session_id).exists()
return await asyncio.to_thread(_check)
+179
View File
@@ -0,0 +1,179 @@
"""
State Writer - Dual-write adapter for migration period.
Writes execution state to both old (Run/RunSummary) and new (state.json) formats
to maintain backward compatibility during the transition period.
"""
import logging
import os
from datetime import datetime
from framework.schemas.run import Problem, Run, RunMetrics, RunStatus
from framework.schemas.session_state import SessionState, SessionStatus
from framework.storage.concurrent import ConcurrentStorage
from framework.storage.session_store import SessionStore
logger = logging.getLogger(__name__)
class StateWriter:
"""
Writes execution state to both old and new formats during migration.
During the dual-write phase:
- New format (state.json) is written when USE_UNIFIED_SESSIONS=true
- Old format (Run/RunSummary) is always written for backward compatibility
"""
def __init__(self, old_storage: ConcurrentStorage, session_store: SessionStore):
"""
Initialize state writer.
Args:
old_storage: ConcurrentStorage for old format (runs/, summaries/)
session_store: SessionStore for new format (sessions/*/state.json)
"""
self.old = old_storage
self.new = session_store
self.dual_write_enabled = os.getenv("USE_UNIFIED_SESSIONS", "false").lower() == "true"
async def write_execution_state(
self,
session_id: str,
state: SessionState,
) -> None:
"""
Write execution state to both old and new formats.
Args:
session_id: Session ID
state: SessionState to write
"""
# Write to new format if enabled
if self.dual_write_enabled:
try:
await self.new.write_state(session_id, state)
logger.debug(f"Wrote state.json for session {session_id}")
except Exception as e:
logger.error(f"Failed to write state.json for {session_id}: {e}")
# Don't fail - old format is still written
# Always write to old format for backward compatibility
try:
run = self._convert_to_run(state)
await self.old.save_run(run)
logger.debug(f"Wrote Run object for session {session_id}")
except Exception as e:
logger.error(f"Failed to write Run object for {session_id}: {e}")
# This is more critical - reraise if old format fails
raise
def _convert_to_run(self, state: SessionState) -> Run:
"""
Convert SessionState to legacy Run object.
Args:
state: SessionState to convert
Returns:
Run object
"""
# Map SessionStatus to RunStatus
status_mapping = {
SessionStatus.ACTIVE: RunStatus.RUNNING,
SessionStatus.PAUSED: RunStatus.RUNNING, # Paused is still "running" in old format
SessionStatus.COMPLETED: RunStatus.COMPLETED,
SessionStatus.FAILED: RunStatus.FAILED,
SessionStatus.CANCELLED: RunStatus.CANCELLED,
}
run_status = status_mapping.get(state.status, RunStatus.FAILED)
# Convert timestamps
started_at = datetime.fromisoformat(state.timestamps.started_at)
completed_at = (
datetime.fromisoformat(state.timestamps.completed_at)
if state.timestamps.completed_at
else None
)
# Build RunMetrics
metrics = RunMetrics(
total_decisions=state.metrics.decision_count,
successful_decisions=state.metrics.decision_count
- len(state.progress.nodes_with_failures), # Approximate
failed_decisions=len(state.progress.nodes_with_failures),
total_tokens=state.metrics.total_input_tokens + state.metrics.total_output_tokens,
total_latency_ms=state.progress.total_latency_ms,
nodes_executed=state.metrics.nodes_executed,
edges_traversed=state.metrics.edges_traversed,
)
# Convert problems (SessionState stores as dicts, Run expects Problem objects)
problems = []
for p_dict in state.problems:
# Handle both old Problem objects and new dict format
if isinstance(p_dict, dict):
problems.append(Problem(**p_dict))
else:
problems.append(p_dict)
# Convert decisions (SessionState stores as dicts, Run expects Decision objects)
from framework.schemas.decision import Decision
decisions = []
for d_dict in state.decisions:
# Handle both old Decision objects and new dict format
if isinstance(d_dict, dict):
try:
decisions.append(Decision(**d_dict))
except Exception:
# Skip invalid decisions
continue
else:
decisions.append(d_dict)
# Create Run object
run = Run(
id=state.session_id, # Use session_id as run_id
goal_id=state.goal_id,
started_at=started_at,
status=run_status,
completed_at=completed_at,
decisions=decisions,
problems=problems,
metrics=metrics,
goal_description="", # Not stored in SessionState
input_data=state.input_data,
output_data=state.result.output,
)
return run
async def read_state(
self,
session_id: str,
prefer_new: bool = True,
) -> SessionState | None:
"""
Read execution state from either format.
Args:
session_id: Session ID
prefer_new: If True, try new format first (default)
Returns:
SessionState or None if not found
"""
if prefer_new:
# Try new format first
state = await self.new.read_state(session_id)
if state:
return state
# Fall back to old format
run = await self.old.load_run(session_id)
if run:
return SessionState.from_legacy_run(run, session_id)
return None
+25
View File
@@ -1,4 +1,6 @@
import logging
import platform
import subprocess
import time
from textual.app import App, ComposeResult
@@ -11,6 +13,7 @@ from framework.runtime.event_bus import AgentEvent, EventType
from framework.tui.widgets.chat_repl import ChatRepl
from framework.tui.widgets.graph_view import GraphOverview
from framework.tui.widgets.log_pane import LogPane
from framework.tui.widgets.selectable_rich_log import SelectableRichLog
class StatusBar(Container):
@@ -202,6 +205,8 @@ class AdenTUI(App):
BINDINGS = [
Binding("q", "quit", "Quit"),
Binding("ctrl+c", "ctrl_c", "Interrupt", show=False, priority=True),
Binding("super+c", "ctrl_c", "Copy", show=False, priority=True),
Binding("ctrl+s", "screenshot", "Screenshot (SVG)", show=True, priority=True),
Binding("tab", "focus_next", "Next Panel", show=True),
Binding("shift+tab", "focus_previous", "Previous Panel", show=False),
@@ -217,6 +222,26 @@ class AdenTUI(App):
self.status_bar = StatusBar(graph_id=runtime.graph.id)
self.is_ready = False
def open_url(self, url: str, *, new_tab: bool = True) -> None:
"""Override to use native `open` for file:// URLs on macOS."""
if url.startswith("file://") and platform.system() == "Darwin":
path = url.removeprefix("file://")
subprocess.Popen(["open", path])
else:
super().open_url(url, new_tab=new_tab)
def action_ctrl_c(self) -> None:
# Check if any SelectableRichLog has an active selection to copy
for widget in self.query(SelectableRichLog):
if widget.selection is not None:
text = widget.copy_selection()
if text:
widget.clear_selection()
self.notify("Copied to clipboard", severity="information", timeout=2)
return
self.notify("Press [b]q[/b] to quit", severity="warning", timeout=3)
def compose(self) -> ComposeResult:
yield self.status_bar
+19 -5
View File
@@ -21,9 +21,10 @@ from typing import Any
from textual.app import ComposeResult
from textual.containers import Vertical
from textual.widgets import Input, Label, RichLog
from textual.widgets import Input, Label
from framework.runtime.agent_runtime import AgentRuntime
from framework.tui.widgets.selectable_rich_log import SelectableRichLog as RichLog
class ChatRepl(Vertical):
@@ -88,16 +89,29 @@ class ChatRepl(Vertical):
self._agent_thread.start()
def compose(self) -> ComposeResult:
yield RichLog(id="chat-history", highlight=True, markup=True, auto_scroll=False, wrap=True)
yield RichLog(
id="chat-history",
highlight=True,
markup=True,
auto_scroll=False,
wrap=True,
min_width=0,
)
yield Label("Agent is processing...", id="processing-indicator")
yield Input(placeholder="Enter input for agent...", id="chat-input")
# Regex for file:// URIs that are NOT already inside Rich [link=...] markup
_FILE_URI_RE = re.compile(r"(?<!\[link=)(file://\S+)")
_FILE_URI_RE = re.compile(r"(?<!\[link=)(file://[^\s)\]>*]+)")
def _linkify(self, text: str) -> str:
"""Convert bare file:// URIs to clickable Rich [link=...] markup."""
return self._FILE_URI_RE.sub(r"[link=\1]\1[/link]", text)
"""Convert bare file:// URIs to clickable Rich [link=...] markup with short display text."""
def _shorten(match: re.Match) -> str:
uri = match.group(1)
filename = uri.rsplit("/", 1)[-1] if "/" in uri else uri
return f"[link={uri}]{filename}[/link]"
return self._FILE_URI_RE.sub(_shorten, text)
def _write_history(self, content: str) -> None:
"""Write to chat history, only auto-scrolling if user is at the bottom."""
+1 -1
View File
@@ -4,10 +4,10 @@ Graph/Tree Overview Widget - Displays real agent graph structure.
from textual.app import ComposeResult
from textual.containers import Vertical
from textual.widgets import RichLog
from framework.runtime.agent_runtime import AgentRuntime
from framework.runtime.event_bus import EventType
from framework.tui.widgets.selectable_rich_log import SelectableRichLog as RichLog
class GraphOverview(Vertical):
+1 -1
View File
@@ -7,9 +7,9 @@ from datetime import datetime
from textual.app import ComposeResult
from textual.containers import Container
from textual.widgets import RichLog
from framework.runtime.event_bus import AgentEvent, EventType
from framework.tui.widgets.selectable_rich_log import SelectableRichLog as RichLog
class LogPane(Container):
@@ -0,0 +1,206 @@
"""
SelectableRichLog - RichLog with mouse-driven text selection and clipboard copy.
Drop-in replacement for RichLog. Click-and-drag to select text, which is
visually highlighted. Press Ctrl+C to copy selection to clipboard (handled
by app.py). Press Escape or single-click to clear selection.
"""
from __future__ import annotations
import subprocess
import sys
from rich.segment import Segment as RichSegment
from rich.style import Style
from textual.geometry import Offset
from textual.selection import Selection
from textual.strip import Strip
from textual.widgets import RichLog
# Highlight style for selected text
_HIGHLIGHT_STYLE = Style(bgcolor="blue", color="white")
class SelectableRichLog(RichLog):
"""RichLog with mouse-driven text selection."""
DEFAULT_CSS = """
SelectableRichLog {
pointer: text;
}
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self._sel_anchor: Offset | None = None
self._sel_end: Offset | None = None
self._selecting: bool = False
# -- Internal helpers --
def _apply_highlight(self, strip: Strip) -> Strip:
"""Apply highlight with correct precedence (highlight wins over base style)."""
segments = []
for text, style, control in strip._segments:
if control:
segments.append(RichSegment(text, style, control))
else:
new_style = (style + _HIGHLIGHT_STYLE) if style else _HIGHLIGHT_STYLE
segments.append(RichSegment(text, new_style, control))
return Strip(segments, strip.cell_length)
# -- Selection helpers --
@property
def selection(self) -> Selection | None:
"""Build a Selection from current anchor/end, or None if no selection."""
if self._sel_anchor is None or self._sel_end is None:
return None
if self._sel_anchor == self._sel_end:
return None
return Selection.from_offsets(self._sel_anchor, self._sel_end)
def _mouse_to_content(self, event_x: int, event_y: int) -> Offset:
"""Convert viewport mouse coords to content (line, col) coords."""
scroll_x, scroll_y = self.scroll_offset
return Offset(scroll_x + event_x, scroll_y + event_y)
def clear_selection(self) -> None:
"""Clear any active selection."""
had_selection = self._sel_anchor is not None
self._sel_anchor = None
self._sel_end = None
self._selecting = False
if had_selection:
self.refresh()
# -- Mouse handlers (left button only) --
def on_mouse_down(self, event) -> None:
"""Start selection on left mouse button."""
if event.button != 1:
return
self._sel_anchor = self._mouse_to_content(event.x, event.y)
self._sel_end = self._sel_anchor
self._selecting = True
self.capture_mouse()
self.refresh()
def on_mouse_move(self, event) -> None:
"""Extend selection while dragging."""
if not self._selecting:
return
self._sel_end = self._mouse_to_content(event.x, event.y)
self.refresh()
def on_mouse_up(self, event) -> None:
"""End selection on mouse release."""
if not self._selecting:
return
self._selecting = False
self.release_mouse()
# Single-click (no drag) clears selection
if self._sel_anchor == self._sel_end:
self.clear_selection()
# -- Keyboard handlers --
def on_key(self, event) -> None:
"""Clear selection on Escape."""
if event.key == "escape":
self.clear_selection()
# -- Rendering with highlight --
def render_line(self, y: int) -> Strip:
"""Override to apply selection highlight on top of the base strip."""
strip = super().render_line(y)
sel = self.selection
if sel is None:
return strip
# Determine which content line this viewport row corresponds to
_, scroll_y = self.scroll_offset
content_y = scroll_y + y
span = sel.get_span(content_y)
if span is None:
return strip
start_x, end_x = span
cell_len = strip.cell_length
if cell_len == 0:
return strip
scroll_x, _ = self.scroll_offset
# -1 means "to end of content line" — use viewport end
if end_x == -1:
end_x = cell_len
else:
# Convert content-space x to viewport-space x
end_x = end_x - scroll_x
# Convert content-space x to viewport-space x
start_x = start_x - scroll_x
# Clamp to viewport strip bounds
start_x = max(0, start_x)
end_x = min(end_x, cell_len)
if start_x >= end_x:
return strip
# Divide strip into [before, selected, after] and highlight the middle
parts = strip.divide([start_x, end_x])
if len(parts) < 2:
return strip
highlighted_parts: list[Strip] = []
for i, part in enumerate(parts):
if i == 1:
highlighted_parts.append(self._apply_highlight(part))
else:
highlighted_parts.append(part)
return Strip.join(highlighted_parts)
# -- Text extraction & clipboard --
def get_selected_text(self) -> str | None:
"""Extract the plain text of the current selection, or None."""
sel = self.selection
if sel is None:
return None
# Build full text from all lines
all_text = "\n".join(strip.text for strip in self.lines)
extracted = sel.extract(all_text)
return extracted if extracted else None
def copy_selection(self) -> str | None:
"""Copy selected text to system clipboard. Returns text or None."""
text = self.get_selected_text()
if not text:
return None
_copy_to_clipboard(text)
return text
def _copy_to_clipboard(text: str) -> None:
"""Copy text to system clipboard using platform-native tools."""
try:
if sys.platform == "darwin":
subprocess.run(["pbcopy"], input=text.encode(), check=True, timeout=5)
elif sys.platform.startswith("linux"):
subprocess.run(
["xclip", "-selection", "clipboard"],
input=text.encode(),
check=True,
timeout=5,
)
except (subprocess.SubprocessError, FileNotFoundError):
pass
+11 -1
View File
@@ -1,10 +1,20 @@
"""Tests for the BuilderQuery interface - how Builder analyzes agent runs."""
"""Tests for the BuilderQuery interface - how Builder analyzes agent runs.
DEPRECATED: These tests rely on the deprecated FileStorage backend.
BuilderQuery and Runtime both use FileStorage which is deprecated.
New code should use unified session storage instead.
"""
from pathlib import Path
import pytest
from framework import BuilderQuery, Runtime
from framework.schemas.run import RunStatus
# Mark all tests in this module as skipped - they rely on deprecated FileStorage
pytestmark = pytest.mark.skip(reason="Tests rely on deprecated FileStorage backend")
def create_successful_run(runtime: Runtime, goal_id: str = "test_goal") -> str:
"""Helper to create a successful run with decisions."""
+20
View File
@@ -26,6 +26,11 @@ def create_test_run(
)
@pytest.mark.skip(
reason="FileStorage.save_run() is deprecated and now a no-op. "
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
@pytest.mark.asyncio
async def test_cache_invalidation_on_save(tmp_path: Path):
"""Test that summary cache is invalidated when a run is saved.
@@ -62,6 +67,11 @@ async def test_cache_invalidation_on_save(tmp_path: Path):
await storage.stop()
@pytest.mark.skip(
reason="FileStorage.save_run() is deprecated and now a no-op. "
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
@pytest.mark.asyncio
async def test_batched_write_cache_consistency(tmp_path: Path):
"""Test that cache is only updated after successful batched write.
@@ -104,6 +114,11 @@ async def test_batched_write_cache_consistency(tmp_path: Path):
await storage.stop()
@pytest.mark.skip(
reason="FileStorage.save_run() is deprecated and now a no-op. "
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
@pytest.mark.asyncio
async def test_immediate_write_updates_cache(tmp_path: Path):
"""Test that immediate writes still update cache correctly."""
@@ -129,6 +144,11 @@ async def test_immediate_write_updates_cache(tmp_path: Path):
await storage.stop()
@pytest.mark.skip(
reason="FileStorage.save_run() is deprecated and now a no-op. "
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
@pytest.mark.asyncio
async def test_summary_cache_invalidated_on_multiple_saves(tmp_path: Path):
"""Test that summary cache is invalidated on each save, not just the first."""
+2 -8
View File
@@ -8,7 +8,6 @@ Set HIVE_TEST_LLM_MODEL=<model> to override the real model.
from __future__ import annotations
import asyncio
import os
from collections.abc import AsyncIterator, Callable
from dataclasses import dataclass
@@ -952,14 +951,9 @@ async def test_client_facing_node_streams_output():
config=LoopConfig(max_iterations=5),
)
# client_facing + text-only blocks for user input; use shutdown to unblock
async def auto_shutdown():
await asyncio.sleep(0.05)
node.signal_shutdown()
task = asyncio.create_task(auto_shutdown())
# Text-only on client_facing no longer blocks (no ask_user called),
# so the node completes without needing a shutdown workaround.
result = await node.execute(ctx)
await task
assert result.success
+122 -34
View File
@@ -447,14 +447,9 @@ class TestEventBusLifecycle:
ctx = build_ctx(runtime, spec, memory, llm)
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
# client_facing + text-only blocks for user input; use shutdown to unblock
async def auto_shutdown():
await asyncio.sleep(0.05)
node.signal_shutdown()
task = asyncio.create_task(auto_shutdown())
# Text-only on client_facing no longer blocks (no ask_user), so
# the node completes without needing shutdown.
await node.execute(ctx)
await task
assert EventType.CLIENT_OUTPUT_DELTA in received_types
assert EventType.LLM_TEXT_DELTA not in received_types
@@ -480,11 +475,38 @@ class TestClientFacingBlocking:
)
@pytest.mark.asyncio
async def test_client_facing_blocks_on_text(self, runtime, memory, client_spec):
"""client_facing + text-only response blocks until inject_event."""
async def test_text_only_no_blocking(self, runtime, memory, client_spec):
"""client_facing + text-only (no ask_user) should NOT block."""
llm = MockStreamingLLM(
scenarios=[
text_scenario("Hello!"),
text_scenario("Hello! Here is your status update."),
]
)
bus = EventBus()
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
ctx = build_ctx(runtime, client_spec, memory, llm)
# Should complete without blocking — no ask_user called, no output_keys required
result = await node.execute(ctx)
assert result.success is True
assert llm._call_index >= 1
@pytest.mark.asyncio
async def test_ask_user_triggers_blocking(self, runtime, memory, client_spec):
"""client_facing + ask_user() blocks until inject_event."""
# Give the node an output key so the judge doesn't auto-accept
# after the user responds — it needs set_output first.
client_spec.output_keys = ["answer"]
llm = MockStreamingLLM(
scenarios=[
# Turn 1: LLM greets user and calls ask_user
tool_call_scenario(
"ask_user", {"question": "What do you need?"}, tool_use_id="ask_1"
),
# Turn 2: after user responds, LLM processes and sets output
tool_call_scenario("set_output", {"key": "answer", "value": "help provided"}),
# Turn 3: text finish (implicit judge accepts — output key set)
text_scenario("Got your message."),
]
)
@@ -495,21 +517,19 @@ class TestClientFacingBlocking:
async def user_responds():
await asyncio.sleep(0.05)
await node.inject_event("I need help")
await asyncio.sleep(0.05)
node.signal_shutdown()
user_task = asyncio.create_task(user_responds())
result = await node.execute(ctx)
await user_task
assert result.success is True
# LLM called once; after inject_event, implicit judge ACCEPTs
# (no required output_keys) before a second LLM turn occurs.
assert llm._call_index >= 1
# LLM called at least twice: once for ask_user turn, once after user responded
assert llm._call_index >= 2
assert result.output["answer"] == "help provided"
@pytest.mark.asyncio
async def test_client_facing_does_not_block_on_tools(self, runtime, memory):
"""client_facing + tool calls should NOT block — judge evaluates normally."""
"""client_facing + tool calls (no ask_user) should NOT block."""
spec = NodeSpec(
id="chat",
name="Chat",
@@ -518,10 +538,9 @@ class TestClientFacingBlocking:
output_keys=["result"],
client_facing=True,
)
# Scenario 1: LLM calls set_output (tool call present → no blocking, judge RETRYs)
# Scenario 2: LLM produces text (implicit judge sees output key set → ACCEPT)
# But scenario 2 is text-only on client_facing → would block.
# So we need shutdown to handle that case.
# Scenario 1: LLM calls set_output
# Scenario 2: LLM produces text implicit judge ACCEPTs (output key set)
# No ask_user called, so no blocking occurs.
llm = MockStreamingLLM(
scenarios=[
tool_call_scenario("set_output", {"key": "result", "value": "done"}),
@@ -531,18 +550,8 @@ class TestClientFacingBlocking:
node = EventLoopNode(config=LoopConfig(max_iterations=5))
ctx = build_ctx(runtime, spec, memory, llm)
# After set_output, implicit judge RETRYs (tool calls present).
# Next turn: text-only on client_facing → blocks.
# But implicit judge should ACCEPT first (output key is set, no tools).
# Actually, client_facing check happens BEFORE judge, so it blocks.
# Use shutdown as safety net.
async def auto_shutdown():
await asyncio.sleep(0.1)
node.signal_shutdown()
task = asyncio.create_task(auto_shutdown())
# Should complete without blocking — no ask_user called
result = await node.execute(ctx)
await task
assert result.success is True
assert result.output["result"] == "done"
@@ -568,7 +577,11 @@ class TestClientFacingBlocking:
@pytest.mark.asyncio
async def test_signal_shutdown_unblocks(self, runtime, memory, client_spec):
"""signal_shutdown should unblock a waiting client_facing node."""
llm = MockStreamingLLM(scenarios=[text_scenario("Waiting...")])
llm = MockStreamingLLM(
scenarios=[
tool_call_scenario("ask_user", {"question": "Waiting..."}, tool_use_id="ask_1"),
]
)
bus = EventBus()
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=10))
ctx = build_ctx(runtime, client_spec, memory, llm)
@@ -585,8 +598,12 @@ class TestClientFacingBlocking:
@pytest.mark.asyncio
async def test_client_input_requested_event_published(self, runtime, memory, client_spec):
"""CLIENT_INPUT_REQUESTED should be published when blocking."""
llm = MockStreamingLLM(scenarios=[text_scenario("Hello!")])
"""CLIENT_INPUT_REQUESTED should be published when ask_user blocks."""
llm = MockStreamingLLM(
scenarios=[
tool_call_scenario("ask_user", {"question": "Hello!"}, tool_use_id="ask_1"),
]
)
bus = EventBus()
received = []
@@ -612,6 +629,77 @@ class TestClientFacingBlocking:
assert len(received) >= 1
assert received[0].type == EventType.CLIENT_INPUT_REQUESTED
@pytest.mark.asyncio
async def test_ask_user_with_real_tools(self, runtime, memory):
"""ask_user alongside real tool calls still triggers blocking."""
spec = NodeSpec(
id="chat",
name="Chat",
description="chat node",
node_type="event_loop",
output_keys=[],
client_facing=True,
)
# LLM calls a real tool AND ask_user in the same turn
llm = MockStreamingLLM(
scenarios=[
[
ToolCallEvent(
tool_use_id="tool_1", tool_name="search", tool_input={"q": "test"}
),
ToolCallEvent(tool_use_id="ask_1", tool_name="ask_user", tool_input={}),
FinishEvent(
stop_reason="tool_calls", input_tokens=10, output_tokens=5, model="mock"
),
],
text_scenario("Done"),
]
)
def my_executor(tool_use: ToolUse) -> ToolResult:
return ToolResult(tool_use_id=tool_use.id, content="result", is_error=False)
node = EventLoopNode(
tool_executor=my_executor,
config=LoopConfig(max_iterations=5),
)
ctx = build_ctx(
runtime, spec, memory, llm, tools=[Tool(name="search", description="", parameters={})]
)
async def unblock():
await asyncio.sleep(0.05)
await node.inject_event("user input")
task = asyncio.create_task(unblock())
result = await node.execute(ctx)
await task
assert result.success is True
assert llm._call_index >= 2
@pytest.mark.asyncio
async def test_ask_user_not_available_non_client_facing(self, runtime, memory):
"""ask_user tool should NOT be injected for non-client-facing nodes."""
spec = NodeSpec(
id="internal",
name="Internal",
description="internal node",
node_type="event_loop",
output_keys=[],
)
llm = MockStreamingLLM(scenarios=[text_scenario("thinking...")])
node = EventLoopNode(config=LoopConfig(max_iterations=2))
ctx = build_ctx(runtime, spec, memory, llm)
await node.execute(ctx)
# Verify ask_user was NOT in the tools passed to the LLM
assert llm._call_index >= 1
for call in llm.stream_calls:
tool_names = [t.name for t in (call["tools"] or [])]
assert "ask_user" not in tool_names
# ===========================================================================
# Tool execution
+12
View File
@@ -37,6 +37,10 @@ class TestRuntimeBasics:
runtime.end_run(success=True)
assert runtime.current_run is None
@pytest.mark.skip(
reason="FileStorage.save_run() is deprecated and now a no-op. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
def test_run_saved_on_end(self, tmp_path: Path):
"""Run is saved to storage when ended."""
runtime = Runtime(tmp_path)
@@ -341,6 +345,10 @@ class TestConvenienceMethods:
class TestNarrativeGeneration:
"""Test automatic narrative generation."""
@pytest.mark.skip(
reason="FileStorage.save_run() and get_runs_by_goal() are deprecated. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
def test_default_narrative_success(self, tmp_path: Path):
"""Test default narrative for successful run."""
runtime = Runtime(tmp_path)
@@ -360,6 +368,10 @@ class TestNarrativeGeneration:
run = runtime.storage.load_run(runtime.storage.get_runs_by_goal("test_goal")[0])
assert "completed successfully" in run.narrative
@pytest.mark.skip(
reason="FileStorage.save_run() and get_runs_by_goal() are deprecated. "
"New sessions use unified storage at sessions/{session_id}/state.json"
)
def test_default_narrative_failure(self, tmp_path: Path):
"""Test default narrative for failed run."""
runtime = Runtime(tmp_path)
+942
View File
@@ -0,0 +1,942 @@
"""Tests for RuntimeLogger and RuntimeLogStore.
Tests incremental JSONL writes (L2/L3), crash resilience, and L1
summary aggregation at end_run().
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from framework.runtime.runtime_log_schemas import (
NodeDetail,
NodeStepLog,
RunSummaryLog,
ToolCallLog,
)
from framework.runtime.runtime_log_store import RuntimeLogStore
from framework.runtime.runtime_logger import RuntimeLogger
# ---------------------------------------------------------------------------
# RuntimeLogStore tests
# ---------------------------------------------------------------------------
class TestRuntimeLogStore:
@pytest.mark.asyncio
async def test_ensure_run_dir_creates_directory(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
store.ensure_run_dir("test_run_1")
assert (tmp_path / "logs" / "runs" / "test_run_1").is_dir()
@pytest.mark.asyncio
async def test_append_and_load_details(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
store.ensure_run_dir("test_run_2")
detail1 = NodeDetail(
node_id="node-1",
node_name="Search Node",
node_type="event_loop",
success=True,
total_steps=2,
exit_status="success",
accept_count=1,
retry_count=1,
)
detail2 = NodeDetail(
node_id="node-2",
node_name="Process Node",
node_type="function",
success=True,
total_steps=1,
)
store.append_node_detail("test_run_2", detail1)
store.append_node_detail("test_run_2", detail2)
loaded = await store.load_details("test_run_2")
assert loaded is not None
assert len(loaded.nodes) == 2
assert loaded.nodes[0].node_id == "node-1"
assert loaded.nodes[0].exit_status == "success"
assert loaded.nodes[1].node_type == "function"
@pytest.mark.asyncio
async def test_append_and_load_tool_logs(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
store.ensure_run_dir("test_run_3")
step = NodeStepLog(
node_id="node-1",
node_type="event_loop",
step_index=0,
llm_text="I will search for the data.",
tool_calls=[
ToolCallLog(
tool_use_id="tc_1",
tool_name="web_search",
tool_input={"query": "test"},
result="Found 3 results",
is_error=False,
)
],
input_tokens=100,
output_tokens=50,
latency_ms=1200,
verdict="CONTINUE",
)
store.append_step("test_run_3", step)
loaded = await store.load_tool_logs("test_run_3")
assert loaded is not None
assert len(loaded.steps) == 1
assert loaded.steps[0].tool_calls[0].tool_name == "web_search"
assert loaded.steps[0].input_tokens == 100
assert loaded.steps[0].node_id == "node-1"
@pytest.mark.asyncio
async def test_save_and_load_summary(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
summary = RunSummaryLog(
run_id="test_run_1",
agent_id="agent-a",
goal_id="goal-1",
status="success",
total_nodes_executed=3,
node_path=["node-1", "node-2", "node-3"],
started_at="2025-01-01T00:00:00",
duration_ms=5000,
execution_quality="clean",
)
await store.save_summary("test_run_1", summary)
loaded = await store.load_summary("test_run_1")
assert loaded is not None
assert loaded.run_id == "test_run_1"
assert loaded.status == "success"
assert loaded.total_nodes_executed == 3
assert loaded.goal_id == "goal-1"
assert loaded.execution_quality == "clean"
@pytest.mark.asyncio
async def test_load_missing_run_returns_none(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
assert await store.load_summary("nonexistent") is None
assert await store.load_details("nonexistent") is None
assert await store.load_tool_logs("nonexistent") is None
@pytest.mark.asyncio
async def test_list_runs_empty(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
runs = await store.list_runs()
assert runs == []
@pytest.mark.asyncio
async def test_list_runs_with_filter(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
# Save a success run
store.ensure_run_dir("run_ok")
await store.save_summary(
"run_ok",
RunSummaryLog(
run_id="run_ok",
status="success",
started_at="2025-01-01T00:00:01",
),
)
# Save a failure run
store.ensure_run_dir("run_fail")
await store.save_summary(
"run_fail",
RunSummaryLog(
run_id="run_fail",
status="failure",
needs_attention=True,
started_at="2025-01-01T00:00:02",
),
)
# All runs
all_runs = await store.list_runs()
assert len(all_runs) == 2
# Filter by status
success_runs = await store.list_runs(status="success")
assert len(success_runs) == 1
assert success_runs[0].run_id == "run_ok"
# Filter by needs_attention
attention_runs = await store.list_runs(status="needs_attention")
assert len(attention_runs) == 1
assert attention_runs[0].run_id == "run_fail"
@pytest.mark.asyncio
async def test_list_runs_sorted_by_timestamp_desc(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
for i in range(5):
run_id = f"run_{i}"
store.ensure_run_dir(run_id)
await store.save_summary(
run_id,
RunSummaryLog(
run_id=run_id,
status="success",
started_at=f"2025-01-01T00:00:{i:02d}",
),
)
runs = await store.list_runs()
# Most recent first
assert runs[0].run_id == "run_4"
assert runs[-1].run_id == "run_0"
@pytest.mark.asyncio
async def test_list_runs_limit(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
for i in range(10):
run_id = f"run_{i}"
store.ensure_run_dir(run_id)
await store.save_summary(
run_id,
RunSummaryLog(
run_id=run_id,
status="success",
started_at=f"2025-01-01T00:00:{i:02d}",
),
)
runs = await store.list_runs(limit=3)
assert len(runs) == 3
@pytest.mark.asyncio
async def test_list_runs_includes_in_progress(self, tmp_path: Path):
"""Directories without summary.json appear as in_progress."""
store = RuntimeLogStore(tmp_path / "logs")
# Completed run with summary
store.ensure_run_dir("run_done")
await store.save_summary(
"run_done",
RunSummaryLog(
run_id="run_done",
status="success",
started_at="2025-01-01T00:00:01",
),
)
# In-progress run: directory exists but no summary.json
store.ensure_run_dir("run_active")
all_runs = await store.list_runs()
assert len(all_runs) == 2
run_ids = {r.run_id for r in all_runs}
assert "run_done" in run_ids
assert "run_active" in run_ids
active = next(r for r in all_runs if r.run_id == "run_active")
assert active.status == "in_progress"
@pytest.mark.asyncio
async def test_read_node_details_sync(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
store.ensure_run_dir("test_run")
store.append_node_detail(
"test_run",
NodeDetail(
node_id="n1", node_name="A", success=True, input_tokens=100, output_tokens=50
),
)
store.append_node_detail(
"test_run",
NodeDetail(node_id="n2", node_name="B", success=False, error="oops"),
)
details = store.read_node_details_sync("test_run")
assert len(details) == 2
assert details[0].node_id == "n1"
assert details[1].error == "oops"
@pytest.mark.asyncio
async def test_corrupt_jsonl_line_skipped(self, tmp_path: Path):
"""A corrupt JSONL line should be skipped without breaking reads."""
store = RuntimeLogStore(tmp_path / "logs")
store.ensure_run_dir("test_run")
# Write a valid line, a corrupt line, then another valid line
jsonl_path = tmp_path / "logs" / "runs" / "test_run" / "details.jsonl"
valid1 = json.dumps(NodeDetail(node_id="n1", node_name="A", success=True).model_dump())
valid2 = json.dumps(NodeDetail(node_id="n2", node_name="B", success=True).model_dump())
jsonl_path.write_text(f"{valid1}\n{{corrupt line\n{valid2}\n")
details = store.read_node_details_sync("test_run")
assert len(details) == 2
assert details[0].node_id == "n1"
assert details[1].node_id == "n2"
# ---------------------------------------------------------------------------
# RuntimeLogger tests
# ---------------------------------------------------------------------------
class TestRuntimeLogger:
@pytest.mark.asyncio
async def test_start_run_returns_run_id(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rl = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rl.start_run("goal-1")
assert run_id
assert len(run_id) > 10 # timestamp + uuid
@pytest.mark.asyncio
async def test_start_run_creates_directory(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rl = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rl.start_run("goal-1")
assert (tmp_path / "logs" / "runs" / run_id).is_dir()
@pytest.mark.asyncio
async def test_log_step_writes_to_disk_immediately(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rl = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rl.start_run("goal-1")
rl.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
llm_text="Searching.",
input_tokens=100,
output_tokens=50,
)
# Verify the file exists and has one line
jsonl_path = tmp_path / "logs" / "runs" / run_id / "tool_logs.jsonl"
assert jsonl_path.exists()
lines = [line for line in jsonl_path.read_text().strip().split("\n") if line]
assert len(lines) == 1
data = json.loads(lines[0])
assert data["node_id"] == "node-1"
assert data["input_tokens"] == 100
@pytest.mark.asyncio
async def test_log_node_complete_writes_to_disk_immediately(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rl = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rl.start_run("goal-1")
rl.log_node_complete(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=True,
exit_status="success",
)
jsonl_path = tmp_path / "logs" / "runs" / run_id / "details.jsonl"
assert jsonl_path.exists()
lines = [line for line in jsonl_path.read_text().strip().split("\n") if line]
assert len(lines) == 1
data = json.loads(lines[0])
assert data["node_id"] == "node-1"
assert data["exit_status"] == "success"
@pytest.mark.asyncio
async def test_full_lifecycle(self, tmp_path: Path):
"""Test start_run -> log_step (x3) -> log_node_complete -> end_run."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Step 0: RETRY (event_loop iteration)
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
verdict="RETRY",
verdict_feedback="Missing output keys: ['result']",
tool_calls=[
{
"tool_use_id": "tc_1",
"tool_name": "web_search",
"tool_input": {"query": "test"},
"content": "Found data",
"is_error": False,
}
],
llm_text="Let me search for that.",
input_tokens=100,
output_tokens=50,
latency_ms=1000,
)
# Step 1: CONTINUE (unjudged)
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=1,
verdict="CONTINUE",
verdict_feedback="Unjudged",
tool_calls=[],
llm_text="Processing...",
input_tokens=80,
output_tokens=30,
latency_ms=500,
)
# Step 2: ACCEPT
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=2,
verdict="ACCEPT",
verdict_feedback="All outputs set",
tool_calls=[],
llm_text="Here is your result.",
input_tokens=90,
output_tokens=40,
latency_ms=800,
)
# Log node completion
rt_logger.log_node_complete(
node_id="node-1",
node_name="Search Node",
node_type="event_loop",
success=True,
total_steps=3,
tokens_used=390,
input_tokens=270,
output_tokens=120,
latency_ms=2300,
exit_status="success",
accept_count=1,
retry_count=1,
continue_count=1,
)
await rt_logger.end_run(
status="success",
duration_ms=2300,
node_path=["node-1"],
execution_quality="clean",
)
# Verify Level 1: Summary
summary = await store.load_summary(run_id)
assert summary is not None
assert summary.status == "success"
assert summary.total_nodes_executed == 1
assert summary.total_input_tokens == 270
assert summary.total_output_tokens == 120
assert summary.needs_attention is False
assert summary.duration_ms == 2300
assert summary.execution_quality == "clean"
assert summary.node_path == ["node-1"]
# Verify Level 2: Details
details = await store.load_details(run_id)
assert details is not None
assert len(details.nodes) == 1
assert details.nodes[0].node_id == "node-1"
assert details.nodes[0].exit_status == "success"
assert details.nodes[0].accept_count == 1
assert details.nodes[0].retry_count == 1
# Verify Level 3: Tool logs
tool_logs = await store.load_tool_logs(run_id)
assert tool_logs is not None
assert len(tool_logs.steps) == 3
assert tool_logs.steps[0].tool_calls[0].tool_name == "web_search"
assert tool_logs.steps[0].input_tokens == 100
assert tool_logs.steps[0].verdict == "RETRY"
assert tool_logs.steps[2].verdict == "ACCEPT"
@pytest.mark.asyncio
async def test_multi_node_lifecycle(self, tmp_path: Path):
"""Test logging across multiple nodes in a graph run."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Node 1: event_loop
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
verdict="ACCEPT",
llm_text="Done.",
input_tokens=100,
output_tokens=50,
)
rt_logger.log_node_complete(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=True,
total_steps=1,
tokens_used=150,
input_tokens=100,
output_tokens=50,
exit_status="success",
accept_count=1,
)
# Node 2: function
rt_logger.log_step(
node_id="node-2",
node_type="function",
step_index=0,
latency_ms=50,
)
rt_logger.log_node_complete(
node_id="node-2",
node_name="Process",
node_type="function",
success=True,
total_steps=1,
latency_ms=50,
)
await rt_logger.end_run(
status="success",
duration_ms=1000,
node_path=["node-1", "node-2"],
execution_quality="clean",
)
summary = await store.load_summary(run_id)
assert summary.total_nodes_executed == 2
assert summary.node_path == ["node-1", "node-2"]
assert summary.total_input_tokens == 100
assert summary.total_output_tokens == 50
details = await store.load_details(run_id)
assert len(details.nodes) == 2
@pytest.mark.asyncio
async def test_failed_node_needs_attention(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
verdict="ESCALATE",
verdict_feedback="Cannot proceed, need human input",
tool_calls=[],
llm_text="I'm stuck.",
input_tokens=50,
output_tokens=20,
latency_ms=300,
)
rt_logger.log_node_complete(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=False,
error="Judge escalated: Cannot proceed",
total_steps=1,
tokens_used=70,
latency_ms=300,
exit_status="escalated",
escalate_count=1,
)
await rt_logger.end_run(
status="failure",
duration_ms=300,
node_path=["node-1"],
execution_quality="failed",
)
summary = await store.load_summary(run_id)
assert summary is not None
assert summary.needs_attention is True
assert any(
"failed" in r.lower() or "escalat" in r.lower() for r in summary.attention_reasons
)
@pytest.mark.asyncio
async def test_ensure_node_logged_no_op_if_already_logged(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Node logs itself
rt_logger.log_node_complete(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=True,
exit_status="success",
)
# Executor calls ensure_node_logged — should be no-op
rt_logger.ensure_node_logged(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=True,
)
# Only one entry on disk
details = store.read_node_details_sync(run_id)
assert len(details) == 1
@pytest.mark.asyncio
async def test_ensure_node_logged_creates_entry_if_missing(self, tmp_path: Path):
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Node didn't log itself — executor calls ensure
rt_logger.ensure_node_logged(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=False,
error="Crashed",
)
details = store.read_node_details_sync(run_id)
assert len(details) == 1
assert details[0].error == "Crashed"
assert details[0].needs_attention is True
@pytest.mark.asyncio
async def test_large_data_preserved(self, tmp_path: Path):
"""Large tool input/result/llm_text values should be stored in full."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
long_value = "x" * 2000
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
verdict="ACCEPT",
tool_calls=[
{
"tool_use_id": "tc_1",
"tool_name": "write_file",
"tool_input": {"content": long_value},
"content": "y" * 5000,
"is_error": False,
}
],
llm_text="z" * 5000,
input_tokens=100,
output_tokens=50,
latency_ms=500,
)
rt_logger.log_node_complete(
node_id="node-1",
node_name="Writer",
node_type="event_loop",
success=True,
total_steps=1,
exit_status="success",
)
await rt_logger.end_run(
status="success",
duration_ms=500,
node_path=["node-1"],
)
tool_logs = await store.load_tool_logs(run_id)
assert tool_logs is not None
tc = tool_logs.steps[0].tool_calls[0]
# Full values preserved
assert len(tc.tool_input["content"]) == 2000
assert len(tc.result) == 5000
assert len(tool_logs.steps[0].llm_text) == 5000
@pytest.mark.asyncio
async def test_end_run_does_not_propagate_exceptions(self, tmp_path: Path):
"""end_run must catch all exceptions and never propagate."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
rt_logger.start_run("goal-1")
# Make the store path unwritable to force an error
import os
bad_path = tmp_path / "logs" / "runs"
bad_path.mkdir(parents=True, exist_ok=True)
# Create a file where directory should be
run_dir = bad_path / rt_logger._run_id
run_dir.mkdir(parents=True, exist_ok=True)
blocker = run_dir / "summary.json"
blocker.write_text("not json")
os.chmod(str(run_dir), 0o444)
try:
# This should NOT raise, even though writing will fail
await rt_logger.end_run("success", duration_ms=100)
finally:
# Restore permissions for cleanup
os.chmod(str(run_dir), 0o755)
@pytest.mark.asyncio
async def test_crash_resilience_l2_l3_survive(self, tmp_path: Path):
"""L2 and L3 data survives even if end_run() is never called (crash)."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log some steps and a node
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
llm_text="Working...",
input_tokens=100,
output_tokens=50,
)
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=1,
llm_text="Still working...",
input_tokens=80,
output_tokens=30,
)
rt_logger.log_node_complete(
node_id="node-1",
node_name="Search",
node_type="event_loop",
success=True,
total_steps=2,
input_tokens=180,
output_tokens=80,
)
# Simulate crash: do NOT call end_run()
# Verify L2 and L3 are recoverable from disk
details = await store.load_details(run_id)
assert details is not None
assert len(details.nodes) == 1
assert details.nodes[0].node_id == "node-1"
tool_logs = await store.load_tool_logs(run_id)
assert tool_logs is not None
assert len(tool_logs.steps) == 2
# But no L1 summary exists
summary = await store.load_summary(run_id)
assert summary is None
@pytest.mark.asyncio
async def test_in_progress_run_visible_in_list(self, tmp_path: Path):
"""An in-progress run (no summary.json) appears in list_runs."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log a step but don't end
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
llm_text="Working...",
)
runs = await store.list_runs()
assert len(runs) == 1
assert runs[0].run_id == run_id
assert runs[0].status == "in_progress"
@pytest.mark.asyncio
async def test_log_step_with_error_and_stacktrace(self, tmp_path: Path):
"""Test logging partial steps with errors and stack traces."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log a partial step with error
rt_logger.log_step(
node_id="node-1",
node_type="event_loop",
step_index=0,
error="LLM call failed: Connection timeout",
stacktrace=(
"Traceback (most recent call last):\n"
" File test.py line 10\n"
" raise TimeoutError()"
),
is_partial=True,
)
# Verify the step was logged
loaded = await store.load_tool_logs(run_id)
assert loaded is not None
assert len(loaded.steps) == 1
step = loaded.steps[0]
assert step.error == "LLM call failed: Connection timeout"
assert "TimeoutError" in step.stacktrace
assert step.is_partial is True
@pytest.mark.asyncio
async def test_log_node_complete_with_stacktrace(self, tmp_path: Path):
"""Test logging node completion with stack traces."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log node failure with stacktrace
rt_logger.log_node_complete(
node_id="node-1",
node_name="Test Node",
node_type="event_loop",
success=False,
error="Node crashed",
stacktrace=(
"Traceback (most recent call last):\n"
" File node.py line 42\n"
" raise RuntimeError('crash')"
),
)
# Verify the detail was logged with stacktrace
loaded = await store.load_details(run_id)
assert loaded is not None
assert len(loaded.nodes) == 1
node = loaded.nodes[0]
assert node.error == "Node crashed"
assert "RuntimeError" in node.stacktrace
@pytest.mark.asyncio
async def test_attention_flags_excessive_retries(self, tmp_path: Path):
"""Test that excessive retries trigger attention flags."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log node with excessive retries
rt_logger.log_node_complete(
node_id="node-1",
node_name="Retry Node",
node_type="event_loop",
success=True,
retry_count=5, # > 3 threshold
)
# Verify attention flag is set
loaded = await store.load_details(run_id)
assert loaded is not None
node = loaded.nodes[0]
assert node.needs_attention is True
assert any("Excessive retries" in reason for reason in node.attention_reasons)
@pytest.mark.asyncio
async def test_attention_flags_high_latency(self, tmp_path: Path):
"""Test that high latency triggers attention flags."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log node with high latency
rt_logger.log_node_complete(
node_id="node-1",
node_name="Slow Node",
node_type="event_loop",
success=True,
latency_ms=65000, # > 60000 threshold
)
# Verify attention flag is set
loaded = await store.load_details(run_id)
assert loaded is not None
node = loaded.nodes[0]
assert node.needs_attention is True
assert any("High latency" in reason for reason in node.attention_reasons)
@pytest.mark.asyncio
async def test_attention_flags_high_token_usage(self, tmp_path: Path):
"""Test that high token usage triggers attention flags."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log node with high token usage
rt_logger.log_node_complete(
node_id="node-1",
node_name="Token Heavy Node",
node_type="event_loop",
success=True,
tokens_used=150000, # > 100000 threshold
)
# Verify attention flag is set
loaded = await store.load_details(run_id)
assert loaded is not None
node = loaded.nodes[0]
assert node.needs_attention is True
assert any("High token usage" in reason for reason in node.attention_reasons)
@pytest.mark.asyncio
async def test_attention_flags_many_iterations(self, tmp_path: Path):
"""Test that many iterations trigger attention flags."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log node with many iterations
rt_logger.log_node_complete(
node_id="node-1",
node_name="Iterative Node",
node_type="event_loop",
success=True,
total_steps=25, # > 20 threshold
)
# Verify attention flag is set
loaded = await store.load_details(run_id)
assert loaded is not None
node = loaded.nodes[0]
assert node.needs_attention is True
assert any("Many iterations" in reason for reason in node.attention_reasons)
@pytest.mark.asyncio
async def test_guard_failure_exit_status(self, tmp_path: Path):
"""Test that guard failures use the correct exit status."""
store = RuntimeLogStore(tmp_path / "logs")
rt_logger = RuntimeLogger(store=store, agent_id="test-agent")
run_id = rt_logger.start_run("goal-1")
# Log a guard failure
rt_logger.log_node_complete(
node_id="node-1",
node_name="Guard Node",
node_type="event_loop",
success=False,
error="LLM provider not available",
exit_status="guard_failure",
)
# Verify exit status
loaded = await store.load_details(run_id)
assert loaded is not None
node = loaded.nodes[0]
assert node.exit_status == "guard_failure"
assert node.success is False
+16 -1
View File
@@ -1,4 +1,9 @@
"""Tests for the storage module - FileStorage and ConcurrentStorage backends."""
"""Tests for the storage module - FileStorage and ConcurrentStorage backends.
DEPRECATED: FileStorage and ConcurrentStorage are deprecated.
New sessions use unified storage at sessions/{session_id}/state.json.
These tests are kept for backward compatibility verification only.
"""
import json
import time
@@ -38,6 +43,7 @@ def create_test_run(
# === FILESTORAGE TESTS ===
@pytest.mark.skip(reason="FileStorage is deprecated - use unified session storage")
class TestFileStorageBasics:
"""Test basic FileStorage operations."""
@@ -57,6 +63,7 @@ class TestFileStorageBasics:
assert storage.base_path == tmp_path
@pytest.mark.skip(reason="FileStorage is deprecated - use unified session storage")
class TestFileStorageRunOperations:
"""Test FileStorage run CRUD operations."""
@@ -155,6 +162,7 @@ class TestFileStorageRunOperations:
assert result is False
@pytest.mark.skip(reason="FileStorage is deprecated - use unified session storage")
class TestFileStorageIndexing:
"""Test FileStorage index operations."""
@@ -259,6 +267,7 @@ class TestFileStorageIndexing:
assert storage.get_runs_by_node("nonexistent") == []
@pytest.mark.skip(reason="FileStorage is deprecated - use unified session storage")
class TestFileStorageListOperations:
"""Test FileStorage list operations."""
@@ -323,6 +332,7 @@ class TestCacheEntry:
# === CONCURRENTSTORAGE TESTS ===
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageBasics:
"""Test basic ConcurrentStorage operations."""
@@ -367,6 +377,7 @@ class TestConcurrentStorageBasics:
assert storage._running is False
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageRunOperations:
"""Test ConcurrentStorage run operations."""
@@ -471,6 +482,7 @@ class TestConcurrentStorageRunOperations:
await storage.stop()
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageQueryOperations:
"""Test ConcurrentStorage query operations."""
@@ -526,6 +538,7 @@ class TestConcurrentStorageQueryOperations:
await storage.stop()
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageCacheManagement:
"""Test ConcurrentStorage cache management."""
@@ -565,6 +578,7 @@ class TestConcurrentStorageCacheManagement:
assert stats["valid_entries"] == 1
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageSyncAPI:
"""Test ConcurrentStorage synchronous API for backward compatibility."""
@@ -598,6 +612,7 @@ class TestConcurrentStorageSyncAPI:
assert loaded is None
@pytest.mark.skip(reason="ConcurrentStorage is deprecated - wraps deprecated FileStorage")
class TestConcurrentStorageStats:
"""Test ConcurrentStorage statistics."""
+1 -1
View File
@@ -152,7 +152,7 @@ Add to `.vscode/settings.json`:
1. **Never commit API keys** - Use environment variables or `.env` files
2. **`.env` is git-ignored** - Copy `.env.example` to `.env` at the project root and fill in your values
3. **Mock mode for testing** - Set `MOCK_MODE=1` to avoid LLM calls during development
3. **Use real provider keys in non-production environments** - validate configuration with low-risk inputs before production rollout
4. **Credential isolation** - Each tool validates its own credentials at runtime
## Troubleshooting
+14 -19
View File
@@ -158,6 +158,7 @@ hive/ # Repository root
│ │ ├── schemas/ # Data schemas
│ │ ├── storage/ # File-based persistence
│ │ ├── testing/ # Testing utilities
│ │ ├── tui/ # Terminal UI dashboard
│ │ └── __init__.py
│ ├── pyproject.toml # Package metadata and dependencies
│ ├── README.md # Framework documentation
@@ -180,6 +181,9 @@ hive/ # Repository root
├── exports/ # AGENT PACKAGES (user-created, gitignored)
│ └── your_agent_name/ # Created via /hive-create
├── examples/ # Example agents
│ └── templates/ # Pre-built template agents
├── docs/ # Documentation
│ ├── getting-started.md # Quick start guide
│ ├── configuration.md # Configuration reference
@@ -287,22 +291,19 @@ If you prefer to build agents manually:
### Running Agents
```bash
# Validate agent structure
PYTHONPATH=exports uv run python -m agent_name validate
# Browse and run agents interactively (Recommended)
hive tui
# Show agent information
PYTHONPATH=exports uv run python -m agent_name info
# Run a specific agent
hive run exports/my_agent --input '{"ticket_content": "My login is broken", "customer_id": "CUST-123"}'
# Run agent with input
PYTHONPATH=exports uv run python -m agent_name run --input '{
"ticket_content": "My login is broken",
"customer_id": "CUST-123"
}'
# Run with TUI dashboard
hive run exports/my_agent --tui
# Run in mock mode (no LLM calls)
PYTHONPATH=exports uv run python -m agent_name run --mock --input '{...}'
```
> **Using Python directly:** `PYTHONPATH=exports uv run python -m agent_name run --input '{...}'`
---
## Testing Agents
@@ -615,16 +616,10 @@ echo 'ANTHROPIC_API_KEY=your-key-here' >> .env
### Debugging Agent Execution
```python
# Add debug logging to your agent
import logging
logging.basicConfig(level=logging.DEBUG)
```bash
# Run with verbose output
PYTHONPATH=exports uv run python -m agent_name run --input '{...}' --verbose
hive run exports/my_agent --verbose --input '{"task": "..."}'
# Use mock mode to test without LLM calls
PYTHONPATH=exports uv run python -m agent_name run --mock --input '{...}'
```
---
+38 -22
View File
@@ -18,6 +18,8 @@ This will:
- Check Python version (requires 3.11+)
- Install the core framework package (`framework`)
- Install the tools package (`aden_tools`)
- Initialize encrypted credential store (`~/.hive/credentials`)
- Configure default LLM provider
- Fix package compatibility issues (openai + litellm)
- Verify all installations
@@ -126,7 +128,32 @@ $env:ANTHROPIC_API_KEY="your-key-here"
## Running Agents
All agent commands must be run from the project root with `PYTHONPATH` set:
The `hive` CLI is the primary interface for running agents:
```bash
# Browse and run agents interactively (Recommended)
hive tui
# Run a specific agent
hive run exports/my_agent --input '{"task": "Your input here"}'
# Run with TUI dashboard
hive run exports/my_agent --tui
```
### CLI Command Reference
| Command | Description |
|---------|-------------|
| `hive tui` | Browse agents and launch TUI dashboard |
| `hive run <path>` | Execute an agent (`--tui`, `--model`, `--mock`, `--quiet`, `--verbose`) |
| `hive shell [path]` | Interactive REPL (`--multi`, `--no-approve`) |
| `hive info <path>` | Show agent details |
| `hive validate <path>` | Validate agent structure |
| `hive list [dir]` | List available agents |
| `hive dispatch [dir]` | Multi-agent orchestration |
### Using Python directly (alternative)
```bash
# From /hive/ directory
@@ -140,24 +167,6 @@ $env:PYTHONPATH="core;exports"
python -m agent_name COMMAND
```
### Example: Support Ticket Agent
```bash
# Validate agent structure
PYTHONPATH=exports uv run python -m your_agent_name validate
# Show agent information
PYTHONPATH=exports uv run python -m your_agent_name info
# Run agent with input
PYTHONPATH=exports uv run python -m your_agent_name run --input '{
"task": "Your input here"
}'
# Run in mock mode (no LLM calls)
PYTHONPATH=exports uv run python -m your_agent_name run --mock --input '{...}'
```
## Building New Agents and Run Flow
Build and run an agent using Claude Code CLI with the agent building skills:
@@ -353,8 +362,11 @@ hive/
│ ├── .venv/ # Created by quickstart.sh
│ └── pyproject.toml
── exports/ # Agent packages (user-created, gitignored)
└── your_agent_name/ # Created via /hive-create
── exports/ # Agent packages (user-created, gitignored)
└── your_agent_name/ # Created via /hive-create
└── examples/
└── templates/ # Pre-built template agents
```
## Separate Virtual Environments
@@ -456,7 +468,11 @@ claude> /hive-test
### 5. Run Agent
```bash
PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
# Interactive dashboard
hive tui
# Or run directly
hive run exports/your_agent_name --input '{"task": "..."}'
```
## IDE Setup
+17 -18
View File
@@ -88,7 +88,8 @@ hive/
│ │ ├── runtime/ # Runtime environment
│ │ ├── schemas/ # Data schemas
│ │ ├── storage/ # File-based persistence
│ │ ── testing/ # Testing utilities
│ │ ── testing/ # Testing utilities
│ │ └── tui/ # Terminal UI dashboard
│ └── pyproject.toml # Package metadata
├── tools/ # MCP Tools Package
@@ -102,6 +103,9 @@ hive/
├── exports/ # Agent Packages (user-generated, not in repo)
│ └── your_agent/ # Your agents created via /hive
├── examples/
│ └── templates/ # Pre-built template agents
├── .claude/ # Claude Code Skills
│ └── skills/
│ ├── hive/
@@ -116,19 +120,15 @@ hive/
## Running an Agent
```bash
# Validate agent structure
PYTHONPATH=exports uv run python -m my_agent validate
# Browse and run agents interactively (Recommended)
hive tui
# Show agent information
PYTHONPATH=exports uv run python -m my_agent info
# Run a specific agent
hive run exports/my_agent --input '{"task": "Your input here"}'
# Run agent with input
PYTHONPATH=exports uv run python -m my_agent run --input '{
"task": "Your input here"
}'
# Run with TUI dashboard
hive run exports/my_agent --tui
# Run in mock mode (no LLM calls)
PYTHONPATH=exports uv run python -m my_agent run --mock --input '{...}'
```
## API Keys Setup
@@ -164,11 +164,12 @@ PYTHONPATH=exports uv run python -m my_agent test --type success
## Next Steps
1. **Detailed Setup**: See [environment-setup.md](./environment-setup.md)
2. **Developer Guide**: See [developer-guide.md](./developer-guide.md)
3. **Build Agents**: Use `/hive` skill in Claude Code
4. **Custom Tools**: Learn to integrate MCP servers
5. **Join Community**: [Discord](https://discord.com/invite/MXE49hrKDk)
1. **TUI Dashboard**: Run `hive tui` to explore agents interactively
2. **Detailed Setup**: See [environment-setup.md](./environment-setup.md)
3. **Developer Guide**: See [developer-guide.md](./developer-guide.md)
4. **Build Agents**: Use `/hive` skill in Claude Code
5. **Custom Tools**: Learn to integrate MCP servers
6. **Join Community**: [Discord](https://discord.com/invite/MXE49hrKDk)
## Troubleshooting
@@ -194,8 +195,6 @@ uv pip install -e .
# Verify API key is set
echo $ANTHROPIC_API_KEY
# Run in mock mode to test without API
PYTHONPATH=exports uv run python -m my_agent run --mock --input '{...}'
```
### Package Installation Issues
+30 -1
View File
@@ -1,4 +1,27 @@
# TUI Text Selection and Copy Guide
# TUI Dashboard Guide
## Launching the TUI
There are two ways to launch the TUI dashboard:
```bash
# Browse and select an agent interactively
hive tui
# Launch the TUI for a specific agent
hive run exports/my_agent --tui
```
`hive tui` scans both `exports/` and `examples/templates/` for available agents, then presents a selection menu.
## Dashboard Panels
The TUI dashboard is divided into four areas:
- **Status Bar** - Shows the current agent name, execution state, and model in use
- **Graph Overview** - Live visualization of the agent's node graph with highlighted active node
- **Log Pane** - Scrollable event log streaming node transitions, LLM calls, and tool outputs
- **Chat REPL** - Input area for interacting with client-facing nodes (`ask_user()` prompts appear here)
## Keybindings
@@ -28,3 +51,9 @@ The log pane uses `auto_scroll=False`. New output only scrolls to the bottom whe
## Screenshots
`Ctrl+S` saves an SVG screenshot to the `screenshots/` directory with a timestamped filename. Open the SVG in any browser to view it.
## Tips
- Use `--mock` mode to explore agent execution without spending API credits: `hive run exports/my_agent --tui --mock`
- Override the default model with `--model`: `hive run exports/my_agent --model gpt-4o`
- Screenshots are saved as SVG files to `screenshots/` and can be opened in any browser
@@ -34,18 +34,17 @@ def cli():
@cli.command()
@click.option("--topic", "-t", type=str, required=True, help="Research topic")
@click.option("--mock", is_flag=True, help="Run in mock mode")
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def run(topic, mock, quiet, verbose, debug):
def run(topic, quiet, verbose, debug):
"""Execute research on a topic."""
if not quiet:
setup_logging(verbose=verbose, debug=debug)
context = {"topic": topic}
result = asyncio.run(default_agent.run(context, mock_mode=mock))
result = asyncio.run(default_agent.run(context))
output_data = {
"success": result.success,
@@ -60,10 +59,9 @@ def run(topic, mock, quiet, verbose, debug):
@cli.command()
@click.option("--mock", is_flag=True, help="Run in mock mode")
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def tui(mock, verbose, debug):
def tui(verbose, debug):
"""Launch the TUI dashboard for interactive research."""
setup_logging(verbose=verbose, debug=debug)
@@ -97,13 +95,11 @@ def tui(mock, verbose, debug):
if mcp_config_path.exists():
agent._tool_registry.load_mcp_config(mcp_config_path)
llm = None
if not mock:
llm = LiteLLMProvider(
model=agent.config.model,
api_key=agent.config.api_key,
api_base=agent.config.api_base,
)
llm = LiteLLMProvider(
model=agent.config.model,
api_key=agent.config.api_key,
api_base=agent.config.api_base,
)
tools = list(agent._tool_registry.get_tools().values())
tool_executor = agent._tool_registry.get_executor()
+10 -12
View File
@@ -173,7 +173,7 @@ class DeepResearchAgent:
},
)
def _setup(self, mock_mode=False) -> GraphExecutor:
def _setup(self) -> GraphExecutor:
"""Set up the executor with all components."""
from pathlib import Path
@@ -187,13 +187,11 @@ class DeepResearchAgent:
if mcp_config_path.exists():
self._tool_registry.load_mcp_config(mcp_config_path)
llm = None
if not mock_mode:
llm = LiteLLMProvider(
model=self.config.model,
api_key=self.config.api_key,
api_base=self.config.api_base,
)
llm = LiteLLMProvider(
model=self.config.model,
api_key=self.config.api_key,
api_base=self.config.api_base,
)
tool_executor = self._tool_registry.get_executor()
tools = list(self._tool_registry.get_tools().values())
@@ -213,10 +211,10 @@ class DeepResearchAgent:
return self._executor
async def start(self, mock_mode=False) -> None:
async def start(self) -> None:
"""Set up the agent (initialize executor and tools)."""
if self._executor is None:
self._setup(mock_mode=mock_mode)
self._setup()
async def stop(self) -> None:
"""Clean up resources."""
@@ -244,10 +242,10 @@ class DeepResearchAgent:
)
async def run(
self, context: dict, mock_mode=False, session_state=None
self, context: dict, session_state=None
) -> ExecutionResult:
"""Run the agent (convenience method for single execution)."""
await self.start(mock_mode=mock_mode)
await self.start()
try:
result = await self.trigger_and_wait(
"start", context, session_state=session_state
@@ -1,9 +1,9 @@
{
"hive-tools": {
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"cwd": "../../tools",
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../../../tools",
"description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
}
}
@@ -1,57 +0,0 @@
# Template: Marketing Content Agent
A multi-channel marketing content generator. Given a product and audience, this agent analyzes the audience, generates tailored copy for multiple channels with A/B variants, and reviews the output for quality.
## Workflow
```
[analyze-audience] → [generate-content] → [review-and-refine]
|
(conditional)
|
needs_revision == True → [generate-content]
needs_revision == False → (done)
```
## Nodes
| Node | Type | Description |
|------|------|-------------|
| `analyze-audience` | `llm_generate` | Produces structured audience analysis |
| `generate-content` | `llm_generate` | Creates per-channel copy with A/B variants |
| `review-and-refine` | `llm_generate` | Reviews and optionally revises content |
## Usage
```bash
# From the repo root
uv run python -m examples.templates.marketing_agent
# With custom input
uv run python -m examples.templates.marketing_agent --input '{
"product_description": "A fitness tracking app",
"target_audience": "Health-conscious millennials",
"brand_voice": "Energetic and motivational",
"channels": ["instagram", "email"]
}'
```
## Customization ideas
- Add a `function` node to call an analytics API and inform audience analysis with real data
- Add a `human_input` pause node before final output for editorial approval
- Swap `llm_generate` nodes to `llm_tool_use` and add web search tools for competitive research
- Add an image generation tool to produce visual assets alongside copy
## File structure
```
marketing_agent/
├── __init__.py # Package exports
├── __main__.py # CLI entry point
├── agent.py # Goal, edges, graph spec, MarketingAgent class
├── config.py # RuntimeConfig and AgentMetadata
├── nodes/
│ └── __init__.py # NodeSpec definitions
└── README.md # This file
```
@@ -1,6 +0,0 @@
"""Marketing Content Agent — template example."""
from .agent import MarketingAgent, goal, edges, nodes
from .config import default_config
__all__ = ["MarketingAgent", "goal", "edges", "nodes", "default_config"]
@@ -1,31 +0,0 @@
"""CLI entry point for Marketing Content Agent."""
import asyncio
import json
import sys
def main():
from .agent import MarketingAgent
from .config import default_config
# Simple CLI — replace with Click for production use
input_data = {
"product_description": "An AI-powered project management tool for remote teams",
"target_audience": "Engineering managers at mid-size tech companies",
"brand_voice": "Professional but approachable, concise, data-driven",
"channels": ["email", "twitter", "linkedin"],
}
# Accept JSON input from command line
if len(sys.argv) > 1 and sys.argv[1] == "--input":
input_data = json.loads(sys.argv[2])
agent = MarketingAgent(config=default_config)
result = asyncio.run(agent.run(input_data))
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()
-161
View File
@@ -1,161 +0,0 @@
"""Marketing Content Agent — goal, edges, graph spec, and agent class."""
from pathlib import Path
from framework.graph import EdgeCondition, EdgeSpec, Goal, SuccessCriterion, Constraint
from framework.graph.edge import GraphSpec
from framework.graph.executor import GraphExecutor
from framework.runtime.core import Runtime
from framework.llm.anthropic import AnthropicProvider
from .config import default_config, RuntimeConfig
from .nodes import all_nodes
# ---------------------------------------------------------------------------
# Goal
# ---------------------------------------------------------------------------
goal = Goal(
id="marketing-content",
name="Marketing Content Generator",
description=(
"Generate targeted marketing content across multiple channels "
"for a given product and audience."
),
success_criteria=[
SuccessCriterion(
id="audience-analyzed",
description="Audience analysis is produced with demographics and pain points",
metric="output_contains",
target="audience_analysis",
),
SuccessCriterion(
id="content-generated",
description="At least 2 channel-specific content pieces are generated",
metric="custom",
target="len(content) >= 2",
),
SuccessCriterion(
id="variants-provided",
description="A/B variants are provided for each content piece",
metric="custom",
target="all variants present",
),
],
constraints=[
Constraint(
id="no-competitor-names",
description="No competitor brand names in generated content",
constraint_type="hard",
category="safety",
),
Constraint(
id="social-length",
description="Social media content should be under 280 characters",
constraint_type="soft",
category="quality",
),
],
input_schema={
"product_description": {"type": "string"},
"target_audience": {"type": "string"},
"brand_voice": {"type": "string"},
"channels": {"type": "array", "items": {"type": "string"}},
},
output_schema={
"audience_analysis": {"type": "object"},
"content": {"type": "array"},
},
)
# ---------------------------------------------------------------------------
# Edges
# ---------------------------------------------------------------------------
edges = [
EdgeSpec(
id="analyze-to-generate",
source="analyze-audience",
target="generate-content",
condition=EdgeCondition.ON_SUCCESS,
description="After audience analysis, generate content",
),
EdgeSpec(
id="generate-to-review",
source="generate-content",
target="review-and-refine",
condition=EdgeCondition.ON_SUCCESS,
description="After content generation, review and refine",
),
EdgeSpec(
id="review-to-regenerate",
source="review-and-refine",
target="generate-content",
condition=EdgeCondition.CONDITIONAL,
condition_expr="needs_revision == True",
priority=10,
description="If revision needed, loop back to content generation",
),
]
# ---------------------------------------------------------------------------
# Graph structure
# ---------------------------------------------------------------------------
entry_node = "analyze-audience"
entry_points = {"start": "analyze-audience"}
terminal_nodes = ["review-and-refine"]
pause_nodes = []
nodes = all_nodes
# ---------------------------------------------------------------------------
# Agent class
# ---------------------------------------------------------------------------
class MarketingAgent:
"""Multi-channel marketing content generator agent."""
def __init__(self, config: RuntimeConfig | None = None):
self.config = config or default_config
self.goal = goal
self.nodes = nodes
self.edges = edges
self.entry_node = entry_node
self.terminal_nodes = terminal_nodes
self.executor = None
def _build_graph(self) -> GraphSpec:
return GraphSpec(
id="marketing-content-graph",
goal_id=self.goal.id,
entry_node=self.entry_node,
entry_points=entry_points,
terminal_nodes=self.terminal_nodes,
pause_nodes=pause_nodes,
nodes=self.nodes,
edges=self.edges,
default_model=self.config.model,
max_tokens=self.config.max_tokens,
description="Marketing content generation workflow",
)
def _create_executor(self):
runtime = Runtime(storage_path=Path(self.config.storage_path).expanduser())
llm = AnthropicProvider(model=self.config.model)
self.executor = GraphExecutor(runtime=runtime, llm=llm)
return self.executor
async def run(self, context: dict, mock_mode: bool = False) -> dict:
graph = self._build_graph()
executor = self._create_executor()
result = await executor.execute(
graph=graph,
goal=self.goal,
input_data=context,
)
return {
"success": result.success,
"output": result.output,
"steps": result.steps_executed,
"path": result.path,
}
default_agent = MarketingAgent()
@@ -1,26 +0,0 @@
"""Runtime configuration for Marketing Content Agent."""
from dataclasses import dataclass, field
@dataclass
class RuntimeConfig:
model: str = "claude-haiku-4-5-20251001"
max_tokens: int = 2048
storage_path: str = "~/.hive/storage"
mock_mode: bool = False
@dataclass
class AgentMetadata:
name: str = "marketing_agent"
version: str = "0.1.0"
description: str = "Multi-channel marketing content generator"
author: str = ""
tags: list[str] = field(
default_factory=lambda: ["marketing", "content", "template"]
)
default_config = RuntimeConfig()
metadata = AgentMetadata()
@@ -1,106 +0,0 @@
"""Node definitions for Marketing Content Agent."""
from framework.graph import NodeSpec
# ---------------------------------------------------------------------------
# Node 1: Analyze the target audience
# ---------------------------------------------------------------------------
analyze_audience_node = NodeSpec(
id="analyze-audience",
name="Analyze Audience",
description="Produce a structured audience analysis from the product and target audience description.",
node_type="llm_generate",
input_keys=["product_description", "target_audience"],
output_keys=["audience_analysis"],
system_prompt="""\
You are a marketing strategist. Analyze the target audience for a product.
Product: {product_description}
Target audience: {target_audience}
Produce a structured analysis as raw JSON (no markdown):
{{
"audience_analysis": {{
"demographics": "...",
"pain_points": ["..."],
"motivations": ["..."],
"preferred_channels": ["..."],
"messaging_angle": "..."
}}
}}
""",
tools=[],
max_retries=2,
)
# ---------------------------------------------------------------------------
# Node 2: Generate channel-specific content with A/B variants
# ---------------------------------------------------------------------------
generate_content_node = NodeSpec(
id="generate-content",
name="Generate Content",
description="Create marketing copy for each requested channel with two variants per channel.",
node_type="llm_generate",
input_keys=["product_description", "audience_analysis", "brand_voice", "channels"],
output_keys=["content"],
system_prompt="""\
You are a marketing copywriter. Generate content for each channel.
Product: {product_description}
Audience analysis: {audience_analysis}
Brand voice: {brand_voice}
Channels: {channels}
For each channel, produce two variants (A and B).
Output as raw JSON (no markdown):
{{
"content": [
{{
"channel": "twitter",
"variant_a": "...",
"variant_b": "..."
}}
]
}}
""",
tools=[],
max_retries=2,
)
# ---------------------------------------------------------------------------
# Node 3: Review and refine content
# ---------------------------------------------------------------------------
review_and_refine_node = NodeSpec(
id="review-and-refine",
name="Review and Refine",
description="Review generated content for brand voice alignment and channel fit. Revise if needed.",
node_type="llm_generate",
input_keys=["content", "brand_voice"],
output_keys=["content", "needs_revision"],
system_prompt="""\
You are a senior marketing editor. Review the following content for brand
voice alignment, clarity, and channel appropriateness.
Content: {content}
Brand voice: {brand_voice}
If any piece needs revision, fix it and set needs_revision to true.
If everything looks good, return the content unchanged with needs_revision false.
Output as raw JSON (no markdown):
{{
"content": [...],
"needs_revision": false
}}
""",
tools=[],
max_retries=2,
)
# All nodes for easy import
all_nodes = [
analyze_audience_node,
generate_content_node,
review_and_refine_node,
]
@@ -0,0 +1,116 @@
# Tech & AI News Reporter
**Version**: 1.0.0
**Type**: Multi-node agent
**Created**: 2026-02-06
## Overview
Research the latest technology and AI news from the web, summarize key stories, and produce a well-organized report for the user to read.
## Architecture
### Execution Flow
```
intake → research → compile-report
```
### Nodes (3 total)
1. **intake** (event_loop)
- Greet the user and ask if they have specific tech/AI topics to focus on, or if they want a general news roundup.
- Writes: `research_brief`
- Client-facing: Yes (blocks for user input)
2. **research** (event_loop)
- Search the web for recent tech/AI news articles, scrape the top results, and extract key information including titles, summaries, sources, and topics.
- Reads: `research_brief`
- Writes: `articles_data`
- Tools: `web_search, web_scrape`
3. **compile-report** (event_loop)
- Organize the researched articles into a structured HTML report, save it, and deliver a clickable link to the user.
- Reads: `articles_data`
- Writes: `report_file`
- Tools: `save_data, serve_file_to_user`
- Client-facing: Yes (blocks for user input)
### Edges (2 total)
- `intake``research` (condition: on_success, priority=1)
- `research``compile-report` (condition: on_success, priority=1)
## Goal Criteria
### Success Criteria
**Finds recent, relevant tech/AI news articles** (weight 0.25)
- Metric: Number of articles sourced
- Target: 5+ articles
**Covers diverse topics, not just one story** (weight 0.2)
- Metric: Distinct topics covered
- Target: 3+ topics
**Produces a structured, readable report with sections, summaries, and links** (weight 0.25)
- Metric: Report has clear sections and summaries
- Target: Yes
**Includes source attribution with URLs for every story** (weight 0.15)
- Metric: Stories with source URLs
- Target: 100%
**Delivers the report to the user in a viewable format** (weight 0.15)
- Metric: User receives a viewable report
- Target: Yes
### Constraints
**Never fabricate news stories or URLs** (hard)
- Category: quality
**Always attribute sources with links** (hard)
- Category: quality
**Only include news from the past week** (hard)
- Category: quality
## Required Tools
- `save_data`
- `serve_file_to_user`
- `web_scrape`
- `web_search`
## Usage
### Basic Usage
```python
from framework.runner import AgentRunner
# Load the agent
runner = AgentRunner.load("examples/templates/tech_news_reporter")
# Run with input
result = await runner.run({"input_key": "value"})
# Access results
print(result.output)
print(result.status)
```
### Input Schema
The agent's entry node `intake` requires:
### Output Schema
Terminal nodes: `compile-report`
## Version History
- **1.0.0** (2026-02-06): Initial release
- 3 nodes, 2 edges
- Goal: Tech & AI News Reporter
@@ -0,0 +1,23 @@
"""
Tech & AI News Reporter - Research latest tech/AI news and produce reports.
Searches for recent technology and AI news, summarizes key stories,
and delivers a well-organized HTML report for the user to read.
"""
from .agent import TechNewsReporterAgent, default_agent, goal, nodes, edges
from .config import RuntimeConfig, AgentMetadata, default_config, metadata
__version__ = "1.0.0"
__all__ = [
"TechNewsReporterAgent",
"default_agent",
"goal",
"nodes",
"edges",
"RuntimeConfig",
"AgentMetadata",
"default_config",
"metadata",
]
@@ -0,0 +1,223 @@
"""
CLI entry point for Tech & AI News Reporter.
Uses AgentRuntime for multi-entrypoint support with HITL pause/resume.
"""
import asyncio
import json
import logging
import sys
import click
from .agent import default_agent, TechNewsReporterAgent
def setup_logging(verbose=False, debug=False):
"""Configure logging for execution visibility."""
if debug:
level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
elif verbose:
level, fmt = logging.INFO, "%(message)s"
else:
level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
logging.getLogger("framework").setLevel(level)
@click.group()
@click.version_option(version="1.0.0")
def cli():
"""Tech & AI News Reporter - Research and report on latest tech/AI news."""
pass
@cli.command()
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def run(quiet, verbose, debug):
"""Execute the news reporter agent."""
if not quiet:
setup_logging(verbose=verbose, debug=debug)
context = {}
result = asyncio.run(default_agent.run(context))
output_data = {
"success": result.success,
"steps_executed": result.steps_executed,
"output": result.output,
}
if result.error:
output_data["error"] = result.error
click.echo(json.dumps(output_data, indent=2, default=str))
sys.exit(0 if result.success else 1)
@cli.command()
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def tui(verbose, debug):
"""Launch the TUI dashboard for interactive news reporting."""
setup_logging(verbose=verbose, debug=debug)
try:
from framework.tui.app import AdenTUI
except ImportError:
click.echo(
"TUI requires the 'textual' package. Install with: pip install textual"
)
sys.exit(1)
from pathlib import Path
from framework.llm import LiteLLMProvider
from framework.runner.tool_registry import ToolRegistry
from framework.runtime.agent_runtime import create_agent_runtime
from framework.runtime.event_bus import EventBus
from framework.runtime.execution_stream import EntryPointSpec
async def run_with_tui():
agent = TechNewsReporterAgent()
agent._event_bus = EventBus()
agent._tool_registry = ToolRegistry()
storage_path = Path.home() / ".hive" / "tech_news_reporter"
storage_path.mkdir(parents=True, exist_ok=True)
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
if mcp_config_path.exists():
agent._tool_registry.load_mcp_config(mcp_config_path)
llm = LiteLLMProvider(
model=agent.config.model,
api_key=agent.config.api_key,
api_base=agent.config.api_base,
)
tools = list(agent._tool_registry.get_tools().values())
tool_executor = agent._tool_registry.get_executor()
graph = agent._build_graph()
runtime = create_agent_runtime(
graph=graph,
goal=agent.goal,
storage_path=storage_path,
entry_points=[
EntryPointSpec(
id="start",
name="Start News Report",
entry_node="intake",
trigger_type="manual",
isolation_level="isolated",
),
],
llm=llm,
tools=tools,
tool_executor=tool_executor,
)
await runtime.start()
try:
app = AdenTUI(runtime)
await app.run_async()
finally:
await runtime.stop()
asyncio.run(run_with_tui())
@cli.command()
@click.option("--json", "output_json", is_flag=True)
def info(output_json):
"""Show agent information."""
info_data = default_agent.info()
if output_json:
click.echo(json.dumps(info_data, indent=2))
else:
click.echo(f"Agent: {info_data['name']}")
click.echo(f"Version: {info_data['version']}")
click.echo(f"Description: {info_data['description']}")
click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
click.echo(f"Entry: {info_data['entry_node']}")
click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
@cli.command()
def validate():
"""Validate agent structure."""
validation = default_agent.validate()
if validation["valid"]:
click.echo("Agent is valid")
if validation["warnings"]:
for warning in validation["warnings"]:
click.echo(f" WARNING: {warning}")
else:
click.echo("Agent has errors:")
for error in validation["errors"]:
click.echo(f" ERROR: {error}")
sys.exit(0 if validation["valid"] else 1)
@cli.command()
@click.option("--verbose", "-v", is_flag=True)
def shell(verbose):
"""Interactive news reporter session (CLI, no TUI)."""
asyncio.run(_interactive_shell(verbose))
async def _interactive_shell(verbose=False):
"""Async interactive shell."""
setup_logging(verbose=verbose)
click.echo("=== Tech & AI News Reporter ===")
click.echo("Press Enter to get the latest news report (or 'quit' to exit):\n")
agent = TechNewsReporterAgent()
await agent.start()
try:
while True:
try:
user_input = await asyncio.get_event_loop().run_in_executor(
None, input, "News> "
)
if user_input.lower() in ["quit", "exit", "q"]:
click.echo("Goodbye!")
break
click.echo("\nSearching for latest news...\n")
result = await agent.trigger_and_wait("start", {})
if result is None:
click.echo("\n[Execution timed out]\n")
continue
if result.success:
output = result.output
if "report_file" in output:
click.echo(f"\nReport saved: {output['report_file']}\n")
else:
click.echo(f"\nFailed: {result.error}\n")
except KeyboardInterrupt:
click.echo("\nGoodbye!")
break
except Exception as e:
click.echo(f"Error: {e}", err=True)
import traceback
traceback.print_exc()
finally:
await agent.stop()
if __name__ == "__main__":
cli()
@@ -0,0 +1,220 @@
{
"agent": {
"id": "tech_news_reporter",
"name": "Tech & AI News Reporter",
"version": "1.0.0",
"description": "Research the latest technology and AI news from the web, summarize key stories, and produce a well-organized report for the user to read."
},
"graph": {
"id": "tech_news_reporter-graph",
"goal_id": "tech-news-report",
"version": "1.0.0",
"entry_node": "intake",
"entry_points": {
"start": "intake"
},
"pause_nodes": [],
"terminal_nodes": [
"compile-report"
],
"nodes": [
{
"id": "intake",
"name": "Intake",
"description": "Greet the user and ask if they have specific tech/AI topics to focus on, or if they want a general news roundup.",
"node_type": "event_loop",
"input_keys": [],
"output_keys": [
"research_brief"
],
"nullable_output_keys": [],
"input_schema": {},
"output_schema": {},
"system_prompt": "You are the intake assistant for a Tech & AI News Reporter agent.\n\n**STEP 1 — Greet and ask the user:**\nGreet the user and ask what kind of tech/AI news they're interested in today. Offer options like:\n- General tech & AI roundup (covers everything notable)\n- Specific topics (e.g., LLMs, robotics, startups, cybersecurity, semiconductors)\n- A particular company or product\n\nKeep it brief and friendly. If the user already stated a preference in their initial message, acknowledge it.\n\nAfter your greeting, call ask_user() to wait for the user's response.\n\n**STEP 2 — After the user responds, call set_output:**\n- set_output(\"research_brief\", \"<a clear, concise description of what to search for based on the user's preferences>\")\n\nIf the user just wants a general roundup, set: \"General tech and AI news roundup covering the most notable stories from the past week\"",
"tools": [],
"model": null,
"function": null,
"routes": {},
"max_retries": 3,
"retry_on": [],
"max_node_visits": 1,
"output_model": null,
"max_validation_retries": 2,
"client_facing": true
},
{
"id": "research",
"name": "Research",
"description": "Search the web for recent tech/AI news articles, scrape the top results, and extract key information including titles, summaries, sources, and topics.",
"node_type": "event_loop",
"input_keys": [
"research_brief"
],
"output_keys": [
"articles_data"
],
"nullable_output_keys": [],
"input_schema": {},
"output_schema": {},
"system_prompt": "You are a news researcher for a Tech & AI News Reporter agent.\n\nYour task: Find and summarize recent tech/AI news based on the research_brief.\n\n**Instructions:**\n1. Use web_search to find recent tech and AI news articles. Run multiple searches with different queries to get diverse coverage (e.g., \"latest AI news this week\", \"tech industry news today\", topic-specific queries from the brief).\n2. Pick the 5-10 most interesting and significant articles from the search results.\n3. Use web_scrape on each selected article to get the full content.\n4. For each article, extract: title, source name, URL, publication date, a 2-3 sentence summary, and the main topic category.\n\n**Output format:**\nUse set_output(\"articles_data\", <JSON string>) with this structure:\n```json\n{\n \"articles\": [\n {\n \"title\": \"Article Title\",\n \"source\": \"Source Name\",\n \"url\": \"https://...\",\n \"date\": \"2026-02-05\",\n \"summary\": \"2-3 sentence summary of the key points.\",\n \"topic\": \"AI / Semiconductors / Startups / etc.\"\n }\n ],\n \"search_date\": \"2026-02-06\",\n \"topics_covered\": [\"AI\", \"Semiconductors\", \"...\"]\n}\n```\n\n**Rules:**\n- Only include REAL articles with REAL URLs you found via search. Never fabricate.\n- Focus on news from the past week.\n- Aim for at least 3 distinct topic categories.\n- Keep summaries factual and concise.",
"tools": [
"web_search",
"web_scrape"
],
"model": null,
"function": null,
"routes": {},
"max_retries": 3,
"retry_on": [],
"max_node_visits": 1,
"output_model": null,
"max_validation_retries": 2,
"client_facing": false
},
{
"id": "compile-report",
"name": "Compile Report",
"description": "Organize the researched articles into a structured HTML report, save it, and deliver a clickable link to the user.",
"node_type": "event_loop",
"input_keys": [
"articles_data"
],
"output_keys": [
"report_file"
],
"nullable_output_keys": [],
"input_schema": {},
"output_schema": {},
"system_prompt": "You are the report compiler for a Tech & AI News Reporter agent.\n\nYour task: Turn the articles_data into a polished, readable HTML report and deliver it to the user.\n\n**Instructions:**\n1. Parse the articles_data JSON to get the list of articles.\n2. Generate a well-structured HTML report with:\n - A header with the report title and date\n - A table of contents / summary section listing topics covered\n - Articles grouped by topic category\n - For each article: title (linked to source URL), source name, date, and summary\n - Clean, readable styling (inline CSS)\n3. Use save_data to save the HTML report as \"tech_news_report.html\".\n4. Use serve_file_to_user to get a clickable link for the user.\n\n**STEP 1 — Respond to the user (text only, NO tool calls):**\nPresent a brief text summary of the report highlights — how many articles, what topics are covered, and a few headline highlights. Tell the user you're generating their full report now.\n\n**STEP 2 — After presenting the summary, save and serve the report:**\n- save_data(filename=\"tech_news_report.html\", data=<html_content>, data_dir=<data_dir>)\n- serve_file_to_user(filename=\"tech_news_report.html\", data_dir=<data_dir>, label=\"Tech & AI News Report\", open_in_browser=True)\n- set_output(\"report_file\", \"tech_news_report.html\")\n\nThe report will auto-open in the user's default browser. Let them know the report has been opened.",
"tools": [
"save_data",
"serve_file_to_user"
],
"model": null,
"function": null,
"routes": {},
"max_retries": 3,
"retry_on": [],
"max_node_visits": 1,
"output_model": null,
"max_validation_retries": 2,
"client_facing": false
}
],
"edges": [
{
"id": "intake-to-research",
"source": "intake",
"target": "research",
"condition": "on_success",
"condition_expr": null,
"priority": 1,
"input_mapping": {}
},
{
"id": "research-to-compile-report",
"source": "research",
"target": "compile-report",
"condition": "on_success",
"condition_expr": null,
"priority": 1,
"input_mapping": {}
}
],
"max_steps": 100,
"max_retries_per_node": 3,
"description": "Research the latest technology and AI news from the web, summarize key stories, and produce a well-organized report for the user to read.",
"created_at": "2026-02-06T08:42:51.476802"
},
"goal": {
"id": "tech-news-report",
"name": "Tech & AI News Reporter",
"description": "Research the latest technology and AI news from the web, summarize key stories, and produce a well-organized report for the user to read.",
"status": "draft",
"success_criteria": [
{
"id": "sc-find-articles",
"description": "Finds recent, relevant tech/AI news articles",
"metric": "Number of articles sourced",
"target": "5+ articles",
"weight": 0.25,
"met": false
},
{
"id": "sc-diverse-topics",
"description": "Covers diverse topics, not just one story",
"metric": "Distinct topics covered",
"target": "3+ topics",
"weight": 0.2,
"met": false
},
{
"id": "sc-structured-report",
"description": "Produces a structured, readable report with sections, summaries, and links",
"metric": "Report has clear sections and summaries",
"target": "Yes",
"weight": 0.25,
"met": false
},
{
"id": "sc-source-attribution",
"description": "Includes source attribution with URLs for every story",
"metric": "Stories with source URLs",
"target": "100%",
"weight": 0.15,
"met": false
},
{
"id": "sc-deliver-report",
"description": "Delivers the report to the user in a viewable format",
"metric": "User receives a viewable report",
"target": "Yes",
"weight": 0.15,
"met": false
}
],
"constraints": [
{
"id": "c-no-fabrication",
"description": "Never fabricate news stories or URLs",
"constraint_type": "hard",
"category": "quality",
"check": ""
},
{
"id": "c-source-attribution",
"description": "Always attribute sources with links",
"constraint_type": "hard",
"category": "quality",
"check": ""
},
{
"id": "c-recent-news",
"description": "Only include news from the past week",
"constraint_type": "hard",
"category": "quality",
"check": ""
}
],
"context": {},
"required_capabilities": [],
"input_schema": {},
"output_schema": {},
"version": "1.0.0",
"parent_version": null,
"evolution_reason": null,
"created_at": "2026-02-06 08:39:00.123362",
"updated_at": "2026-02-06 08:39:00.123364"
},
"required_tools": [
"web_scrape",
"save_data",
"serve_file_to_user",
"web_search"
],
"metadata": {
"created_at": "2026-02-06T08:42:51.476862",
"node_count": 3,
"edge_count": 2
}
}
@@ -0,0 +1,293 @@
"""Agent graph construction for Tech & AI News Reporter."""
from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
from framework.graph.edge import GraphSpec
from framework.graph.executor import ExecutionResult, GraphExecutor
from framework.runtime.event_bus import EventBus
from framework.runtime.core import Runtime
from framework.llm import LiteLLMProvider
from framework.runner.tool_registry import ToolRegistry
from .config import default_config, metadata
from .nodes import (
intake_node,
research_node,
compile_report_node,
)
# Goal definition
goal = Goal(
id="tech-news-report",
name="Tech & AI News Reporter",
description=(
"Research the latest technology and AI news from the web, "
"summarize key stories, and produce a well-organized report "
"for the user to read."
),
success_criteria=[
SuccessCriterion(
id="sc-find-articles",
description="Finds recent, relevant tech/AI news articles",
metric="articles_sourced",
target=">=5",
weight=0.25,
),
SuccessCriterion(
id="sc-diverse-topics",
description="Covers diverse topics, not just one story",
metric="topics_covered",
target=">=3",
weight=0.2,
),
SuccessCriterion(
id="sc-structured-report",
description="Produces a structured, readable report with sections, summaries, and links",
metric="report_structured",
target="true",
weight=0.25,
),
SuccessCriterion(
id="sc-source-attribution",
description="Includes source attribution with URLs for every story",
metric="source_attribution",
target="100%",
weight=0.15,
),
SuccessCriterion(
id="sc-deliver-report",
description="Delivers the report to the user in a viewable format",
metric="report_delivered",
target="true",
weight=0.15,
),
],
constraints=[
Constraint(
id="c-no-fabrication",
description="Never fabricate news stories or URLs",
constraint_type="hard",
category="quality",
),
Constraint(
id="c-source-attribution",
description="Always attribute sources with links",
constraint_type="hard",
category="quality",
),
Constraint(
id="c-recent-news",
description="Only include news from the past week",
constraint_type="hard",
category="quality",
),
],
)
# Node list
nodes = [
intake_node,
research_node,
compile_report_node,
]
# Edge definitions
edges = [
EdgeSpec(
id="intake-to-research",
source="intake",
target="research",
condition=EdgeCondition.ON_SUCCESS,
priority=1,
),
EdgeSpec(
id="research-to-compile-report",
source="research",
target="compile-report",
condition=EdgeCondition.ON_SUCCESS,
priority=1,
),
]
# Graph configuration
entry_node = "intake"
entry_points = {"start": "intake"}
pause_nodes = []
terminal_nodes = ["compile-report"]
class TechNewsReporterAgent:
"""
Tech & AI News Reporter 3-node pipeline.
Flow: intake -> research -> compile-report
"""
def __init__(self, config=None):
self.config = config or default_config
self.goal = goal
self.nodes = nodes
self.edges = edges
self.entry_node = entry_node
self.entry_points = entry_points
self.pause_nodes = pause_nodes
self.terminal_nodes = terminal_nodes
self._executor: GraphExecutor | None = None
self._graph: GraphSpec | None = None
self._event_bus: EventBus | None = None
self._tool_registry: ToolRegistry | None = None
def _build_graph(self) -> GraphSpec:
"""Build the GraphSpec."""
return GraphSpec(
id="tech-news-reporter-graph",
goal_id=self.goal.id,
version="1.0.0",
entry_node=self.entry_node,
entry_points=self.entry_points,
terminal_nodes=self.terminal_nodes,
pause_nodes=self.pause_nodes,
nodes=self.nodes,
edges=self.edges,
default_model=self.config.model,
max_tokens=self.config.max_tokens,
loop_config={
"max_iterations": 50,
"max_tool_calls_per_turn": 10,
"max_history_tokens": 32000,
},
)
def _setup(self) -> GraphExecutor:
"""Set up the executor with all components."""
from pathlib import Path
storage_path = Path.home() / ".hive" / "tech_news_reporter"
storage_path.mkdir(parents=True, exist_ok=True)
self._event_bus = EventBus()
self._tool_registry = ToolRegistry()
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
if mcp_config_path.exists():
self._tool_registry.load_mcp_config(mcp_config_path)
llm = LiteLLMProvider(
model=self.config.model,
api_key=self.config.api_key,
api_base=self.config.api_base,
)
tool_executor = self._tool_registry.get_executor()
tools = list(self._tool_registry.get_tools().values())
self._graph = self._build_graph()
runtime = Runtime(storage_path)
self._executor = GraphExecutor(
runtime=runtime,
llm=llm,
tools=tools,
tool_executor=tool_executor,
event_bus=self._event_bus,
storage_path=storage_path,
loop_config=self._graph.loop_config,
)
return self._executor
async def start(self) -> None:
"""Set up the agent (initialize executor and tools)."""
if self._executor is None:
self._setup()
async def stop(self) -> None:
"""Clean up resources."""
self._executor = None
self._event_bus = None
async def trigger_and_wait(
self,
entry_point: str,
input_data: dict,
timeout: float | None = None,
session_state: dict | None = None,
) -> ExecutionResult | None:
"""Execute the graph and wait for completion."""
if self._executor is None:
raise RuntimeError("Agent not started. Call start() first.")
if self._graph is None:
raise RuntimeError("Graph not built. Call start() first.")
return await self._executor.execute(
graph=self._graph,
goal=self.goal,
input_data=input_data,
session_state=session_state,
)
async def run(
self, context: dict, session_state=None
) -> ExecutionResult:
"""Run the agent (convenience method for single execution)."""
await self.start()
try:
result = await self.trigger_and_wait(
"start", context, session_state=session_state
)
return result or ExecutionResult(success=False, error="Execution timeout")
finally:
await self.stop()
def info(self):
"""Get agent information."""
return {
"name": metadata.name,
"version": metadata.version,
"description": metadata.description,
"goal": {
"name": self.goal.name,
"description": self.goal.description,
},
"nodes": [n.id for n in self.nodes],
"edges": [e.id for e in self.edges],
"entry_node": self.entry_node,
"entry_points": self.entry_points,
"pause_nodes": self.pause_nodes,
"terminal_nodes": self.terminal_nodes,
"client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
}
def validate(self):
"""Validate agent structure."""
errors = []
warnings = []
node_ids = {node.id for node in self.nodes}
for edge in self.edges:
if edge.source not in node_ids:
errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
if edge.target not in node_ids:
errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
if self.entry_node not in node_ids:
errors.append(f"Entry node '{self.entry_node}' not found")
for terminal in self.terminal_nodes:
if terminal not in node_ids:
errors.append(f"Terminal node '{terminal}' not found")
for ep_id, node_id in self.entry_points.items():
if node_id not in node_ids:
errors.append(
f"Entry point '{ep_id}' references unknown node '{node_id}'"
)
return {
"valid": len(errors) == 0,
"errors": errors,
"warnings": warnings,
}
# Create default instance
default_agent = TechNewsReporterAgent()
@@ -0,0 +1,46 @@
"""Runtime configuration."""
import json
from dataclasses import dataclass, field
from pathlib import Path
def _load_preferred_model() -> str:
"""Load preferred model from ~/.hive/configuration.json."""
config_path = Path.home() / ".hive" / "configuration.json"
if config_path.exists():
try:
with open(config_path) as f:
config = json.load(f)
llm = config.get("llm", {})
if llm.get("provider") and llm.get("model"):
return f"{llm['provider']}/{llm['model']}"
except Exception:
pass
return "anthropic/claude-sonnet-4-20250514"
@dataclass
class RuntimeConfig:
model: str = field(default_factory=_load_preferred_model)
temperature: float = 0.7
max_tokens: int = 40000
api_key: str | None = None
api_base: str | None = None
default_config = RuntimeConfig()
@dataclass
class AgentMetadata:
name: str = "Tech & AI News Reporter"
version: str = "1.0.0"
description: str = (
"Research the latest technology and AI news from the web, "
"summarize key stories, and produce a well-organized report "
"for the user to read."
)
metadata = AgentMetadata()
@@ -0,0 +1,9 @@
{
"hive-tools": {
"transport": "stdio",
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../../../tools",
"description": "Hive tools MCP server providing web_search, web_scrape, save_data, and serve_file_to_user"
}
}
@@ -0,0 +1,151 @@
"""Node definitions for Tech & AI News Reporter."""
from framework.graph import NodeSpec
# Node 1: Intake (client-facing)
# Brief conversation to understand what topics the user cares about.
intake_node = NodeSpec(
id="intake",
name="Intake",
description="Greet the user and ask if they have specific tech/AI topics to focus on, or if they want a general news roundup.",
node_type="event_loop",
client_facing=True,
input_keys=[],
output_keys=["research_brief"],
system_prompt="""\
You are the intake assistant for a Tech & AI News Reporter agent.
**STEP 1 Greet and ask the user:**
Greet the user and ask what kind of tech/AI news they're interested in today. Offer options like:
- General tech & AI roundup (covers everything notable)
- Specific topics (e.g., LLMs, robotics, startups, cybersecurity, semiconductors)
- A particular company or product
Keep it brief and friendly. If the user already stated a preference in their initial message, acknowledge it.
After your greeting, call ask_user() to wait for the user's response.
**STEP 2 After the user responds, call set_output:**
- set_output("research_brief", "<a clear, concise description of what to search for based on the user's preferences>")
If the user just wants a general roundup, set: "General tech and AI news roundup covering the most notable stories from the past week"
""",
tools=[],
)
# Node 2: Research
# Scrapes known tech news sites directly — no API keys needed.
research_node = NodeSpec(
id="research",
name="Research",
description="Scrape well-known tech news sites for recent articles and extract key information including titles, summaries, sources, and topics.",
node_type="event_loop",
input_keys=["research_brief"],
output_keys=["articles_data"],
system_prompt="""\
You are a news researcher for a Tech & AI News Reporter agent.
Your task: Find and summarize recent tech/AI news based on the research_brief.
You do NOT have web search instead, scrape news directly from known sites.
**Instructions:**
1. Use web_scrape to fetch the front/latest pages of these tech news sources.
IMPORTANT: Always set max_length=5000 and include_links=true for front pages
so you get headlines and links without blowing up context.
Scrape these (pick 3-4, not all 5, to stay efficient):
- https://news.ycombinator.com (Hacker News tech community picks)
- https://techcrunch.com (startups, AI, tech industry)
- https://www.theverge.com/tech (consumer tech, AI, policy)
- https://arstechnica.com (in-depth tech, science, AI)
- https://www.technologyreview.com (MIT AI, emerging tech)
If the research_brief requests specific topics, also try relevant category pages
(e.g., https://techcrunch.com/category/artificial-intelligence/).
2. From the scraped front pages, identify the most interesting and recent headlines.
Pick 5-8 article URLs total across all sources, prioritizing:
- Relevance to the research_brief
- Recency (past week)
- Significance and diversity of topics
3. For each selected article, use web_scrape with max_length=3000 on the
individual article URL to get the content. Extract: title, source name,
URL, publication date, a 2-3 sentence summary, and the main topic category.
**Output format:**
Use set_output("articles_data", <JSON string>) with this structure:
```json
{
"articles": [
{
"title": "Article Title",
"source": "Source Name",
"url": "https://...",
"date": "2026-02-05",
"summary": "2-3 sentence summary of the key points.",
"topic": "AI / Semiconductors / Startups / etc."
}
],
"search_date": "2026-02-06",
"topics_covered": ["AI", "Semiconductors", "..."]
}
```
**Rules:**
- Only include REAL articles with REAL URLs you scraped. Never fabricate.
- Focus on news from the past week.
- Aim for at least 3 distinct topic categories.
- Keep summaries factual and concise.
- If a site fails to load, skip it and move on to the next.
- Always use max_length to limit scraped content (5000 for front pages, 3000 for articles).
- Work in batches: scrape front pages first, then articles. Don't scrape everything at once.
""",
tools=["web_scrape"],
)
# Node 3: Compile Report
# Turns research into a polished HTML report and delivers it.
# Not client-facing: it does autonomous work (no user interaction needed).
compile_report_node = NodeSpec(
id="compile-report",
name="Compile Report",
description="Organize the researched articles into a structured HTML report, save it, and deliver a clickable link to the user.",
node_type="event_loop",
client_facing=False,
input_keys=["articles_data"],
output_keys=["report_file"],
system_prompt="""\
You are the report compiler for a Tech & AI News Reporter agent.
Your task: Turn the articles_data into a polished, readable HTML report and deliver it to the user.
**Instructions:**
1. Parse the articles_data JSON to get the list of articles.
2. Generate a well-structured HTML report with:
- A header with the report title and date
- A table of contents / summary section listing topics covered
- Articles grouped by topic category
- For each article: title (linked to source URL), source name, date, and summary
- Clean, readable styling (inline CSS)
3. Use save_data to save the HTML report as "tech_news_report.html".
4. Use serve_file_to_user to get a clickable link for the user.
**STEP 1 Respond to the user (text only, NO tool calls):**
Present a brief text summary of the report highlights how many articles, what topics are covered, and a few headline highlights. Tell the user you're generating their full report now.
**STEP 2 After presenting the summary, save and serve the report:**
- save_data(filename="tech_news_report.html", data=<html_content>, data_dir=<data_dir>)
- serve_file_to_user(filename="tech_news_report.html", data_dir=<data_dir>, label="Tech & AI News Report", open_in_browser=True)
- set_output("report_file", "tech_news_report.html")
The report will auto-open in the user's default browser. Let them know the report has been opened.
""",
tools=["save_data", "serve_file_to_user"],
)
__all__ = [
"intake_node",
"research_node",
"compile_report_node",
]
@@ -18,8 +18,8 @@ PYTHONPATH=core:exports uv run python -m twitter_outreach validate
# Show agent info
PYTHONPATH=core:exports uv run python -m twitter_outreach info
# Run in mock mode (no API calls)
PYTHONPATH=core:exports uv run python -m twitter_outreach run --mock
# Run the workflow
PYTHONPATH=core:exports uv run python -m twitter_outreach run
# Launch the TUI
PYTHONPATH=core:exports uv run python -m twitter_outreach tui
@@ -33,16 +33,15 @@ def cli():
@cli.command()
@click.option("--mock", is_flag=True, help="Run in mock mode")
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def run(mock, quiet, verbose, debug):
def run(quiet, verbose, debug):
"""Execute the outreach workflow."""
if not quiet:
setup_logging(verbose=verbose, debug=debug)
result = asyncio.run(default_agent.run({}, mock_mode=mock))
result = asyncio.run(default_agent.run({}))
output_data = {
"success": result.success,
@@ -57,10 +56,9 @@ def run(mock, quiet, verbose, debug):
@cli.command()
@click.option("--mock", is_flag=True, help="Run in mock mode")
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
@click.option("--debug", is_flag=True, help="Show debug logging")
def tui(mock, verbose, debug):
def tui(verbose, debug):
"""Launch the TUI dashboard for interactive outreach."""
setup_logging(verbose=verbose, debug=debug)
@@ -93,13 +91,11 @@ def tui(mock, verbose, debug):
if mcp_config_path.exists():
agent._tool_registry.load_mcp_config(mcp_config_path)
llm = None
if not mock:
llm = LiteLLMProvider(
model=agent.config.model,
api_key=agent.config.api_key,
api_base=agent.config.api_base,
)
llm = LiteLLMProvider(
model=agent.config.model,
api_key=agent.config.api_key,
api_base=agent.config.api_base,
)
tools = list(agent._tool_registry.get_tools().values())
tool_executor = agent._tool_registry.get_executor()
+10 -12
View File
@@ -172,7 +172,7 @@ class TwitterOutreachAgent:
},
)
def _setup(self, mock_mode=False) -> GraphExecutor:
def _setup(self) -> GraphExecutor:
"""Set up the executor with all components."""
from pathlib import Path
@@ -186,13 +186,11 @@ class TwitterOutreachAgent:
if mcp_config_path.exists():
self._tool_registry.load_mcp_config(mcp_config_path)
llm = None
if not mock_mode:
llm = LiteLLMProvider(
model=self.config.model,
api_key=self.config.api_key,
api_base=self.config.api_base,
)
llm = LiteLLMProvider(
model=self.config.model,
api_key=self.config.api_key,
api_base=self.config.api_base,
)
tool_executor = self._tool_registry.get_executor()
tools = list(self._tool_registry.get_tools().values())
@@ -212,10 +210,10 @@ class TwitterOutreachAgent:
return self._executor
async def start(self, mock_mode=False) -> None:
async def start(self) -> None:
"""Set up the agent (initialize executor and tools)."""
if self._executor is None:
self._setup(mock_mode=mock_mode)
self._setup()
async def stop(self) -> None:
"""Clean up resources."""
@@ -243,10 +241,10 @@ class TwitterOutreachAgent:
)
async def run(
self, context: dict, mock_mode=False, session_state=None
self, context: dict, session_state=None
) -> ExecutionResult:
"""Run the agent (convenience method for single execution)."""
await self.start(mock_mode=mock_mode)
await self.start()
try:
result = await self.trigger_and_wait(
"start", context, session_state=session_state
@@ -1,9 +1,9 @@
{
"hive-tools": {
"transport": "stdio",
"command": "python",
"args": ["mcp_server.py", "--stdio"],
"cwd": "../../tools",
"command": "uv",
"args": ["run", "python", "mcp_server.py", "--stdio"],
"cwd": "../../../tools",
"description": "Hive tools MCP server providing web_search, web_scrape, and send_email"
}
}
Executable
+62
View File
@@ -0,0 +1,62 @@
#!/usr/bin/env bash
#
# Wrapper script for the Hive CLI.
# Uses uv to run the hive command in the project's virtual environment.
#
# Usage:
# ./hive tui - Launch interactive agent dashboard
# ./hive run <agent> - Run an agent
# ./hive --help - Show all commands
#
set -e
# Resolve symlinks to find the real script location
SOURCE="${BASH_SOURCE[0]}"
while [ -L "$SOURCE" ]; do
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
SOURCE="$(readlink "$SOURCE")"
# Handle relative symlinks
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
done
SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
# Verify user is running from the hive project directory
USER_CWD="$(pwd)"
if [ "$USER_CWD" != "$SCRIPT_DIR" ]; then
echo "Error: hive must be run from the project directory." >&2
echo "" >&2
echo " Current directory: $USER_CWD" >&2
echo " Expected directory: $SCRIPT_DIR" >&2
echo "" >&2
echo "Run: cd $SCRIPT_DIR" >&2
exit 1
fi
cd "$SCRIPT_DIR"
# Verify this is a valid Hive project directory
if [ ! -f "$SCRIPT_DIR/pyproject.toml" ] || [ ! -d "$SCRIPT_DIR/core" ]; then
echo "Error: Not a valid Hive project directory: $SCRIPT_DIR" >&2
echo "" >&2
echo "The hive CLI must be run from a Hive project root." >&2
echo "Expected files: pyproject.toml, core/" >&2
exit 1
fi
if [ ! -d "$SCRIPT_DIR/.venv" ]; then
echo "Error: Virtual environment not found." >&2
echo "" >&2
echo "Run ./quickstart.sh first to set up the project." >&2
exit 1
fi
# Ensure uv is in PATH (common install locations)
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if ! command -v uv &> /dev/null; then
echo "Error: uv is not installed. Run ./quickstart.sh first." >&2
exit 1
fi
exec uv run hive "$@"
+38 -1
View File
@@ -709,6 +709,38 @@ if [ $ERRORS -gt 0 ]; then
exit 1
fi
# ============================================================
# Step 7: Install hive CLI globally
# ============================================================
echo -e "${YELLOW}${NC} ${BLUE}${BOLD}Step 7: Installing hive CLI...${NC}"
echo ""
# Ensure ~/.local/bin exists and is in PATH
mkdir -p "$HOME/.local/bin"
# Create/update symlink
HIVE_SCRIPT="$SCRIPT_DIR/hive"
HIVE_LINK="$HOME/.local/bin/hive"
if [ -L "$HIVE_LINK" ] || [ -e "$HIVE_LINK" ]; then
rm -f "$HIVE_LINK"
fi
ln -s "$HIVE_SCRIPT" "$HIVE_LINK"
echo -e "${GREEN} ✓ hive CLI installed to ~/.local/bin/hive${NC}"
# Check if ~/.local/bin is in PATH
if echo "$PATH" | grep -q "$HOME/.local/bin"; then
echo -e "${GREEN} ✓ ~/.local/bin is in PATH${NC}"
else
echo -e "${YELLOW} ⚠ Add ~/.local/bin to your PATH:${NC}"
echo -e " ${DIM}echo 'export PATH=\"\$HOME/.local/bin:\$PATH\"' >> ~/.bashrc${NC}"
echo -e " ${DIM}source ~/.bashrc${NC}"
fi
echo ""
# ============================================================
# Success!
# ============================================================
@@ -740,7 +772,12 @@ if [ -n "$HIVE_CREDENTIAL_KEY" ]; then
echo ""
fi
echo -e "${BOLD}Quick Start:${NC}"
echo -e "${BOLD}Run an Agent:${NC}"
echo ""
echo -e " Launch the interactive dashboard to browse and run agents:"
echo -e " ${CYAN}hive tui${NC}"
echo ""
echo -e "${BOLD}Build a New Agent:${NC}"
echo ""
echo -e " 1. Open Claude Code in this directory:"
echo -e " ${CYAN}claude${NC}"
@@ -353,7 +353,18 @@ class CredentialStoreAdapter:
cls,
specs: dict[str, CredentialSpec] | None = None,
) -> CredentialStoreAdapter:
"""Create adapter with encrypted storage primary and env var fallback."""
"""Create adapter with encrypted storage primary and env var fallback.
When ADEN_API_KEY is set, builds the store with AdenSyncProvider and
AdenCachedStorage so that OAuth credentials (Google, HubSpot, Slack)
auto-refresh via the Aden server. Non-Aden credentials (brave_search,
anthropic, resend) still resolve from environment variables.
When ADEN_API_KEY is not set, behaves identically to before.
"""
import logging
import os
from framework.credentials import CredentialStore
from framework.credentials.storage import (
CompositeStorage,
@@ -361,6 +372,8 @@ class CredentialStoreAdapter:
EnvVarStorage,
)
log = logging.getLogger(__name__)
if specs is None:
from . import CREDENTIAL_SPECS
@@ -368,17 +381,69 @@ class CredentialStoreAdapter:
env_mapping = {name: spec.env_var for name, spec in specs.items()}
# --- Aden sync branch ---
# Note: we don't use CredentialStore.with_aden_sync() here because it
# only wraps EncryptedFileStorage. We need CompositeStorage (encrypted
# + env var fallback) so non-Aden credentials like brave_search still
# resolve from environment variables.
aden_api_key = os.environ.get("ADEN_API_KEY")
if aden_api_key:
try:
from framework.credentials.aden import (
AdenCachedStorage,
AdenClientConfig,
AdenCredentialClient,
AdenSyncProvider,
)
# Local storage: encrypted primary + env var fallback
encrypted = EncryptedFileStorage()
env = EnvVarStorage(env_mapping)
local_composite = CompositeStorage(primary=encrypted, fallbacks=[env])
# Aden components
client = AdenCredentialClient(
AdenClientConfig(
base_url=os.environ.get("ADEN_API_URL", "https://api.adenhq.com"),
)
)
provider = AdenSyncProvider(client=client)
# AdenCachedStorage wraps composite, giving Aden priority
cached_storage = AdenCachedStorage(
local_storage=local_composite,
aden_provider=provider,
cache_ttl_seconds=300,
)
store = CredentialStore(
storage=cached_storage,
providers=[provider],
auto_refresh=True,
)
# Initial sync: populate local cache from Aden
try:
synced = provider.sync_all(store)
log.info("Aden credential sync complete: %d credentials synced", synced)
except Exception as e:
log.warning("Aden initial sync failed (will retry on access): %s", e)
return cls(store=store, specs=specs)
except Exception as e:
log.warning(
"Aden credential sync unavailable, falling back to default storage: %s", e
)
# --- Default branch (no ADEN_API_KEY or Aden setup failed) ---
try:
encrypted = EncryptedFileStorage()
env = EnvVarStorage(env_mapping)
composite = CompositeStorage(primary=encrypted, fallbacks=[env])
store = CredentialStore(storage=composite)
except Exception as e:
import logging
logging.getLogger(__name__).warning(
"Encrypted credential storage unavailable, falling back to env vars: %s", e
)
log.warning("Encrypted credential storage unavailable, falling back to env vars: %s", e)
store = CredentialStore.with_env_storage(env_mapping)
return cls(store=store, specs=specs)
+5
View File
@@ -42,6 +42,7 @@ from .file_system_toolkits.write_to_file import register_tools as register_write
from .github_tool import register_tools as register_github
from .hubspot_tool import register_tools as register_hubspot
from .pdf_read_tool import register_tools as register_pdf_read
from .runtime_logs_tool import register_tools as register_runtime_logs
from .slack_tool import register_tools as register_slack
from .web_scrape_tool import register_tools as register_web_scrape
from .web_search_tool import register_tools as register_web_search
@@ -66,6 +67,7 @@ def register_all_tools(
register_example(mcp)
register_web_scrape(mcp)
register_pdf_read(mcp)
register_runtime_logs(mcp)
# Tools that need credentials (pass credentials if provided)
# web_search supports multiple providers (Google, Brave) with auto-detection
@@ -140,6 +142,9 @@ def register_all_tools(
"hubspot_get_deal",
"hubspot_create_deal",
"hubspot_update_deal",
"query_runtime_logs",
"query_runtime_log_details",
"query_runtime_log_raw",
"slack_send_message",
"slack_list_channels",
"slack_get_channel_history",
@@ -15,6 +15,8 @@ from pathlib import Path
from mcp.server.fastmcp import FastMCP
from aden_tools.credentials.browser import open_browser
def register_tools(mcp: FastMCP) -> None:
"""Register data management tools with the MCP server."""
@@ -142,7 +144,9 @@ def register_tools(mcp: FastMCP) -> None:
return {"error": f"Failed to load data: {str(e)}"}
@mcp.tool()
def serve_file_to_user(filename: str, data_dir: str, label: str = "") -> dict:
def serve_file_to_user(
filename: str, data_dir: str, label: str = "", open_in_browser: bool = False
) -> dict:
"""
Purpose
Resolve a sandboxed file path to a fully qualified file URI
@@ -152,6 +156,8 @@ def register_tools(mcp: FastMCP) -> None:
After saving a file (HTML report, CSV export, etc.) with save_data,
call this to give the user a clickable link to open it.
The TUI will render the file:// URI as a clickable link.
Set open_in_browser=True to also auto-open the file in the
user's default browser.
Rules & Constraints
filename must be a simple name no paths or '..'
@@ -162,9 +168,10 @@ def register_tools(mcp: FastMCP) -> None:
filename: The filename to serve (must exist in data_dir).
data_dir: Absolute path to the data directory.
label: Optional display label (defaults to filename).
open_in_browser: If True, auto-open the file in the default browser.
Returns:
Dict with file_uri, file_path, and label
Dict with file_uri, file_path, label, and optionally browser_opened
"""
if not filename or ".." in filename or "/" in filename or "\\" in filename:
return {"error": "Invalid filename. Use simple names like 'report.html'"}
@@ -178,12 +185,19 @@ def register_tools(mcp: FastMCP) -> None:
full_path = str(path.resolve())
file_uri = f"file://{full_path}"
return {
result = {
"success": True,
"file_uri": file_uri,
"file_path": full_path,
"label": label or filename,
}
if open_in_browser:
opened, msg = open_browser(file_uri)
result["browser_opened"] = opened
result["browser_message"] = msg
return result
except Exception as e:
return {"error": f"Failed to serve file: {str(e)}"}
@@ -0,0 +1,5 @@
"""Runtime Logs Tool package."""
from .runtime_logs_tool import register_tools
__all__ = ["register_tools"]
@@ -0,0 +1,232 @@
"""MCP tools for querying runtime logs.
Three tools provide access to the three-level runtime logging system:
- query_runtime_logs: Level 1 summaries (did the graph run succeed?)
- query_runtime_log_details: Level 2 per-node results (which node failed?)
- query_runtime_log_raw: Level 3 full step data (what exactly happened?)
Implementation uses pure sync file I/O -- no imports from the core runtime
logger/store classes. L2 and L3 use JSONL format (one JSON object per line).
L1 uses standard JSON. The file format is the interface between writer
(RuntimeLogger -> RuntimeLogStore) and reader (these MCP tools).
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from fastmcp import FastMCP
logger = logging.getLogger(__name__)
def _read_jsonl(path: Path) -> list[dict]:
"""Parse a JSONL file into a list of dicts.
Skips blank lines and corrupt JSON lines (partial writes from crashes).
"""
results = []
if not path.exists():
return results
try:
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
results.append(json.loads(line))
except json.JSONDecodeError:
logger.warning("Skipping corrupt JSONL line in %s", path)
continue
except OSError as e:
logger.warning("Failed to read %s: %s", path, e)
return results
def _get_run_dirs(agent_work_dir: Path) -> list[tuple[str, Path]]:
"""Scan both old and new storage locations for run directories.
Returns list of (run_id, log_dir_path) tuples.
Scans:
- New: {agent_work_dir}/sessions/{session_id}/logs/
- Old: {agent_work_dir}/runtime_logs/runs/{run_id}/ (deprecated)
"""
run_dirs = []
# Scan new location: sessions/{session_id}/logs/
sessions_dir = agent_work_dir / "sessions"
if sessions_dir.exists():
for session_dir in sessions_dir.iterdir():
if session_dir.is_dir() and session_dir.name.startswith("session_"):
logs_dir = session_dir / "logs"
if logs_dir.exists() and logs_dir.is_dir():
run_dirs.append((session_dir.name, logs_dir))
# Scan old location: runtime_logs/runs/ (deprecated)
old_runs_dir = agent_work_dir / "runtime_logs" / "runs"
if old_runs_dir.exists():
for run_dir in old_runs_dir.iterdir():
if run_dir.is_dir():
run_dirs.append((run_dir.name, run_dir))
return run_dirs
def register_tools(mcp: FastMCP) -> None:
"""Register runtime log query tools with the MCP server."""
@mcp.tool()
def query_runtime_logs(
agent_work_dir: str,
status: str = "",
limit: int = 20,
) -> dict:
"""Query runtime log summaries. Returns high-level pass/fail for recent graph runs.
Scans both old (runtime_logs/runs/) and new (sessions/*/logs/) locations.
Use status='needs_attention' to find runs that need debugging.
Other status values: 'success', 'failure', 'degraded', 'in_progress'.
Leave status empty to see all runs.
Args:
agent_work_dir: Path to the agent's working directory
status: Filter by status (empty string for all)
limit: Maximum number of results to return (default 20)
Returns:
Dict with 'runs' list of summary objects and 'total' count
"""
work_dir = Path(agent_work_dir)
run_dirs = _get_run_dirs(work_dir)
if not run_dirs:
return {"runs": [], "total": 0, "message": "No runtime logs found"}
summaries = []
for run_id, log_dir in run_dirs:
summary_path = log_dir / "summary.json"
if summary_path.exists():
try:
data = json.loads(summary_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
else:
# In-progress run: no summary.json yet
data = {
"run_id": run_id,
"status": "in_progress",
"started_at": "",
"needs_attention": False,
}
# Apply status filter
if status == "needs_attention":
if not data.get("needs_attention", False):
continue
elif status and data.get("status") != status:
continue
summaries.append(data)
# Sort by started_at descending
summaries.sort(key=lambda s: s.get("started_at", ""), reverse=True)
total = len(summaries)
summaries = summaries[:limit]
return {"runs": summaries, "total": total}
@mcp.tool()
def query_runtime_log_details(
agent_work_dir: str,
run_id: str,
needs_attention_only: bool = False,
node_id: str = "",
) -> dict:
"""Get per-node completion details for a specific graph run.
Shows per-node success/failure, exit status, verdict counts,
and attention flags. Use after query_runtime_logs identifies
a run to investigate.
Supports both old (runtime_logs/runs/) and new (sessions/*/logs/) locations.
Args:
agent_work_dir: Path to the agent's working directory
run_id: The run ID from query_runtime_logs results
needs_attention_only: If True, only return flagged nodes
node_id: If set, only return details for this node
Returns:
Dict with run_id and nodes list of per-node details
"""
work_dir = Path(agent_work_dir)
# Try new location first: sessions/{session_id}/logs/
if run_id.startswith("session_"):
details_path = work_dir / "sessions" / run_id / "logs" / "details.jsonl"
else:
# Old location: runtime_logs/runs/{run_id}/
details_path = work_dir / "runtime_logs" / "runs" / run_id / "details.jsonl"
if not details_path.exists():
return {"error": f"No details found for run {run_id}"}
nodes = _read_jsonl(details_path)
if node_id:
nodes = [n for n in nodes if n.get("node_id") == node_id]
if needs_attention_only:
nodes = [n for n in nodes if n.get("needs_attention")]
return {"run_id": run_id, "nodes": nodes}
@mcp.tool()
def query_runtime_log_raw(
agent_work_dir: str,
run_id: str,
step_index: int = -1,
node_id: str = "",
) -> dict:
"""Get full tool call and LLM details for a graph run.
Use after identifying a problematic node via
query_runtime_log_details. Returns tool inputs/outputs,
LLM text, and token counts per step.
Supports both old (runtime_logs/runs/) and new (sessions/*/logs/) locations.
Args:
agent_work_dir: Path to the agent's working directory
run_id: The run ID from query_runtime_logs results
step_index: Specific step index, or -1 for all steps
node_id: If set, only return steps for this node
Returns:
Dict with run_id and steps list of tool/LLM details
"""
work_dir = Path(agent_work_dir)
# Try new location first: sessions/{session_id}/logs/
if run_id.startswith("session_"):
tool_logs_path = work_dir / "sessions" / run_id / "logs" / "tool_logs.jsonl"
else:
# Old location: runtime_logs/runs/{run_id}/
tool_logs_path = work_dir / "runtime_logs" / "runs" / run_id / "tool_logs.jsonl"
if not tool_logs_path.exists():
return {"error": f"No tool logs found for run {run_id}"}
steps = _read_jsonl(tool_logs_path)
if node_id:
steps = [s for s in steps if s.get("node_id") == node_id]
if step_index >= 0:
steps = [s for s in steps if s.get("step_index") == step_index]
return {"run_id": run_id, "steps": steps}
+129
View File
@@ -1,5 +1,7 @@
"""Tests for CredentialStoreAdapter."""
from unittest.mock import MagicMock, patch
import pytest
from aden_tools.credentials import (
@@ -484,3 +486,130 @@ class TestSpecCompleteness:
assert spec.credential_group == "", (
f"Credential '{name}' has unexpected credential_group='{spec.credential_group}'"
)
class TestCredentialStoreAdapterAdenSync:
"""Tests for Aden sync branch in CredentialStoreAdapter.default()."""
def _patch_encrypted_storage(self, tmp_path):
"""Patch EncryptedFileStorage to use a temp directory."""
from framework.credentials.storage import EncryptedFileStorage
original_init = EncryptedFileStorage.__init__
def patched_init(self_inner, base_path=None, **kwargs):
original_init(self_inner, base_path=str(tmp_path / "creds"), **kwargs)
return patch.object(EncryptedFileStorage, "__init__", patched_init)
def test_default_with_aden_key_creates_aden_store(self, monkeypatch, tmp_path):
"""When ADEN_API_KEY is set, default() wires up AdenSyncProvider."""
monkeypatch.setenv("ADEN_API_KEY", "test-aden-key")
monkeypatch.setenv("ADEN_API_URL", "https://test.adenhq.com")
mock_client = MagicMock()
mock_client.list_integrations.return_value = []
with (
self._patch_encrypted_storage(tmp_path),
patch(
"framework.credentials.aden.AdenCredentialClient",
return_value=mock_client,
),
patch(
"framework.credentials.aden.AdenClientConfig",
),
):
adapter = CredentialStoreAdapter.default()
# Verify AdenSyncProvider is registered
provider = adapter.store.get_provider("aden_sync")
assert provider is not None
def test_default_without_aden_key_uses_env_fallback(self, monkeypatch, tmp_path):
"""When ADEN_API_KEY is not set, default() uses env-only storage."""
monkeypatch.delenv("ADEN_API_KEY", raising=False)
monkeypatch.setenv("BRAVE_SEARCH_API_KEY", "test-brave-key")
with self._patch_encrypted_storage(tmp_path):
adapter = CredentialStoreAdapter.default()
# No Aden provider should be registered
assert adapter.store.get_provider("aden_sync") is None
# Env vars still work
assert adapter.get("brave_search") == "test-brave-key"
def test_default_aden_non_aden_cred_falls_through_to_env(self, monkeypatch, tmp_path):
"""Non-Aden credentials (e.g. brave_search) resolve from env vars even with Aden."""
monkeypatch.setenv("ADEN_API_KEY", "test-aden-key")
monkeypatch.setenv("ADEN_API_URL", "https://test.adenhq.com")
monkeypatch.setenv("BRAVE_SEARCH_API_KEY", "brave-from-env")
mock_client = MagicMock()
mock_client.list_integrations.return_value = []
# Aden returns None for brave_search (404 → None)
mock_client.get_credential.return_value = None
with (
self._patch_encrypted_storage(tmp_path),
patch(
"framework.credentials.aden.AdenCredentialClient",
return_value=mock_client,
),
patch(
"framework.credentials.aden.AdenClientConfig",
),
):
adapter = CredentialStoreAdapter.default()
assert adapter.get("brave_search") == "brave-from-env"
def test_default_aden_sync_failure_falls_back_gracefully(self, monkeypatch, tmp_path):
"""If Aden initial sync fails, adapter is still created and env vars work."""
monkeypatch.setenv("ADEN_API_KEY", "test-aden-key")
monkeypatch.setenv("ADEN_API_URL", "https://test.adenhq.com")
monkeypatch.setenv("BRAVE_SEARCH_API_KEY", "brave-fallback")
mock_client = MagicMock()
mock_client.list_integrations.side_effect = Exception("Connection refused")
mock_client.get_credential.return_value = None
with (
self._patch_encrypted_storage(tmp_path),
patch(
"framework.credentials.aden.AdenCredentialClient",
return_value=mock_client,
),
patch(
"framework.credentials.aden.AdenClientConfig",
),
):
adapter = CredentialStoreAdapter.default()
# Adapter was created despite sync failure
assert adapter is not None
assert adapter.get("brave_search") == "brave-fallback"
def test_default_aden_import_error_falls_back(self, monkeypatch, tmp_path):
"""If Aden imports fail (e.g. missing httpx), fall back to default storage."""
monkeypatch.setenv("ADEN_API_KEY", "test-aden-key")
monkeypatch.setenv("BRAVE_SEARCH_API_KEY", "brave-fallback")
import builtins
real_import = builtins.__import__
def mock_import(name, *args, **kwargs):
if name == "framework.credentials.aden":
raise ImportError(f"No module named '{name}'")
return real_import(name, *args, **kwargs)
with (
self._patch_encrypted_storage(tmp_path),
patch.object(builtins, "__import__", side_effect=mock_import),
):
adapter = CredentialStoreAdapter.default()
# Fell back to default — env vars still work, no Aden provider
assert adapter.store.get_provider("aden_sync") is None
assert adapter.get("brave_search") == "brave-fallback"
+345
View File
@@ -0,0 +1,345 @@
"""Tests for MCP runtime_logs_tool.
Uses fixture data written to tmp_path, verifying the three query tools
return correct results. L2/L3 use JSONL format; L1 uses standard JSON.
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from fastmcp import FastMCP
from aden_tools.tools.runtime_logs_tool import register_tools
def _write_jsonl(path: Path, items: list[dict]) -> None:
"""Write a list of dicts as JSONL (one JSON object per line)."""
with open(path, "w", encoding="utf-8") as f:
for item in items:
f.write(json.dumps(item) + "\n")
@pytest.fixture
def runtime_logs_dir(tmp_path: Path) -> Path:
"""Create fixture runtime log data in JSONL format."""
runs_dir = tmp_path / "runtime_logs" / "runs"
# Run 1: success (2 nodes)
run1_dir = runs_dir / "20250101T000001_abc12345"
run1_dir.mkdir(parents=True)
(run1_dir / "summary.json").write_text(
json.dumps(
{
"run_id": "20250101T000001_abc12345",
"agent_id": "agent-a",
"goal_id": "goal-1",
"status": "success",
"total_nodes_executed": 2,
"node_path": ["node-1", "node-2"],
"total_input_tokens": 200,
"total_output_tokens": 100,
"needs_attention": False,
"attention_reasons": [],
"started_at": "2025-01-01T00:00:01",
"duration_ms": 3000,
"execution_quality": "clean",
}
)
)
_write_jsonl(
run1_dir / "details.jsonl",
[
{
"node_id": "node-1",
"node_name": "Search",
"node_type": "event_loop",
"success": True,
"total_steps": 2,
"tokens_used": 250,
"exit_status": "success",
"accept_count": 1,
"retry_count": 1,
"needs_attention": False,
"attention_reasons": [],
},
{
"node_id": "node-2",
"node_name": "Format",
"node_type": "function",
"success": True,
"total_steps": 1,
"tokens_used": 0,
"needs_attention": False,
"attention_reasons": [],
},
],
)
_write_jsonl(
run1_dir / "tool_logs.jsonl",
[
{
"node_id": "node-1",
"node_type": "event_loop",
"step_index": 0,
"llm_text": "Let me search.",
"tool_calls": [
{
"tool_use_id": "tc_1",
"tool_name": "web_search",
"tool_input": {"query": "test"},
"result": "Found data",
"is_error": False,
}
],
"input_tokens": 100,
"output_tokens": 50,
"latency_ms": 1000,
"verdict": "RETRY",
},
{
"node_id": "node-1",
"node_type": "event_loop",
"step_index": 1,
"llm_text": "Here is your result.",
"tool_calls": [],
"input_tokens": 100,
"output_tokens": 50,
"latency_ms": 800,
"verdict": "ACCEPT",
},
{
"node_id": "node-2",
"node_type": "function",
"step_index": 0,
"llm_text": "",
"tool_calls": [],
"input_tokens": 0,
"output_tokens": 0,
"latency_ms": 50,
},
],
)
# Run 2: failure with needs_attention
run2_dir = runs_dir / "20250101T000002_def67890"
run2_dir.mkdir(parents=True)
(run2_dir / "summary.json").write_text(
json.dumps(
{
"run_id": "20250101T000002_def67890",
"agent_id": "agent-a",
"goal_id": "goal-2",
"status": "failure",
"total_nodes_executed": 1,
"node_path": ["node-1"],
"total_input_tokens": 10000,
"total_output_tokens": 5000,
"needs_attention": True,
"attention_reasons": ["Node node-1 failed: Max iterations exhausted"],
"started_at": "2025-01-01T00:00:02",
"duration_ms": 60000,
"execution_quality": "failed",
}
)
)
_write_jsonl(
run2_dir / "details.jsonl",
[
{
"node_id": "node-1",
"node_name": "Search",
"node_type": "event_loop",
"success": False,
"error": "Max iterations exhausted",
"total_steps": 50,
"exit_status": "failure",
"retry_count": 50,
"needs_attention": True,
"attention_reasons": ["Node node-1 failed: Max iterations exhausted"],
},
],
)
_write_jsonl(
run2_dir / "tool_logs.jsonl",
[],
)
return tmp_path
@pytest.fixture
def runtime_logs_dir_with_in_progress(runtime_logs_dir: Path) -> Path:
"""Extend the fixture with an in-progress run (no summary.json)."""
runs_dir = runtime_logs_dir / "runtime_logs" / "runs"
run3_dir = runs_dir / "20250101T000003_fff00000"
run3_dir.mkdir(parents=True)
# Only L2/L3 files, no summary.json
_write_jsonl(
run3_dir / "details.jsonl",
[
{
"node_id": "node-1",
"node_name": "Active",
"node_type": "event_loop",
"success": True,
},
],
)
_write_jsonl(
run3_dir / "tool_logs.jsonl",
[
{
"node_id": "node-1",
"node_type": "event_loop",
"step_index": 0,
"llm_text": "Working...",
},
],
)
return runtime_logs_dir
@pytest.fixture
def query_logs_fn(mcp: FastMCP):
register_tools(mcp)
return mcp._tool_manager._tools["query_runtime_logs"].fn
@pytest.fixture
def query_details_fn(mcp: FastMCP):
register_tools(mcp)
return mcp._tool_manager._tools["query_runtime_log_details"].fn
@pytest.fixture
def query_raw_fn(mcp: FastMCP):
register_tools(mcp)
return mcp._tool_manager._tools["query_runtime_log_raw"].fn
class TestQueryRuntimeLogs:
def test_list_all_runs(self, query_logs_fn, runtime_logs_dir: Path):
result = query_logs_fn(agent_work_dir=str(runtime_logs_dir))
assert result["total"] == 2
assert len(result["runs"]) == 2
# Sorted by started_at desc
assert result["runs"][0]["run_id"] == "20250101T000002_def67890"
def test_filter_by_status(self, query_logs_fn, runtime_logs_dir: Path):
result = query_logs_fn(agent_work_dir=str(runtime_logs_dir), status="success")
assert result["total"] == 1
assert result["runs"][0]["status"] == "success"
def test_filter_needs_attention(self, query_logs_fn, runtime_logs_dir: Path):
result = query_logs_fn(agent_work_dir=str(runtime_logs_dir), status="needs_attention")
assert result["total"] == 1
assert result["runs"][0]["needs_attention"] is True
def test_empty_directory(self, query_logs_fn, tmp_path: Path):
result = query_logs_fn(agent_work_dir=str(tmp_path))
assert result["total"] == 0
assert result["runs"] == []
def test_limit(self, query_logs_fn, runtime_logs_dir: Path):
result = query_logs_fn(agent_work_dir=str(runtime_logs_dir), limit=1)
assert len(result["runs"]) == 1
def test_in_progress_runs_visible(self, query_logs_fn, runtime_logs_dir_with_in_progress: Path):
result = query_logs_fn(agent_work_dir=str(runtime_logs_dir_with_in_progress))
assert result["total"] == 3
run_ids = {r["run_id"] for r in result["runs"]}
assert "20250101T000003_fff00000" in run_ids
# Filter in_progress only
result_ip = query_logs_fn(
agent_work_dir=str(runtime_logs_dir_with_in_progress),
status="in_progress",
)
assert result_ip["total"] == 1
assert result_ip["runs"][0]["status"] == "in_progress"
class TestQueryRuntimeLogDetails:
def test_load_details(self, query_details_fn, runtime_logs_dir: Path):
result = query_details_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
)
assert result["run_id"] == "20250101T000001_abc12345"
assert len(result["nodes"]) == 2
assert result["nodes"][0]["node_id"] == "node-1"
def test_filter_by_node_id(self, query_details_fn, runtime_logs_dir: Path):
result = query_details_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
node_id="node-2",
)
assert len(result["nodes"]) == 1
assert result["nodes"][0]["node_id"] == "node-2"
def test_needs_attention_only(self, query_details_fn, runtime_logs_dir: Path):
result = query_details_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000002_def67890",
needs_attention_only=True,
)
assert len(result["nodes"]) == 1
assert result["nodes"][0]["needs_attention"] is True
def test_missing_run(self, query_details_fn, runtime_logs_dir: Path):
result = query_details_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="nonexistent",
)
assert "error" in result
class TestQueryRuntimeLogRaw:
def test_load_all_steps(self, query_raw_fn, runtime_logs_dir: Path):
result = query_raw_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
)
assert result["run_id"] == "20250101T000001_abc12345"
assert len(result["steps"]) == 3
def test_filter_by_step_index(self, query_raw_fn, runtime_logs_dir: Path):
result = query_raw_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
step_index=0,
)
assert len(result["steps"]) == 2 # step_index=0 for both node-1 and node-2
assert all(s["step_index"] == 0 for s in result["steps"])
def test_filter_by_node_id(self, query_raw_fn, runtime_logs_dir: Path):
result = query_raw_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
node_id="node-1",
)
assert len(result["steps"]) == 2 # 2 steps for node-1
assert all(s["node_id"] == "node-1" for s in result["steps"])
assert result["steps"][0]["tool_calls"][0]["tool_name"] == "web_search"
def test_filter_by_node_id_and_step_index(self, query_raw_fn, runtime_logs_dir: Path):
result = query_raw_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="20250101T000001_abc12345",
node_id="node-1",
step_index=0,
)
assert len(result["steps"]) == 1
assert result["steps"][0]["node_id"] == "node-1"
assert result["steps"][0]["step_index"] == 0
def test_missing_run(self, query_raw_fn, runtime_logs_dir: Path):
result = query_raw_fn(
agent_work_dir=str(runtime_logs_dir),
run_id="nonexistent",
)
assert "error" in result