Compare commits
114 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9da91b5319 | |||
| 2493beaf5a | |||
| b6c65ab5d5 | |||
| 162f9a55ad | |||
| e484fdfa51 | |||
| 77d9ccf2e4 | |||
| 94e39ee09e | |||
| 661b0c0038 | |||
| 8ed38bf0e2 | |||
| 4d675dfff7 | |||
| b42a3293f1 | |||
| 87e9bf853d | |||
| c56f78422a | |||
| ac311e10ba | |||
| 0297520263 | |||
| 4803552a7a | |||
| b8d85ff723 | |||
| 7d571dfaec | |||
| ba02e53bdd | |||
| 153e6142ff | |||
| 228449c9d8 | |||
| c65eed8802 | |||
| 40d32f2e01 | |||
| c83aac5e12 | |||
| 48b9241247 | |||
| 7779bc5336 | |||
| beec549f74 | |||
| 310698ecc0 | |||
| 4f719c4778 | |||
| 4cc00f3bdc | |||
| 1f9c47fef1 | |||
| 80a4980640 | |||
| 8dbe424f5a | |||
| ec9bf033e6 | |||
| a2d21ec7bc | |||
| 06ccc853ee | |||
| 4847332161 | |||
| 8c1ee54725 | |||
| 5e537d9d55 | |||
| d6b95067a1 | |||
| 32cae75ef5 | |||
| 21e7554cdb | |||
| 374442e900 | |||
| a1a0ec5ddb | |||
| 1fd56b079c | |||
| a12163d63f | |||
| 0cd6f21980 | |||
| a88fc1d75c | |||
| 87b0037fcd | |||
| 767d32d420 | |||
| e9bde26611 | |||
| c02f40622c | |||
| 929dc24e93 | |||
| 8cfb533fef | |||
| 3328a388b3 | |||
| 8f632eb005 | |||
| c8ee961436 | |||
| 6fd7efece6 | |||
| bc9f6b0af8 | |||
| 7d48f17867 | |||
| 776583b3ad | |||
| 9c28dae583 | |||
| 59a315b90b | |||
| 866518f188 | |||
| 736ae65a1d | |||
| 76c9f7c9a9 | |||
| 32ad225d7f | |||
| e5428bec5c | |||
| 7ae6f67470 | |||
| faf534511b | |||
| 594bceb8f5 | |||
| 9dc0f48ec9 | |||
| 9d11f834b8 | |||
| 131b72cd0c | |||
| ce5a2d4a81 | |||
| 7f489cee46 | |||
| 3c2d669a2f | |||
| ec36e96499 | |||
| 9ecd4980e4 | |||
| 64446ff9b6 | |||
| e3d2262292 | |||
| 891cfa387a | |||
| f0243fddf2 | |||
| 85ff8e364b | |||
| 75f1afe8e3 | |||
| 7b660311e5 | |||
| 98a493296d | |||
| bc2a42aed2 | |||
| 8b501d9091 | |||
| 0304b392b2 | |||
| ae9b4e82fe | |||
| 4bac5e4c46 | |||
| c4d3400ec4 | |||
| 1da9bb0c0f | |||
| 760ed51ad3 | |||
| 6d0a3b952a | |||
| 873fcd5822 | |||
| a08f3a8925 | |||
| 2a98d3a489 | |||
| b681ba03b1 | |||
| 47cd55052f | |||
| fb203b5bdf | |||
| 3f6bdda2a0 | |||
| f2492bd4d4 | |||
| 9d156325e0 | |||
| 4310852ee6 | |||
| 853f1e9873 | |||
| ae5fe84fb2 | |||
| 92b538d5ae | |||
| 5351703949 | |||
| 7ba8169444 | |||
| d090c954ae | |||
| 9bee1666f1 | |||
| fb94637339 |
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"mcp__agent-builder__create_session",
|
||||
"mcp__agent-builder__set_goal",
|
||||
"mcp__agent-builder__add_node",
|
||||
"mcp__agent-builder__add_edge",
|
||||
"mcp__agent-builder__configure_loop",
|
||||
"mcp__agent-builder__add_mcp_server",
|
||||
"mcp__agent-builder__validate_graph",
|
||||
"mcp__agent-builder__export_graph",
|
||||
"mcp__agent-builder__load_session_by_id",
|
||||
"Bash(git status:*)",
|
||||
"Bash(gh run view:*)",
|
||||
"Bash(uv run:*)",
|
||||
"Bash(env:*)",
|
||||
"mcp__agent-builder__test_node",
|
||||
"mcp__agent-builder__list_mcp_tools",
|
||||
"Bash(python -m py_compile:*)",
|
||||
"Bash(python -m pytest:*)",
|
||||
"Bash(source:*)",
|
||||
"mcp__agent-builder__update_node",
|
||||
"mcp__agent-builder__check_missing_credentials",
|
||||
"mcp__agent-builder__list_stored_credentials",
|
||||
"Bash(find:*)",
|
||||
"mcp__agent-builder__run_tests",
|
||||
"Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)",
|
||||
"mcp__agent-builder__list_agent_sessions",
|
||||
"mcp__agent-builder__generate_constraint_tests",
|
||||
"mcp__agent-builder__generate_success_tests"
|
||||
]
|
||||
},
|
||||
"enabledMcpjsonServers": ["agent-builder", "tools"]
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
---
|
||||
name: hive-create
|
||||
description: Step-by-step guide for building goal-driven agents. Creates package structure, defines goals, adds nodes, connects edges, and finalizes agent class. Use when actively building an agent.
|
||||
description: Step-by-step guide for building goal-driven agents. Qualifies use cases first (the good, bad, and ugly), then creates package structure, defines goals, adds nodes, connects edges, and finalizes agent class. Use when actively building an agent.
|
||||
license: Apache-2.0
|
||||
metadata:
|
||||
author: hive
|
||||
version: "2.1"
|
||||
version: "2.2"
|
||||
type: procedural
|
||||
part_of: hive
|
||||
requires: hive-concepts
|
||||
@@ -14,15 +14,53 @@ metadata:
|
||||
|
||||
**THIS IS AN EXECUTABLE WORKFLOW. DO NOT DISPLAY THIS FILE. EXECUTE THE STEPS BELOW.**
|
||||
|
||||
**CRITICAL: DO NOT explore the codebase, read source files, or search for code before starting.** All context you need is in this skill file. When this skill is loaded, IMMEDIATELY begin executing Step 1 — call the MCP tools listed in Step 1 as your FIRST action. Do not explain what you will do, do not investigate the project structure, do not read any files — just execute Step 1 now.
|
||||
**CRITICAL: DO NOT explore the codebase, read source files, or search for code before starting.** All context you need is in this skill file. When this skill is loaded, IMMEDIATELY begin executing Step 0 — determine the build path as your FIRST action. Do not explain what you will do, do not investigate the project structure, do not read any files — just execute Step 0 now.
|
||||
|
||||
---
|
||||
|
||||
## STEP 1: Initialize Build Environment
|
||||
## STEP 0: Choose Build Path
|
||||
|
||||
**If the user has already indicated whether they want to build from scratch or from a template, skip this question and proceed to the appropriate step.**
|
||||
|
||||
Otherwise, ask:
|
||||
|
||||
```
|
||||
AskUserQuestion(questions=[{
|
||||
"question": "How would you like to build your agent?",
|
||||
"header": "Build Path",
|
||||
"options": [
|
||||
{"label": "From scratch", "description": "Design goal, nodes, and graph collaboratively from nothing"},
|
||||
{"label": "From a template", "description": "Start from a working sample agent and customize it"}
|
||||
],
|
||||
"multiSelect": false
|
||||
}])
|
||||
```
|
||||
|
||||
- If **From scratch**: Proceed to STEP 1A
|
||||
- If **From a template**: Proceed to STEP 1B
|
||||
|
||||
---
|
||||
|
||||
## STEP 1A: Initialize Build Environment (From Scratch)
|
||||
|
||||
**EXECUTE THESE TOOL CALLS NOW** (silent setup — no user interaction needed):
|
||||
|
||||
1. Register the hive-tools MCP server:
|
||||
1. Check for existing sessions:
|
||||
|
||||
```
|
||||
mcp__agent-builder__list_sessions()
|
||||
```
|
||||
|
||||
- If a session with this agent name already exists, load it with `mcp__agent-builder__load_session_by_id(session_id="...")` and skip to step 3.
|
||||
- If no matching session exists, proceed to step 2.
|
||||
|
||||
2. Create a build session (replace AGENT_NAME with the user's requested agent name in snake_case):
|
||||
|
||||
```
|
||||
mcp__agent-builder__create_session(name="AGENT_NAME")
|
||||
```
|
||||
|
||||
3. Register the hive-tools MCP server:
|
||||
|
||||
```
|
||||
mcp__agent-builder__add_mcp_server(
|
||||
@@ -35,45 +73,368 @@ mcp__agent-builder__add_mcp_server(
|
||||
)
|
||||
```
|
||||
|
||||
2. Create a build session (replace AGENT_NAME with the user's requested agent name in snake_case):
|
||||
|
||||
```
|
||||
mcp__agent-builder__create_session(name="AGENT_NAME")
|
||||
```
|
||||
|
||||
3. Discover available tools:
|
||||
4. Discover available tools:
|
||||
|
||||
```
|
||||
mcp__agent-builder__list_mcp_tools()
|
||||
```
|
||||
|
||||
4. Create the package directory:
|
||||
5. Create the package directory:
|
||||
|
||||
```bash
|
||||
mkdir -p exports/AGENT_NAME/nodes
|
||||
```
|
||||
|
||||
**Save the tool list for step 3** — you will need it for node design in STEP 3.
|
||||
**Save the tool list for STEP 4** — you will need it for node design.
|
||||
|
||||
**THEN immediately proceed to STEP 2** (do NOT display setup results to the user — just move on).
|
||||
|
||||
---
|
||||
|
||||
## STEP 1B: Initialize Build Environment (From Template)
|
||||
|
||||
**EXECUTE THESE STEPS NOW:**
|
||||
|
||||
### 1B.1: Discover available templates
|
||||
|
||||
List the template directories and read each template's `agent.json` to get its name and description:
|
||||
|
||||
```bash
|
||||
ls examples/templates/
|
||||
```
|
||||
|
||||
For each directory found, read `examples/templates/TEMPLATE_DIR/agent.json` with the Read tool and extract:
|
||||
- `agent.name` — the template's display name
|
||||
- `agent.description` — what the template does
|
||||
|
||||
### 1B.2: Present templates to user
|
||||
|
||||
Show the user a table of available templates:
|
||||
|
||||
> **Available Templates:**
|
||||
>
|
||||
> | # | Template | Description |
|
||||
> |---|----------|-------------|
|
||||
> | 1 | [name from agent.json] | [description from agent.json] |
|
||||
> | 2 | ... | ... |
|
||||
|
||||
Then ask the user to pick a template and provide a name for their new agent:
|
||||
|
||||
```
|
||||
AskUserQuestion(questions=[{
|
||||
"question": "Which template would you like to start from?",
|
||||
"header": "Template",
|
||||
"options": [
|
||||
{"label": "[template 1 name]", "description": "[template 1 description]"},
|
||||
{"label": "[template 2 name]", "description": "[template 2 description]"},
|
||||
...
|
||||
],
|
||||
"multiSelect": false
|
||||
}, {
|
||||
"question": "What should the new agent be named? (snake_case)",
|
||||
"header": "Agent Name",
|
||||
"options": [
|
||||
{"label": "Use template name", "description": "Keep the original template name as-is"},
|
||||
{"label": "Custom name", "description": "I'll provide a new snake_case name"}
|
||||
],
|
||||
"multiSelect": false
|
||||
}])
|
||||
```
|
||||
|
||||
### 1B.3: Copy template to exports
|
||||
|
||||
```bash
|
||||
cp -r examples/templates/TEMPLATE_DIR exports/NEW_AGENT_NAME
|
||||
```
|
||||
|
||||
### 1B.4: Create session and register MCP (same logic as STEP 1A)
|
||||
|
||||
First, check for existing sessions:
|
||||
|
||||
```
|
||||
mcp__agent-builder__list_sessions()
|
||||
```
|
||||
|
||||
- If a session with this agent name already exists, load it with `mcp__agent-builder__load_session_by_id(session_id="...")` and skip to `list_mcp_tools`.
|
||||
- If no matching session exists, create one:
|
||||
|
||||
```
|
||||
mcp__agent-builder__create_session(name="NEW_AGENT_NAME")
|
||||
```
|
||||
|
||||
Then register MCP and discover tools:
|
||||
|
||||
```
|
||||
mcp__agent-builder__add_mcp_server(
|
||||
name="hive-tools",
|
||||
transport="stdio",
|
||||
command="uv",
|
||||
args='["run", "python", "mcp_server.py", "--stdio"]',
|
||||
cwd="tools",
|
||||
description="Hive tools MCP server"
|
||||
)
|
||||
```
|
||||
|
||||
```
|
||||
mcp__agent-builder__list_mcp_tools()
|
||||
```
|
||||
|
||||
### 1B.5: Load template into builder session
|
||||
|
||||
Import the entire agent definition in one call:
|
||||
|
||||
```
|
||||
mcp__agent-builder__import_from_export(agent_json_path="exports/NEW_AGENT_NAME/agent.json")
|
||||
```
|
||||
|
||||
This reads the agent.json and populates the builder session with the goal, all nodes, and all edges.
|
||||
|
||||
**THEN immediately proceed to STEP 2.**
|
||||
|
||||
---
|
||||
|
||||
## STEP 2: Define Goal Together with User
|
||||
**A responsible engineer doesn't jump into building. First, understand the problem and be transparent about what the framework can and cannot do.**
|
||||
|
||||
**If starting from a template**, the goal is already loaded in the builder session. Present the existing goal to the user using the format below and ask for approval. Skip the collaborative drafting questions — go straight to presenting and asking "Do you approve this goal, or would you like to modify it?"
|
||||
|
||||
**If the user has NOT already described what they want to build**, start by asking what kind of agent they have in mind:
|
||||
|
||||
```
|
||||
AskUserQuestion(questions=[{
|
||||
"question": "What kind of agent do you want to build? Select an option below, or choose 'Other' to describe your own.",
|
||||
"header": "Agent type",
|
||||
"options": [
|
||||
{"label": "Data collection", "description": "Gathers information from the web, analyzes it, and produces a report or sends outreach (e.g. market research, news digest, email campaigns, competitive analysis)"},
|
||||
{"label": "Workflow automation", "description": "Automates a multi-step business process end-to-end (e.g. lead qualification, content publishing pipeline, data entry)"},
|
||||
{"label": "Personal assistant", "description": "Handles recurring tasks or monitors for events and acts on them (e.g. daily briefings, meeting prep, file organization)"}
|
||||
],
|
||||
"multiSelect": false
|
||||
}])
|
||||
```
|
||||
|
||||
Use the user's selection (or their custom description if they chose "Other") as context when shaping the goal below. If the user already described what they want before this step, skip the question and proceed directly.
|
||||
|
||||
**DO NOT propose a complete goal on your own.** Instead, collaborate with the user to define it.
|
||||
|
||||
**START by asking the user to help shape the goal:**
|
||||
### 2a: Fast Discovery (3-8 Turns)
|
||||
|
||||
> I've set up the build environment and discovered [N] available tools. Let's define the goal for your agent together.
|
||||
>
|
||||
> To get started, can you help me understand:
|
||||
>
|
||||
> 1. **What should this agent accomplish?** (the core purpose)
|
||||
> 2. **How will we know it succeeded?** (what does "done" look like)
|
||||
> 3. **Are there any hard constraints?** (things it must never do, quality bars, etc.)
|
||||
**The core principle**: Discovery should feel like progress, not paperwork. The stakeholder should walk away feeling like you understood them faster than anyone else would have.
|
||||
|
||||
**WAIT for the user to respond.** Use their input to draft:
|
||||
**Communication sytle**: Be concise. Say less. Mean more. Impatient stakeholders don't want a wall of text — they want to know you get it. Every sentence you say should either move the conversation forward or prove you understood something. If it does neither, cut it.
|
||||
|
||||
**Ask Question Rules: Respect Their Time.** Every question must earn its place by:
|
||||
1. **Preventing a costly wrong turn** — you're about to build the wrong thing
|
||||
2. **Unlocking a shortcut** — their answer lets you simplify the design
|
||||
3. **Surfacing a dealbreaker** — there's a constraint that changes everything
|
||||
4. **Provide Options** - Provide options to your questions if possible, but also always allow the user to type something beyong the options.
|
||||
|
||||
If a question doesn't do one of these, don't ask it. Make an assumption, state it, and move on.
|
||||
|
||||
---
|
||||
|
||||
#### 2a.1: Let Them Talk, But Listen Like an Architect
|
||||
|
||||
When the stakeholder describes what they want, don't just hear the words — listen for the architecture underneath. While they talk, mentally construct:
|
||||
|
||||
- **The actors**: Who are the people/systems involved?
|
||||
- **The trigger**: What kicks off the workflow?
|
||||
- **The core loop**: What's the main thing that happens repeatedly?
|
||||
- **The output**: What's the valuable thing produced at the end?
|
||||
- **The pain**: What about today's situation is broken, slow, or missing?
|
||||
|
||||
You are extracting a **domain model** from natural language in real time. Most stakeholders won't give you this structure explicitly — they'll give you a story. Your job is to hear the structure inside the story.
|
||||
|
||||
| They say... | You're hearing... |
|
||||
|-------------|-------------------|
|
||||
| Nouns they repeat | Your entities |
|
||||
| Verbs they emphasize | Your core operations |
|
||||
| Frustrations they mention | Your design constraints |
|
||||
| Workarounds they describe | What the system must replace |
|
||||
| People they name | Your user types |
|
||||
|
||||
---
|
||||
|
||||
#### 2a.2: Use Domain Knowledge to Fill In the Blanks
|
||||
|
||||
You have broad knowledge of how systems work. Use it aggressively.
|
||||
|
||||
If they say "I need a research agent," you already know it probably involves: search, summarization, source tracking, and iteration. Don't ask about each — use them as your starting mental model and let their specifics override your defaults.
|
||||
|
||||
If they say "I need to monitor files and alert me," you know this probably involves: watch patterns, triggers, notifications, and state tracking.
|
||||
|
||||
**The key move**: Take your general knowledge of the domain and merge it with the specifics they've given you. The result is a draft understanding that's 60-80% right before you've asked a single question. Your questions close the remaining 20-40%.
|
||||
|
||||
---
|
||||
|
||||
#### 2a.3: Play Back a Proposed Model (Not a List of Questions)
|
||||
|
||||
After listening, present a **concrete picture** of what you think they need. Make it specific enough that they can spot what's wrong.
|
||||
|
||||
**Pattern: "Here's what I heard — tell me where I'm off"**
|
||||
|
||||
> "OK here's how I'm picturing this: [User type] needs to [core action]. Right now they're [current painful workflow]. What you want is [proposed solution that replaces the pain].
|
||||
>
|
||||
> The way I'd structure this: [key entities] connected by [key relationships], with the main flow being [trigger → steps → outcome].
|
||||
>
|
||||
> For the MVP, I'd focus on [the one thing that delivers the most value] and hold off on [things that can wait].
|
||||
>
|
||||
> Before I start — [1-2 specific questions you genuinely can't infer]."
|
||||
|
||||
Why this works:
|
||||
- **Proves you were listening** — they don't feel like they have to repeat themselves
|
||||
- **Shows competence** — you're already thinking in systems
|
||||
- **Fast to correct** — "no, it's more like X" takes 10 seconds vs. answering 15 questions
|
||||
- **Creates momentum** — heading toward building, not more talking
|
||||
|
||||
---
|
||||
|
||||
#### 2a.4: Ask Only What You Cannot Infer
|
||||
|
||||
Your questions should be **narrow, specific, and consequential**. Never ask what you could answer yourself.
|
||||
|
||||
**Good questions** (high-stakes, can't infer):
|
||||
- "Who's the primary user — you or your end customers?"
|
||||
- "Is this replacing a spreadsheet, or is there literally nothing today?"
|
||||
- "Does this need to integrate with anything, or standalone?"
|
||||
- "Is there existing data to migrate, or starting fresh?"
|
||||
|
||||
**Bad questions** (low-stakes, inferable):
|
||||
- "What should happen if there's an error?" *(handle gracefully, obviously)*
|
||||
- "Should it have search?" *(if there's a list, yes)*
|
||||
- "How should we handle permissions?" *(follow standard patterns)*
|
||||
- "What tools should I use?" *(your call, not theirs)*
|
||||
|
||||
---
|
||||
|
||||
#### Conversation Flow (3-5 Turns)
|
||||
|
||||
| Turn | Who | What |
|
||||
|------|-----|------|
|
||||
| 1 | User | Describes what they need |
|
||||
| 2 | Agent | Plays back understanding as a proposed model. Asks 1-2 critical questions max. |
|
||||
| 3 | User | Corrects, confirms, or adds detail |
|
||||
| 4 | Agent | Adjusts model, confirms MVP scope, states assumptions, declares starting point |
|
||||
| *(5)* | *(Only if Turn 3 revealed something that fundamentally changes the approach)* |
|
||||
|
||||
**AFTER the conversation, IMMEDIATELY proceed to 2b. DO NOT skip to building.**
|
||||
|
||||
---
|
||||
|
||||
#### Anti-Patterns
|
||||
|
||||
| Don't | Do Instead |
|
||||
|-------|------------|
|
||||
| Open with a list of questions | Open with what you understood from their request |
|
||||
| "What are your requirements?" | "Here's what I think you need — am I right?" |
|
||||
| Ask about every edge case | Handle with smart defaults, flag in summary |
|
||||
| 10+ turn discovery conversation | 3-8 turns. Start building, iterate with real software. |
|
||||
| Being lazy nd not understand what user want to achieve | Understand "what" and "why |
|
||||
| Ask for permission to start | State your plan and start |
|
||||
| Wait for certainty | Start at 80% confidence, iterate the rest |
|
||||
| Ask what tech/tools to use | That's your job. Decide, disclose, move on. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
|
||||
### 2b: Capability Assessment
|
||||
|
||||
**After the user responds, analyze the fit.** Present this assessment honestly:
|
||||
|
||||
> **Framework Fit Assessment**
|
||||
>
|
||||
> Based on what you've described, here's my honest assessment of how well this framework fits your use case:
|
||||
>
|
||||
> **What Works Well (The Good):**
|
||||
> - [List 2-4 things the framework handles well for this use case]
|
||||
> - Examples: multi-turn conversations, human-in-the-loop review, tool orchestration, structured outputs
|
||||
>
|
||||
> **Limitations to Be Aware Of (The Bad):**
|
||||
> - [List 2-3 limitations that apply but are workable]
|
||||
> - Examples: LLM latency means not suitable for sub-second responses, context window limits for very large documents, cost per run for heavy tool usage
|
||||
>
|
||||
> **Potential Deal-Breakers (The Ugly):**
|
||||
> - [List any significant challenges or missing capabilities — be honest]
|
||||
> - Examples: no tool available for X, would require custom MCP server, framework not designed for Y
|
||||
|
||||
**Be specific.** Reference the actual tools discovered in Step 1. If the user needs `send_email` but it's not available, say so. If they need real-time streaming from a database, explain that's not how the framework works.
|
||||
|
||||
### 2c: Gap Analysis
|
||||
|
||||
**Identify specific gaps** between what the user wants and what you can deliver:
|
||||
|
||||
| Requirement | Framework Support | Gap/Workaround |
|
||||
|-------------|-------------------|----------------|
|
||||
| [User need] | [✅ Supported / ⚠️ Partial / ❌ Not supported] | [How to handle or why it's a problem] |
|
||||
|
||||
**Examples of gaps to identify:**
|
||||
- Missing tools (user needs X, but only Y and Z are available)
|
||||
- Scope issues (user wants to process 10,000 items, but LLM rate limits apply)
|
||||
- Interaction mismatches (user wants CLI-only, but agent is designed for TUI)
|
||||
- Data flow issues (user needs to persist state across runs, but sessions are isolated)
|
||||
- Latency requirements (user needs instant responses, but LLM calls take seconds)
|
||||
|
||||
### 2d: Recommendation
|
||||
|
||||
**Give a clear recommendation:**
|
||||
|
||||
> **My Recommendation:**
|
||||
>
|
||||
> [One of these three:]
|
||||
>
|
||||
> **✅ PROCEED** — This is a good fit. The framework handles your core needs well. [List any minor caveats.]
|
||||
>
|
||||
> **⚠️ PROCEED WITH SCOPE ADJUSTMENT** — This can work, but we should adjust: [specific changes]. Without these adjustments, you'll hit [specific problems].
|
||||
>
|
||||
> **🛑 RECONSIDER** — This framework may not be the right tool for this job because [specific reasons]. Consider instead: [alternatives — simpler script, different framework, custom solution].
|
||||
|
||||
### 2e: Get Explicit Acknowledgment
|
||||
|
||||
**CALL AskUserQuestion:**
|
||||
|
||||
```
|
||||
AskUserQuestion(questions=[{
|
||||
"question": "Based on this assessment, how would you like to proceed?",
|
||||
"header": "Proceed",
|
||||
"options": [
|
||||
{"label": "Proceed as described", "description": "I understand the limitations, let's build it"},
|
||||
{"label": "Adjust scope", "description": "Let's modify the requirements to fit better"},
|
||||
{"label": "More questions", "description": "I have questions about the assessment"},
|
||||
{"label": "Reconsider", "description": "Maybe this isn't the right approach"}
|
||||
],
|
||||
"multiSelect": false
|
||||
}])
|
||||
```
|
||||
|
||||
**WAIT for user response.**
|
||||
|
||||
- If **Proceed**: Move to STEP 3
|
||||
- If **Adjust scope**: Discuss what to change, update your notes, re-assess if needed
|
||||
- If **More questions**: Answer them honestly, then ask again
|
||||
- If **Reconsider**: Discuss alternatives. If they decide to proceed anyway, that's their informed choice
|
||||
|
||||
---
|
||||
|
||||
## STEP 3: Define Goal Together with User
|
||||
|
||||
**Now that the use case is qualified, collaborate on the goal definition.**
|
||||
|
||||
**START by synthesizing what you learned:**
|
||||
|
||||
> Based on our discussion, here's my understanding of the goal:
|
||||
>
|
||||
> **Core purpose:** [what you understood from 2a]
|
||||
> **Success looks like:** [what you inferred]
|
||||
> **Key constraints:** [what you inferred]
|
||||
>
|
||||
> Let me refine this with you:
|
||||
>
|
||||
> 1. **What should this agent accomplish?** (confirm or correct my understanding)
|
||||
> 2. **How will we know it succeeded?** (what specific outcomes matter)
|
||||
> 3. **Are there any hard constraints?** (things it must never do, quality bars)
|
||||
|
||||
**WAIT for the user to respond.** Use their input (and the agent type they selected) to draft:
|
||||
|
||||
- Goal ID (kebab-case)
|
||||
- Goal name
|
||||
@@ -115,12 +476,14 @@ AskUserQuestion(questions=[{
|
||||
|
||||
**WAIT for user response.**
|
||||
|
||||
- If **Approve**: Call `mcp__agent-builder__set_goal(...)` with the goal details, then proceed to STEP 3
|
||||
- If **Approve**: Call `mcp__agent-builder__set_goal(...)` with the goal details, then proceed to STEP 4
|
||||
- If **Modify**: Ask what they want to change, update the draft, ask again
|
||||
|
||||
---
|
||||
|
||||
## STEP 3: Design Conceptual Nodes
|
||||
## STEP 4: Design Conceptual Nodes
|
||||
|
||||
**If starting from a template**, the nodes are already loaded in the builder session. Present the existing nodes using the table format below and ask for approval. Skip the design phase.
|
||||
|
||||
**BEFORE designing nodes**, review the available tools from Step 1. Nodes can ONLY use tools that exist.
|
||||
|
||||
@@ -173,12 +536,14 @@ AskUserQuestion(questions=[{
|
||||
|
||||
**WAIT for user response.**
|
||||
|
||||
- If **Approve**: Proceed to STEP 4
|
||||
- If **Approve**: Proceed to STEP 5
|
||||
- If **Modify**: Ask what they want to change, update design, ask again
|
||||
|
||||
---
|
||||
|
||||
## STEP 4: Design Full Graph and Review
|
||||
## STEP 5: Design Full Graph and Review
|
||||
|
||||
**If starting from a template**, the edges are already loaded in the builder session. Render the existing graph as ASCII art and present it to the user for approval. Skip the edge design phase.
|
||||
|
||||
**DETERMINE the edges** connecting the approved nodes. For each edge:
|
||||
|
||||
@@ -288,16 +653,38 @@ AskUserQuestion(questions=[{
|
||||
|
||||
**WAIT for user response.**
|
||||
|
||||
- If **Approve**: Proceed to STEP 5
|
||||
- If **Approve**: Proceed to STEP 6
|
||||
- If **Modify**: Ask what they want to change, update the graph, re-render, ask again
|
||||
|
||||
---
|
||||
|
||||
## STEP 5: Build the Agent
|
||||
## STEP 6: Build the Agent
|
||||
|
||||
**NOW — and only now — write the actual code.** The user has approved the goal, nodes, and graph.
|
||||
|
||||
### 5a: Register nodes and edges with MCP
|
||||
### 6a: Register nodes and edges with MCP
|
||||
**If starting from a template**, the copied files will be overwritten with the approved design. You MUST replace every occurrence of the old template name with the new agent name. Here is the complete checklist — miss NONE of these:
|
||||
|
||||
| File | What to rename |
|
||||
|------|---------------|
|
||||
| `config.py` | `AgentMetadata.name` — the display name shown in TUI agent selection |
|
||||
| `config.py` | `AgentMetadata.description` — agent description |
|
||||
| `config.py` | `AgentMetadata.intro_message` — greeting shown to user when TUI loads |
|
||||
| `agent.py` | Module docstring (line 1) |
|
||||
| `agent.py` | `class OldNameAgent:` → `class NewNameAgent:` |
|
||||
| `agent.py` | `GraphSpec(id="old-name-graph")` → `GraphSpec(id="new-name-graph")` — shown in TUI status bar |
|
||||
| `agent.py` | Storage path: `Path.home() / ".hive" / "agents" / "old_name"` → `"new_name"` |
|
||||
| `__main__.py` | Module docstring (line 1) |
|
||||
| `__main__.py` | `from .agent import ... OldNameAgent` → `NewNameAgent` |
|
||||
| `__main__.py` | CLI help string in `def cli()` docstring |
|
||||
| `__main__.py` | All `OldNameAgent()` instantiations |
|
||||
| `__main__.py` | Storage path (duplicated from agent.py) |
|
||||
| `__main__.py` | Shell banner string (e.g. `"=== Old Name Agent ==="`) |
|
||||
| `__init__.py` | Package docstring |
|
||||
| `__init__.py` | `from .agent import OldNameAgent` import |
|
||||
| `__init__.py` | `__all__` list entry |
|
||||
|
||||
**If starting from a template and no modifications were made in Steps 2-5**, the nodes and edges are already registered. Skip to validation (`mcp__agent-builder__validate_graph()`). If modifications were made, re-register the changed nodes/edges (the MCP tools handle duplicates by overwriting).
|
||||
|
||||
**FOR EACH approved node**, call:
|
||||
|
||||
@@ -337,9 +724,9 @@ mcp__agent-builder__validate_graph()
|
||||
```
|
||||
|
||||
- If invalid: Fix the issues and re-validate
|
||||
- If valid: Continue to 5b
|
||||
- If valid: Continue to 6b
|
||||
|
||||
### 5b: Write Python package files
|
||||
### 6b: Write Python package files
|
||||
|
||||
**EXPORT the graph data:**
|
||||
|
||||
@@ -349,7 +736,7 @@ mcp__agent-builder__export_graph()
|
||||
|
||||
**THEN write the Python package files** using the exported data. Create these files in `exports/AGENT_NAME/`:
|
||||
|
||||
1. `config.py` - Runtime configuration with model settings
|
||||
1. `config.py` - Runtime configuration with model settings and `AgentMetadata` (including `intro_message` — the greeting shown when TUI loads)
|
||||
2. `nodes/__init__.py` - All NodeSpec definitions
|
||||
3. `agent.py` - Goal, edges, graph config, and agent class
|
||||
4. `__init__.py` - Package exports
|
||||
@@ -399,7 +786,7 @@ mcp__agent-builder__export_graph()
|
||||
|
||||
---
|
||||
|
||||
## STEP 6: Verify and Test
|
||||
## STEP 7: Verify and Test
|
||||
|
||||
**RUN validation:**
|
||||
|
||||
@@ -525,16 +912,70 @@ result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
|
||||
|
||||
---
|
||||
|
||||
## REFERENCE: Framework Capabilities for Qualification
|
||||
|
||||
Use this reference during STEP 2 to give accurate, honest assessments.
|
||||
|
||||
### What the Framework Does Well (The Good)
|
||||
|
||||
| Capability | Description |
|
||||
|------------|-------------|
|
||||
| Multi-turn conversations | Client-facing nodes stream to users and block for input |
|
||||
| Human-in-the-loop review | Approval checkpoints with feedback loops back to earlier nodes |
|
||||
| Tool orchestration | LLM can call multiple tools, framework handles execution |
|
||||
| Structured outputs | `set_output` produces validated, typed outputs |
|
||||
| Parallel execution | Fan-out/fan-in for concurrent node execution |
|
||||
| Context management | Automatic compaction and spillover for large data |
|
||||
| Error recovery | Retry logic, judges, and feedback edges for self-correction |
|
||||
| Session persistence | State saved to disk, resumable sessions |
|
||||
|
||||
### Framework Limitations (The Bad)
|
||||
|
||||
| Limitation | Impact | Workaround |
|
||||
|------------|--------|------------|
|
||||
| LLM latency | 2-10+ seconds per turn | Not suitable for real-time/low-latency needs |
|
||||
| Context window limits | ~128K tokens max | Use data tools for spillover, design for chunking |
|
||||
| Cost per run | LLM API calls cost money | Budget planning, caching where possible |
|
||||
| Rate limits | API throttling on heavy usage | Backoff, queue management |
|
||||
| Node boundaries lose context | Outputs must be serialized | Prefer fewer, richer nodes |
|
||||
| Single-threaded within node | One LLM call at a time per node | Use fan-out for parallelism |
|
||||
|
||||
### Not Designed For (The Ugly)
|
||||
|
||||
| Use Case | Why It's Problematic | Alternative |
|
||||
|----------|---------------------|-------------|
|
||||
| Long-running daemons | Framework is request-response, not persistent | External scheduler + agent |
|
||||
| Sub-second responses | LLM latency is inherent | Traditional code, no LLM |
|
||||
| Processing millions of items | Context windows and rate limits | Batch processing + sampling |
|
||||
| Real-time streaming data | No built-in pub/sub or streaming input | Custom MCP server + agent |
|
||||
| Guaranteed determinism | LLM outputs vary | Function nodes for deterministic parts |
|
||||
| Offline/air-gapped | Requires LLM API access | Local models (not currently supported) |
|
||||
| Multi-user concurrency | Single-user session model | Separate agent instances per user |
|
||||
|
||||
### Tool Availability Reality Check
|
||||
|
||||
**Before promising any capability, check `list_mcp_tools()`.** Common gaps:
|
||||
|
||||
- **Email**: May not have `send_email` — check before promising email automation
|
||||
- **Calendar**: May not have calendar APIs — check before promising scheduling
|
||||
- **Database**: May not have SQL tools — check before promising data queries
|
||||
- **File system**: Has data tools but not arbitrary filesystem access
|
||||
- **External APIs**: Depends entirely on what MCP servers are registered
|
||||
|
||||
---
|
||||
|
||||
## COMMON MISTAKES TO AVOID
|
||||
|
||||
1. **Using tools that don't exist** - Always check `mcp__agent-builder__list_mcp_tools()` first
|
||||
2. **Wrong entry_points format** - Must be `{"start": "node-id"}`, NOT a set or list
|
||||
3. **Skipping validation** - Always validate nodes and graph before proceeding
|
||||
4. **Not waiting for approval** - Always ask user before major steps
|
||||
5. **Displaying this file** - Execute the steps, don't show documentation
|
||||
6. **Too many thin nodes** - Prefer fewer, richer nodes (4 nodes > 8 nodes)
|
||||
7. **Missing STEP 1/STEP 2 in client-facing prompts** - Client-facing nodes need explicit phases to prevent premature set_output
|
||||
8. **Forgetting nullable_output_keys** - Mark input_keys that only arrive on certain edges (e.g., feedback) as nullable on the receiving node
|
||||
9. **Adding framework gating for LLM behavior** - Fix prompts or use judges, not ad-hoc code
|
||||
10. **Writing code before user approves the graph** - Always get approval on goal, nodes, and graph BEFORE writing any agent code
|
||||
11. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), `cwd` must be `"../../tools"`, and `command` must be `"uv"` with args `["run", "python", ...]`
|
||||
1. **Skipping use case qualification** - A responsible engineer qualifies the use case BEFORE building. Be transparent about what works, what doesn't, and what's problematic
|
||||
2. **Hiding limitations** - Don't oversell the framework. If a tool doesn't exist or a capability is missing, say so upfront
|
||||
3. **Using tools that don't exist** - Always check `mcp__agent-builder__list_mcp_tools()` first
|
||||
4. **Wrong entry_points format** - Must be `{"start": "node-id"}`, NOT a set or list
|
||||
5. **Skipping validation** - Always validate nodes and graph before proceeding
|
||||
6. **Not waiting for approval** - Always ask user before major steps
|
||||
7. **Displaying this file** - Execute the steps, don't show documentation
|
||||
8. **Too many thin nodes** - Prefer fewer, richer nodes (4 nodes > 8 nodes)
|
||||
9. **Missing STEP 1/STEP 2 in client-facing prompts** - Client-facing nodes need explicit phases to prevent premature set_output
|
||||
10. **Forgetting nullable_output_keys** - Mark input_keys that only arrive on certain edges (e.g., feedback) as nullable on the receiving node
|
||||
11. **Adding framework gating for LLM behavior** - Fix prompts or use judges, not ad-hoc code
|
||||
12. **Writing code before user approves the graph** - Always get approval on goal, nodes, and graph BEFORE writing any agent code
|
||||
13. **Wrong mcp_servers.json format** - Use flat format (no `"mcpServers"` wrapper), `cwd` must be `"../../tools"`, and `command` must be `"uv"` with args `["run", "python", ...]`
|
||||
|
||||
@@ -1,33 +1,8 @@
|
||||
"""Runtime configuration."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_preferred_model() -> str:
|
||||
"""Load preferred model from ~/.hive/configuration.json."""
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
llm = config.get("llm", {})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{llm['provider']}/{llm['model']}"
|
||||
except Exception:
|
||||
pass
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeConfig:
|
||||
model: str = field(default_factory=_load_preferred_model)
|
||||
temperature: float = 0.7
|
||||
max_tokens: int = 40000
|
||||
api_key: str | None = None
|
||||
api_base: str | None = None
|
||||
from dataclasses import dataclass
|
||||
|
||||
from framework.config import RuntimeConfig
|
||||
|
||||
default_config = RuntimeConfig()
|
||||
|
||||
@@ -41,6 +16,11 @@ class AgentMetadata:
|
||||
"multi-source search, quality evaluation, and synthesis - with TUI conversation "
|
||||
"at key checkpoints for user guidance and feedback."
|
||||
)
|
||||
intro_message: str = (
|
||||
"Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
|
||||
"thoroughly — searching multiple sources, evaluating quality, and synthesizing "
|
||||
"a comprehensive report. What would you like me to research?"
|
||||
)
|
||||
|
||||
|
||||
metadata = AgentMetadata()
|
||||
|
||||
@@ -141,6 +141,12 @@ for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTI
|
||||
- **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
|
||||
- **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile
|
||||
|
||||
> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
|
||||
> ```
|
||||
> ⚠️ Environment variables were added to your shell config.
|
||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
||||
> ```
|
||||
|
||||
#### Option 1: Aden Platform (OAuth)
|
||||
|
||||
This is the recommended flow for supported integrations (HubSpot, etc.).
|
||||
@@ -202,6 +208,12 @@ if success:
|
||||
print(f"Run: {source_cmd}")
|
||||
```
|
||||
|
||||
> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
|
||||
> ```
|
||||
> ⚠️ Environment variables were added to your shell config.
|
||||
> Open a NEW TERMINAL for them to take effect outside this session.
|
||||
> ```
|
||||
|
||||
Also save to `~/.hive/configuration.json` for the framework:
|
||||
|
||||
```python
|
||||
@@ -460,9 +472,14 @@ result: HealthCheckResult = check_credential_health("hubspot", token_value)
|
||||
The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.
|
||||
|
||||
- If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
|
||||
- The user MUST persist this key (e.g., in `~/.bashrc` or a secrets manager)
|
||||
- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
|
||||
- Without this key, stored credentials cannot be decrypted
|
||||
- This is the ONLY secret that should live in `~/.bashrc` or environment config
|
||||
|
||||
**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
|
||||
- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
|
||||
- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
|
||||
|
||||
All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**
|
||||
|
||||
If `HIVE_CREDENTIAL_KEY` is not set:
|
||||
|
||||
@@ -475,6 +492,7 @@ If `HIVE_CREDENTIAL_KEY` is not set:
|
||||
- **NEVER** log, print, or echo credential values in tool output
|
||||
- **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
|
||||
- **NEVER** hardcode credentials in source code
|
||||
- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
|
||||
- **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
|
||||
- **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
|
||||
- **ALWAYS** run health checks before storing credentials (when possible)
|
||||
@@ -601,18 +619,22 @@ All credentials are now configured:
|
||||
│ ✅ CREDENTIALS CONFIGURED │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ OPEN A NEW TERMINAL before running commands below. │
|
||||
│ Environment variables were saved to your shell config but │
|
||||
│ only take effect in new terminal sessions. │
|
||||
│ │
|
||||
│ NEXT STEPS: │
|
||||
│ │
|
||||
│ 1. RUN YOUR AGENT: │
|
||||
│ │
|
||||
│ PYTHONPATH=core:exports python -m research-agent tui │
|
||||
│ hive tui │
|
||||
│ │
|
||||
│ 2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER: │
|
||||
│ │
|
||||
│ /hive-debugger │
|
||||
│ │
|
||||
│ The debugger analyzes runtime logs, identifies retry loops, tool │
|
||||
│ failures, stalled execution, and provides actionable fix suggestions. │
|
||||
│ failures, stalled execution, and provides actionable fix suggestions. │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
@@ -47,7 +47,7 @@ Before using this skill, ensure:
|
||||
**What to do:**
|
||||
|
||||
1. **Ask the developer which agent needs debugging:**
|
||||
- Get agent name (e.g., "twitter_outreach", "deep_research_agent")
|
||||
- Get agent name (e.g., "deep_research_agent", "deep_research_agent")
|
||||
- Confirm the agent exists in `exports/{agent_name}/`
|
||||
|
||||
2. **Determine agent working directory:**
|
||||
@@ -66,7 +66,7 @@ Before using this skill, ensure:
|
||||
|
||||
4. **Store context for the debugging session:**
|
||||
- agent_name
|
||||
- agent_work_dir (e.g., `/home/user/.hive/twitter_outreach`)
|
||||
- agent_work_dir (e.g., `/home/user/.hive/deep_research_agent`)
|
||||
- goal_id
|
||||
- success_criteria
|
||||
- constraints
|
||||
@@ -74,19 +74,19 @@ Before using this skill, ensure:
|
||||
|
||||
**Example:**
|
||||
```
|
||||
Developer: "My twitter_outreach agent keeps failing"
|
||||
Developer: "My deep_research_agent agent keeps failing"
|
||||
|
||||
You: "I'll help debug the twitter_outreach agent. Let me gather context..."
|
||||
You: "I'll help debug the deep_research_agent agent. Let me gather context..."
|
||||
|
||||
[Read exports/twitter_outreach/agent.json]
|
||||
[Read exports/deep_research_agent/agent.json]
|
||||
|
||||
Context gathered:
|
||||
- Agent: twitter_outreach
|
||||
- Goal: twitter-outreach-multi-loop
|
||||
- Working Directory: /home/user/.hive/twitter_outreach
|
||||
- Success Criteria: ["Successfully send 5 personalized outreach messages"]
|
||||
- Constraints: ["Must verify handle exists", "Must personalize message"]
|
||||
- Nodes: ["intake-collector", "profile-analyzer", "message-composer", "outreach-sender"]
|
||||
- Agent: deep_research_agent
|
||||
- Goal: deep-research
|
||||
- Working Directory: /home/user/.hive/deep_research_agent
|
||||
- Success Criteria: ["Produce a comprehensive research report with cited sources"]
|
||||
- Constraints: ["Must cite all sources", "Must cover multiple perspectives"]
|
||||
- Nodes: ["intake", "research", "analysis", "report-writer"]
|
||||
```
|
||||
|
||||
---
|
||||
@@ -224,7 +224,7 @@ Which run would you like to investigate?
|
||||
```
|
||||
Diagnosis for session_20260206_115718_e22339c5:
|
||||
|
||||
Problem Node: intake-collector
|
||||
Problem Node: research
|
||||
├─ Exit Status: escalate
|
||||
├─ Retry Count: 5 (HIGH)
|
||||
├─ Verdict Counts: {RETRY: 5, ESCALATE: 1}
|
||||
@@ -232,7 +232,7 @@ Problem Node: intake-collector
|
||||
├─ Total Steps: 8
|
||||
└─ Categories: Missing Outputs + Retry Loops
|
||||
|
||||
Root Issue: The intake-collector node is stuck in a retry loop because it's not setting required outputs.
|
||||
Root Issue: The research node is stuck in a retry loop because it's not setting required outputs.
|
||||
```
|
||||
|
||||
---
|
||||
@@ -293,25 +293,25 @@ Root Issue: The intake-collector node is stuck in a retry loop because it's not
|
||||
|
||||
**Example Output:**
|
||||
```
|
||||
Root Cause Analysis for intake-collector:
|
||||
Root Cause Analysis for research:
|
||||
|
||||
Step-by-step breakdown:
|
||||
|
||||
Step 3:
|
||||
- Tool Call: web_search(query="@RomuloNevesOf")
|
||||
- Result: Found Twitter profile information
|
||||
- Tool Call: web_search(query="latest AI regulations 2026")
|
||||
- Result: Found relevant articles and sources
|
||||
- Verdict: RETRY
|
||||
- Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
|
||||
- Feedback: "Missing required output 'research_findings'. You found sources but didn't call set_output."
|
||||
|
||||
Step 4:
|
||||
- Tool Call: web_search(query="@RomuloNevesOf twitter")
|
||||
- Result: Found additional Twitter information
|
||||
- Tool Call: web_search(query="AI regulation policy 2026")
|
||||
- Result: Found additional policy information
|
||||
- Verdict: RETRY
|
||||
- Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
|
||||
- Feedback: "Still missing 'research_findings'. Use set_output to save your findings."
|
||||
|
||||
Steps 5-7: Similar pattern continues...
|
||||
|
||||
ROOT CAUSE: The node is successfully finding Twitter handles via web_search, but the LLM is not calling set_output to save the results. It keeps searching for more information instead of completing the task.
|
||||
ROOT CAUSE: The node is successfully finding research sources via web_search, but the LLM is not calling set_output to save the results. It keeps searching for more information instead of completing the task.
|
||||
```
|
||||
|
||||
---
|
||||
@@ -495,11 +495,114 @@ max_node_visits=3 # Prevent getting stuck
|
||||
- Confirm it calls set_output eventually
|
||||
```
|
||||
|
||||
#### Template 6: Checkpoint Recovery (Post-Fix Resume)
|
||||
|
||||
```markdown
|
||||
## Recovery Strategy: Resume from Last Clean Checkpoint
|
||||
|
||||
**Situation:** You've fixed the issue, but the failed session is stuck mid-execution
|
||||
|
||||
**Solution:** Resume execution from a checkpoint before the failure
|
||||
|
||||
### Option A: Auto-Resume from Latest Checkpoint (Recommended)
|
||||
|
||||
Use CLI arguments to auto-resume when launching TUI:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=core:exports python -m {agent_name} --tui \
|
||||
--resume-session {session_id}
|
||||
```
|
||||
|
||||
This will:
|
||||
- Load session state from `state.json`
|
||||
- Continue from where it paused/failed
|
||||
- Apply your fixes immediately
|
||||
|
||||
### Option B: Resume from Specific Checkpoint (Time-Travel)
|
||||
|
||||
If you need to go back to an earlier point:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=core:exports python -m {agent_name} --tui \
|
||||
--resume-session {session_id} \
|
||||
--checkpoint {checkpoint_id}
|
||||
```
|
||||
|
||||
Example:
|
||||
```bash
|
||||
PYTHONPATH=core:exports python -m deep_research_agent --tui \
|
||||
--resume-session session_20260208_143022_abc12345 \
|
||||
--checkpoint cp_node_complete_intake_143030
|
||||
```
|
||||
|
||||
### Option C: Use TUI Commands
|
||||
|
||||
Alternatively, launch TUI normally and use commands:
|
||||
|
||||
```bash
|
||||
# Launch TUI
|
||||
PYTHONPATH=core:exports python -m {agent_name} --tui
|
||||
|
||||
# In TUI, use commands:
|
||||
/resume {session_id} # Resume from session state
|
||||
/recover {session_id} {checkpoint_id} # Recover from specific checkpoint
|
||||
```
|
||||
|
||||
### When to Use Each Option:
|
||||
|
||||
**Use `/resume` (or --resume-session) when:**
|
||||
- You fixed credentials and want to retry
|
||||
- Agent paused and you want to continue
|
||||
- Agent failed and you want to retry from last state
|
||||
|
||||
**Use `/recover` (or --resume-session + --checkpoint) when:**
|
||||
- You need to go back to an earlier checkpoint
|
||||
- You want to try a different path from a specific point
|
||||
- Debugging requires time-travel to earlier state
|
||||
|
||||
### Find Available Checkpoints:
|
||||
|
||||
Use MCP tools to programmatically find and inspect checkpoints:
|
||||
|
||||
```
|
||||
# List all sessions to find the failed one
|
||||
list_agent_sessions(agent_work_dir="~/.hive/agents/{agent_name}", status="failed")
|
||||
|
||||
# Inspect session state
|
||||
get_agent_session_state(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}")
|
||||
|
||||
# Find clean checkpoints to resume from
|
||||
list_agent_checkpoints(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", is_clean="true")
|
||||
|
||||
# Compare checkpoints to understand what changed
|
||||
compare_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/{agent_name}",
|
||||
session_id="{session_id}",
|
||||
checkpoint_id_before="cp_node_complete_intake_143030",
|
||||
checkpoint_id_after="cp_node_complete_research_143115"
|
||||
)
|
||||
|
||||
# Inspect memory at a specific checkpoint
|
||||
get_agent_checkpoint(agent_work_dir="~/.hive/agents/{agent_name}", session_id="{session_id}", checkpoint_id="cp_node_complete_intake_143030")
|
||||
```
|
||||
|
||||
Or in TUI:
|
||||
```bash
|
||||
/sessions {session_id}
|
||||
```
|
||||
|
||||
**Verification:**
|
||||
- Use `--resume-session` to test your fix immediately
|
||||
- No need to re-run from the beginning
|
||||
- Session continues with your code changes applied
|
||||
```
|
||||
|
||||
**Selecting the right template:**
|
||||
- Match the issue category from Stage 4
|
||||
- Customize with specific details from Stage 5
|
||||
- Include actual error messages and code snippets
|
||||
- Provide file paths and line numbers when possible
|
||||
- **Always include recovery commands** (Template 6) after providing fix recommendations
|
||||
|
||||
---
|
||||
|
||||
@@ -568,7 +671,7 @@ max_node_visits=3 # Prevent getting stuck
|
||||
|
||||
**Example interaction:**
|
||||
```
|
||||
Developer: "I applied the fix to intake-collector. How do I verify it works?"
|
||||
Developer: "I applied the fix to research. How do I verify it works?"
|
||||
|
||||
You: "Great! Let's verify the fix with these steps:
|
||||
|
||||
@@ -580,11 +683,11 @@ You: "Great! Let's verify the fix with these steps:
|
||||
[Use query_runtime_logs to check for attention flags]
|
||||
|
||||
3. Verify the specific node:
|
||||
[Use query_runtime_log_details for intake-collector]
|
||||
[Use query_runtime_log_details for research]
|
||||
|
||||
Expected results:
|
||||
- No 'needs_attention' flags
|
||||
- intake-collector shows exit_status='success'
|
||||
- research shows exit_status='success'
|
||||
- retry_count should be 0
|
||||
|
||||
Let me know when you've run it and I'll help check the logs!"
|
||||
@@ -602,7 +705,7 @@ Let me know when you've run it and I'll help check the logs!"
|
||||
- **Example:**
|
||||
```
|
||||
query_runtime_logs(
|
||||
agent_work_dir="/home/user/.hive/twitter_outreach",
|
||||
agent_work_dir="/home/user/.hive/deep_research_agent",
|
||||
status="needs_attention",
|
||||
limit=20
|
||||
)
|
||||
@@ -614,7 +717,7 @@ Let me know when you've run it and I'll help check the logs!"
|
||||
- **Example:**
|
||||
```
|
||||
query_runtime_log_details(
|
||||
agent_work_dir="/home/user/.hive/twitter_outreach",
|
||||
agent_work_dir="/home/user/.hive/deep_research_agent",
|
||||
run_id="session_20260206_115718_e22339c5",
|
||||
needs_attention_only=True
|
||||
)
|
||||
@@ -626,9 +729,83 @@ Let me know when you've run it and I'll help check the logs!"
|
||||
- **Example:**
|
||||
```
|
||||
query_runtime_log_raw(
|
||||
agent_work_dir="/home/user/.hive/twitter_outreach",
|
||||
agent_work_dir="/home/user/.hive/deep_research_agent",
|
||||
run_id="session_20260206_115718_e22339c5",
|
||||
node_id="intake-collector"
|
||||
node_id="research"
|
||||
)
|
||||
```
|
||||
|
||||
### Session & Checkpoint Tools
|
||||
|
||||
**list_agent_sessions** - Browse sessions with filtering
|
||||
- **When to use:** Finding resumable sessions, identifying failed sessions, Stage 3 triage
|
||||
- **Returns:** Session list with status, timestamps, is_resumable, current_node, quality
|
||||
- **Example:**
|
||||
```
|
||||
list_agent_sessions(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
status="failed",
|
||||
limit=10
|
||||
)
|
||||
```
|
||||
|
||||
**get_agent_session_state** - Load full session state (excludes memory values)
|
||||
- **When to use:** Inspecting session progress, checking is_resumable, examining path
|
||||
- **Returns:** Full state with memory_keys/memory_size instead of memory values
|
||||
- **Example:**
|
||||
```
|
||||
get_agent_session_state(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
session_id="session_20260208_143022_abc12345"
|
||||
)
|
||||
```
|
||||
|
||||
**get_agent_session_memory** - Get memory contents from a session
|
||||
- **When to use:** Stage 5 root cause analysis, inspecting produced data
|
||||
- **Returns:** All memory keys+values, or a single key's value
|
||||
- **Example:**
|
||||
```
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
session_id="session_20260208_143022_abc12345",
|
||||
key="twitter_handles"
|
||||
)
|
||||
```
|
||||
|
||||
**list_agent_checkpoints** - List checkpoints for a session
|
||||
- **When to use:** Stage 6 recovery, finding clean checkpoints to resume from
|
||||
- **Returns:** Checkpoint summaries with type, node, clean status
|
||||
- **Example:**
|
||||
```
|
||||
list_agent_checkpoints(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
session_id="session_20260208_143022_abc12345",
|
||||
is_clean="true"
|
||||
)
|
||||
```
|
||||
|
||||
**get_agent_checkpoint** - Load a specific checkpoint with full state
|
||||
- **When to use:** Inspecting exact state at a checkpoint, comparing to current state
|
||||
- **Returns:** Full checkpoint: memory snapshot, execution path, metrics
|
||||
- **Example:**
|
||||
```
|
||||
get_agent_checkpoint(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
session_id="session_20260208_143022_abc12345",
|
||||
checkpoint_id="cp_node_complete_intake_143030"
|
||||
)
|
||||
```
|
||||
|
||||
**compare_agent_checkpoints** - Diff memory between two checkpoints
|
||||
- **When to use:** Understanding data flow, finding where state diverged
|
||||
- **Returns:** Memory diff (added/removed/changed keys) + execution path diff
|
||||
- **Example:**
|
||||
```
|
||||
compare_agent_checkpoints(
|
||||
agent_work_dir="/home/user/.hive/agents/twitter_outreach",
|
||||
session_id="session_20260208_143022_abc12345",
|
||||
checkpoint_id_before="cp_node_complete_intake_143030",
|
||||
checkpoint_id_after="cp_node_complete_research_143115"
|
||||
)
|
||||
```
|
||||
|
||||
@@ -654,27 +831,37 @@ Loop every 10 seconds:
|
||||
2. If found: Alert and drill into L2
|
||||
```
|
||||
|
||||
**Pattern 4: Session State + Checkpoint Recovery**
|
||||
```
|
||||
1. list_agent_sessions: Find failed/paused sessions
|
||||
2. get_agent_session_state: Check is_resumable, see execution path
|
||||
3. get_agent_session_memory: Inspect what data was produced
|
||||
4. list_agent_checkpoints: Find clean checkpoints before failure
|
||||
5. compare_agent_checkpoints: Understand what changed between checkpoints
|
||||
6. Recommend resume command with specific checkpoint
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Example Walkthrough
|
||||
|
||||
### Scenario: twitter_outreach Agent Failing
|
||||
### Scenario: deep_research_agent Agent Failing
|
||||
|
||||
**Developer says:** "My twitter_outreach agent keeps failing. It's not finding Twitter handles correctly."
|
||||
**Developer says:** "My deep_research_agent keeps failing. It's not saving research findings correctly."
|
||||
|
||||
#### Stage 1: Setup
|
||||
```
|
||||
You: "I'll help debug the twitter_outreach agent. Let me gather context..."
|
||||
You: "I'll help debug the deep_research_agent agent. Let me gather context..."
|
||||
|
||||
[Read exports/twitter_outreach/agent.json]
|
||||
[Read exports/deep_research_agent/agent.json]
|
||||
|
||||
Context:
|
||||
- Agent: twitter_outreach
|
||||
- Goal: twitter-outreach-multi-loop
|
||||
- Working Dir: ~/.hive/agents/twitter_outreach
|
||||
- Agent: deep_research_agent
|
||||
- Goal: deep-research
|
||||
- Working Dir: ~/.hive/agents/deep_research_agent
|
||||
- Success Criteria: ["Successfully send 5 personalized outreach messages"]
|
||||
- Constraints: ["Must verify handle exists", "Must personalize message"]
|
||||
- Nodes: intake-collector, profile-analyzer, message-composer, outreach-sender
|
||||
- Nodes: research, profile-analyzer, message-composer, outreach-sender
|
||||
```
|
||||
|
||||
#### Stage 2: Mode Selection
|
||||
@@ -706,31 +893,31 @@ You: "Analyzing node-level details..."
|
||||
[Call query_runtime_log_details for the session]
|
||||
|
||||
Results:
|
||||
- Problem Node: intake-collector
|
||||
- Problem Node: research
|
||||
- Exit Status: escalate
|
||||
- Retry Count: 5
|
||||
- Verdict Counts: {RETRY: 5, ESCALATE: 1}
|
||||
- Attention Reasons: ["high_retry_count", "missing_outputs"]
|
||||
- Total Steps: 8
|
||||
|
||||
Diagnosis: intake-collector has retry loop + missing outputs issue
|
||||
Diagnosis: research has retry loop + missing outputs issue
|
||||
Category: Missing Outputs + Retry Loops
|
||||
```
|
||||
|
||||
#### Stage 5: Root Cause (L3)
|
||||
```
|
||||
You: "Drilling into detailed logs for intake-collector..."
|
||||
You: "Drilling into detailed logs for research..."
|
||||
|
||||
[Call query_runtime_log_raw for node intake-collector]
|
||||
[Call query_runtime_log_raw for node research]
|
||||
|
||||
Evidence:
|
||||
Step 3: web_search("@RomuloNevesOf") → Found profile
|
||||
Step 3: web_search("latest AI regulations 2026") → Found articles
|
||||
Verdict: RETRY
|
||||
Feedback: "Missing required output 'twitter_handles'. You found the handle but didn't call set_output."
|
||||
Feedback: "Missing required output 'research_findings'. You found sources but didn't call set_output."
|
||||
|
||||
Step 4: web_search("@RomuloNevesOf twitter") → Found more info
|
||||
Step 4: web_search("AI regulation policy 2026") → Found more info
|
||||
Verdict: RETRY
|
||||
Feedback: "Still missing 'twitter_handles'. Use set_output to save your findings."
|
||||
Feedback: "Still missing 'research_findings'. Use set_output to save your findings."
|
||||
|
||||
Steps 5-7: Same pattern repeats...
|
||||
|
||||
@@ -741,37 +928,37 @@ ROOT CAUSE: Node successfully finds handles but LLM doesn't call set_output. It
|
||||
```
|
||||
You: "Here's the fix for this issue:
|
||||
|
||||
## Issue: intake-collector Not Setting Outputs
|
||||
## Issue: research Node Not Setting Outputs
|
||||
|
||||
**Root Cause:** Node successfully finds Twitter handles but doesn't call set_output
|
||||
**Root Cause:** Node successfully finds research sources but doesn't call set_output
|
||||
|
||||
**Category:** Missing Outputs
|
||||
|
||||
**File to edit:** exports/twitter_outreach/nodes/intake_collector.py
|
||||
**File to edit:** exports/deep_research_agent/nodes/research.py
|
||||
|
||||
**Changes:**
|
||||
Update the system_prompt to emphasize the output requirement:
|
||||
|
||||
```python
|
||||
system_prompt = """
|
||||
You are the Intake Collector for the Twitter outreach agent.
|
||||
You are the Research node for the deep research agent.
|
||||
|
||||
Your task:
|
||||
1. Extract Twitter handle from user input (URL or @mention)
|
||||
2. Verify it's a valid Twitter handle format
|
||||
3. **CRITICAL: Call set_output(twitter_handles=[handle]) immediately after finding valid handle**
|
||||
1. Search for relevant sources on the research topic
|
||||
2. Extract key findings from each source
|
||||
3. **CRITICAL: Call set_output(research_findings=[...]) immediately after gathering sufficient findings**
|
||||
|
||||
IMPORTANT: You MUST call set_output even if you found the handle on the first try.
|
||||
Do not continue searching once you have a valid handle - set output and finish.
|
||||
IMPORTANT: You MUST call set_output even if you found findings on the first search.
|
||||
Do not continue searching indefinitely - set output and finish once you have enough data.
|
||||
|
||||
Output format:
|
||||
{
|
||||
"twitter_handles": ["@handle1", "@handle2", ...]
|
||||
"research_findings": [{"source": "...", "summary": "...", "url": "..."}, ...]
|
||||
}
|
||||
"""
|
||||
```
|
||||
|
||||
**Why this works:** The explicit instruction to call set_output immediately prevents the LLM from continuing to search after finding the handle.
|
||||
**Why this works:** The explicit instruction to call set_output immediately prevents the LLM from continuing to search after finding sufficient sources.
|
||||
```
|
||||
|
||||
#### Stage 7: Verification
|
||||
@@ -795,13 +982,13 @@ You: "Checking now..."
|
||||
[Call query_runtime_log_details for the new run]
|
||||
|
||||
Results:
|
||||
- Node: intake-collector
|
||||
- Node: research
|
||||
- Exit Status: success ✓
|
||||
- Retry Count: 0 ✓
|
||||
- Total Steps: 2 ✓
|
||||
- No attention flags ✓
|
||||
|
||||
"Perfect! The fix worked. The intake-collector now:
|
||||
"Perfect! The fix worked. The research now:
|
||||
- Completes successfully on first try
|
||||
- No retries needed
|
||||
- Calls set_output properly
|
||||
|
||||
+719
-973
File diff suppressed because it is too large
Load Diff
@@ -1,351 +1,333 @@
|
||||
# Example: Testing a YouTube Research Agent
|
||||
# Example: Iterative Testing of a Research Agent
|
||||
|
||||
This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
|
||||
This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.
|
||||
|
||||
## Prerequisites
|
||||
## Agent Structure
|
||||
|
||||
- Agent built with hive-create skill at `exports/youtube-research/`
|
||||
- Goal defined with success criteria and constraints
|
||||
|
||||
## Step 1: Load the Goal
|
||||
|
||||
First, load the goal that was defined during the Goal stage:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "youtube-research",
|
||||
"name": "YouTube Research Agent",
|
||||
"description": "Find relevant YouTube videos on a given topic",
|
||||
"success_criteria": [
|
||||
{
|
||||
"id": "find_videos",
|
||||
"description": "Find 3-5 relevant videos",
|
||||
"metric": "video_count",
|
||||
"target": "3-5",
|
||||
"weight": 1.0
|
||||
},
|
||||
{
|
||||
"id": "relevance",
|
||||
"description": "Videos must be relevant to the topic",
|
||||
"metric": "relevance_score",
|
||||
"target": ">0.8",
|
||||
"weight": 0.8
|
||||
}
|
||||
],
|
||||
"constraints": [
|
||||
{
|
||||
"id": "api_limits",
|
||||
"description": "Must not exceed YouTube API rate limits",
|
||||
"constraint_type": "hard",
|
||||
"category": "technical"
|
||||
},
|
||||
{
|
||||
"id": "content_safety",
|
||||
"description": "Must filter out inappropriate content",
|
||||
"constraint_type": "hard",
|
||||
"category": "safety"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
exports/deep_research_agent/
|
||||
├── agent.py # Goal + graph: intake → research → review → report
|
||||
├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
|
||||
├── config.py # Model config
|
||||
├── mcp_servers.json # Tools: web_search, web_scrape
|
||||
└── tests/ # Test files (we'll create these)
|
||||
```
|
||||
|
||||
## Step 2: Get Constraint Test Guidelines
|
||||
**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.
|
||||
|
||||
During the Goal stage (or early Eval), get test guidelines for constraints:
|
||||
---
|
||||
|
||||
## Phase 1: Generate Tests
|
||||
|
||||
### Read the goal
|
||||
|
||||
```python
|
||||
result = generate_constraint_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json='<goal JSON above>',
|
||||
agent_path="exports/youtube-research"
|
||||
)
|
||||
Read(file_path="exports/deep_research_agent/agent.py")
|
||||
# Extract: goal_id="rigorous-interactive-research"
|
||||
# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
|
||||
# constraints: no-hallucination, source-attribution
|
||||
```
|
||||
|
||||
**The result contains guidelines (not generated tests):**
|
||||
- `output_file`: Where to write tests
|
||||
- `file_header`: Imports and fixtures to use
|
||||
- `test_template`: Format for test functions
|
||||
- `constraints_formatted`: The constraints to test
|
||||
- `test_guidelines`: Rules for writing tests
|
||||
|
||||
## Step 3: Write Constraint Tests
|
||||
|
||||
Using the guidelines, write tests directly with the Write tool:
|
||||
|
||||
```python
|
||||
# Write constraint tests using the provided file_header and guidelines
|
||||
Write(
|
||||
file_path="exports/youtube-research/tests/test_constraints.py",
|
||||
content='''
|
||||
"""Constraint tests for youtube-research agent."""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
from exports.youtube_research import default_agent
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
|
||||
reason="API key required for real testing."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_constraint_api_limits_respected():
|
||||
"""Verify API rate limits are not exceeded."""
|
||||
import time
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
|
||||
for i in range(10):
|
||||
result = await default_agent.run({"topic": f"test_{i}"}, mock_mode=mock_mode)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Should complete without rate limit errors
|
||||
assert "rate limit" not in str(result).lower()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_constraint_content_safety_filter():
|
||||
"""Verify inappropriate content is filtered."""
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
result = await default_agent.run({"topic": "general topic"}, mock_mode=mock_mode)
|
||||
|
||||
for video in result.videos:
|
||||
assert video.safe_for_work is True
|
||||
assert video.age_restricted is False
|
||||
'''
|
||||
)
|
||||
```
|
||||
|
||||
## Step 4: Get Success Criteria Test Guidelines
|
||||
|
||||
After the agent is built, get success criteria test guidelines:
|
||||
### Get test guidelines
|
||||
|
||||
```python
|
||||
result = generate_success_tests(
|
||||
goal_id="youtube-research",
|
||||
goal_json='<goal JSON>',
|
||||
node_names="search_node,filter_node,rank_node,format_node",
|
||||
tool_names="youtube_search,video_details,channel_info",
|
||||
agent_path="exports/youtube-research"
|
||||
goal_id="rigorous-interactive-research",
|
||||
goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
|
||||
node_names="intake,research,review,report",
|
||||
tool_names="web_search,web_scrape",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
```
|
||||
|
||||
## Step 5: Write Success Criteria Tests
|
||||
|
||||
Using the guidelines, write success criteria tests:
|
||||
### Write tests
|
||||
|
||||
```python
|
||||
Write(
|
||||
file_path="exports/youtube-research/tests/test_success_criteria.py",
|
||||
content='''
|
||||
"""Success criteria tests for youtube-research agent."""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
from exports.youtube_research import default_agent
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not os.environ.get("ANTHROPIC_API_KEY") and not os.environ.get("MOCK_MODE"),
|
||||
reason="API key required for real testing."
|
||||
)
|
||||
|
||||
file_path="exports/deep_research_agent/tests/test_success_criteria.py",
|
||||
content=result["file_header"] + '''
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_find_videos_happy_path():
|
||||
"""Test finding videos for a common topic."""
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
result = await default_agent.run({"topic": "machine learning"}, mock_mode=mock_mode)
|
||||
|
||||
assert result.success
|
||||
assert 3 <= len(result.videos) <= 5
|
||||
assert all(v.title for v in result.videos)
|
||||
assert all(v.video_id for v in result.videos)
|
||||
|
||||
async def test_success_source_diversity(runner, auto_responder, mock_mode):
|
||||
"""At least 5 diverse sources are found."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "impact of remote work on productivity"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
sources = output.get("sources", [])
|
||||
if isinstance(sources, list):
|
||||
assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_find_videos_minimum_boundary():
|
||||
"""Test at minimum threshold (3 videos)."""
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
result = await default_agent.run({"topic": "niche topic xyz"}, mock_mode=mock_mode)
|
||||
|
||||
assert len(result.videos) >= 3
|
||||
|
||||
async def test_success_citation_coverage(runner, auto_responder, mock_mode):
|
||||
"""Every factual claim in the report cites its source."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "climate change effects on agriculture"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
report = output.get("report", "")
|
||||
# Check that report contains numbered references
|
||||
assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_relevance_score_threshold():
|
||||
"""Test relevance scoring meets threshold."""
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
result = await default_agent.run({"topic": "python programming"}, mock_mode=mock_mode)
|
||||
|
||||
for video in result.videos:
|
||||
assert video.relevance_score > 0.8
|
||||
|
||||
async def test_success_report_completeness(runner, auto_responder, mock_mode):
|
||||
"""Report addresses the original research question."""
|
||||
query = "pros and cons of nuclear energy"
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": query})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
assert result.success, f"Agent failed: {result.error}"
|
||||
output = result.output or {}
|
||||
report = output.get("report", "")
|
||||
assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_find_videos_no_results_graceful():
|
||||
"""Test graceful handling of no results."""
|
||||
mock_mode = bool(os.environ.get("MOCK_MODE"))
|
||||
result = await default_agent.run({"topic": "xyznonexistent123"}, mock_mode=mock_mode)
|
||||
async def test_empty_query_handling(runner, auto_responder, mock_mode):
|
||||
"""Agent handles empty input gracefully."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": ""})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
output = result.output or {}
|
||||
assert not result.success or output.get("error"), "Should handle empty query"
|
||||
|
||||
# Should not crash, return empty or message
|
||||
assert result.videos == [] or result.message
|
||||
@pytest.mark.asyncio
|
||||
async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
|
||||
"""Feedback loop between review and research terminates."""
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({"query": "quantum computing basics"})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
visits = result.node_visit_counts or {}
|
||||
for node_id, count in visits.items():
|
||||
assert count <= 5, f"Node {node_id} visited {count} times"
|
||||
'''
|
||||
)
|
||||
```
|
||||
|
||||
## Step 6: Run All Tests
|
||||
---
|
||||
|
||||
Execute all tests:
|
||||
## Phase 2: First Execution
|
||||
|
||||
```python
|
||||
result = run_tests(
|
||||
goal_id="youtube-research",
|
||||
agent_path="exports/youtube-research",
|
||||
test_types='["all"]',
|
||||
parallel=4
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent",
|
||||
fail_fast=True
|
||||
)
|
||||
```
|
||||
|
||||
**Results:**
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"overall_passed": false,
|
||||
"summary": {
|
||||
"total": 6,
|
||||
"passed": 5,
|
||||
"failed": 1,
|
||||
"pass_rate": "83.3%"
|
||||
},
|
||||
"duration_ms": 4521,
|
||||
"results": [
|
||||
{"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
|
||||
{"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
|
||||
{"test_id": "test_success_001", "passed": true, "duration_ms": 789},
|
||||
{"test_id": "test_success_002", "passed": true, "duration_ms": 654},
|
||||
{"test_id": "test_success_003", "passed": true, "duration_ms": 543},
|
||||
{"test_id": "test_success_004", "passed": false, "duration_ms": 845,
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
|
||||
]
|
||||
"overall_passed": false,
|
||||
"summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
|
||||
"failures": [
|
||||
{"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
|
||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Step 7: Debug the Failed Test
|
||||
---
|
||||
|
||||
## Phase 3: Analyze (Iteration 1)
|
||||
|
||||
### Debug the first failure
|
||||
|
||||
```python
|
||||
result = debug_test(
|
||||
goal_id="youtube-research",
|
||||
test_name="test_find_videos_no_results_graceful",
|
||||
agent_path="exports/youtube-research"
|
||||
debug_test(
|
||||
goal_id="rigorous-interactive-research",
|
||||
test_name="test_success_source_diversity",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
|
||||
```
|
||||
|
||||
### Find the session and inspect memory
|
||||
|
||||
```python
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="completed",
|
||||
limit=1
|
||||
)
|
||||
# → session_20260209_150000_abc12345
|
||||
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_150000_abc12345",
|
||||
key="research_results"
|
||||
)
|
||||
# → Only 2 sources found. LLM stopped searching after 2 queries.
|
||||
```
|
||||
|
||||
### Check LLM behavior in the research node
|
||||
|
||||
```python
|
||||
query_runtime_log_raw(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
run_id="session_20260209_150000_abc12345",
|
||||
node_id="research"
|
||||
)
|
||||
# → LLM called web_search twice, got results, immediately called set_output.
|
||||
# → Prompt doesn't instruct it to find at least 5 sources.
|
||||
```
|
||||
|
||||
**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Fix (Iteration 1)
|
||||
|
||||
```python
|
||||
Read(file_path="exports/deep_research_agent/nodes/__init__.py")
|
||||
|
||||
# Fix the research node prompt
|
||||
Edit(
|
||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
||||
old_string='system_prompt="Search for information on the user\'s topic using web search."',
|
||||
new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
|
||||
)
|
||||
```
|
||||
|
||||
**Debug Output:**
|
||||
---
|
||||
|
||||
## Phase 5: Recover & Resume (Iteration 1)
|
||||
|
||||
The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
|
||||
|
||||
```python
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent",
|
||||
fail_fast=True
|
||||
)
|
||||
```
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"test_id": "test_success_004",
|
||||
"test_name": "test_find_videos_no_results_graceful",
|
||||
"input": {"topic": "xyznonexistent123"},
|
||||
"expected": "Empty list or message",
|
||||
"actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
|
||||
"passed": false,
|
||||
"error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
|
||||
"error_category": "IMPLEMENTATION_ERROR",
|
||||
"stack_trace": "Traceback (most recent call last):\n File \"filter_node.py\", line 42\n for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
|
||||
"logs": [
|
||||
{"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
|
||||
{"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
|
||||
{"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
|
||||
],
|
||||
"runtime_data": {
|
||||
"execution_path": ["start", "search_node", "filter_node"],
|
||||
"node_outputs": {
|
||||
"search_node": null
|
||||
}
|
||||
},
|
||||
"suggested_fix": "Add null check in filter_node before accessing .videos attribute",
|
||||
"iteration_guidance": {
|
||||
"stage": "Agent",
|
||||
"action": "Fix the code in nodes/edges",
|
||||
"restart_required": false,
|
||||
"description": "The goal is correct, but filter_node doesn't handle null results from search_node."
|
||||
}
|
||||
"overall_passed": false,
|
||||
"summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
|
||||
"failures": [
|
||||
{"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Step 8: Iterate Based on Category
|
||||
Source diversity now passes. Citation coverage still fails.
|
||||
|
||||
Since this is an **IMPLEMENTATION_ERROR**, we:
|
||||
---
|
||||
|
||||
1. **Don't restart** the Goal → Agent → Eval flow
|
||||
2. **Fix the agent** using hive-create skill:
|
||||
- Modify `filter_node` to handle null results
|
||||
3. **Re-run Eval** (tests only)
|
||||
|
||||
### Fix in hive-create:
|
||||
## Phase 3: Analyze (Iteration 2)
|
||||
|
||||
```python
|
||||
# Update the filter_node to handle null
|
||||
add_node(
|
||||
node_id="filter_node",
|
||||
name="Filter Node",
|
||||
description="Filter and rank videos",
|
||||
node_type="function",
|
||||
input_keys=["search_results"],
|
||||
output_keys=["filtered_videos"],
|
||||
system_prompt="""
|
||||
Filter videos by relevance.
|
||||
IMPORTANT: Handle case where search_results is None or empty.
|
||||
Return empty list if no results.
|
||||
"""
|
||||
debug_test(
|
||||
goal_id="rigorous-interactive-research",
|
||||
test_name="test_success_citation_coverage",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
# Category: ASSERTION_FAILURE — Report lacks citations
|
||||
|
||||
# Check what the report node produced
|
||||
list_agent_sessions(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="completed",
|
||||
limit=1
|
||||
)
|
||||
# → session_20260209_151500_def67890
|
||||
|
||||
get_agent_session_memory(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_151500_def67890",
|
||||
key="report"
|
||||
)
|
||||
# → Report text exists but uses no numbered references.
|
||||
# → Sources are in memory but report node doesn't cite them.
|
||||
```
|
||||
|
||||
**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Fix (Iteration 2)
|
||||
|
||||
```python
|
||||
Edit(
|
||||
file_path="exports/deep_research_agent/nodes/__init__.py",
|
||||
old_string='system_prompt="Write a comprehensive report based on the research findings."',
|
||||
new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
|
||||
)
|
||||
```
|
||||
|
||||
### Re-export and re-test:
|
||||
---
|
||||
|
||||
## Phase 5: Resume (Iteration 2)
|
||||
|
||||
The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
|
||||
|
||||
```bash
|
||||
# Run via CLI to get checkpoints
|
||||
uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
|
||||
|
||||
# After it runs, find the clean checkpoint before report
|
||||
list_agent_checkpoints(
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
session_id="session_20260209_152000_ghi34567",
|
||||
is_clean="true"
|
||||
)
|
||||
# → cp_node_complete_review_152100 (after review, before report)
|
||||
|
||||
# Resume — skips intake, research, review entirely
|
||||
uv run hive run exports/deep_research_agent \
|
||||
--resume-session session_20260209_152000_ghi34567 \
|
||||
--checkpoint cp_node_complete_review_152100
|
||||
```
|
||||
|
||||
Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: Final Verification
|
||||
|
||||
```python
|
||||
# Re-export the fixed agent
|
||||
export_graph(path="exports/youtube-research")
|
||||
|
||||
# Re-run tests
|
||||
result = run_tests(
|
||||
goal_id="youtube-research",
|
||||
agent_path="exports/youtube-research",
|
||||
test_types='["all"]'
|
||||
run_tests(
|
||||
goal_id="rigorous-interactive-research",
|
||||
agent_path="exports/deep_research_agent"
|
||||
)
|
||||
```
|
||||
|
||||
**Updated Results:**
|
||||
|
||||
**Result:**
|
||||
```json
|
||||
{
|
||||
"goal_id": "youtube-research",
|
||||
"overall_passed": true,
|
||||
"summary": {
|
||||
"total": 6,
|
||||
"passed": 6,
|
||||
"failed": 0,
|
||||
"pass_rate": "100.0%"
|
||||
}
|
||||
"overall_passed": true,
|
||||
"summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
|
||||
}
|
||||
```
|
||||
|
||||
All tests pass.
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
1. **Got guidelines** for constraint tests during Goal stage
|
||||
2. **Wrote** constraint tests using Write tool
|
||||
3. **Got guidelines** for success criteria tests during Eval stage
|
||||
4. **Wrote** success criteria tests using Write tool
|
||||
5. **Ran** tests in parallel
|
||||
6. **Debugged** the one failure
|
||||
7. **Categorized** as IMPLEMENTATION_ERROR
|
||||
8. **Fixed** the agent (not the goal)
|
||||
9. **Re-ran** Eval only (didn't restart full flow)
|
||||
10. **Passed** all tests
|
||||
| Iteration | Failure | Root Cause | Fix | Recovery |
|
||||
|-----------|---------|------------|-----|----------|
|
||||
| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
|
||||
| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |
|
||||
|
||||
The agent is now validated and ready for production use.
|
||||
**Key takeaways:**
|
||||
- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
|
||||
- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
|
||||
- Final `run_tests` confirms all scenarios pass end-to-end
|
||||
|
||||
@@ -19,14 +19,18 @@ metadata:
|
||||
|
||||
**THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**
|
||||
|
||||
When this skill is loaded, determine what the user needs and invoke the appropriate skill NOW:
|
||||
- **User wants to build an agent** → Invoke `/hive-create` immediately
|
||||
- **User wants to test an agent** → Invoke `/hive-test` immediately
|
||||
- **User wants to learn concepts** → Invoke `/hive-concepts` immediately
|
||||
- **User wants patterns/optimization** → Invoke `/hive-patterns` immediately
|
||||
- **User wants to set up credentials** → Invoke `/hive-credentials` immediately
|
||||
- **User has a failing/broken agent** → Invoke `/hive-debugger` immediately
|
||||
- **Unclear what user needs** → Ask the user (do NOT explore the codebase to figure it out)
|
||||
When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
|
||||
|
||||
```
|
||||
Use AskUserQuestion with these options:
|
||||
- "Build a new agent" → Then invoke /hive-create
|
||||
- "Test an existing agent" → Then invoke /hive-test
|
||||
- "Learn agent concepts" → Then invoke /hive-concepts
|
||||
- "Optimize agent design" → Then invoke /hive-patterns
|
||||
- "Set up credentials" → Then invoke /hive-credentials
|
||||
- "Debug a failing agent" → Then invoke /hive-debugger
|
||||
- "Other" (please describe what you want to achieve)
|
||||
```
|
||||
|
||||
**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
|
||||
|
||||
@@ -73,7 +77,6 @@ Use this meta-skill when:
|
||||
|
||||
## Phase 0: Understand Concepts (Optional)
|
||||
|
||||
**Duration**: 5-10 minutes
|
||||
**Skill**: `/hive-concepts`
|
||||
**Input**: Questions about agent architecture
|
||||
|
||||
@@ -95,9 +98,8 @@ Use this meta-skill when:
|
||||
|
||||
## Phase 1: Build Agent Structure
|
||||
|
||||
**Duration**: 15-30 minutes
|
||||
**Skill**: `/hive-create`
|
||||
**Input**: User requirements ("Build an agent that...")
|
||||
**Input**: User requirements ("Build an agent that...") or a template to start from
|
||||
|
||||
### What This Phase Does
|
||||
|
||||
@@ -166,7 +168,6 @@ exports/agent_name/
|
||||
|
||||
## Phase 1.5: Optimize Design (Optional)
|
||||
|
||||
**Duration**: 10-15 minutes
|
||||
**Skill**: `/hive-patterns`
|
||||
**Input**: Completed agent structure
|
||||
|
||||
@@ -191,22 +192,21 @@ exports/agent_name/
|
||||
|
||||
## Phase 2: Test & Validate
|
||||
|
||||
**Duration**: 20-40 minutes
|
||||
**Skill**: `/hive-test`
|
||||
**Input**: Working agent from Phase 1
|
||||
|
||||
### What This Phase Does
|
||||
|
||||
Creates comprehensive test suite:
|
||||
- Constraint tests (verify hard requirements)
|
||||
- Success criteria tests (measure goal achievement)
|
||||
- Edge case tests (handle failures gracefully)
|
||||
- Integration tests (end-to-end workflows)
|
||||
Guides the creation and execution of a comprehensive test suite:
|
||||
- Constraint tests
|
||||
- Success criteria tests
|
||||
- Edge case tests
|
||||
- Integration tests
|
||||
|
||||
### Process
|
||||
|
||||
1. **Analyze agent** - Read goal, constraints, success criteria
|
||||
2. **Generate tests** - Create pytest files in `exports/agent_name/tests/`
|
||||
2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
|
||||
3. **User approval** - Review and approve each test
|
||||
4. **Run evaluation** - Execute tests and collect results
|
||||
5. **Debug failures** - Identify and fix issues
|
||||
@@ -287,6 +287,19 @@ User: "Build an agent (first time)"
|
||||
→ Done: Production-ready agent
|
||||
```
|
||||
|
||||
### Pattern 1c: Build from Template
|
||||
|
||||
```
|
||||
User: "Build an agent based on the deep research template"
|
||||
→ Use /hive-create
|
||||
→ Select "From a template" path
|
||||
→ Pick template, name new agent
|
||||
→ Review/modify goal, nodes, graph
|
||||
→ Agent exported with customizations
|
||||
→ Use /hive-test
|
||||
→ Done: Customized agent
|
||||
```
|
||||
|
||||
### Pattern 2: Test Existing Agent
|
||||
|
||||
```
|
||||
@@ -490,6 +503,7 @@ The workflow is **flexible** - skip phases as needed, iterate freely, and adapt
|
||||
- Have clear requirements
|
||||
- Ready to write code
|
||||
- Want step-by-step guidance
|
||||
- Want to start from an existing template and customize it
|
||||
|
||||
**Choose hive-patterns when:**
|
||||
- Agent structure complete
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
---
|
||||
name: hive
|
||||
description: Hive Agent Builder & Manager
|
||||
mode: primary
|
||||
tools:
|
||||
agent-builder: true
|
||||
tools: true
|
||||
---
|
||||
|
||||
# Hive Agent
|
||||
You are the Hive Agent Builder. Your goal is to help the user construct, configure, and deploy AI agents using the Hive framework.
|
||||
|
||||
## Capabilities
|
||||
1. **Scaffold Agents:** Create new agent directories/configs.
|
||||
2. **Manage Tools:** Add/remove tools via MCP.
|
||||
3. **Debug:** Analyze agent workflows.
|
||||
|
||||
## Context
|
||||
- You are an expert in the Hive framework architecture.
|
||||
- Always use the `agent-builder` MCP server for filesystem operations.
|
||||
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"agent-builder": {
|
||||
"command": "uv",
|
||||
"args": [
|
||||
"run",
|
||||
"python",
|
||||
"-m",
|
||||
"framework.mcp.agent_builder_server"
|
||||
],
|
||||
"cwd": "core",
|
||||
"env": {
|
||||
"PYTHONPATH": "../tools/src"
|
||||
}
|
||||
},
|
||||
"tools": {
|
||||
"command": "uv",
|
||||
"args": [
|
||||
"run",
|
||||
"python",
|
||||
"mcp_server.py",
|
||||
"--stdio"
|
||||
],
|
||||
"cwd": "tools",
|
||||
"env": {
|
||||
"PYTHONPATH": "src"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-concepts
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-create
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-credentials
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-debugger
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-patterns
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/hive-test
|
||||
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../.claude/skills/triage-issue
|
||||
@@ -1,41 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Initial project structure
|
||||
- React frontend (honeycomb) with Vite and TypeScript
|
||||
- Node.js backend (hive) with Express and TypeScript
|
||||
- Docker Compose configuration for local development
|
||||
- Configuration system via `config.yaml`
|
||||
- GitHub Actions CI/CD workflows
|
||||
- Comprehensive documentation
|
||||
|
||||
### Changed
|
||||
- N/A
|
||||
|
||||
### Deprecated
|
||||
- N/A
|
||||
|
||||
### Removed
|
||||
- N/A
|
||||
|
||||
|
||||
### Fixed
|
||||
- tools: Fixed web_scrape tool attempting to parse non-HTML content (PDF, JSON) as HTML (#487)
|
||||
|
||||
### Security
|
||||
- N/A
|
||||
|
||||
## [0.1.0] - 2025-01-13
|
||||
|
||||
### Added
|
||||
- Initial release
|
||||
|
||||
[Unreleased]: https://github.com/adenhq/hive/compare/v0.1.0...HEAD
|
||||
[0.1.0]: https://github.com/adenhq/hive/releases/tag/v0.1.0
|
||||
+6
-3
@@ -1,6 +1,6 @@
|
||||
# Contributing to Aden Agent Framework
|
||||
|
||||
Thank you for your interest in contributing to the Aden Agent Framework! This document provides guidelines and information for contributors. We’re especially looking for help building tools, integrations([check #2805](https://github.com/adenhq/hive/issues/2805)), and example agents for the framework. If you’re interested in extending its functionality, this is the perfect place to start.
|
||||
Thank you for your interest in contributing to the Aden Agent Framework! This document provides guidelines and information for contributors. We’re especially looking for help building tools, integrations ([check #2805](https://github.com/adenhq/hive/issues/2805)), and example agents for the framework. If you’re interested in extending its functionality, this is the perfect place to start.
|
||||
|
||||
## Code of Conduct
|
||||
|
||||
@@ -49,8 +49,8 @@ You may submit PRs without prior assignment for:
|
||||
make check # Lint and format checks (ruff check + ruff format --check on core/ and tools/)
|
||||
make test # Core tests (cd core && pytest tests/ -v)
|
||||
```
|
||||
6. Commit your changes following our commit conventions
|
||||
7. Push to your fork and submit a Pull Request
|
||||
8. Commit your changes following our commit conventions
|
||||
9. Push to your fork and submit a Pull Request
|
||||
|
||||
## Development Setup
|
||||
|
||||
@@ -145,6 +145,9 @@ make test
|
||||
# Or run tests directly
|
||||
cd core && pytest tests/ -v
|
||||
|
||||
# Run tools package tests (when contributing to tools/)
|
||||
cd tools && uv run pytest tests/ -v
|
||||
|
||||
# Run tests for a specific agent
|
||||
PYTHONPATH=exports uv run python -m agent_name test
|
||||
```
|
||||
|
||||
@@ -73,8 +73,9 @@ Use Hive when you need:
|
||||
- **[Documentation](https://docs.adenhq.com/)** - Complete guides and API reference
|
||||
- **[Self-Hosting Guide](https://docs.adenhq.com/getting-started/quickstart)** - Deploy Hive on your infrastructure
|
||||
- **[Changelog](https://github.com/adenhq/hive/releases)** - Latest updates and releases
|
||||
- **[Roadmap](https://adenhq.com/roadmap)** - Upcoming features and plans
|
||||
- **[Roadmap](docs/roadmap.md)** - Upcoming features and plans
|
||||
- **[Report Issues](https://github.com/adenhq/hive/issues)** - Bug reports and feature requests
|
||||
- **[Contributing](CONTRIBUTING.md)** - How to contribute and submit PRs
|
||||
|
||||
## Quick Start
|
||||
|
||||
@@ -119,6 +120,16 @@ hive tui
|
||||
# Or run directly
|
||||
hive run exports/your_agent_name --input '{"key": "value"}'
|
||||
```
|
||||
## Coding Agent Support
|
||||
### Opencode
|
||||
Hive includes native support for [Opencode](https://github.com/opencode-ai/opencode).
|
||||
|
||||
1. **Setup:** Run the quickstart script
|
||||
2. **Launch:** Open Opencode in the project root.
|
||||
3. **Activate:** Type `/hive` in the chat to switch to the Hive Agent.
|
||||
4. **Verify:** Ask the agent *"List your tools"* to confirm the connection.
|
||||
|
||||
The agent has access to all Hive skills and can scaffold agents, add tools, and debug workflows directly from the chat.
|
||||
|
||||
**[📖 Complete Setup Guide](docs/environment-setup.md)** - Detailed instructions for agent development
|
||||
|
||||
@@ -136,7 +147,7 @@ hive run exports/your_agent_name --input '{"key": "value"}'
|
||||
|
||||
## Integration
|
||||
|
||||
<img width="100%" alt="Integration" src="https://github.com/user-attachments/assets/a1573f93-cf02-4bb8-b3d5-b305b05b1e51" />
|
||||
<a href="https://github.com/adenhq/hive/tree/main/tools/src/aden_tools/tools"><img width="100%" alt="Integration" src="https://github.com/user-attachments/assets/a1573f93-cf02-4bb8-b3d5-b305b05b1e51" /></a>
|
||||
|
||||
Hive is built to be model-agnostic and system-agnostic.
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
exports/
|
||||
docs/
|
||||
.agent-builder-sessions/
|
||||
.pytest_cache/
|
||||
**/__pycache__/
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Shared Hive configuration utilities.
|
||||
|
||||
Centralises reading of ~/.hive/configuration.json so that the runner
|
||||
and every agent template share one implementation instead of copy-pasting
|
||||
helper functions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.graph.edge import DEFAULT_MAX_TOKENS
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-level config file access
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HIVE_CONFIG_FILE = Path.home() / ".hive" / "configuration.json"
|
||||
|
||||
|
||||
def get_hive_config() -> dict[str, Any]:
|
||||
"""Load hive configuration from ~/.hive/configuration.json."""
|
||||
if not HIVE_CONFIG_FILE.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(HIVE_CONFIG_FILE, encoding="utf-8-sig") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Derived helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_preferred_model() -> str:
|
||||
"""Return the user's preferred LLM model string (e.g. 'anthropic/claude-sonnet-4-20250514')."""
|
||||
llm = get_hive_config().get("llm", {})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{llm['provider']}/{llm['model']}"
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
def get_max_tokens() -> int:
|
||||
"""Return the configured max_tokens, falling back to DEFAULT_MAX_TOKENS."""
|
||||
return get_hive_config().get("llm", {}).get("max_tokens", DEFAULT_MAX_TOKENS)
|
||||
|
||||
|
||||
def get_api_key() -> str | None:
|
||||
"""Return the API key from the environment variable specified in configuration."""
|
||||
llm = get_hive_config().get("llm", {})
|
||||
api_key_env_var = llm.get("api_key_env_var")
|
||||
if api_key_env_var:
|
||||
return os.environ.get(api_key_env_var)
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RuntimeConfig – shared across agent templates
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeConfig:
|
||||
"""Agent runtime configuration loaded from ~/.hive/configuration.json."""
|
||||
|
||||
model: str = field(default_factory=get_preferred_model)
|
||||
temperature: float = 0.7
|
||||
max_tokens: int = field(default_factory=get_max_tokens)
|
||||
api_key: str | None = field(default_factory=get_api_key)
|
||||
api_base: str | None = None
|
||||
@@ -143,19 +143,34 @@ class AdenCredentialResponse:
|
||||
def from_dict(
|
||||
cls, data: dict[str, Any], integration_id: str | None = None
|
||||
) -> AdenCredentialResponse:
|
||||
"""Create from API response dictionary."""
|
||||
"""Create from API response dictionary or normalized credential dict."""
|
||||
|
||||
expires_at = None
|
||||
if data.get("expires_at"):
|
||||
expires_at = datetime.fromisoformat(data["expires_at"].replace("Z", "+00:00"))
|
||||
|
||||
resolved_integration_id = (
|
||||
integration_id
|
||||
or data.get("integration_id")
|
||||
or data.get("alias")
|
||||
or data.get("provider", "")
|
||||
)
|
||||
|
||||
resolved_integration_type = data.get("integration_type") or data.get("provider", "")
|
||||
metadata = data.get("metadata")
|
||||
if metadata is None and data.get("email"):
|
||||
metadata = {"email": data.get("email")}
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
return cls(
|
||||
integration_id=integration_id or data.get("alias", data.get("provider", "")),
|
||||
integration_type=data.get("provider", ""),
|
||||
integration_id=resolved_integration_id,
|
||||
integration_type=resolved_integration_type,
|
||||
access_token=data["access_token"],
|
||||
token_type=data.get("token_type", "Bearer"),
|
||||
expires_at=expires_at,
|
||||
scopes=data.get("scopes", []),
|
||||
metadata={"email": data.get("email")} if data.get("email") else {},
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from framework.graph.client_io import (
|
||||
from framework.graph.code_sandbox import CodeSandbox, safe_eval, safe_exec
|
||||
from framework.graph.context_handoff import ContextHandoff, HandoffContext
|
||||
from framework.graph.conversation import ConversationStore, Message, NodeConversation
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.edge import DEFAULT_MAX_TOKENS, EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.event_loop_node import (
|
||||
EventLoopNode,
|
||||
JudgeProtocol,
|
||||
@@ -58,6 +58,7 @@ __all__ = [
|
||||
"EdgeSpec",
|
||||
"EdgeCondition",
|
||||
"GraphSpec",
|
||||
"DEFAULT_MAX_TOKENS",
|
||||
# Executor (fixed graph)
|
||||
"GraphExecutor",
|
||||
# Plan (flexible execution)
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Checkpoint Configuration - Controls checkpoint behavior during execution.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckpointConfig:
|
||||
"""
|
||||
Configuration for checkpoint behavior during graph execution.
|
||||
|
||||
Controls when checkpoints are created, how they're stored,
|
||||
and when they're pruned.
|
||||
"""
|
||||
|
||||
# Enable/disable checkpointing
|
||||
enabled: bool = True
|
||||
|
||||
# When to checkpoint
|
||||
checkpoint_on_node_start: bool = True
|
||||
checkpoint_on_node_complete: bool = True
|
||||
|
||||
# Pruning (time-based)
|
||||
checkpoint_max_age_days: int = 7 # Prune checkpoints older than 1 week
|
||||
prune_every_n_nodes: int = 10 # Check for pruning every N nodes
|
||||
|
||||
# Performance
|
||||
async_checkpoint: bool = True # Don't block execution on checkpoint writes
|
||||
|
||||
# What to include in checkpoints
|
||||
include_full_memory: bool = True
|
||||
include_metrics: bool = True
|
||||
|
||||
def should_checkpoint_node_start(self) -> bool:
|
||||
"""Check if should checkpoint before node execution."""
|
||||
return self.enabled and self.checkpoint_on_node_start
|
||||
|
||||
def should_checkpoint_node_complete(self) -> bool:
|
||||
"""Check if should checkpoint after node execution."""
|
||||
return self.enabled and self.checkpoint_on_node_complete
|
||||
|
||||
def should_prune_checkpoints(self, nodes_executed: int) -> bool:
|
||||
"""
|
||||
Check if should prune checkpoints based on execution progress.
|
||||
|
||||
Args:
|
||||
nodes_executed: Number of nodes executed so far
|
||||
|
||||
Returns:
|
||||
True if should check for old checkpoints and prune them
|
||||
"""
|
||||
return (
|
||||
self.enabled
|
||||
and self.prune_every_n_nodes > 0
|
||||
and nodes_executed % self.prune_every_n_nodes == 0
|
||||
)
|
||||
|
||||
|
||||
# Default configuration for most agents
|
||||
DEFAULT_CHECKPOINT_CONFIG = CheckpointConfig(
|
||||
enabled=True,
|
||||
checkpoint_on_node_start=True,
|
||||
checkpoint_on_node_complete=True,
|
||||
checkpoint_max_age_days=7,
|
||||
prune_every_n_nodes=10,
|
||||
async_checkpoint=True,
|
||||
)
|
||||
|
||||
|
||||
# Minimal configuration (only checkpoint at node completion)
|
||||
MINIMAL_CHECKPOINT_CONFIG = CheckpointConfig(
|
||||
enabled=True,
|
||||
checkpoint_on_node_start=False,
|
||||
checkpoint_on_node_complete=True,
|
||||
checkpoint_max_age_days=7,
|
||||
prune_every_n_nodes=20,
|
||||
async_checkpoint=True,
|
||||
)
|
||||
|
||||
|
||||
# Disabled configuration (no checkpointing)
|
||||
DISABLED_CHECKPOINT_CONFIG = CheckpointConfig(
|
||||
enabled=False,
|
||||
)
|
||||
@@ -24,10 +24,12 @@ given the current goal, context, and execution state.
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
from framework.graph.safe_eval import safe_eval
|
||||
|
||||
DEFAULT_MAX_TOKENS = 8192
|
||||
|
||||
|
||||
class EdgeCondition(StrEnum):
|
||||
"""When an edge should be traversed."""
|
||||
@@ -424,7 +426,7 @@ class GraphSpec(BaseModel):
|
||||
|
||||
# Default LLM settings
|
||||
default_model: str = "claude-haiku-4-5-20251001"
|
||||
max_tokens: int = 1024
|
||||
max_tokens: int = Field(default=None) # resolved by _resolve_max_tokens validator
|
||||
|
||||
# Cleanup LLM for JSON extraction fallback (fast/cheap model preferred)
|
||||
# If not set, uses CEREBRAS_API_KEY -> cerebras/llama-3.3-70b or
|
||||
@@ -447,6 +449,16 @@ class GraphSpec(BaseModel):
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def _resolve_max_tokens(cls, values: Any) -> Any:
|
||||
"""Resolve max_tokens from the global config store when not explicitly set."""
|
||||
if isinstance(values, dict) and values.get("max_tokens") is None:
|
||||
from framework.config import get_max_tokens
|
||||
|
||||
values["max_tokens"] = get_max_tokens()
|
||||
return values
|
||||
|
||||
def get_node(self, node_id: str) -> Any | None:
|
||||
"""Get a node by ID."""
|
||||
for node in self.nodes:
|
||||
|
||||
@@ -274,6 +274,7 @@ class EventLoopNode(NodeProtocol):
|
||||
|
||||
# 5. Stall detection state
|
||||
recent_responses: list[str] = []
|
||||
user_interaction_count = 0 # tracks how many times this node blocked for user input
|
||||
|
||||
# 6. Main loop
|
||||
for iteration in range(start_iteration, self._config.max_iterations):
|
||||
@@ -485,13 +486,11 @@ class EventLoopNode(NodeProtocol):
|
||||
|
||||
# 6h. Client-facing input blocking
|
||||
#
|
||||
# For client_facing nodes, block for user input only when the
|
||||
# LLM explicitly called ask_user(). Text-only turns without
|
||||
# ask_user flow through without blocking, allowing progress
|
||||
# updates and summaries to stream freely.
|
||||
#
|
||||
# After user input, always fall through to judge evaluation
|
||||
# (6i). The judge handles all acceptance decisions.
|
||||
# Block ONLY when the LLM explicitly calls ask_user().
|
||||
# Text-only turns and set_output-only turns flow through
|
||||
# without blocking, allowing progress updates and summaries
|
||||
# to stream freely. After user input arrives, fall through
|
||||
# to judge evaluation (6i) — the judge handles acceptance.
|
||||
if ctx.node_spec.client_facing and user_input_requested:
|
||||
if self._shutdown:
|
||||
await self._publish_loop_completed(stream_id, node_id, iteration + 1)
|
||||
@@ -578,6 +577,7 @@ class EventLoopNode(NodeProtocol):
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
user_interaction_count += 1
|
||||
recent_responses.clear()
|
||||
# Fall through to judge evaluation (6i)
|
||||
|
||||
@@ -824,6 +824,12 @@ class EventLoopNode(NodeProtocol):
|
||||
|
||||
Returns True if input arrived, False if shutdown was signaled.
|
||||
"""
|
||||
# Clear BEFORE emitting so that synchronous handlers (e.g. the
|
||||
# headless stdin handler) can call inject_event() during the emit
|
||||
# and the signal won't be lost. TUI handlers return immediately
|
||||
# without injecting, so the wait still blocks until the user types.
|
||||
self._input_ready.clear()
|
||||
|
||||
if self._event_bus:
|
||||
await self._event_bus.emit_client_input_requested(
|
||||
stream_id=ctx.node_id,
|
||||
@@ -831,7 +837,6 @@ class EventLoopNode(NodeProtocol):
|
||||
prompt="",
|
||||
)
|
||||
|
||||
self._input_ready.clear()
|
||||
await self._input_ready.wait()
|
||||
return not self._shutdown
|
||||
|
||||
@@ -989,7 +994,7 @@ class EventLoopNode(NodeProtocol):
|
||||
is_error=result.is_error,
|
||||
)
|
||||
if not result.is_error:
|
||||
value = tc.tool_input["value"]
|
||||
value = tc.tool_input.get("value", "")
|
||||
# Parse JSON strings into native types so downstream
|
||||
# consumers get lists/dicts instead of serialised JSON,
|
||||
# and the hallucination validator skips non-string values.
|
||||
@@ -1000,8 +1005,9 @@ class EventLoopNode(NodeProtocol):
|
||||
value = parsed
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
await accumulator.set(tc.tool_input["key"], value)
|
||||
outputs_set_this_turn.append(tc.tool_input["key"])
|
||||
key = tc.tool_input.get("key", "")
|
||||
await accumulator.set(key, value)
|
||||
outputs_set_this_turn.append(key)
|
||||
logged_tool_calls.append(
|
||||
{
|
||||
"tool_use_id": tc.tool_use_id,
|
||||
@@ -1283,6 +1289,24 @@ class EventLoopNode(NodeProtocol):
|
||||
accumulator, ctx.node_spec.output_keys, ctx.node_spec.nullable_output_keys
|
||||
)
|
||||
if not missing:
|
||||
# Safety check: when ALL output keys are nullable and NONE
|
||||
# have been set, the node produced nothing useful. Retry
|
||||
# instead of accepting an empty result — this prevents
|
||||
# client-facing nodes from terminating before the user
|
||||
# ever interacts, and non-client-facing nodes from
|
||||
# short-circuiting without doing their work.
|
||||
output_keys = ctx.node_spec.output_keys or []
|
||||
nullable_keys = set(ctx.node_spec.nullable_output_keys or [])
|
||||
all_nullable = output_keys and nullable_keys >= set(output_keys)
|
||||
none_set = not any(accumulator.get(k) is not None for k in output_keys)
|
||||
if all_nullable and none_set:
|
||||
return JudgeVerdict(
|
||||
action="RETRY",
|
||||
feedback=(
|
||||
f"No output keys have been set yet. "
|
||||
f"Use set_output to set at least one of: {output_keys}"
|
||||
),
|
||||
)
|
||||
return JudgeVerdict(action="ACCEPT")
|
||||
else:
|
||||
return JudgeVerdict(
|
||||
@@ -1763,7 +1787,19 @@ class EventLoopNode(NodeProtocol):
|
||||
conversation: NodeConversation,
|
||||
iteration: int,
|
||||
) -> bool:
|
||||
"""Check if pause has been requested. Returns True if paused."""
|
||||
"""
|
||||
Check if pause has been requested. Returns True if paused.
|
||||
|
||||
Note: This check happens BEFORE starting iteration N, after completing N-1.
|
||||
If paused, the node exits having completed {iteration} iterations (0 to iteration-1).
|
||||
"""
|
||||
# Check executor-level pause event (for /pause command, Ctrl+Z)
|
||||
if ctx.pause_event and ctx.pause_event.is_set():
|
||||
completed = iteration # 0-indexed: iteration=3 means 3 iterations completed (0,1,2)
|
||||
logger.info(f"⏸ Pausing after {completed} iteration(s) completed (executor-level)")
|
||||
return True
|
||||
|
||||
# Check context-level pause flags (legacy/alternative methods)
|
||||
pause_requested = ctx.input_data.get("pause_requested", False)
|
||||
if not pause_requested:
|
||||
try:
|
||||
@@ -1771,8 +1807,10 @@ class EventLoopNode(NodeProtocol):
|
||||
except (PermissionError, KeyError):
|
||||
pause_requested = False
|
||||
if pause_requested:
|
||||
logger.info(f"Pause requested at iteration {iteration}")
|
||||
completed = iteration
|
||||
logger.info(f"⏸ Pausing after {completed} iteration(s) completed (context-level)")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
@@ -17,6 +17,7 @@ from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.node import (
|
||||
@@ -32,7 +33,10 @@ from framework.graph.node import (
|
||||
from framework.graph.output_cleaner import CleansingConfig, OutputCleaner
|
||||
from framework.graph.validator import OutputValidator
|
||||
from framework.llm.provider import LLMProvider, Tool
|
||||
from framework.observability import set_trace_context
|
||||
from framework.runtime.core import Runtime
|
||||
from framework.schemas.checkpoint import Checkpoint
|
||||
from framework.storage.checkpoint_store import CheckpointStore
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -179,6 +183,9 @@ class GraphExecutor:
|
||||
self.enable_parallel_execution = enable_parallel_execution
|
||||
self._parallel_config = parallel_config or ParallelExecutionConfig()
|
||||
|
||||
# Pause/resume control
|
||||
self._pause_requested = asyncio.Event()
|
||||
|
||||
def _validate_tools(self, graph: GraphSpec) -> list[str]:
|
||||
"""
|
||||
Validate that all tools declared by nodes are available.
|
||||
@@ -208,6 +215,7 @@ class GraphExecutor:
|
||||
goal: Goal,
|
||||
input_data: dict[str, Any] | None = None,
|
||||
session_state: dict[str, Any] | None = None,
|
||||
checkpoint_config: "CheckpointConfig | None" = None,
|
||||
) -> ExecutionResult:
|
||||
"""
|
||||
Execute a graph for a goal.
|
||||
@@ -221,6 +229,9 @@ class GraphExecutor:
|
||||
Returns:
|
||||
ExecutionResult with output and metrics
|
||||
"""
|
||||
# Add agent_id to trace context for correlation
|
||||
set_trace_context(agent_id=graph.id)
|
||||
|
||||
# Validate graph
|
||||
errors = graph.validate()
|
||||
if errors:
|
||||
@@ -246,6 +257,12 @@ class GraphExecutor:
|
||||
# Initialize execution state
|
||||
memory = SharedMemory()
|
||||
|
||||
# Initialize checkpoint store if checkpointing is enabled
|
||||
checkpoint_store: CheckpointStore | None = None
|
||||
if checkpoint_config and checkpoint_config.enabled and self._storage_path:
|
||||
checkpoint_store = CheckpointStore(self._storage_path)
|
||||
self.logger.info("✓ Checkpointing enabled")
|
||||
|
||||
# Restore session state if provided
|
||||
if session_state and "memory" in session_state:
|
||||
memory_data = session_state["memory"]
|
||||
@@ -273,8 +290,110 @@ class GraphExecutor:
|
||||
node_visit_counts: dict[str, int] = {} # Track visits for feedback loops
|
||||
_is_retry = False # True when looping back for a retry (not a new visit)
|
||||
|
||||
# Restore node_visit_counts from session state if available
|
||||
if session_state and "node_visit_counts" in session_state:
|
||||
node_visit_counts = dict(session_state["node_visit_counts"])
|
||||
if node_visit_counts:
|
||||
self.logger.info(f"📥 Restored node visit counts: {node_visit_counts}")
|
||||
|
||||
# If resuming at a specific node (paused_at), that node was counted
|
||||
# but never completed, so decrement its count
|
||||
paused_at = session_state.get("paused_at")
|
||||
if (
|
||||
paused_at
|
||||
and paused_at in node_visit_counts
|
||||
and node_visit_counts[paused_at] > 0
|
||||
):
|
||||
old_count = node_visit_counts[paused_at]
|
||||
node_visit_counts[paused_at] -= 1
|
||||
self.logger.info(
|
||||
f"📥 Decremented visit count for paused node '{paused_at}': "
|
||||
f"{old_count} -> {node_visit_counts[paused_at]}"
|
||||
)
|
||||
|
||||
# Determine entry point (may differ if resuming)
|
||||
current_node_id = graph.get_entry_point(session_state)
|
||||
# Check if resuming from checkpoint
|
||||
if session_state and session_state.get("resume_from_checkpoint") and checkpoint_store:
|
||||
checkpoint_id = session_state["resume_from_checkpoint"]
|
||||
try:
|
||||
checkpoint = await checkpoint_store.load_checkpoint(checkpoint_id)
|
||||
|
||||
if checkpoint:
|
||||
self.logger.info(
|
||||
f"🔄 Resuming from checkpoint: {checkpoint_id} "
|
||||
f"(node: {checkpoint.current_node})"
|
||||
)
|
||||
|
||||
# Restore memory from checkpoint
|
||||
for key, value in checkpoint.shared_memory.items():
|
||||
memory.write(key, value, validate=False)
|
||||
|
||||
# Start from checkpoint's next node or current node
|
||||
current_node_id = (
|
||||
checkpoint.next_node or checkpoint.current_node or graph.entry_node
|
||||
)
|
||||
|
||||
# Restore execution path
|
||||
path.extend(checkpoint.execution_path)
|
||||
|
||||
self.logger.info(
|
||||
f"📥 Restored memory with {len(checkpoint.shared_memory)} keys, "
|
||||
f"resuming at node: {current_node_id}"
|
||||
)
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Checkpoint {checkpoint_id} not found, resuming from normal entry point"
|
||||
)
|
||||
# Check if resuming from paused_at (fallback to session state)
|
||||
paused_at = session_state.get("paused_at") if session_state else None
|
||||
if paused_at and graph.get_node(paused_at) is not None:
|
||||
current_node_id = paused_at
|
||||
self.logger.info(f"🔄 Resuming from paused node: {paused_at}")
|
||||
else:
|
||||
current_node_id = graph.get_entry_point(session_state)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Failed to load checkpoint {checkpoint_id}: {e}, "
|
||||
f"resuming from normal entry point"
|
||||
)
|
||||
# Check if resuming from paused_at (fallback to session state)
|
||||
paused_at = session_state.get("paused_at") if session_state else None
|
||||
if paused_at and graph.get_node(paused_at) is not None:
|
||||
current_node_id = paused_at
|
||||
self.logger.info(f"🔄 Resuming from paused node: {paused_at}")
|
||||
else:
|
||||
current_node_id = graph.get_entry_point(session_state)
|
||||
else:
|
||||
# Check if resuming from paused_at (session state resume)
|
||||
paused_at = session_state.get("paused_at") if session_state else None
|
||||
node_ids = [n.id for n in graph.nodes]
|
||||
self.logger.debug(f"paused_at={paused_at}, available node IDs={node_ids}")
|
||||
|
||||
if paused_at and graph.get_node(paused_at) is not None:
|
||||
# Resume from paused_at node directly (works for any node, not just pause_nodes)
|
||||
current_node_id = paused_at
|
||||
|
||||
# Restore execution path from session state if available
|
||||
if session_state:
|
||||
execution_path = session_state.get("execution_path", [])
|
||||
if execution_path:
|
||||
path.extend(execution_path)
|
||||
self.logger.info(
|
||||
f"🔄 Resuming from paused node: {paused_at} "
|
||||
f"(restored path: {execution_path})"
|
||||
)
|
||||
else:
|
||||
self.logger.info(f"🔄 Resuming from paused node: {paused_at}")
|
||||
else:
|
||||
self.logger.info(f"🔄 Resuming from paused node: {paused_at}")
|
||||
else:
|
||||
# Fall back to normal entry point logic
|
||||
self.logger.warning(
|
||||
f"⚠ paused_at={paused_at} is not a valid node, falling back to entry point"
|
||||
)
|
||||
current_node_id = graph.get_entry_point(session_state)
|
||||
|
||||
steps = 0
|
||||
|
||||
if session_state and current_node_id != graph.entry_node:
|
||||
@@ -289,7 +408,6 @@ class GraphExecutor:
|
||||
|
||||
if self.runtime_logger:
|
||||
# Extract session_id from storage_path if available (for unified sessions)
|
||||
# storage_path format: base_path/sessions/{session_id}/
|
||||
session_id = ""
|
||||
if self._storage_path and self._storage_path.name.startswith("session_"):
|
||||
session_id = self._storage_path.name
|
||||
@@ -313,6 +431,45 @@ class GraphExecutor:
|
||||
while steps < graph.max_steps:
|
||||
steps += 1
|
||||
|
||||
# Check for pause request
|
||||
if self._pause_requested.is_set():
|
||||
self.logger.info("⏸ Pause detected - stopping at node boundary")
|
||||
|
||||
# Create session state for pause
|
||||
saved_memory = memory.read_all()
|
||||
pause_session_state: dict[str, Any] = {
|
||||
"memory": saved_memory, # Include memory for resume
|
||||
"execution_path": list(path),
|
||||
"node_visit_counts": dict(node_visit_counts),
|
||||
}
|
||||
|
||||
# Create a pause checkpoint
|
||||
if checkpoint_store:
|
||||
pause_checkpoint = self._create_checkpoint(
|
||||
checkpoint_type="pause",
|
||||
current_node=current_node_id,
|
||||
execution_path=path,
|
||||
memory=memory,
|
||||
next_node=current_node_id,
|
||||
is_clean=True,
|
||||
)
|
||||
await checkpoint_store.save_checkpoint(pause_checkpoint)
|
||||
pause_session_state["latest_checkpoint_id"] = pause_checkpoint.checkpoint_id
|
||||
pause_session_state["resume_from_checkpoint"] = (
|
||||
pause_checkpoint.checkpoint_id
|
||||
)
|
||||
|
||||
# Return with paused status
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
output=saved_memory,
|
||||
path=path,
|
||||
paused_at=current_node_id,
|
||||
error="Execution paused by user request",
|
||||
session_state=pause_session_state,
|
||||
node_visit_counts=dict(node_visit_counts),
|
||||
)
|
||||
|
||||
# Get current node
|
||||
node_spec = graph.get_node(current_node_id)
|
||||
if node_spec is None:
|
||||
@@ -348,6 +505,21 @@ class GraphExecutor:
|
||||
|
||||
path.append(current_node_id)
|
||||
|
||||
# Clear stale nullable outputs from previous visits.
|
||||
# When a node is re-visited (e.g. review → process-batch → review),
|
||||
# nullable outputs from the PREVIOUS visit linger in shared memory.
|
||||
# This causes stale edge conditions to fire (e.g. "feedback is not None"
|
||||
# from visit 1 triggers even when visit 2 sets "final_summary" instead).
|
||||
# Clearing them ensures only the CURRENT visit's outputs affect routing.
|
||||
if node_visit_counts.get(current_node_id, 0) > 1:
|
||||
nullable_keys = getattr(node_spec, "nullable_output_keys", None) or []
|
||||
for key in nullable_keys:
|
||||
if memory.read(key) is not None:
|
||||
memory.write(key, None, validate=False)
|
||||
self.logger.info(
|
||||
f" 🧹 Cleared stale nullable output '{key}' from previous visit"
|
||||
)
|
||||
|
||||
# Check if pause (HITL) before execution
|
||||
if current_node_id in graph.pause_nodes:
|
||||
self.logger.info(f"⏸ Paused at HITL node: {node_spec.name}")
|
||||
@@ -391,6 +563,27 @@ class GraphExecutor:
|
||||
description=f"Validation errors for {current_node_id}: {validation_errors}",
|
||||
)
|
||||
|
||||
# CHECKPOINT: node_start
|
||||
if (
|
||||
checkpoint_store
|
||||
and checkpoint_config
|
||||
and checkpoint_config.should_checkpoint_node_start()
|
||||
):
|
||||
checkpoint = self._create_checkpoint(
|
||||
checkpoint_type="node_start",
|
||||
current_node=node_spec.id,
|
||||
execution_path=list(path),
|
||||
memory=memory,
|
||||
is_clean=(sum(node_retry_counts.values()) == 0),
|
||||
)
|
||||
|
||||
if checkpoint_config.async_checkpoint:
|
||||
# Non-blocking checkpoint save
|
||||
asyncio.create_task(checkpoint_store.save_checkpoint(checkpoint))
|
||||
else:
|
||||
# Blocking checkpoint save
|
||||
await checkpoint_store.save_checkpoint(checkpoint)
|
||||
|
||||
# Emit node-started event (skip event_loop nodes — they emit their own)
|
||||
if self._event_bus and node_spec.node_type != "event_loop":
|
||||
await self._event_bus.emit_node_loop_started(
|
||||
@@ -464,6 +657,13 @@ class GraphExecutor:
|
||||
if len(value_str) > 200:
|
||||
value_str = value_str[:200] + "..."
|
||||
self.logger.info(f" {key}: {value_str}")
|
||||
|
||||
# Write node outputs to memory BEFORE edge evaluation
|
||||
# This enables direct key access in conditional expressions (e.g., "score > 80")
|
||||
# Without this, conditional edges can only use output['key'] syntax
|
||||
if result.output:
|
||||
for key, value in result.output.items():
|
||||
memory.write(key, value, validate=False)
|
||||
else:
|
||||
self.logger.error(f" ✗ Failed: {result.error}")
|
||||
|
||||
@@ -557,13 +757,21 @@ class GraphExecutor:
|
||||
execution_quality="failed",
|
||||
)
|
||||
|
||||
# Save memory for potential resume
|
||||
saved_memory = memory.read_all()
|
||||
failure_session_state = {
|
||||
"memory": saved_memory,
|
||||
"execution_path": list(path),
|
||||
"node_visit_counts": dict(node_visit_counts),
|
||||
}
|
||||
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
error=(
|
||||
f"Node '{node_spec.name}' failed after "
|
||||
f"{max_retries} attempts: {result.error}"
|
||||
),
|
||||
output=memory.read_all(),
|
||||
output=saved_memory,
|
||||
steps_executed=steps,
|
||||
total_tokens=total_tokens,
|
||||
total_latency_ms=total_latency,
|
||||
@@ -574,6 +782,7 @@ class GraphExecutor:
|
||||
had_partial_failures=len(nodes_failed) > 0,
|
||||
execution_quality="failed",
|
||||
node_visit_counts=dict(node_visit_counts),
|
||||
session_state=failure_session_state,
|
||||
)
|
||||
|
||||
# Check if we just executed a pause node - if so, save state and return
|
||||
@@ -696,6 +905,39 @@ class GraphExecutor:
|
||||
break
|
||||
next_spec = graph.get_node(next_node)
|
||||
self.logger.info(f" → Next: {next_spec.name if next_spec else next_node}")
|
||||
|
||||
# CHECKPOINT: node_complete (after determining next node)
|
||||
if (
|
||||
checkpoint_store
|
||||
and checkpoint_config
|
||||
and checkpoint_config.should_checkpoint_node_complete()
|
||||
):
|
||||
checkpoint = self._create_checkpoint(
|
||||
checkpoint_type="node_complete",
|
||||
current_node=node_spec.id,
|
||||
execution_path=list(path),
|
||||
memory=memory,
|
||||
next_node=next_node,
|
||||
is_clean=(sum(node_retry_counts.values()) == 0),
|
||||
)
|
||||
|
||||
if checkpoint_config.async_checkpoint:
|
||||
asyncio.create_task(checkpoint_store.save_checkpoint(checkpoint))
|
||||
else:
|
||||
await checkpoint_store.save_checkpoint(checkpoint)
|
||||
|
||||
# Periodic checkpoint pruning
|
||||
if (
|
||||
checkpoint_store
|
||||
and checkpoint_config
|
||||
and checkpoint_config.should_prune_checkpoints(len(path))
|
||||
):
|
||||
asyncio.create_task(
|
||||
checkpoint_store.prune_checkpoints(
|
||||
max_age_days=checkpoint_config.checkpoint_max_age_days
|
||||
)
|
||||
)
|
||||
|
||||
current_node_id = next_node
|
||||
|
||||
# Update input_data for next node
|
||||
@@ -753,6 +995,50 @@ class GraphExecutor:
|
||||
node_visit_counts=dict(node_visit_counts),
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# Handle cancellation (e.g., TUI quit) - save as paused instead of failed
|
||||
self.logger.info("⏸ Execution cancelled - saving state for resume")
|
||||
|
||||
# Save memory and state for resume
|
||||
saved_memory = memory.read_all()
|
||||
session_state_out: dict[str, Any] = {
|
||||
"memory": saved_memory,
|
||||
"execution_path": list(path),
|
||||
"node_visit_counts": dict(node_visit_counts),
|
||||
}
|
||||
|
||||
# Calculate quality metrics
|
||||
total_retries_count = sum(node_retry_counts.values())
|
||||
nodes_failed = [nid for nid, count in node_retry_counts.items() if count > 0]
|
||||
exec_quality = "degraded" if total_retries_count > 0 else "clean"
|
||||
|
||||
if self.runtime_logger:
|
||||
await self.runtime_logger.end_run(
|
||||
status="paused",
|
||||
duration_ms=total_latency,
|
||||
node_path=path,
|
||||
execution_quality=exec_quality,
|
||||
)
|
||||
|
||||
# Return with paused status
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
error="Execution paused by user",
|
||||
output=saved_memory,
|
||||
steps_executed=steps,
|
||||
total_tokens=total_tokens,
|
||||
total_latency_ms=total_latency,
|
||||
path=path,
|
||||
paused_at=current_node_id, # Save where we were
|
||||
session_state=session_state_out,
|
||||
total_retries=total_retries_count,
|
||||
nodes_with_failures=nodes_failed,
|
||||
retry_details=dict(node_retry_counts),
|
||||
had_partial_failures=len(nodes_failed) > 0,
|
||||
execution_quality=exec_quality,
|
||||
node_visit_counts=dict(node_visit_counts),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
@@ -790,9 +1076,40 @@ class GraphExecutor:
|
||||
execution_quality="failed",
|
||||
)
|
||||
|
||||
# Save memory and state for potential resume
|
||||
saved_memory = memory.read_all()
|
||||
session_state_out: dict[str, Any] = {
|
||||
"memory": saved_memory,
|
||||
"execution_path": list(path),
|
||||
"node_visit_counts": dict(node_visit_counts),
|
||||
}
|
||||
|
||||
# Mark latest checkpoint for resume on failure
|
||||
if checkpoint_store:
|
||||
try:
|
||||
checkpoints = await checkpoint_store.list_checkpoints()
|
||||
if checkpoints:
|
||||
# Find latest clean checkpoint
|
||||
index = await checkpoint_store.load_index()
|
||||
if index:
|
||||
latest_clean = index.get_latest_clean_checkpoint()
|
||||
if latest_clean:
|
||||
session_state_out["resume_from_checkpoint"] = (
|
||||
latest_clean.checkpoint_id
|
||||
)
|
||||
session_state_out["latest_checkpoint_id"] = (
|
||||
latest_clean.checkpoint_id
|
||||
)
|
||||
self.logger.info(
|
||||
f"💾 Marked checkpoint for resume: {latest_clean.checkpoint_id}"
|
||||
)
|
||||
except Exception as checkpoint_err:
|
||||
self.logger.warning(f"Failed to mark checkpoint for resume: {checkpoint_err}")
|
||||
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
output=saved_memory,
|
||||
steps_executed=steps,
|
||||
path=path,
|
||||
total_retries=total_retries_count,
|
||||
@@ -801,6 +1118,7 @@ class GraphExecutor:
|
||||
had_partial_failures=len(nodes_failed) > 0,
|
||||
execution_quality="failed",
|
||||
node_visit_counts=dict(node_visit_counts),
|
||||
session_state=session_state_out,
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -841,6 +1159,7 @@ class GraphExecutor:
|
||||
goal=goal, # Pass Goal object for LLM-powered routers
|
||||
max_tokens=max_tokens,
|
||||
runtime_logger=self.runtime_logger,
|
||||
pause_event=self._pause_requested, # Pass pause event for granular control
|
||||
)
|
||||
|
||||
# Valid node types - no ambiguous "llm" type allowed
|
||||
@@ -1353,3 +1672,50 @@ class GraphExecutor:
|
||||
def register_function(self, node_id: str, func: Callable) -> None:
|
||||
"""Register a function as a node."""
|
||||
self.node_registry[node_id] = FunctionNode(func)
|
||||
|
||||
def request_pause(self) -> None:
|
||||
"""
|
||||
Request graceful pause of the current execution.
|
||||
|
||||
The execution will pause at the next node boundary after the current
|
||||
node completes. A checkpoint will be saved at the pause point, allowing
|
||||
the execution to be resumed later.
|
||||
|
||||
This method is safe to call from any thread.
|
||||
"""
|
||||
self._pause_requested.set()
|
||||
self.logger.info("⏸ Pause requested - will pause at next node boundary")
|
||||
|
||||
def _create_checkpoint(
|
||||
self,
|
||||
checkpoint_type: str,
|
||||
current_node: str,
|
||||
execution_path: list[str],
|
||||
memory: SharedMemory,
|
||||
next_node: str | None = None,
|
||||
is_clean: bool = True,
|
||||
) -> Checkpoint:
|
||||
"""
|
||||
Create a checkpoint from current execution state.
|
||||
|
||||
Args:
|
||||
checkpoint_type: Type of checkpoint (node_start, node_complete)
|
||||
current_node: Current node ID
|
||||
execution_path: Nodes executed so far
|
||||
memory: SharedMemory instance
|
||||
next_node: Next node to execute (for node_complete checkpoints)
|
||||
is_clean: Whether execution was clean up to this point
|
||||
|
||||
Returns:
|
||||
New Checkpoint instance
|
||||
"""
|
||||
|
||||
return Checkpoint.create(
|
||||
checkpoint_type=checkpoint_type,
|
||||
session_id=self._storage_path.name if self._storage_path else "unknown",
|
||||
current_node=current_node,
|
||||
execution_path=execution_path,
|
||||
shared_memory=memory.read_all(),
|
||||
next_node=next_node,
|
||||
is_clean=is_clean,
|
||||
)
|
||||
|
||||
@@ -480,6 +480,9 @@ class NodeContext:
|
||||
# Runtime logging (optional)
|
||||
runtime_logger: Any = None # RuntimeLogger | None — uses Any to avoid import
|
||||
|
||||
# Pause control (optional) - asyncio.Event for pause requests
|
||||
pause_event: Any = None # asyncio.Event | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeResult:
|
||||
@@ -1131,7 +1134,7 @@ Keep the same JSON structure but with shorter content values.
|
||||
decision_id=decision_id,
|
||||
success=True,
|
||||
result=response.content,
|
||||
tokens_used=response.input_tokens + response.output_tokens,
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
@@ -1230,7 +1233,7 @@ Keep the same JSON structure but with shorter content values.
|
||||
success=False,
|
||||
error=_extraction_error,
|
||||
output={},
|
||||
tokens_used=response.input_tokens + response.output_tokens,
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
# JSON extraction failed completely - still strip code blocks
|
||||
@@ -1272,7 +1275,7 @@ Keep the same JSON structure but with shorter content values.
|
||||
return NodeResult(
|
||||
success=True,
|
||||
output=output,
|
||||
tokens_used=response.input_tokens + response.output_tokens,
|
||||
tokens_used=total_input_tokens + total_output_tokens,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
|
||||
@@ -8,21 +8,25 @@ Usage:
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
# Project root resolution. This file lives at core/framework/mcp/agent_builder_server.py,
|
||||
# so the project root (where exports/ lives) is four parents up.
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent
|
||||
|
||||
# Ensure exports/ is on sys.path so AgentRunner can import agent modules.
|
||||
_framework_dir = Path(__file__).resolve().parent.parent # core/framework/ -> core/
|
||||
_project_root = _framework_dir.parent # core/ -> project root
|
||||
_exports_dir = _project_root / "exports"
|
||||
_exports_dir = _PROJECT_ROOT / "exports"
|
||||
if _exports_dir.is_dir() and str(_exports_dir) not in sys.path:
|
||||
sys.path.insert(0, str(_exports_dir))
|
||||
del _framework_dir, _project_root, _exports_dir
|
||||
del _exports_dir
|
||||
|
||||
from mcp.server import FastMCP # noqa: E402
|
||||
from pydantic import ValidationError # noqa: E402
|
||||
|
||||
from framework.graph import ( # noqa: E402
|
||||
Constraint,
|
||||
@@ -175,8 +179,8 @@ def _load_active_session() -> BuildSession | None:
|
||||
|
||||
if session_id:
|
||||
return _load_session(session_id)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logging.warning("Failed to load active session: %s", e)
|
||||
|
||||
return None
|
||||
|
||||
@@ -541,6 +545,9 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]:
|
||||
"""
|
||||
Validate and normalize agent_path.
|
||||
|
||||
Resolves relative paths against _PROJECT_ROOT since the MCP server's
|
||||
cwd (core/) differs from the user's cwd (project root).
|
||||
|
||||
Returns:
|
||||
(Path, None) if valid
|
||||
(None, error_json) if invalid
|
||||
@@ -555,6 +562,12 @@ def _validate_agent_path(agent_path: str) -> tuple[Path | None, str | None]:
|
||||
|
||||
path = Path(agent_path)
|
||||
|
||||
# Resolve relative paths against project root (not MCP server's cwd)
|
||||
if not path.is_absolute() and not path.exists():
|
||||
resolved = _PROJECT_ROOT / path
|
||||
if resolved.exists():
|
||||
path = resolved
|
||||
|
||||
if not path.exists():
|
||||
return None, json.dumps(
|
||||
{
|
||||
@@ -1856,6 +1869,85 @@ def export_graph() -> str:
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def import_from_export(
|
||||
agent_json_path: Annotated[str, "Path to the agent.json file to import"],
|
||||
) -> str:
|
||||
"""
|
||||
Import an agent definition from an exported agent.json file into the current build session.
|
||||
|
||||
Reads the agent.json, parses goal/nodes/edges, and populates the current session.
|
||||
This is the reverse of export_graph().
|
||||
|
||||
Args:
|
||||
agent_json_path: Path to the agent.json file to import
|
||||
|
||||
Returns:
|
||||
JSON summary of what was imported (goal name, node count, edge count)
|
||||
"""
|
||||
session = get_session()
|
||||
|
||||
path = Path(agent_json_path)
|
||||
if not path.exists():
|
||||
return json.dumps({"success": False, "error": f"File not found: {agent_json_path}"})
|
||||
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
except json.JSONDecodeError as e:
|
||||
return json.dumps({"success": False, "error": f"Invalid JSON: {e}"})
|
||||
|
||||
try:
|
||||
# Parse goal (same pattern as BuildSession.from_dict lines 88-99)
|
||||
goal_data = data.get("goal")
|
||||
if goal_data:
|
||||
session.goal = Goal(
|
||||
id=goal_data["id"],
|
||||
name=goal_data["name"],
|
||||
description=goal_data["description"],
|
||||
success_criteria=[
|
||||
SuccessCriterion(**sc) for sc in goal_data.get("success_criteria", [])
|
||||
],
|
||||
constraints=[Constraint(**c) for c in goal_data.get("constraints", [])],
|
||||
)
|
||||
|
||||
# Parse nodes (same pattern as BuildSession.from_dict line 102)
|
||||
graph_data = data.get("graph", {})
|
||||
nodes_data = graph_data.get("nodes", [])
|
||||
session.nodes = [NodeSpec(**n) for n in nodes_data]
|
||||
|
||||
# Parse edges (same pattern as BuildSession.from_dict lines 105-118)
|
||||
edges_data = graph_data.get("edges", [])
|
||||
session.edges = []
|
||||
for e in edges_data:
|
||||
condition_str = e.get("condition")
|
||||
if isinstance(condition_str, str):
|
||||
condition_map = {
|
||||
"always": EdgeCondition.ALWAYS,
|
||||
"on_success": EdgeCondition.ON_SUCCESS,
|
||||
"on_failure": EdgeCondition.ON_FAILURE,
|
||||
"conditional": EdgeCondition.CONDITIONAL,
|
||||
"llm_decide": EdgeCondition.LLM_DECIDE,
|
||||
}
|
||||
e["condition"] = condition_map.get(condition_str, EdgeCondition.ON_SUCCESS)
|
||||
session.edges.append(EdgeSpec(**e))
|
||||
except (KeyError, TypeError, ValueError, ValidationError) as e:
|
||||
return json.dumps({"success": False, "error": f"Malformed agent.json: {e}"})
|
||||
|
||||
# Persist updated session
|
||||
_save_session(session)
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"goal": session.goal.name if session.goal else None,
|
||||
"nodes_count": len(session.nodes),
|
||||
"edges_count": len(session.edges),
|
||||
"node_ids": [n.id for n in session.nodes],
|
||||
"edge_ids": [e.id for e in session.edges],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_session_status() -> str:
|
||||
"""Get the current status of the build session."""
|
||||
@@ -2939,18 +3031,15 @@ def _format_success_criteria(criteria: list[SuccessCriterion]) -> str:
|
||||
|
||||
# Test template for Claude to use when writing tests
|
||||
CONSTRAINT_TEST_TEMPLATE = '''@pytest.mark.asyncio
|
||||
async def test_constraint_{constraint_id}_{scenario}(mock_mode):
|
||||
async def test_constraint_{constraint_id}_{scenario}(runner, auto_responder, mock_mode):
|
||||
"""Test: {description}"""
|
||||
result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode)
|
||||
|
||||
# IMPORTANT: result is an ExecutionResult object with these attributes:
|
||||
# - result.success: bool - whether the agent succeeded
|
||||
# - result.output: dict - the agent's output data (access data here!)
|
||||
# - result.error: str or None - error message if failed
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({{"key": "value"}})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
|
||||
assert result.success, f"Agent failed: {{result.error}}"
|
||||
|
||||
# Access output data via result.output
|
||||
output_data = result.output or {{}}
|
||||
|
||||
# Add constraint-specific assertions here
|
||||
@@ -2958,18 +3047,15 @@ async def test_constraint_{constraint_id}_{scenario}(mock_mode):
|
||||
'''
|
||||
|
||||
SUCCESS_TEST_TEMPLATE = '''@pytest.mark.asyncio
|
||||
async def test_success_{criteria_id}_{scenario}(mock_mode):
|
||||
async def test_success_{criteria_id}_{scenario}(runner, auto_responder, mock_mode):
|
||||
"""Test: {description}"""
|
||||
result = await default_agent.run({{"key": "value"}}, mock_mode=mock_mode)
|
||||
|
||||
# IMPORTANT: result is an ExecutionResult object with these attributes:
|
||||
# - result.success: bool - whether the agent succeeded
|
||||
# - result.output: dict - the agent's output data (access data here!)
|
||||
# - result.error: str or None - error message if failed
|
||||
await auto_responder.start()
|
||||
try:
|
||||
result = await runner.run({{"key": "value"}})
|
||||
finally:
|
||||
await auto_responder.stop()
|
||||
|
||||
assert result.success, f"Agent failed: {{result.error}}"
|
||||
|
||||
# Access output data via result.output
|
||||
output_data = result.output or {{}}
|
||||
|
||||
# Add success criteria-specific assertions here
|
||||
@@ -3025,7 +3111,6 @@ def generate_constraint_tests(
|
||||
test_type="Constraint",
|
||||
agent_name=agent_module,
|
||||
description=f"Tests for constraints defined in goal: {goal.name}",
|
||||
agent_module=agent_module,
|
||||
)
|
||||
|
||||
# Return guidelines + data for Claude to write tests directly
|
||||
@@ -3041,14 +3126,22 @@ def generate_constraint_tests(
|
||||
"max_tests": 5,
|
||||
"naming_convention": "test_constraint_<constraint_id>_<scenario>",
|
||||
"required_decorator": "@pytest.mark.asyncio",
|
||||
"required_fixture": "mock_mode",
|
||||
"agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)",
|
||||
"required_fixtures": "runner, auto_responder, mock_mode",
|
||||
"agent_call_pattern": "await runner.run(input_dict)",
|
||||
"auto_responder_pattern": (
|
||||
"await auto_responder.start()\n"
|
||||
"try:\n"
|
||||
" result = await runner.run(input_dict)\n"
|
||||
"finally:\n"
|
||||
" await auto_responder.stop()"
|
||||
),
|
||||
"result_type": "ExecutionResult with .success, .output (dict), .error",
|
||||
"critical_rules": [
|
||||
"Every test function MUST be async with @pytest.mark.asyncio",
|
||||
"Every test MUST accept mock_mode as a parameter",
|
||||
"Use await default_agent.run(input, mock_mode=mock_mode)",
|
||||
"default_agent is already imported - do NOT add imports",
|
||||
"Every test MUST accept runner, auto_responder, and mock_mode fixtures",
|
||||
"Use await runner.run(input) -- NOT default_agent.run()",
|
||||
"Start auto_responder before running, stop in finally block",
|
||||
"runner and auto_responder are from conftest.py -- do NOT import them",
|
||||
"NEVER call result.get() - use result.output.get() instead",
|
||||
"Always check result.success before accessing result.output",
|
||||
],
|
||||
@@ -3112,7 +3205,6 @@ def generate_success_tests(
|
||||
test_type="Success criteria",
|
||||
agent_name=agent_module,
|
||||
description=f"Tests for success criteria defined in goal: {goal.name}",
|
||||
agent_module=agent_module,
|
||||
)
|
||||
|
||||
# Return guidelines + data for Claude to write tests directly
|
||||
@@ -3134,14 +3226,22 @@ def generate_success_tests(
|
||||
"max_tests": 12,
|
||||
"naming_convention": "test_success_<criteria_id>_<scenario>",
|
||||
"required_decorator": "@pytest.mark.asyncio",
|
||||
"required_fixture": "mock_mode",
|
||||
"agent_call_pattern": "await default_agent.run(input_dict, mock_mode=mock_mode)",
|
||||
"required_fixtures": "runner, auto_responder, mock_mode",
|
||||
"agent_call_pattern": "await runner.run(input_dict)",
|
||||
"auto_responder_pattern": (
|
||||
"await auto_responder.start()\n"
|
||||
"try:\n"
|
||||
" result = await runner.run(input_dict)\n"
|
||||
"finally:\n"
|
||||
" await auto_responder.stop()"
|
||||
),
|
||||
"result_type": "ExecutionResult with .success, .output (dict), .error",
|
||||
"critical_rules": [
|
||||
"Every test function MUST be async with @pytest.mark.asyncio",
|
||||
"Every test MUST accept mock_mode as a parameter",
|
||||
"Use await default_agent.run(input, mock_mode=mock_mode)",
|
||||
"default_agent is already imported - do NOT add imports",
|
||||
"Every test MUST accept runner, auto_responder, and mock_mode fixtures",
|
||||
"Use await runner.run(input) -- NOT default_agent.run()",
|
||||
"Start auto_responder before running, stop in finally block",
|
||||
"runner and auto_responder are from conftest.py -- do NOT import them",
|
||||
"NEVER call result.get() - use result.output.get() instead",
|
||||
"Always check result.success before accessing result.output",
|
||||
],
|
||||
@@ -3238,11 +3338,13 @@ def run_tests(
|
||||
# Add short traceback and quiet summary
|
||||
cmd.append("--tb=short")
|
||||
|
||||
# Set PYTHONPATH to project root so agents can import from core.framework
|
||||
# Set PYTHONPATH so framework and agent packages are importable
|
||||
env = os.environ.copy()
|
||||
pythonpath = env.get("PYTHONPATH", "")
|
||||
project_root = Path(__file__).parent.parent.parent.parent.resolve()
|
||||
env["PYTHONPATH"] = f"{project_root}:{pythonpath}"
|
||||
core_path = project_root / "core"
|
||||
exports_path = project_root / "exports"
|
||||
env["PYTHONPATH"] = f"{core_path}:{exports_path}:{project_root}:{pythonpath}"
|
||||
|
||||
# Run pytest
|
||||
try:
|
||||
@@ -3712,7 +3814,11 @@ def check_missing_credentials(
|
||||
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
runner = AgentRunner.load(agent_path)
|
||||
path, err = _validate_agent_path(agent_path)
|
||||
if err:
|
||||
return err
|
||||
|
||||
runner = AgentRunner.load(str(path))
|
||||
runner.validate()
|
||||
|
||||
store = _get_credential_store()
|
||||
@@ -3912,7 +4018,11 @@ def verify_credentials(
|
||||
try:
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
runner = AgentRunner.load(agent_path)
|
||||
path, err = _validate_agent_path(agent_path)
|
||||
if err:
|
||||
return err
|
||||
|
||||
runner = AgentRunner.load(str(path))
|
||||
validation = runner.validate()
|
||||
|
||||
return json.dumps(
|
||||
@@ -3929,6 +4039,382 @@ def verify_credentials(
|
||||
return json.dumps({"error": str(e)})
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SESSION & CHECKPOINT TOOLS (read-only, no build session required)
|
||||
# =============================================================================
|
||||
|
||||
_MAX_DIFF_VALUE_LEN = 500
|
||||
|
||||
|
||||
def _read_session_json(path: Path) -> dict | None:
|
||||
"""Read a JSON file, returning None on failure."""
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def _scan_agent_sessions(agent_work_dir: Path) -> list[tuple[str, Path]]:
|
||||
"""Find session directories with state.json, sorted most-recent-first."""
|
||||
sessions: list[tuple[str, Path]] = []
|
||||
sessions_dir = agent_work_dir / "sessions"
|
||||
if not sessions_dir.exists():
|
||||
return sessions
|
||||
for session_dir in sessions_dir.iterdir():
|
||||
if session_dir.is_dir() and session_dir.name.startswith("session_"):
|
||||
state_path = session_dir / "state.json"
|
||||
if state_path.exists():
|
||||
sessions.append((session_dir.name, state_path))
|
||||
sessions.sort(key=lambda t: t[0], reverse=True)
|
||||
return sessions
|
||||
|
||||
|
||||
def _truncate_value(value: object, max_len: int = _MAX_DIFF_VALUE_LEN) -> object:
|
||||
"""Truncate a value's JSON representation if too long."""
|
||||
s = json.dumps(value, default=str)
|
||||
if len(s) <= max_len:
|
||||
return value
|
||||
return {"_truncated": True, "_preview": s[:max_len] + "...", "_length": len(s)}
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def list_agent_sessions(
|
||||
agent_work_dir: Annotated[
|
||||
str,
|
||||
"Path to the agent's working directory (e.g., ~/.hive/agents/my_agent)",
|
||||
],
|
||||
status: Annotated[
|
||||
str,
|
||||
"Filter by status: 'active', 'paused', 'completed', 'failed', 'cancelled'. Empty for all.",
|
||||
] = "",
|
||||
limit: Annotated[int, "Maximum number of results (default 20)"] = 20,
|
||||
offset: Annotated[int, "Number of sessions to skip for pagination"] = 0,
|
||||
) -> str:
|
||||
"""
|
||||
List sessions for an agent with optional status filter.
|
||||
|
||||
Use this to discover which sessions exist, find resumable sessions,
|
||||
or identify failed sessions for debugging. Combines well with
|
||||
query_runtime_logs for correlating session state with log data.
|
||||
"""
|
||||
work_dir = Path(agent_work_dir)
|
||||
all_sessions = _scan_agent_sessions(work_dir)
|
||||
|
||||
if not all_sessions:
|
||||
return json.dumps({"sessions": [], "total": 0, "offset": offset, "limit": limit})
|
||||
|
||||
summaries = []
|
||||
for session_id, state_path in all_sessions:
|
||||
data = _read_session_json(state_path)
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
session_status = data.get("status", "")
|
||||
if status and session_status != status:
|
||||
continue
|
||||
|
||||
timestamps = data.get("timestamps", {})
|
||||
progress = data.get("progress", {})
|
||||
checkpoint_dir = state_path.parent / "checkpoints"
|
||||
|
||||
summaries.append(
|
||||
{
|
||||
"session_id": session_id,
|
||||
"status": session_status,
|
||||
"goal_id": data.get("goal_id", ""),
|
||||
"started_at": timestamps.get("started_at", ""),
|
||||
"updated_at": timestamps.get("updated_at", ""),
|
||||
"completed_at": timestamps.get("completed_at"),
|
||||
"is_resumable": data.get("is_resumable", False),
|
||||
"is_resumable_from_checkpoint": data.get("is_resumable_from_checkpoint", False),
|
||||
"current_node": progress.get("current_node"),
|
||||
"paused_at": progress.get("paused_at"),
|
||||
"steps_executed": progress.get("steps_executed", 0),
|
||||
"execution_quality": progress.get("execution_quality", ""),
|
||||
"has_checkpoints": checkpoint_dir.exists()
|
||||
and any(checkpoint_dir.glob("cp_*.json")),
|
||||
}
|
||||
)
|
||||
|
||||
total = len(summaries)
|
||||
page = summaries[offset : offset + limit]
|
||||
return json.dumps(
|
||||
{"sessions": page, "total": total, "offset": offset, "limit": limit}, indent=2
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_agent_session_state(
|
||||
agent_work_dir: Annotated[str, "Path to the agent's working directory"],
|
||||
session_id: Annotated[str, "The session ID (e.g., 'session_20260208_143022_abc12345')"],
|
||||
) -> str:
|
||||
"""
|
||||
Load full session state for a specific session.
|
||||
|
||||
Returns complete session data including status, progress, result,
|
||||
metrics, and checkpoint info. Memory values are excluded to prevent
|
||||
context bloat -- use get_agent_session_memory to retrieve memory contents.
|
||||
"""
|
||||
state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json"
|
||||
data = _read_session_json(state_path)
|
||||
if data is None:
|
||||
return json.dumps({"error": f"Session not found: {session_id}"})
|
||||
|
||||
memory = data.get("memory", {})
|
||||
data["memory_keys"] = list(memory.keys()) if isinstance(memory, dict) else []
|
||||
data["memory_size"] = len(memory) if isinstance(memory, dict) else 0
|
||||
data.pop("memory", None)
|
||||
|
||||
return json.dumps(data, indent=2, default=str)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_agent_session_memory(
|
||||
agent_work_dir: Annotated[str, "Path to the agent's working directory"],
|
||||
session_id: Annotated[str, "The session ID"],
|
||||
key: Annotated[str, "Specific memory key to retrieve. Empty for all."] = "",
|
||||
) -> str:
|
||||
"""
|
||||
Get memory contents from a session.
|
||||
|
||||
Memory stores intermediate results passed between nodes. Use this
|
||||
to inspect what data was produced during execution.
|
||||
|
||||
If key is provided, returns only that memory key's value.
|
||||
If key is empty, returns all memory keys and their values.
|
||||
"""
|
||||
state_path = Path(agent_work_dir) / "sessions" / session_id / "state.json"
|
||||
data = _read_session_json(state_path)
|
||||
if data is None:
|
||||
return json.dumps({"error": f"Session not found: {session_id}"})
|
||||
|
||||
memory = data.get("memory", {})
|
||||
if not isinstance(memory, dict):
|
||||
memory = {}
|
||||
|
||||
if key:
|
||||
if key not in memory:
|
||||
return json.dumps(
|
||||
{
|
||||
"error": f"Memory key not found: '{key}'",
|
||||
"available_keys": list(memory.keys()),
|
||||
}
|
||||
)
|
||||
value = memory[key]
|
||||
return json.dumps(
|
||||
{
|
||||
"session_id": session_id,
|
||||
"key": key,
|
||||
"value": value,
|
||||
"value_type": type(value).__name__,
|
||||
},
|
||||
indent=2,
|
||||
default=str,
|
||||
)
|
||||
|
||||
return json.dumps(
|
||||
{"session_id": session_id, "memory": memory, "total_keys": len(memory)},
|
||||
indent=2,
|
||||
default=str,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def list_agent_checkpoints(
|
||||
agent_work_dir: Annotated[str, "Path to the agent's working directory"],
|
||||
session_id: Annotated[str, "The session ID to list checkpoints for"],
|
||||
checkpoint_type: Annotated[
|
||||
str,
|
||||
"Filter by type: 'node_start', 'node_complete', 'loop_iteration'. Empty for all.",
|
||||
] = "",
|
||||
is_clean: Annotated[str, "Filter by clean status: 'true', 'false', or empty for all."] = "",
|
||||
) -> str:
|
||||
"""
|
||||
List checkpoints for a specific session.
|
||||
|
||||
Checkpoints capture execution state at node boundaries for
|
||||
crash recovery and resume. Use with get_agent_checkpoint for
|
||||
detailed checkpoint inspection.
|
||||
"""
|
||||
session_dir = Path(agent_work_dir) / "sessions" / session_id
|
||||
checkpoint_dir = session_dir / "checkpoints"
|
||||
|
||||
if not session_dir.exists():
|
||||
return json.dumps({"error": f"Session not found: {session_id}"})
|
||||
|
||||
if not checkpoint_dir.exists():
|
||||
return json.dumps(
|
||||
{
|
||||
"session_id": session_id,
|
||||
"checkpoints": [],
|
||||
"total": 0,
|
||||
"latest_checkpoint_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Try index.json first
|
||||
index_data = _read_session_json(checkpoint_dir / "index.json")
|
||||
if index_data and "checkpoints" in index_data:
|
||||
checkpoints = index_data["checkpoints"]
|
||||
else:
|
||||
# Fallback: scan individual checkpoint files
|
||||
checkpoints = []
|
||||
for cp_file in sorted(checkpoint_dir.glob("cp_*.json")):
|
||||
cp_data = _read_session_json(cp_file)
|
||||
if cp_data:
|
||||
checkpoints.append(
|
||||
{
|
||||
"checkpoint_id": cp_data.get("checkpoint_id", cp_file.stem),
|
||||
"checkpoint_type": cp_data.get("checkpoint_type", ""),
|
||||
"created_at": cp_data.get("created_at", ""),
|
||||
"current_node": cp_data.get("current_node"),
|
||||
"next_node": cp_data.get("next_node"),
|
||||
"is_clean": cp_data.get("is_clean", True),
|
||||
"description": cp_data.get("description", ""),
|
||||
}
|
||||
)
|
||||
|
||||
# Apply filters
|
||||
if checkpoint_type:
|
||||
checkpoints = [c for c in checkpoints if c.get("checkpoint_type") == checkpoint_type]
|
||||
if is_clean:
|
||||
clean_val = is_clean.lower() == "true"
|
||||
checkpoints = [c for c in checkpoints if c.get("is_clean") == clean_val]
|
||||
|
||||
latest_id = None
|
||||
if index_data:
|
||||
latest_id = index_data.get("latest_checkpoint_id")
|
||||
elif checkpoints:
|
||||
latest_id = checkpoints[-1].get("checkpoint_id")
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"session_id": session_id,
|
||||
"checkpoints": checkpoints,
|
||||
"total": len(checkpoints),
|
||||
"latest_checkpoint_id": latest_id,
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_agent_checkpoint(
|
||||
agent_work_dir: Annotated[str, "Path to the agent's working directory"],
|
||||
session_id: Annotated[str, "The session ID"],
|
||||
checkpoint_id: Annotated[str, "Specific checkpoint ID, or empty for latest"] = "",
|
||||
) -> str:
|
||||
"""
|
||||
Load a specific checkpoint with full state data.
|
||||
|
||||
Returns the complete checkpoint including shared memory snapshot,
|
||||
execution path, accumulated outputs, and metrics. If checkpoint_id
|
||||
is empty, loads the latest checkpoint.
|
||||
"""
|
||||
session_dir = Path(agent_work_dir) / "sessions" / session_id
|
||||
checkpoint_dir = session_dir / "checkpoints"
|
||||
|
||||
if not checkpoint_dir.exists():
|
||||
return json.dumps({"error": f"No checkpoints found for session: {session_id}"})
|
||||
|
||||
if not checkpoint_id:
|
||||
index_data = _read_session_json(checkpoint_dir / "index.json")
|
||||
if index_data and index_data.get("latest_checkpoint_id"):
|
||||
checkpoint_id = index_data["latest_checkpoint_id"]
|
||||
else:
|
||||
cp_files = sorted(checkpoint_dir.glob("cp_*.json"))
|
||||
if not cp_files:
|
||||
return json.dumps({"error": f"No checkpoints found for session: {session_id}"})
|
||||
checkpoint_id = cp_files[-1].stem
|
||||
|
||||
cp_path = checkpoint_dir / f"{checkpoint_id}.json"
|
||||
data = _read_session_json(cp_path)
|
||||
if data is None:
|
||||
return json.dumps({"error": f"Checkpoint not found: {checkpoint_id}"})
|
||||
|
||||
return json.dumps(data, indent=2, default=str)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def compare_agent_checkpoints(
|
||||
agent_work_dir: Annotated[str, "Path to the agent's working directory"],
|
||||
session_id: Annotated[str, "The session ID"],
|
||||
checkpoint_id_before: Annotated[str, "The earlier checkpoint ID"],
|
||||
checkpoint_id_after: Annotated[str, "The later checkpoint ID"],
|
||||
) -> str:
|
||||
"""
|
||||
Compare memory state between two checkpoints.
|
||||
|
||||
Shows what memory keys were added, removed, or changed between
|
||||
two points in execution. Useful for understanding how data flows
|
||||
through the agent graph.
|
||||
"""
|
||||
checkpoint_dir = Path(agent_work_dir) / "sessions" / session_id / "checkpoints"
|
||||
|
||||
before = _read_session_json(checkpoint_dir / f"{checkpoint_id_before}.json")
|
||||
if before is None:
|
||||
return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_before}"})
|
||||
|
||||
after = _read_session_json(checkpoint_dir / f"{checkpoint_id_after}.json")
|
||||
if after is None:
|
||||
return json.dumps({"error": f"Checkpoint not found: {checkpoint_id_after}"})
|
||||
|
||||
mem_before = before.get("shared_memory", {})
|
||||
mem_after = after.get("shared_memory", {})
|
||||
|
||||
keys_before = set(mem_before.keys())
|
||||
keys_after = set(mem_after.keys())
|
||||
|
||||
added = {k: _truncate_value(mem_after[k]) for k in keys_after - keys_before}
|
||||
removed = list(keys_before - keys_after)
|
||||
unchanged = []
|
||||
changed = {}
|
||||
|
||||
for k in keys_before & keys_after:
|
||||
if mem_before[k] == mem_after[k]:
|
||||
unchanged.append(k)
|
||||
else:
|
||||
changed[k] = {
|
||||
"before": _truncate_value(mem_before[k]),
|
||||
"after": _truncate_value(mem_after[k]),
|
||||
}
|
||||
|
||||
path_before = before.get("execution_path", [])
|
||||
path_after = after.get("execution_path", [])
|
||||
new_nodes = path_after[len(path_before) :]
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"session_id": session_id,
|
||||
"before": {
|
||||
"checkpoint_id": checkpoint_id_before,
|
||||
"current_node": before.get("current_node"),
|
||||
"created_at": before.get("created_at", ""),
|
||||
},
|
||||
"after": {
|
||||
"checkpoint_id": checkpoint_id_after,
|
||||
"current_node": after.get("current_node"),
|
||||
"created_at": after.get("created_at", ""),
|
||||
},
|
||||
"memory_diff": {
|
||||
"added": added,
|
||||
"removed": removed,
|
||||
"changed": changed,
|
||||
"unchanged": unchanged,
|
||||
},
|
||||
"execution_path_diff": {
|
||||
"new_nodes": new_nodes,
|
||||
"path_before": path_before,
|
||||
"path_after": path_after,
|
||||
},
|
||||
},
|
||||
indent=2,
|
||||
default=str,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
# Observability - Structured Logging
|
||||
|
||||
## Configuration via Environment Variables
|
||||
|
||||
Control logging format using environment variables:
|
||||
|
||||
```bash
|
||||
# JSON logging (production) - Machine-parseable, one line per log
|
||||
export LOG_FORMAT=json
|
||||
python -m my_agent run
|
||||
|
||||
# Human-readable (development) - Color-coded, easy to read
|
||||
# Default if LOG_FORMAT is not set
|
||||
python -m my_agent run
|
||||
```
|
||||
|
||||
**Alternative:** Set `ENV=production` to automatically use JSON format:
|
||||
|
||||
```bash
|
||||
export ENV=production
|
||||
python -m my_agent run
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Hive framework provides automatic structured logging with trace context propagation. Logs include correlation IDs (`trace_id`, `execution_id`) that automatically follow your agent execution flow.
|
||||
|
||||
**Features:**
|
||||
- **Zero developer friction**: Standard `logger.info()` calls automatically get trace context
|
||||
- **ContextVar-based propagation**: Thread-safe and async-safe for concurrent executions
|
||||
- **Dual output modes**: JSON for production, human-readable for development
|
||||
- **Automatic correlation**: `trace_id` and `execution_id` propagate through all logs
|
||||
|
||||
## Quick Start
|
||||
|
||||
Logging is automatically configured when you use `AgentRunner`. No setup required:
|
||||
|
||||
```python
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
runner = AgentRunner(graph=my_graph, goal=my_goal)
|
||||
result = await runner.run({"input": "data"})
|
||||
# Logs automatically include trace_id, execution_id, agent_id, etc.
|
||||
```
|
||||
|
||||
## Programmatic Configuration
|
||||
|
||||
Configure logging explicitly in your code:
|
||||
|
||||
```python
|
||||
from framework.observability import configure_logging
|
||||
|
||||
# Human-readable (development)
|
||||
configure_logging(level="DEBUG", format="human")
|
||||
|
||||
# JSON (production)
|
||||
configure_logging(level="INFO", format="json")
|
||||
|
||||
# Auto-detect from environment
|
||||
configure_logging(level="INFO", format="auto")
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
- **level**: `"DEBUG"`, `"INFO"`, `"WARNING"`, `"ERROR"`, `"CRITICAL"`
|
||||
- **format**:
|
||||
- `"json"` - Machine-parseable JSON (one line per log entry)
|
||||
- `"human"` - Human-readable with colors
|
||||
- `"auto"` - Detects from `LOG_FORMAT` env var or `ENV=production`
|
||||
|
||||
## Log Format Examples
|
||||
|
||||
### JSON Format (Machine-parseable)
|
||||
|
||||
```json
|
||||
{"timestamp": "2026-01-28T15:01:02.671126+00:00", "level": "info", "logger": "framework.runtime", "message": "Starting agent execution", "trace_id": "54e80d7b5bd6409dbc3217e5cd16a4fd", "execution_id": "b4c348ec54e80d7b5bd6409dbc3217e50", "agent_id": "sales-agent", "goal_id": "qualify-leads"}
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- `trace_id` and `execution_id` are 32 hex chars (W3C/OTel-aligned, no prefixes)
|
||||
- Compact single-line format (easy to stream/parse)
|
||||
- All trace context fields included automatically
|
||||
|
||||
### Human-Readable Format (Development)
|
||||
|
||||
```
|
||||
[INFO ] [trace:12345678 | exec:a1b2c3d4 | agent:sales-agent] Starting agent execution
|
||||
[INFO ] [trace:12345678 | exec:a1b2c3d4 | agent:sales-agent] Processing input data [node_id:input-processor]
|
||||
[INFO ] [trace:12345678 | exec:a1b2c3d4 | agent:sales-agent] LLM call completed [latency_ms:1250] [tokens_used:450]
|
||||
```
|
||||
|
||||
**Features:**
|
||||
- Color-coded log levels
|
||||
- Shortened IDs for readability (first 8 chars)
|
||||
- Context prefix shows trace correlation
|
||||
|
||||
## Trace Context Fields
|
||||
|
||||
When the framework sets trace context, these fields are included in all logs. IDs are 32 hex (W3C/OTel-aligned, no prefixes).
|
||||
|
||||
- **trace_id**: Trace identifier
|
||||
- **execution_id**: Run/session correlation
|
||||
- **agent_id**: Agent/graph identifier
|
||||
- **goal_id**: Goal being pursued
|
||||
- **node_id**: Current node (when set)
|
||||
|
||||
## Custom Log Fields
|
||||
|
||||
Add custom fields using the `extra` parameter:
|
||||
|
||||
```python
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("my_module")
|
||||
|
||||
# Add custom fields
|
||||
logger.info("LLM call completed", extra={
|
||||
"latency_ms": 1250,
|
||||
"tokens_used": 450,
|
||||
"model": "claude-3-5-sonnet-20241022",
|
||||
"node_id": "web-search"
|
||||
})
|
||||
```
|
||||
|
||||
These fields appear in both JSON and human-readable formats.
|
||||
|
||||
## Usage in Your Code
|
||||
|
||||
### Standard Logging (Recommended)
|
||||
|
||||
Just use Python's standard logging - context is automatic:
|
||||
|
||||
```python
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def my_function():
|
||||
# This log automatically includes trace_id, execution_id, etc.
|
||||
logger.info("Processing data")
|
||||
|
||||
try:
|
||||
result = do_work()
|
||||
logger.info("Work completed", extra={"result_count": len(result)})
|
||||
except Exception as e:
|
||||
logger.error("Work failed", exc_info=True)
|
||||
```
|
||||
|
||||
### Framework-Managed Context
|
||||
|
||||
The framework automatically sets trace context at key points:
|
||||
|
||||
- **Runtime.start_run()**: Sets `trace_id`, `execution_id`, `goal_id`
|
||||
- **GraphExecutor.execute()**: Adds `agent_id`
|
||||
- **Node execution**: Adds `node_id`
|
||||
|
||||
Propagation is automatic via ContextVar.
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Manual Context Management
|
||||
|
||||
If you need to set trace context manually (rare):
|
||||
|
||||
```python
|
||||
from framework.observability import set_trace_context, get_trace_context
|
||||
|
||||
# Set context (32-hex, no prefixes)
|
||||
set_trace_context(
|
||||
trace_id="54e80d7b5bd6409dbc3217e5cd16a4fd",
|
||||
execution_id="b4c348ec54e80d7b5bd6409dbc3217e50",
|
||||
agent_id="my-agent"
|
||||
)
|
||||
|
||||
# Get current context
|
||||
context = get_trace_context()
|
||||
print(context["execution_id"])
|
||||
|
||||
# Clear context (usually not needed)
|
||||
from framework.observability import clear_trace_context
|
||||
clear_trace_context()
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
For tests, you may want to configure logging explicitly:
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from framework.observability import configure_logging
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_logging():
|
||||
configure_logging(level="DEBUG", format="human")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Production**: Use JSON format (`LOG_FORMAT=json` or `ENV=production`)
|
||||
2. **Development**: Use human-readable format (default)
|
||||
3. **Don't manually set context**: Let the framework manage it
|
||||
4. **Use standard logging**: No special APIs needed - just `logger.info()`
|
||||
5. **Add custom fields**: Use `extra` dict for additional metadata
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Logs missing trace context
|
||||
|
||||
Ensure `configure_logging()` has been called (usually automatic via `AgentRunner._setup()`).
|
||||
|
||||
### JSON logs not appearing
|
||||
|
||||
Check environment variables:
|
||||
```bash
|
||||
echo $LOG_FORMAT
|
||||
echo $ENV
|
||||
```
|
||||
|
||||
Or explicitly set:
|
||||
```python
|
||||
configure_logging(format="json")
|
||||
```
|
||||
|
||||
### Context not propagating
|
||||
|
||||
ContextVar automatically propagates through async calls. If context seems lost, check:
|
||||
- Are you in the same async execution context?
|
||||
- Has `set_trace_context()` been called for this execution?
|
||||
|
||||
## See Also
|
||||
|
||||
- [Logging Implementation](../observability/logging.py) - Source code
|
||||
- [AgentRunner](../runner/runner.py) - Where logging is configured
|
||||
- [Runtime Core](../runtime/core.py) - Where trace context is set
|
||||
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Observability module for automatic trace correlation and structured logging.
|
||||
|
||||
This module provides zero-friction observability:
|
||||
- Automatic trace context propagation via ContextVar
|
||||
- Structured JSON logging for production
|
||||
- Human-readable logging for development
|
||||
- No manual ID passing required
|
||||
"""
|
||||
|
||||
from framework.observability.logging import (
|
||||
clear_trace_context,
|
||||
configure_logging,
|
||||
get_trace_context,
|
||||
set_trace_context,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"configure_logging",
|
||||
"get_trace_context",
|
||||
"set_trace_context",
|
||||
"clear_trace_context",
|
||||
]
|
||||
@@ -0,0 +1,302 @@
|
||||
"""
|
||||
Structured logging with automatic trace context propagation.
|
||||
|
||||
Key Features:
|
||||
- Zero developer friction: Standard logger.info() calls get automatic context
|
||||
- ContextVar-based propagation: Thread-safe and async-safe
|
||||
- Dual output modes: JSON for production, human-readable for development
|
||||
- Correlation IDs: trace_id follows entire request flow automatically
|
||||
|
||||
Architecture:
|
||||
Runtime.start_run() → Generates trace_id, sets context once
|
||||
↓ (automatic propagation via ContextVar)
|
||||
GraphExecutor.execute() → Adds agent_id to context
|
||||
↓ (automatic propagation)
|
||||
Node.execute() → Adds node_id to context
|
||||
↓ (automatic propagation)
|
||||
User code → logger.info("message") → Gets ALL context automatically!
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from contextvars import ContextVar
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
# Context variable for trace propagation
|
||||
# ContextVar is thread-safe and async-safe - perfect for concurrent agent execution
|
||||
trace_context: ContextVar[dict[str, Any] | None] = ContextVar("trace_context", default=None)
|
||||
|
||||
# ANSI escape code pattern (matches \033[...m or \x1b[...m)
|
||||
ANSI_ESCAPE_PATTERN = re.compile(r"\x1b\[[0-9;]*m|\033\[[0-9;]*m")
|
||||
|
||||
|
||||
def strip_ansi_codes(text: str) -> str:
|
||||
"""Remove ANSI escape codes from text for clean JSON logging."""
|
||||
return ANSI_ESCAPE_PATTERN.sub("", text)
|
||||
|
||||
|
||||
class StructuredFormatter(logging.Formatter):
|
||||
"""
|
||||
JSON formatter for structured logging.
|
||||
|
||||
Produces machine-parseable log entries with:
|
||||
- Standard fields (timestamp, level, logger, message)
|
||||
- Trace context (trace_id, execution_id, agent_id, etc.) - AUTOMATIC
|
||||
- Custom fields from extra dict
|
||||
"""
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
"""Format log record as JSON."""
|
||||
# Get trace context for correlation - AUTOMATIC!
|
||||
context = trace_context.get() or {}
|
||||
|
||||
# Strip ANSI codes from message for clean JSON output
|
||||
message = strip_ansi_codes(record.getMessage())
|
||||
|
||||
# Build base log entry
|
||||
log_entry = {
|
||||
"timestamp": datetime.now(UTC).isoformat(),
|
||||
"level": record.levelname.lower(),
|
||||
"logger": record.name,
|
||||
"message": message,
|
||||
}
|
||||
|
||||
# Add trace context (trace_id, execution_id, agent_id, etc.) - AUTOMATIC!
|
||||
log_entry.update(context)
|
||||
|
||||
# Add custom fields from extra (optional)
|
||||
event = getattr(record, "event", None)
|
||||
if event is not None:
|
||||
if isinstance(event, str):
|
||||
log_entry["event"] = strip_ansi_codes(str(event))
|
||||
else:
|
||||
log_entry["event"] = event
|
||||
|
||||
latency_ms = getattr(record, "latency_ms", None)
|
||||
if latency_ms is not None:
|
||||
log_entry["latency_ms"] = latency_ms
|
||||
|
||||
tokens_used = getattr(record, "tokens_used", None)
|
||||
if tokens_used is not None:
|
||||
log_entry["tokens_used"] = tokens_used
|
||||
|
||||
node_id = getattr(record, "node_id", None)
|
||||
if node_id is not None:
|
||||
log_entry["node_id"] = node_id
|
||||
|
||||
model = getattr(record, "model", None)
|
||||
if model is not None:
|
||||
log_entry["model"] = model
|
||||
|
||||
# Add exception info if present (strip ANSI codes from exception text too)
|
||||
if record.exc_info:
|
||||
exception_text = self.formatException(record.exc_info)
|
||||
log_entry["exception"] = strip_ansi_codes(exception_text)
|
||||
|
||||
return json.dumps(log_entry)
|
||||
|
||||
|
||||
class HumanReadableFormatter(logging.Formatter):
|
||||
"""
|
||||
Human-readable formatter for development.
|
||||
|
||||
Provides colorized logs with trace context for local debugging.
|
||||
Includes trace_id prefix for correlation - AUTOMATIC!
|
||||
"""
|
||||
|
||||
COLORS = {
|
||||
"DEBUG": "\033[36m", # Cyan
|
||||
"INFO": "\033[32m", # Green
|
||||
"WARNING": "\033[33m", # Yellow
|
||||
"ERROR": "\033[31m", # Red
|
||||
"CRITICAL": "\033[35m", # Magenta
|
||||
}
|
||||
RESET = "\033[0m"
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
"""Format log record as human-readable string."""
|
||||
# Get trace context - AUTOMATIC!
|
||||
context = trace_context.get() or {}
|
||||
trace_id = context.get("trace_id", "")
|
||||
execution_id = context.get("execution_id", "")
|
||||
agent_id = context.get("agent_id", "")
|
||||
|
||||
# Build context prefix
|
||||
prefix_parts = []
|
||||
if trace_id:
|
||||
prefix_parts.append(f"trace:{trace_id[:8]}")
|
||||
if execution_id:
|
||||
prefix_parts.append(f"exec:{execution_id[-8:]}")
|
||||
if agent_id:
|
||||
prefix_parts.append(f"agent:{agent_id}")
|
||||
|
||||
context_prefix = f"[{' | '.join(prefix_parts)}] " if prefix_parts else ""
|
||||
|
||||
# Get color
|
||||
color = self.COLORS.get(record.levelname, "")
|
||||
reset = self.RESET
|
||||
|
||||
# Format log level (5 chars wide for alignment)
|
||||
level = f"{record.levelname:<8}"
|
||||
|
||||
# Add event if present
|
||||
event = ""
|
||||
record_event = getattr(record, "event", None)
|
||||
if record_event is not None:
|
||||
event = f" [{record_event}]"
|
||||
|
||||
# Format message: [LEVEL] [trace context] message
|
||||
return f"{color}[{level}]{reset} {context_prefix}{record.getMessage()}{event}"
|
||||
|
||||
|
||||
def configure_logging(
|
||||
level: str = "INFO",
|
||||
format: str = "auto", # "json", "human", or "auto"
|
||||
) -> None:
|
||||
"""
|
||||
Configure structured logging for the application.
|
||||
|
||||
This should be called ONCE at application startup, typically in:
|
||||
- AgentRunner._setup()
|
||||
- Main entry point
|
||||
- Test fixtures
|
||||
|
||||
Args:
|
||||
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
format: Output format:
|
||||
- "json": Machine-parseable JSON (for production)
|
||||
- "human": Human-readable with colors (for development)
|
||||
- "auto": JSON if LOG_FORMAT=json or ENV=production, else human
|
||||
|
||||
Examples:
|
||||
# Development mode (human-readable)
|
||||
configure_logging(level="DEBUG", format="human")
|
||||
|
||||
# Production mode (JSON)
|
||||
configure_logging(level="INFO", format="json")
|
||||
|
||||
# Auto-detect from environment
|
||||
configure_logging(level="INFO", format="auto")
|
||||
"""
|
||||
# Auto-detect format
|
||||
if format == "auto":
|
||||
# Use JSON if LOG_FORMAT=json or ENV=production
|
||||
log_format_env = os.getenv("LOG_FORMAT", "").lower()
|
||||
env = os.getenv("ENV", "development").lower()
|
||||
|
||||
if log_format_env == "json" or env == "production":
|
||||
format = "json"
|
||||
else:
|
||||
format = "human"
|
||||
|
||||
# Select formatter
|
||||
if format == "json":
|
||||
formatter = StructuredFormatter()
|
||||
# Disable colors in third-party libraries when using JSON format
|
||||
_disable_third_party_colors()
|
||||
else:
|
||||
formatter = HumanReadableFormatter()
|
||||
|
||||
# Configure handler
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
# Configure root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.handlers.clear()
|
||||
root_logger.addHandler(handler)
|
||||
root_logger.setLevel(level.upper())
|
||||
|
||||
# When in JSON mode, configure known third-party loggers to use JSON formatter
|
||||
# This ensures libraries like LiteLLM, httpcore also output clean JSON
|
||||
if format == "json":
|
||||
third_party_loggers = [
|
||||
"LiteLLM",
|
||||
"httpcore",
|
||||
"httpx",
|
||||
"openai",
|
||||
]
|
||||
for logger_name in third_party_loggers:
|
||||
logger = logging.getLogger(logger_name)
|
||||
# Clear existing handlers so records propagate to root and use our formatter there
|
||||
logger.handlers.clear()
|
||||
logger.propagate = True # Still propagate to root for consistency
|
||||
|
||||
|
||||
def _disable_third_party_colors() -> None:
|
||||
"""Disable color output in third-party libraries for clean JSON logging."""
|
||||
# Set NO_COLOR environment variable (common convention for disabling colors)
|
||||
os.environ["NO_COLOR"] = "1"
|
||||
os.environ["FORCE_COLOR"] = "0"
|
||||
|
||||
# Disable LiteLLM debug/verbose output colors if available
|
||||
try:
|
||||
import litellm
|
||||
|
||||
# LiteLLM respects NO_COLOR, but we can also suppress debug info
|
||||
if hasattr(litellm, "suppress_debug_info"):
|
||||
litellm.suppress_debug_info = True # type: ignore[attr-defined]
|
||||
except (ImportError, AttributeError):
|
||||
pass
|
||||
|
||||
|
||||
def set_trace_context(**kwargs: Any) -> None:
|
||||
"""
|
||||
Set trace context for current execution.
|
||||
|
||||
Context is stored in a ContextVar and AUTOMATICALLY propagates
|
||||
through async calls within the same execution context.
|
||||
|
||||
This is called by the framework at key points:
|
||||
- Runtime.start_run(): Sets trace_id, execution_id, goal_id
|
||||
- GraphExecutor.execute(): Adds agent_id
|
||||
- Node execution: Adds node_id
|
||||
|
||||
Developers/agents NEVER call this directly - it's framework-managed.
|
||||
|
||||
Args:
|
||||
**kwargs: Context fields (trace_id, execution_id, agent_id, etc.)
|
||||
|
||||
Example (framework code):
|
||||
# In Runtime.start_run()
|
||||
trace_id = uuid.uuid4().hex # 32 hex, W3C Trace Context compliant
|
||||
execution_id = uuid.uuid4().hex # 32 hex, OTel-aligned for correlation
|
||||
set_trace_context(
|
||||
trace_id=trace_id,
|
||||
execution_id=execution_id,
|
||||
goal_id=goal_id
|
||||
)
|
||||
# All subsequent logs in this execution get these fields automatically!
|
||||
"""
|
||||
current = trace_context.get() or {}
|
||||
trace_context.set({**current, **kwargs})
|
||||
|
||||
|
||||
def get_trace_context() -> dict:
|
||||
"""
|
||||
Get current trace context.
|
||||
|
||||
Returns:
|
||||
Dict with trace_id, execution_id, agent_id, etc.
|
||||
Empty dict if no context set.
|
||||
"""
|
||||
context = trace_context.get() or {}
|
||||
return context.copy()
|
||||
|
||||
|
||||
def clear_trace_context() -> None:
|
||||
"""
|
||||
Clear trace context.
|
||||
|
||||
Useful for:
|
||||
- Cleanup between test runs
|
||||
- Starting a completely new execution context
|
||||
- Manual context management (rare)
|
||||
|
||||
Note: Framework typically doesn't need to call this - ContextVar
|
||||
is execution-scoped and cleans itself up automatically.
|
||||
"""
|
||||
trace_context.set(None)
|
||||
@@ -63,6 +63,18 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
|
||||
default=None,
|
||||
help="LLM model to use (any LiteLLM-compatible name)",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--resume-session",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Resume from a specific session ID",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Resume from a specific checkpoint (requires --resume-session)",
|
||||
)
|
||||
run_parser.set_defaults(func=cmd_run)
|
||||
|
||||
# info command
|
||||
@@ -196,11 +208,189 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
|
||||
)
|
||||
tui_parser.set_defaults(func=cmd_tui)
|
||||
|
||||
# sessions command group (checkpoint/resume management)
|
||||
sessions_parser = subparsers.add_parser(
|
||||
"sessions",
|
||||
help="Manage agent sessions",
|
||||
description="List, inspect, and manage agent execution sessions.",
|
||||
)
|
||||
sessions_subparsers = sessions_parser.add_subparsers(
|
||||
dest="sessions_cmd",
|
||||
help="Session management commands",
|
||||
)
|
||||
|
||||
# sessions list
|
||||
sessions_list_parser = sessions_subparsers.add_parser(
|
||||
"list",
|
||||
help="List agent sessions",
|
||||
description="List all sessions for an agent.",
|
||||
)
|
||||
sessions_list_parser.add_argument(
|
||||
"agent_path",
|
||||
type=str,
|
||||
help="Path to agent folder",
|
||||
)
|
||||
sessions_list_parser.add_argument(
|
||||
"--status",
|
||||
choices=["all", "active", "failed", "completed", "paused"],
|
||||
default="all",
|
||||
help="Filter by session status (default: all)",
|
||||
)
|
||||
sessions_list_parser.add_argument(
|
||||
"--has-checkpoints",
|
||||
action="store_true",
|
||||
help="Show only sessions with checkpoints",
|
||||
)
|
||||
sessions_list_parser.set_defaults(func=cmd_sessions_list)
|
||||
|
||||
# sessions show
|
||||
sessions_show_parser = sessions_subparsers.add_parser(
|
||||
"show",
|
||||
help="Show session details",
|
||||
description="Display detailed information about a specific session.",
|
||||
)
|
||||
sessions_show_parser.add_argument(
|
||||
"agent_path",
|
||||
type=str,
|
||||
help="Path to agent folder",
|
||||
)
|
||||
sessions_show_parser.add_argument(
|
||||
"session_id",
|
||||
type=str,
|
||||
help="Session ID to inspect",
|
||||
)
|
||||
sessions_show_parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output as JSON",
|
||||
)
|
||||
sessions_show_parser.set_defaults(func=cmd_sessions_show)
|
||||
|
||||
# sessions checkpoints
|
||||
sessions_checkpoints_parser = sessions_subparsers.add_parser(
|
||||
"checkpoints",
|
||||
help="List session checkpoints",
|
||||
description="List all checkpoints for a session.",
|
||||
)
|
||||
sessions_checkpoints_parser.add_argument(
|
||||
"agent_path",
|
||||
type=str,
|
||||
help="Path to agent folder",
|
||||
)
|
||||
sessions_checkpoints_parser.add_argument(
|
||||
"session_id",
|
||||
type=str,
|
||||
help="Session ID",
|
||||
)
|
||||
sessions_checkpoints_parser.set_defaults(func=cmd_sessions_checkpoints)
|
||||
|
||||
# pause command
|
||||
pause_parser = subparsers.add_parser(
|
||||
"pause",
|
||||
help="Pause running session",
|
||||
description="Request graceful pause of a running agent session.",
|
||||
)
|
||||
pause_parser.add_argument(
|
||||
"agent_path",
|
||||
type=str,
|
||||
help="Path to agent folder",
|
||||
)
|
||||
pause_parser.add_argument(
|
||||
"session_id",
|
||||
type=str,
|
||||
help="Session ID to pause",
|
||||
)
|
||||
pause_parser.set_defaults(func=cmd_pause)
|
||||
|
||||
# resume command
|
||||
resume_parser = subparsers.add_parser(
|
||||
"resume",
|
||||
help="Resume session from checkpoint",
|
||||
description="Resume a paused or failed session from a checkpoint.",
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"agent_path",
|
||||
type=str,
|
||||
help="Path to agent folder",
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"session_id",
|
||||
type=str,
|
||||
help="Session ID to resume",
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"--checkpoint",
|
||||
"-c",
|
||||
type=str,
|
||||
help="Specific checkpoint ID to resume from (default: latest)",
|
||||
)
|
||||
resume_parser.add_argument(
|
||||
"--tui",
|
||||
action="store_true",
|
||||
help="Resume in TUI dashboard mode",
|
||||
)
|
||||
resume_parser.set_defaults(func=cmd_resume)
|
||||
|
||||
|
||||
def _load_resume_state(
|
||||
agent_path: str, session_id: str, checkpoint_id: str | None = None
|
||||
) -> dict | None:
|
||||
"""Load session or checkpoint state for headless resume.
|
||||
|
||||
Args:
|
||||
agent_path: Path to the agent folder (e.g., exports/my_agent)
|
||||
session_id: Session ID to resume from
|
||||
checkpoint_id: Optional checkpoint ID within the session
|
||||
|
||||
Returns:
|
||||
session_state dict for executor, or None if not found
|
||||
"""
|
||||
agent_name = Path(agent_path).name
|
||||
agent_work_dir = Path.home() / ".hive" / "agents" / agent_name
|
||||
session_dir = agent_work_dir / "sessions" / session_id
|
||||
|
||||
if not session_dir.exists():
|
||||
return None
|
||||
|
||||
if checkpoint_id:
|
||||
# Checkpoint-based resume: load checkpoint and extract state
|
||||
cp_path = session_dir / "checkpoints" / f"{checkpoint_id}.json"
|
||||
if not cp_path.exists():
|
||||
return None
|
||||
try:
|
||||
cp_data = json.loads(cp_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
return {
|
||||
"memory": cp_data.get("shared_memory", {}),
|
||||
"paused_at": cp_data.get("next_node") or cp_data.get("current_node"),
|
||||
"execution_path": cp_data.get("execution_path", []),
|
||||
"node_visit_counts": {},
|
||||
}
|
||||
else:
|
||||
# Session state resume: load state.json
|
||||
state_path = session_dir / "state.json"
|
||||
if not state_path.exists():
|
||||
return None
|
||||
try:
|
||||
state_data = json.loads(state_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return None
|
||||
progress = state_data.get("progress", {})
|
||||
paused_at = progress.get("paused_at") or progress.get("resume_from")
|
||||
return {
|
||||
"memory": state_data.get("memory", {}),
|
||||
"paused_at": paused_at,
|
||||
"execution_path": progress.get("path", []),
|
||||
"node_visit_counts": progress.get("node_visit_counts", {}),
|
||||
}
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> int:
|
||||
"""Run an exported agent."""
|
||||
import logging
|
||||
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
# Set logging level (quiet by default for cleaner output)
|
||||
@@ -239,8 +429,10 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
runner = AgentRunner.load(
|
||||
args.agent_path,
|
||||
model=args.model,
|
||||
enable_tui=True,
|
||||
)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Error loading agent: {e}")
|
||||
return
|
||||
@@ -253,7 +445,11 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
if runner._agent_runtime and not runner._agent_runtime.is_running:
|
||||
await runner._agent_runtime.start()
|
||||
|
||||
app = AdenTUI(runner._agent_runtime)
|
||||
app = AdenTUI(
|
||||
runner._agent_runtime,
|
||||
resume_session=getattr(args, "resume_session", None),
|
||||
resume_checkpoint=getattr(args, "checkpoint", None),
|
||||
)
|
||||
|
||||
# TUI handles execution via ChatRepl — user submits input,
|
||||
# ChatRepl calls runtime.trigger_and_wait(). No auto-launch.
|
||||
@@ -276,12 +472,35 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
runner = AgentRunner.load(
|
||||
args.agent_path,
|
||||
model=args.model,
|
||||
enable_tui=False,
|
||||
)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Load session/checkpoint state for resume (headless mode)
|
||||
session_state = None
|
||||
resume_session = getattr(args, "resume_session", None)
|
||||
checkpoint = getattr(args, "checkpoint", None)
|
||||
if resume_session:
|
||||
session_state = _load_resume_state(args.agent_path, resume_session, checkpoint)
|
||||
if session_state is None:
|
||||
print(
|
||||
f"Error: Could not load session state for {resume_session}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
if not args.quiet:
|
||||
resume_node = session_state.get("paused_at", "unknown")
|
||||
if checkpoint:
|
||||
print(f"Resuming from checkpoint: {checkpoint}")
|
||||
else:
|
||||
print(f"Resuming session: {resume_session}")
|
||||
print(f"Resume point: {resume_node}")
|
||||
print()
|
||||
|
||||
# Auto-inject user_id if the agent expects it but it's not provided
|
||||
entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else []
|
||||
if "user_id" in entry_input_keys and context.get("user_id") is None:
|
||||
@@ -301,7 +520,7 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
result = asyncio.run(runner.run(context))
|
||||
result = asyncio.run(runner.run(context, session_state=session_state))
|
||||
|
||||
# Format output
|
||||
output = {
|
||||
@@ -381,10 +600,14 @@ def cmd_run(args: argparse.Namespace) -> int:
|
||||
|
||||
def cmd_info(args: argparse.Namespace) -> int:
|
||||
"""Show agent information."""
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
try:
|
||||
runner = AgentRunner.load(args.agent_path)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
@@ -444,10 +667,14 @@ def cmd_info(args: argparse.Namespace) -> int:
|
||||
|
||||
def cmd_validate(args: argparse.Namespace) -> int:
|
||||
"""Validate an exported agent."""
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
try:
|
||||
runner = AgentRunner.load(args.agent_path)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
@@ -764,6 +991,7 @@ def cmd_shell(args: argparse.Namespace) -> int:
|
||||
"""Start an interactive agent session."""
|
||||
import logging
|
||||
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
# Configure logging to show runtime visibility
|
||||
@@ -788,6 +1016,9 @@ def cmd_shell(args: argparse.Namespace) -> int:
|
||||
|
||||
try:
|
||||
runner = AgentRunner.load(agent_path)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return 1
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
@@ -997,6 +1228,7 @@ def cmd_tui(args: argparse.Namespace) -> int:
|
||||
"""Browse agents and launch the interactive TUI dashboard."""
|
||||
import logging
|
||||
|
||||
from framework.credentials.models import CredentialError
|
||||
from framework.runner import AgentRunner
|
||||
from framework.tui.app import AdenTUI
|
||||
|
||||
@@ -1046,8 +1278,10 @@ def cmd_tui(args: argparse.Namespace) -> int:
|
||||
runner = AgentRunner.load(
|
||||
agent_path,
|
||||
model=args.model,
|
||||
enable_tui=True,
|
||||
)
|
||||
except CredentialError as e:
|
||||
print(f"\n{e}", file=sys.stderr)
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"Error loading agent: {e}")
|
||||
return
|
||||
@@ -1432,3 +1666,53 @@ def _interactive_multi(agents_dir: Path) -> int:
|
||||
|
||||
orchestrator.cleanup()
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_sessions_list(args: argparse.Namespace) -> int:
|
||||
"""List agent sessions."""
|
||||
print("⚠ Sessions list command not yet implemented")
|
||||
print("This will be available once checkpoint infrastructure is complete.")
|
||||
print(f"\nAgent: {args.agent_path}")
|
||||
print(f"Status filter: {args.status}")
|
||||
print(f"Has checkpoints: {args.has_checkpoints}")
|
||||
return 1
|
||||
|
||||
|
||||
def cmd_sessions_show(args: argparse.Namespace) -> int:
|
||||
"""Show detailed session information."""
|
||||
print("⚠ Session show command not yet implemented")
|
||||
print("This will be available once checkpoint infrastructure is complete.")
|
||||
print(f"\nAgent: {args.agent_path}")
|
||||
print(f"Session: {args.session_id}")
|
||||
return 1
|
||||
|
||||
|
||||
def cmd_sessions_checkpoints(args: argparse.Namespace) -> int:
|
||||
"""List checkpoints for a session."""
|
||||
print("⚠ Session checkpoints command not yet implemented")
|
||||
print("This will be available once checkpoint infrastructure is complete.")
|
||||
print(f"\nAgent: {args.agent_path}")
|
||||
print(f"Session: {args.session_id}")
|
||||
return 1
|
||||
|
||||
|
||||
def cmd_pause(args: argparse.Namespace) -> int:
|
||||
"""Pause a running session."""
|
||||
print("⚠ Pause command not yet implemented")
|
||||
print("This will be available once executor pause integration is complete.")
|
||||
print(f"\nAgent: {args.agent_path}")
|
||||
print(f"Session: {args.session_id}")
|
||||
return 1
|
||||
|
||||
|
||||
def cmd_resume(args: argparse.Namespace) -> int:
|
||||
"""Resume a session from checkpoint."""
|
||||
print("⚠ Resume command not yet implemented")
|
||||
print("This will be available once checkpoint resume integration is complete.")
|
||||
print(f"\nAgent: {args.agent_path}")
|
||||
print(f"Session: {args.session_id}")
|
||||
if args.checkpoint:
|
||||
print(f"Checkpoint: {args.checkpoint}")
|
||||
if args.tui:
|
||||
print("Mode: TUI")
|
||||
return 1
|
||||
|
||||
+224
-153
@@ -8,19 +8,22 @@ from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.config import get_hive_config, get_preferred_model
|
||||
from framework.graph import Goal
|
||||
from framework.graph.edge import AsyncEntryPointSpec, EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.executor import ExecutionResult, GraphExecutor
|
||||
from framework.graph.edge import (
|
||||
DEFAULT_MAX_TOKENS,
|
||||
AsyncEntryPointSpec,
|
||||
EdgeCondition,
|
||||
EdgeSpec,
|
||||
GraphSpec,
|
||||
)
|
||||
from framework.graph.executor import ExecutionResult
|
||||
from framework.graph.node import NodeSpec
|
||||
from framework.llm.provider import LLMProvider, Tool
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
|
||||
# Multi-entry-point runtime imports
|
||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.core import Runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
from framework.runtime.runtime_log_store import RuntimeLogStore
|
||||
from framework.runtime.runtime_logger import RuntimeLogger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from framework.runner.protocol import AgentMessage, CapabilityResponse
|
||||
@@ -28,9 +31,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration paths
|
||||
HIVE_CONFIG_FILE = Path.home() / ".hive" / "configuration.json"
|
||||
|
||||
|
||||
def _ensure_credential_key_env() -> None:
|
||||
"""Load HIVE_CREDENTIAL_KEY from shell config if not already in environment.
|
||||
@@ -60,17 +60,6 @@ def _ensure_credential_key_env() -> None:
|
||||
CLAUDE_CREDENTIALS_FILE = Path.home() / ".claude" / ".credentials.json"
|
||||
|
||||
|
||||
def get_hive_config() -> dict[str, Any]:
|
||||
"""Load hive configuration from ~/.hive/configuration.json."""
|
||||
if not HIVE_CONFIG_FILE.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(HIVE_CONFIG_FILE) as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def get_claude_code_token() -> str | None:
|
||||
"""
|
||||
Get the OAuth token from Claude Code subscription.
|
||||
@@ -268,11 +257,7 @@ class AgentRunner:
|
||||
@staticmethod
|
||||
def _resolve_default_model() -> str:
|
||||
"""Resolve the default model from ~/.hive/configuration.json."""
|
||||
config = get_hive_config()
|
||||
llm = config.get("llm", {})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{llm['provider']}/{llm['model']}"
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
return get_preferred_model()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -282,7 +267,7 @@ class AgentRunner:
|
||||
mock_mode: bool = False,
|
||||
storage_path: Path | None = None,
|
||||
model: str | None = None,
|
||||
enable_tui: bool = False,
|
||||
intro_message: str = "",
|
||||
):
|
||||
"""
|
||||
Initialize the runner (use AgentRunner.load() instead).
|
||||
@@ -294,14 +279,14 @@ class AgentRunner:
|
||||
mock_mode: If True, use mock LLM responses
|
||||
storage_path: Path for runtime storage (defaults to temp)
|
||||
model: Model to use (reads from agent config or ~/.hive/configuration.json if None)
|
||||
enable_tui: If True, forces use of AgentRuntime with EventBus
|
||||
intro_message: Optional greeting shown to user on TUI load
|
||||
"""
|
||||
self.agent_path = agent_path
|
||||
self.graph = graph
|
||||
self.goal = goal
|
||||
self.mock_mode = mock_mode
|
||||
self.model = model or self._resolve_default_model()
|
||||
self.enable_tui = enable_tui
|
||||
self.intro_message = intro_message
|
||||
|
||||
# Set up storage
|
||||
if storage_path:
|
||||
@@ -321,15 +306,17 @@ class AgentRunner:
|
||||
|
||||
# Initialize components
|
||||
self._tool_registry = ToolRegistry()
|
||||
self._runtime: Runtime | None = None
|
||||
self._llm: LLMProvider | None = None
|
||||
self._executor: GraphExecutor | None = None
|
||||
self._approval_callback: Callable | None = None
|
||||
|
||||
# Multi-entry-point support (AgentRuntime)
|
||||
# AgentRuntime — unified execution path for all agents
|
||||
self._agent_runtime: AgentRuntime | None = None
|
||||
self._uses_async_entry_points = self.graph.has_async_entry_points()
|
||||
|
||||
# Validate credentials before spawning MCP servers.
|
||||
# Fails fast with actionable guidance — no MCP noise on screen.
|
||||
self._validate_credentials()
|
||||
|
||||
# Auto-discover tools from tools.py
|
||||
tools_path = agent_path / "tools.py"
|
||||
if tools_path.exists():
|
||||
@@ -340,6 +327,93 @@ class AgentRunner:
|
||||
if mcp_config_path.exists():
|
||||
self._load_mcp_servers_from_config(mcp_config_path)
|
||||
|
||||
def _validate_credentials(self) -> None:
|
||||
"""Check that required credentials are available before spawning MCP servers.
|
||||
|
||||
Raises CredentialError with actionable guidance if any are missing.
|
||||
Uses graph node specs + CREDENTIAL_SPECS — no tool registry needed.
|
||||
"""
|
||||
required_tools: set[str] = set()
|
||||
for node in self.graph.nodes:
|
||||
if node.tools:
|
||||
required_tools.update(node.tools)
|
||||
node_types: set[str] = {node.node_type for node in self.graph.nodes}
|
||||
|
||||
try:
|
||||
from aden_tools.credentials import CREDENTIAL_SPECS
|
||||
|
||||
from framework.credentials import CredentialStore
|
||||
from framework.credentials.storage import (
|
||||
CompositeStorage,
|
||||
EncryptedFileStorage,
|
||||
EnvVarStorage,
|
||||
)
|
||||
except ImportError:
|
||||
return # aden_tools not installed, skip check
|
||||
|
||||
# Build credential store (same logic as validate())
|
||||
env_mapping = {
|
||||
(spec.credential_id or name): spec.env_var for name, spec in CREDENTIAL_SPECS.items()
|
||||
}
|
||||
storages: list = [EnvVarStorage(env_mapping=env_mapping)]
|
||||
if os.environ.get("HIVE_CREDENTIAL_KEY"):
|
||||
storages.insert(0, EncryptedFileStorage())
|
||||
if len(storages) == 1:
|
||||
storage = storages[0]
|
||||
else:
|
||||
storage = CompositeStorage(primary=storages[0], fallbacks=storages[1:])
|
||||
store = CredentialStore(storage=storage)
|
||||
|
||||
# Build reverse mappings
|
||||
tool_to_cred: dict[str, str] = {}
|
||||
node_type_to_cred: dict[str, str] = {}
|
||||
for cred_name, spec in CREDENTIAL_SPECS.items():
|
||||
for tool_name in spec.tools:
|
||||
tool_to_cred[tool_name] = cred_name
|
||||
for nt in spec.node_types:
|
||||
node_type_to_cred[nt] = cred_name
|
||||
|
||||
missing: list[str] = []
|
||||
checked: set[str] = set()
|
||||
|
||||
# Check tool credentials
|
||||
for tool_name in sorted(required_tools):
|
||||
cred_name = tool_to_cred.get(tool_name)
|
||||
if cred_name is None or cred_name in checked:
|
||||
continue
|
||||
checked.add(cred_name)
|
||||
spec = CREDENTIAL_SPECS[cred_name]
|
||||
cred_id = spec.credential_id or cred_name
|
||||
if spec.required and not store.is_available(cred_id):
|
||||
affected = sorted(t for t in required_tools if t in spec.tools)
|
||||
entry = f" {spec.env_var} for {', '.join(affected)}"
|
||||
if spec.help_url:
|
||||
entry += f"\n Get it at: {spec.help_url}"
|
||||
missing.append(entry)
|
||||
|
||||
# Check node type credentials (e.g., ANTHROPIC_API_KEY for LLM nodes)
|
||||
for nt in sorted(node_types):
|
||||
cred_name = node_type_to_cred.get(nt)
|
||||
if cred_name is None or cred_name in checked:
|
||||
continue
|
||||
checked.add(cred_name)
|
||||
spec = CREDENTIAL_SPECS[cred_name]
|
||||
cred_id = spec.credential_id or cred_name
|
||||
if spec.required and not store.is_available(cred_id):
|
||||
affected_types = sorted(t for t in node_types if t in spec.node_types)
|
||||
entry = f" {spec.env_var} for {', '.join(affected_types)} nodes"
|
||||
if spec.help_url:
|
||||
entry += f"\n Get it at: {spec.help_url}"
|
||||
missing.append(entry)
|
||||
|
||||
if missing:
|
||||
from framework.credentials.models import CredentialError
|
||||
|
||||
lines = ["Missing required credentials:\n"]
|
||||
lines.extend(missing)
|
||||
lines.append("\nTo fix: run /hive-credentials in Claude Code.")
|
||||
raise CredentialError("\n".join(lines))
|
||||
|
||||
@staticmethod
|
||||
def _import_agent_module(agent_path: Path):
|
||||
"""Import an agent package from its directory path.
|
||||
@@ -383,7 +457,6 @@ class AgentRunner:
|
||||
mock_mode: bool = False,
|
||||
storage_path: Path | None = None,
|
||||
model: str | None = None,
|
||||
enable_tui: bool = False,
|
||||
) -> "AgentRunner":
|
||||
"""
|
||||
Load an agent from an export folder.
|
||||
@@ -397,7 +470,6 @@ class AgentRunner:
|
||||
mock_mode: If True, use mock LLM responses
|
||||
storage_path: Path for runtime storage (defaults to ~/.hive/agents/{name})
|
||||
model: LLM model to use (reads from agent's default_config if None)
|
||||
enable_tui: If True, forces use of AgentRuntime with EventBus
|
||||
|
||||
Returns:
|
||||
AgentRunner instance ready to run
|
||||
@@ -425,7 +497,17 @@ class AgentRunner:
|
||||
if agent_config and hasattr(agent_config, "model"):
|
||||
model = agent_config.model
|
||||
|
||||
max_tokens = getattr(agent_config, "max_tokens", 1024) if agent_config else 1024
|
||||
if agent_config and hasattr(agent_config, "max_tokens"):
|
||||
max_tokens = agent_config.max_tokens
|
||||
else:
|
||||
hive_config = get_hive_config()
|
||||
max_tokens = hive_config.get("llm", {}).get("max_tokens", DEFAULT_MAX_TOKENS)
|
||||
|
||||
# Read intro_message from agent metadata (shown on TUI load)
|
||||
agent_metadata = getattr(agent_module, "metadata", None)
|
||||
intro_message = ""
|
||||
if agent_metadata and hasattr(agent_metadata, "intro_message"):
|
||||
intro_message = agent_metadata.intro_message
|
||||
|
||||
# Build GraphSpec from module-level variables
|
||||
graph = GraphSpec(
|
||||
@@ -448,7 +530,7 @@ class AgentRunner:
|
||||
mock_mode=mock_mode,
|
||||
storage_path=storage_path,
|
||||
model=model,
|
||||
enable_tui=enable_tui,
|
||||
intro_message=intro_message,
|
||||
)
|
||||
|
||||
# Fallback: load from agent.json (legacy JSON-based agents)
|
||||
@@ -466,7 +548,6 @@ class AgentRunner:
|
||||
mock_mode=mock_mode,
|
||||
storage_path=storage_path,
|
||||
model=model,
|
||||
enable_tui=enable_tui,
|
||||
)
|
||||
|
||||
def register_tool(
|
||||
@@ -556,12 +637,14 @@ class AgentRunner:
|
||||
callback: Function to call for approval (receives node info, returns bool)
|
||||
"""
|
||||
self._approval_callback = callback
|
||||
# If executor already exists, update it
|
||||
if self._executor is not None:
|
||||
self._executor.approval_callback = callback
|
||||
|
||||
def _setup(self) -> None:
|
||||
"""Set up runtime, LLM, and executor."""
|
||||
# Configure structured logging (auto-detects JSON vs human-readable)
|
||||
from framework.observability import configure_logging
|
||||
|
||||
configure_logging(level="INFO", format="auto")
|
||||
|
||||
# Set up session context for tools (workspace_id, agent_id, session_id)
|
||||
workspace_id = "default" # Could be derived from storage path
|
||||
agent_id = self.graph.id or "unknown"
|
||||
@@ -602,7 +685,8 @@ class AgentRunner:
|
||||
self._llm = LiteLLMProvider(model=self.model, api_key=api_key)
|
||||
else:
|
||||
# Fall back to environment variable
|
||||
api_key_env = self._get_api_key_env_var(self.model)
|
||||
# First check api_key_env_var from config (set by quickstart)
|
||||
api_key_env = llm_config.get("api_key_env_var") or self._get_api_key_env_var(self.model)
|
||||
if api_key_env and os.environ.get(api_key_env):
|
||||
self._llm = LiteLLMProvider(model=self.model)
|
||||
else:
|
||||
@@ -618,16 +702,11 @@ class AgentRunner:
|
||||
print(f"Warning: {api_key_env} not set. LLM calls will fail.")
|
||||
print(f"Set it with: export {api_key_env}=your-api-key")
|
||||
|
||||
# Get tools for executor/runtime
|
||||
# Get tools for runtime
|
||||
tools = list(self._tool_registry.get_tools().values())
|
||||
tool_executor = self._tool_registry.get_executor()
|
||||
|
||||
if self._uses_async_entry_points or self.enable_tui:
|
||||
# Multi-entry-point mode or TUI mode: use AgentRuntime
|
||||
self._setup_agent_runtime(tools, tool_executor)
|
||||
else:
|
||||
# Single-entry-point mode: use legacy GraphExecutor
|
||||
self._setup_legacy_executor(tools, tool_executor)
|
||||
self._setup_agent_runtime(tools, tool_executor)
|
||||
|
||||
def _get_api_key_env_var(self, model: str) -> str | None:
|
||||
"""Get the environment variable name for the API key based on model name."""
|
||||
@@ -642,7 +721,7 @@ class AgentRunner:
|
||||
elif model_lower.startswith("anthropic/") or model_lower.startswith("claude"):
|
||||
return "ANTHROPIC_API_KEY"
|
||||
elif model_lower.startswith("gemini/") or model_lower.startswith("google/"):
|
||||
return "GOOGLE_API_KEY"
|
||||
return "GEMINI_API_KEY"
|
||||
elif model_lower.startswith("mistral/"):
|
||||
return "MISTRAL_API_KEY"
|
||||
elif model_lower.startswith("groq/"):
|
||||
@@ -688,26 +767,6 @@ class AgentRunner:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _setup_legacy_executor(self, tools: list, tool_executor: Callable | None) -> None:
|
||||
"""Set up legacy single-entry-point execution using GraphExecutor."""
|
||||
# Create runtime
|
||||
self._runtime = Runtime(storage_path=self._storage_path)
|
||||
|
||||
# Create runtime logger
|
||||
log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs")
|
||||
runtime_logger = RuntimeLogger(store=log_store, agent_id=self.graph.id)
|
||||
|
||||
# Create executor
|
||||
self._executor = GraphExecutor(
|
||||
runtime=self._runtime,
|
||||
llm=self._llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
approval_callback=self._approval_callback,
|
||||
runtime_logger=runtime_logger,
|
||||
loop_config=self.graph.loop_config,
|
||||
)
|
||||
|
||||
def _setup_agent_runtime(self, tools: list, tool_executor: Callable | None) -> None:
|
||||
"""Set up multi-entry-point execution using AgentRuntime."""
|
||||
# Convert AsyncEntryPointSpec to EntryPointSpec for AgentRuntime
|
||||
@@ -725,9 +784,9 @@ class AgentRunner:
|
||||
)
|
||||
entry_points.append(ep)
|
||||
|
||||
# If TUI enabled but no entry points (single-entry agent), create default
|
||||
if not entry_points and self.enable_tui and self.graph.entry_node:
|
||||
logger.info("Creating default entry point for TUI")
|
||||
# Single-entry agent with no async entry points: create a default entry point
|
||||
if not entry_points and self.graph.entry_node:
|
||||
logger.info("Creating default entry point for single-entry agent")
|
||||
entry_points.append(
|
||||
EntryPointSpec(
|
||||
id="default",
|
||||
@@ -741,6 +800,17 @@ class AgentRunner:
|
||||
# Create AgentRuntime with all entry points
|
||||
log_store = RuntimeLogStore(base_path=self._storage_path / "runtime_logs")
|
||||
|
||||
# Enable checkpointing by default for resumable sessions
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
|
||||
checkpoint_config = CheckpointConfig(
|
||||
enabled=True,
|
||||
checkpoint_on_node_start=False, # Only checkpoint after nodes complete
|
||||
checkpoint_on_node_complete=True,
|
||||
checkpoint_max_age_days=7,
|
||||
async_checkpoint=True, # Non-blocking
|
||||
)
|
||||
|
||||
self._agent_runtime = create_agent_runtime(
|
||||
graph=self.graph,
|
||||
goal=self.goal,
|
||||
@@ -750,8 +820,12 @@ class AgentRunner:
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
runtime_log_store=log_store,
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
# Pass intro_message through for TUI display
|
||||
self._agent_runtime.intro_message = self.intro_message
|
||||
|
||||
async def run(
|
||||
self,
|
||||
input_data: dict | None = None,
|
||||
@@ -791,32 +865,9 @@ class AgentRunner:
|
||||
error=error_msg,
|
||||
)
|
||||
|
||||
if self._uses_async_entry_points or self.enable_tui:
|
||||
# Multi-entry-point mode: use AgentRuntime
|
||||
return await self._run_with_agent_runtime(
|
||||
input_data=input_data or {},
|
||||
entry_point_id=entry_point_id,
|
||||
)
|
||||
else:
|
||||
# Legacy single-entry-point mode
|
||||
return await self._run_with_executor(
|
||||
input_data=input_data or {},
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
async def _run_with_executor(
|
||||
self,
|
||||
input_data: dict,
|
||||
session_state: dict | None = None,
|
||||
) -> ExecutionResult:
|
||||
"""Run using legacy GraphExecutor (single entry point)."""
|
||||
if self._executor is None:
|
||||
self._setup()
|
||||
|
||||
return await self._executor.execute(
|
||||
graph=self.graph,
|
||||
goal=self.goal,
|
||||
input_data=input_data,
|
||||
return await self._run_with_agent_runtime(
|
||||
input_data=input_data or {},
|
||||
entry_point_id=entry_point_id,
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
@@ -824,8 +875,11 @@ class AgentRunner:
|
||||
self,
|
||||
input_data: dict,
|
||||
entry_point_id: str | None = None,
|
||||
session_state: dict | None = None,
|
||||
) -> ExecutionResult:
|
||||
"""Run using AgentRuntime (multi-entry-point)."""
|
||||
"""Run using AgentRuntime."""
|
||||
import sys
|
||||
|
||||
if self._agent_runtime is None:
|
||||
self._setup()
|
||||
|
||||
@@ -833,6 +887,52 @@ class AgentRunner:
|
||||
if not self._agent_runtime.is_running:
|
||||
await self._agent_runtime.start()
|
||||
|
||||
# Set up stdin-based I/O for client-facing nodes in headless mode.
|
||||
# When a client_facing EventLoopNode calls ask_user(), it emits
|
||||
# CLIENT_INPUT_REQUESTED on the event bus and blocks. We subscribe
|
||||
# a handler that prints the prompt and reads from stdin, then injects
|
||||
# the user's response back into the node to unblock it.
|
||||
has_client_facing = any(n.client_facing for n in self.graph.nodes)
|
||||
sub_ids: list[str] = []
|
||||
|
||||
if has_client_facing and sys.stdin.isatty():
|
||||
from framework.runtime.event_bus import EventType
|
||||
|
||||
runtime = self._agent_runtime
|
||||
|
||||
async def _handle_client_output(event):
|
||||
"""Print agent output to stdout as it streams."""
|
||||
content = event.data.get("content", "")
|
||||
if content:
|
||||
print(content, end="", flush=True)
|
||||
|
||||
async def _handle_input_requested(event):
|
||||
"""Read user input from stdin and inject it into the node."""
|
||||
import asyncio
|
||||
|
||||
node_id = event.node_id
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
user_input = await loop.run_in_executor(None, input, "\n>>> ")
|
||||
except EOFError:
|
||||
user_input = ""
|
||||
|
||||
# Inject into the waiting EventLoopNode via runtime
|
||||
await runtime.inject_input(node_id, user_input)
|
||||
|
||||
sub_ids.append(
|
||||
runtime.subscribe_to_events(
|
||||
event_types=[EventType.CLIENT_OUTPUT_DELTA],
|
||||
handler=_handle_client_output,
|
||||
)
|
||||
)
|
||||
sub_ids.append(
|
||||
runtime.subscribe_to_events(
|
||||
event_types=[EventType.CLIENT_INPUT_REQUESTED],
|
||||
handler=_handle_input_requested,
|
||||
)
|
||||
)
|
||||
|
||||
# Determine entry point
|
||||
if entry_point_id is None:
|
||||
# Use first entry point or "default" if no entry points defined
|
||||
@@ -842,44 +942,38 @@ class AgentRunner:
|
||||
else:
|
||||
entry_point_id = "default"
|
||||
|
||||
# Trigger and wait for result
|
||||
result = await self._agent_runtime.trigger_and_wait(
|
||||
entry_point_id=entry_point_id,
|
||||
input_data=input_data,
|
||||
)
|
||||
|
||||
# Return result or create error result
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
error="Execution timed out or failed to complete",
|
||||
try:
|
||||
# Trigger and wait for result
|
||||
result = await self._agent_runtime.trigger_and_wait(
|
||||
entry_point_id=entry_point_id,
|
||||
input_data=input_data,
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
# === Multi-Entry-Point API (for agents with async_entry_points) ===
|
||||
# Return result or create error result
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
error="Execution timed out or failed to complete",
|
||||
)
|
||||
finally:
|
||||
# Clean up subscriptions
|
||||
for sub_id in sub_ids:
|
||||
self._agent_runtime.unsubscribe_from_events(sub_id)
|
||||
|
||||
# === Runtime API ===
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the agent runtime (for multi-entry-point agents).
|
||||
|
||||
This starts all registered entry points and allows concurrent execution.
|
||||
For single-entry-point agents, this is a no-op.
|
||||
"""
|
||||
if not self._uses_async_entry_points:
|
||||
return
|
||||
|
||||
"""Start the agent runtime."""
|
||||
if self._agent_runtime is None:
|
||||
self._setup()
|
||||
|
||||
await self._agent_runtime.start()
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""
|
||||
Stop the agent runtime (for multi-entry-point agents).
|
||||
|
||||
For single-entry-point agents, this is a no-op.
|
||||
"""
|
||||
"""Stop the agent runtime."""
|
||||
if self._agent_runtime is not None:
|
||||
await self._agent_runtime.stop()
|
||||
|
||||
@@ -892,7 +986,7 @@ class AgentRunner:
|
||||
"""
|
||||
Trigger execution at a specific entry point (non-blocking).
|
||||
|
||||
For multi-entry-point agents only. Returns execution ID for tracking.
|
||||
Returns execution ID for tracking.
|
||||
|
||||
Args:
|
||||
entry_point_id: Which entry point to trigger
|
||||
@@ -901,16 +995,7 @@ class AgentRunner:
|
||||
|
||||
Returns:
|
||||
Execution ID for tracking
|
||||
|
||||
Raises:
|
||||
RuntimeError: If agent doesn't use async entry points
|
||||
"""
|
||||
if not self._uses_async_entry_points:
|
||||
raise RuntimeError(
|
||||
"trigger() is only available for multi-entry-point agents. "
|
||||
"Use run() for single-entry-point agents."
|
||||
)
|
||||
|
||||
if self._agent_runtime is None:
|
||||
self._setup()
|
||||
|
||||
@@ -927,19 +1012,9 @@ class AgentRunner:
|
||||
"""
|
||||
Get goal progress across all execution streams.
|
||||
|
||||
For multi-entry-point agents only.
|
||||
|
||||
Returns:
|
||||
Dict with overall_progress, criteria_status, constraint_violations, etc.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If agent doesn't use async entry points
|
||||
"""
|
||||
if not self._uses_async_entry_points:
|
||||
raise RuntimeError(
|
||||
"get_goal_progress() is only available for multi-entry-point agents."
|
||||
)
|
||||
|
||||
if self._agent_runtime is None:
|
||||
self._setup()
|
||||
|
||||
@@ -947,14 +1022,11 @@ class AgentRunner:
|
||||
|
||||
def get_entry_points(self) -> list[EntryPointSpec]:
|
||||
"""
|
||||
Get all registered entry points (for multi-entry-point agents).
|
||||
Get all registered entry points.
|
||||
|
||||
Returns:
|
||||
List of EntryPointSpec objects
|
||||
"""
|
||||
if not self._uses_async_entry_points:
|
||||
return []
|
||||
|
||||
if self._agent_runtime is None:
|
||||
self._setup()
|
||||
|
||||
@@ -1378,7 +1450,7 @@ Respond with JSON only:
|
||||
self._temp_dir = None
|
||||
|
||||
async def cleanup_async(self) -> None:
|
||||
"""Clean up resources (asynchronous - for multi-entry-point agents)."""
|
||||
"""Clean up resources (asynchronous)."""
|
||||
# Stop agent runtime if running
|
||||
if self._agent_runtime is not None and self._agent_runtime.is_running:
|
||||
await self._agent_runtime.stop()
|
||||
@@ -1389,8 +1461,7 @@ Respond with JSON only:
|
||||
async def __aenter__(self) -> "AgentRunner":
|
||||
"""Context manager entry."""
|
||||
self._setup()
|
||||
# Start runtime for multi-entry-point agents
|
||||
if self._uses_async_entry_points and self._agent_runtime is not None:
|
||||
if self._agent_runtime is not None:
|
||||
await self._agent_runtime.start()
|
||||
return self
|
||||
|
||||
|
||||
@@ -0,0 +1,172 @@
|
||||
# Agent Runtime
|
||||
|
||||
Unified execution system for all Hive agents. Every agent — single-entry or multi-entry, headless or TUI — runs through the same runtime stack.
|
||||
|
||||
## Topology
|
||||
|
||||
```
|
||||
AgentRunner.load(agent_path)
|
||||
|
|
||||
AgentRunner
|
||||
(factory + public API)
|
||||
|
|
||||
_setup_agent_runtime()
|
||||
|
|
||||
AgentRuntime
|
||||
(lifecycle + orchestration)
|
||||
/ | \
|
||||
Stream A Stream B Stream C ← one per entry point
|
||||
| | |
|
||||
GraphExecutor GraphExecutor GraphExecutor
|
||||
| | |
|
||||
Node → Node → Node (graph traversal)
|
||||
```
|
||||
|
||||
Single-entry agents get a `"default"` entry point automatically. There is no separate code path.
|
||||
|
||||
## Components
|
||||
|
||||
| Component | File | Role |
|
||||
|---|---|---|
|
||||
| `AgentRunner` | `runner/runner.py` | Load agents, configure tools/LLM, expose high-level API |
|
||||
| `AgentRuntime` | `runtime/agent_runtime.py` | Lifecycle management, entry point routing, event bus |
|
||||
| `ExecutionStream` | `runtime/execution_stream.py` | Per-entry-point execution queue, session persistence |
|
||||
| `GraphExecutor` | `graph/executor.py` | Node traversal, tool dispatch, checkpointing |
|
||||
| `EventBus` | `runtime/event_bus.py` | Pub/sub for execution events (streaming, I/O) |
|
||||
| `SharedStateManager` | `runtime/shared_state.py` | Cross-stream state with isolation levels |
|
||||
| `OutcomeAggregator` | `runtime/outcome_aggregator.py` | Goal progress tracking across streams |
|
||||
| `SessionStore` | `storage/session_store.py` | Session state persistence (`sessions/{id}/state.json`) |
|
||||
|
||||
## Programming Interface
|
||||
|
||||
### AgentRunner (high-level)
|
||||
|
||||
```python
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
# Load and run
|
||||
runner = AgentRunner.load("exports/my_agent", model="anthropic/claude-sonnet-4-20250514")
|
||||
result = await runner.run({"query": "hello"})
|
||||
|
||||
# Resume from paused session
|
||||
result = await runner.run({"query": "continue"}, session_state=saved_state)
|
||||
|
||||
# Lifecycle
|
||||
await runner.start() # Start the runtime
|
||||
await runner.stop() # Stop the runtime
|
||||
exec_id = await runner.trigger("default", {}) # Non-blocking trigger
|
||||
progress = await runner.get_goal_progress() # Goal evaluation
|
||||
entry_points = runner.get_entry_points() # List entry points
|
||||
|
||||
# Context manager
|
||||
async with AgentRunner.load("exports/my_agent") as runner:
|
||||
result = await runner.run({"query": "hello"})
|
||||
|
||||
# Cleanup
|
||||
runner.cleanup() # Synchronous
|
||||
await runner.cleanup_async() # Asynchronous
|
||||
```
|
||||
|
||||
### AgentRuntime (lower-level)
|
||||
|
||||
```python
|
||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
# Create runtime with entry points
|
||||
runtime = create_agent_runtime(
|
||||
graph=graph,
|
||||
goal=goal,
|
||||
storage_path=Path("~/.hive/agents/my_agent"),
|
||||
entry_points=[
|
||||
EntryPointSpec(id="default", name="Default", entry_node="start", trigger_type="manual"),
|
||||
],
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
# Lifecycle
|
||||
await runtime.start()
|
||||
await runtime.stop()
|
||||
|
||||
# Execution
|
||||
exec_id = await runtime.trigger("default", {"query": "hello"}) # Non-blocking
|
||||
result = await runtime.trigger_and_wait("default", {"query": "hello"}) # Blocking
|
||||
result = await runtime.trigger_and_wait("default", {}, session_state=state) # Resume
|
||||
|
||||
# Client-facing node I/O
|
||||
await runtime.inject_input(node_id="chat", content="user response")
|
||||
|
||||
# Events
|
||||
sub_id = runtime.subscribe_to_events(
|
||||
event_types=[EventType.CLIENT_OUTPUT_DELTA],
|
||||
handler=my_handler,
|
||||
)
|
||||
runtime.unsubscribe_from_events(sub_id)
|
||||
|
||||
# Inspection
|
||||
runtime.is_running # bool
|
||||
runtime.event_bus # EventBus
|
||||
runtime.state_manager # SharedStateManager
|
||||
runtime.get_stats() # Runtime statistics
|
||||
```
|
||||
|
||||
## Execution Flow
|
||||
|
||||
1. `AgentRunner.run()` calls `AgentRuntime.trigger_and_wait()`
|
||||
2. `AgentRuntime` routes to the `ExecutionStream` for the entry point
|
||||
3. `ExecutionStream` creates a `GraphExecutor` and calls `execute()`
|
||||
4. `GraphExecutor` traverses nodes, dispatches tools, manages checkpoints
|
||||
5. `ExecutionResult` flows back up through the stack
|
||||
6. `ExecutionStream` writes session state to disk
|
||||
|
||||
## Session Resume
|
||||
|
||||
All execution paths support session resume:
|
||||
|
||||
```python
|
||||
# First run (agent pauses at a client-facing node)
|
||||
result = await runner.run({"query": "start task"})
|
||||
# result.paused_at = "review-node"
|
||||
# result.session_state = {"memory": {...}, "paused_at": "review-node", ...}
|
||||
|
||||
# Resume
|
||||
result = await runner.run({"input": "approved"}, session_state=result.session_state)
|
||||
```
|
||||
|
||||
Session state flows: `AgentRunner.run()` → `AgentRuntime.trigger_and_wait()` → `ExecutionStream.execute()` → `GraphExecutor.execute()`.
|
||||
|
||||
Checkpoints are saved at node boundaries (`sessions/{id}/checkpoints/`) for crash recovery.
|
||||
|
||||
## Event Bus
|
||||
|
||||
The `EventBus` provides real-time execution visibility:
|
||||
|
||||
| Event | When |
|
||||
|---|---|
|
||||
| `NODE_STARTED` | Node begins execution |
|
||||
| `NODE_COMPLETED` | Node finishes |
|
||||
| `TOOL_CALL_STARTED` | Tool invocation begins |
|
||||
| `TOOL_CALL_COMPLETED` | Tool invocation finishes |
|
||||
| `CLIENT_OUTPUT_DELTA` | Agent streams text to user |
|
||||
| `CLIENT_INPUT_REQUESTED` | Agent needs user input |
|
||||
| `EXECUTION_COMPLETED` | Full execution finishes |
|
||||
|
||||
In headless mode, `AgentRunner` subscribes to `CLIENT_OUTPUT_DELTA` and `CLIENT_INPUT_REQUESTED` to print output and read stdin. In TUI mode, `AdenTUI` subscribes to route events to UI widgets.
|
||||
|
||||
## Storage Layout
|
||||
|
||||
```
|
||||
~/.hive/agents/{agent_name}/
|
||||
sessions/
|
||||
session_YYYYMMDD_HHMMSS_{uuid}/
|
||||
state.json # Session state (status, memory, progress)
|
||||
checkpoints/ # Node-boundary snapshots
|
||||
logs/
|
||||
summary.json # Execution summary
|
||||
details.jsonl # Detailed event log
|
||||
tool_logs.jsonl # Tool call log
|
||||
runtime_logs/ # Cross-session runtime logs
|
||||
```
|
||||
@@ -0,0 +1,842 @@
|
||||
# Resumable Sessions Design
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Currently, when an agent encounters a failure during execution (e.g., credential validation, API errors, tool failures), the entire session is lost. This creates a poor user experience, especially when:
|
||||
|
||||
1. The agent has completed significant work before the failure
|
||||
2. The failure is recoverable (e.g., adding missing credentials)
|
||||
3. The user wants to retry from the exact failure point without redoing work
|
||||
|
||||
## Design Goals
|
||||
|
||||
1. **Crash Recovery**: Sessions can resume after process crashes or errors
|
||||
2. **Partial Completion**: Preserve work done by nodes that completed successfully
|
||||
3. **Flexible Resume Points**: Resume from exact failure point or previous checkpoints
|
||||
4. **State Consistency**: Guarantee consistent SharedMemory and conversation state
|
||||
5. **Minimal Overhead**: Checkpointing shouldn't significantly impact performance
|
||||
6. **User Control**: Users can inspect, modify, and resume sessions explicitly
|
||||
|
||||
## Architecture
|
||||
|
||||
### 1. Checkpoint System
|
||||
|
||||
#### Checkpoint Types
|
||||
|
||||
**Automatic Checkpoints** (saved automatically by framework):
|
||||
- `node_start`: Before each node begins execution
|
||||
- `node_complete`: After each node successfully completes
|
||||
- `edge_transition`: Before traversing to next node
|
||||
- `loop_iteration`: At each iteration in EventLoopNode (optional)
|
||||
|
||||
**Manual Checkpoints** (triggered by agent designer):
|
||||
- `safe_point`: Explicitly marked safe points in graph
|
||||
- `user_checkpoint`: Before awaiting user input in client-facing nodes
|
||||
|
||||
#### Checkpoint Data Structure
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class Checkpoint:
|
||||
"""Single checkpoint in execution timeline."""
|
||||
|
||||
# Identity
|
||||
checkpoint_id: str # Format: checkpoint_{timestamp}_{uuid_short}
|
||||
session_id: str
|
||||
checkpoint_type: str # "node_start", "node_complete", etc.
|
||||
|
||||
# Timestamps
|
||||
created_at: str # ISO 8601
|
||||
|
||||
# Execution state
|
||||
current_node: str | None
|
||||
next_node: str | None # For edge_transition checkpoints
|
||||
execution_path: list[str] # Nodes executed so far
|
||||
|
||||
# Memory state (snapshot)
|
||||
shared_memory: dict[str, Any] # Full SharedMemory._data
|
||||
|
||||
# Per-node conversation state references
|
||||
# (actual conversations stored separately, reference by node_id)
|
||||
conversation_states: dict[str, str] # {node_id: conversation_checkpoint_id}
|
||||
|
||||
# Output accumulator state
|
||||
accumulated_outputs: dict[str, Any]
|
||||
|
||||
# Execution metrics (for resuming quality tracking)
|
||||
metrics_snapshot: dict[str, Any]
|
||||
|
||||
# Metadata
|
||||
is_clean: bool # True if no failures/retries before this checkpoint
|
||||
can_resume_from: bool # False if checkpoint is in unstable state
|
||||
description: str # Human-readable checkpoint description
|
||||
```
|
||||
|
||||
#### Storage Structure
|
||||
|
||||
```
|
||||
~/.hive/agents/{agent_name}/
|
||||
└── sessions/
|
||||
└── session_YYYYMMDD_HHMMSS_{uuid}/
|
||||
├── state.json # Session state (existing)
|
||||
├── checkpoints/
|
||||
│ ├── index.json # Checkpoint index/manifest
|
||||
│ ├── checkpoint_1.json # Individual checkpoints
|
||||
│ ├── checkpoint_2.json
|
||||
│ └── checkpoint_N.json
|
||||
├── conversations/ # Per-node conversation state (existing)
|
||||
│ ├── node_id_1/
|
||||
│ │ ├── parts/
|
||||
│ │ ├── meta.json
|
||||
│ │ └── cursor.json
|
||||
│ └── node_id_2/...
|
||||
├── data/ # Spillover artifacts (existing)
|
||||
└── logs/ # L1/L2/L3 logs (existing)
|
||||
```
|
||||
|
||||
**Checkpoint Index Format** (`checkpoints/index.json`):
|
||||
```json
|
||||
{
|
||||
"session_id": "session_20260208_143022_abc12345",
|
||||
"checkpoints": [
|
||||
{
|
||||
"checkpoint_id": "checkpoint_20260208_143030_xyz123",
|
||||
"type": "node_complete",
|
||||
"created_at": "2026-02-08T14:30:30.123Z",
|
||||
"current_node": "collector",
|
||||
"is_clean": true,
|
||||
"can_resume_from": true,
|
||||
"description": "Completed collector node successfully"
|
||||
},
|
||||
{
|
||||
"checkpoint_id": "checkpoint_20260208_143045_abc789",
|
||||
"type": "node_start",
|
||||
"created_at": "2026-02-08T14:30:45.456Z",
|
||||
"current_node": "analyzer",
|
||||
"is_clean": true,
|
||||
"can_resume_from": true,
|
||||
"description": "Starting analyzer node"
|
||||
}
|
||||
],
|
||||
"latest_checkpoint_id": "checkpoint_20260208_143045_abc789",
|
||||
"total_checkpoints": 2
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Resume Mechanism
|
||||
|
||||
#### Resume Flow
|
||||
|
||||
```python
|
||||
# High-level resume flow
|
||||
async def resume_session(
|
||||
session_id: str,
|
||||
checkpoint_id: str | None = None, # None = resume from latest
|
||||
modifications: dict[str, Any] | None = None, # Override memory values
|
||||
) -> ExecutionResult:
|
||||
"""
|
||||
Resume a session from a checkpoint.
|
||||
|
||||
Args:
|
||||
session_id: Session to resume
|
||||
checkpoint_id: Specific checkpoint (None = latest)
|
||||
modifications: Optional memory/state modifications before resume
|
||||
|
||||
Returns:
|
||||
ExecutionResult with resumed execution
|
||||
"""
|
||||
# 1. Load session state
|
||||
session_state = await session_store.read_state(session_id)
|
||||
|
||||
# 2. Verify session is resumable
|
||||
if not session_state.is_resumable:
|
||||
raise ValueError(f"Session {session_id} is not resumable")
|
||||
|
||||
# 3. Load checkpoint
|
||||
checkpoint = await checkpoint_store.load_checkpoint(
|
||||
session_id,
|
||||
checkpoint_id or session_state.progress.resume_from
|
||||
)
|
||||
|
||||
# 4. Restore state
|
||||
# - Restore SharedMemory from checkpoint.shared_memory
|
||||
# - Restore per-node conversations from checkpoint.conversation_states
|
||||
# - Restore output accumulator from checkpoint.accumulated_outputs
|
||||
# - Apply modifications if provided
|
||||
|
||||
# 5. Resume execution from checkpoint.next_node or checkpoint.current_node
|
||||
result = await executor.execute(
|
||||
graph=graph,
|
||||
goal=goal,
|
||||
memory=restored_memory,
|
||||
entry_point=checkpoint.next_node or checkpoint.current_node,
|
||||
session_state=restored_session_state,
|
||||
)
|
||||
|
||||
# 6. Update session state with resumed execution
|
||||
await session_store.write_state(session_id, updated_state)
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
#### Checkpoint Restoration
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class CheckpointStore:
|
||||
"""Manages checkpoint storage and retrieval."""
|
||||
|
||||
async def save_checkpoint(
|
||||
self,
|
||||
session_id: str,
|
||||
checkpoint: Checkpoint,
|
||||
) -> None:
|
||||
"""Save a checkpoint atomically."""
|
||||
# 1. Write checkpoint file: checkpoints/checkpoint_{id}.json
|
||||
# 2. Update index: checkpoints/index.json
|
||||
# 3. Use atomic write for crash safety
|
||||
|
||||
async def load_checkpoint(
|
||||
self,
|
||||
session_id: str,
|
||||
checkpoint_id: str | None = None,
|
||||
) -> Checkpoint | None:
|
||||
"""Load a checkpoint by ID or latest."""
|
||||
# 1. Read checkpoint index
|
||||
# 2. Find checkpoint by ID (or latest if None)
|
||||
# 3. Load and deserialize checkpoint file
|
||||
|
||||
async def list_checkpoints(
|
||||
self,
|
||||
session_id: str,
|
||||
checkpoint_type: str | None = None,
|
||||
is_clean: bool | None = None,
|
||||
) -> list[Checkpoint]:
|
||||
"""List all checkpoints for a session with optional filters."""
|
||||
|
||||
async def delete_checkpoint(
|
||||
self,
|
||||
session_id: str,
|
||||
checkpoint_id: str,
|
||||
) -> bool:
|
||||
"""Delete a specific checkpoint."""
|
||||
|
||||
async def prune_checkpoints(
|
||||
self,
|
||||
session_id: str,
|
||||
keep_count: int = 10,
|
||||
keep_clean_only: bool = False,
|
||||
) -> int:
|
||||
"""Prune old checkpoints, keeping most recent N."""
|
||||
```
|
||||
|
||||
### 3. GraphExecutor Integration
|
||||
|
||||
#### Modified Execution Loop
|
||||
|
||||
```python
|
||||
# In GraphExecutor.execute()
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
graph: GraphSpec,
|
||||
goal: Goal,
|
||||
memory: SharedMemory | None = None,
|
||||
entry_point: str = "start",
|
||||
session_state: dict[str, Any] | None = None,
|
||||
checkpoint_config: CheckpointConfig | None = None,
|
||||
) -> ExecutionResult:
|
||||
"""
|
||||
Execute graph with checkpointing support.
|
||||
|
||||
New parameters:
|
||||
checkpoint_config: Configuration for checkpointing behavior
|
||||
"""
|
||||
|
||||
# Initialize checkpoint store
|
||||
checkpoint_store = CheckpointStore(storage_path / "checkpoints")
|
||||
|
||||
# Restore from checkpoint if session_state indicates resume
|
||||
if session_state and session_state.get("resume_from"):
|
||||
checkpoint = await checkpoint_store.load_checkpoint(
|
||||
session_id,
|
||||
session_state["resume_from"]
|
||||
)
|
||||
memory = self._restore_memory_from_checkpoint(checkpoint)
|
||||
entry_point = checkpoint.next_node or checkpoint.current_node
|
||||
|
||||
current_node = entry_point
|
||||
|
||||
while current_node:
|
||||
# CHECKPOINT: node_start
|
||||
if checkpoint_config and checkpoint_config.checkpoint_on_node_start:
|
||||
await self._save_checkpoint(
|
||||
checkpoint_store,
|
||||
checkpoint_type="node_start",
|
||||
current_node=current_node,
|
||||
memory=memory,
|
||||
# ... other state
|
||||
)
|
||||
|
||||
try:
|
||||
# Execute node
|
||||
result = await self._execute_node(current_node, memory, context)
|
||||
|
||||
# CHECKPOINT: node_complete
|
||||
if checkpoint_config and checkpoint_config.checkpoint_on_node_complete:
|
||||
await self._save_checkpoint(
|
||||
checkpoint_store,
|
||||
checkpoint_type="node_complete",
|
||||
current_node=current_node,
|
||||
memory=memory,
|
||||
# ... other state
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# On failure, mark current checkpoint as resume point
|
||||
await self._mark_failure_checkpoint(
|
||||
checkpoint_store,
|
||||
current_node=current_node,
|
||||
error=str(e),
|
||||
)
|
||||
raise
|
||||
|
||||
# Find next edge
|
||||
next_node = self._find_next_node(current_node, result, memory)
|
||||
|
||||
# CHECKPOINT: edge_transition
|
||||
if next_node and checkpoint_config and checkpoint_config.checkpoint_on_edge:
|
||||
await self._save_checkpoint(
|
||||
checkpoint_store,
|
||||
checkpoint_type="edge_transition",
|
||||
current_node=current_node,
|
||||
next_node=next_node,
|
||||
memory=memory,
|
||||
# ... other state
|
||||
)
|
||||
|
||||
current_node = next_node
|
||||
```
|
||||
|
||||
### 4. EventLoopNode Integration
|
||||
|
||||
#### Conversation State Checkpointing
|
||||
|
||||
EventLoopNode already has conversation persistence via `ConversationStore`. For resumability:
|
||||
|
||||
```python
|
||||
class EventLoopNode:
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
"""Execute with checkpoint support."""
|
||||
|
||||
# Try to restore from checkpoint
|
||||
if ctx.checkpoint_id:
|
||||
conversation = await self._restore_conversation(ctx.checkpoint_id)
|
||||
output_accumulator = await OutputAccumulator.restore(self.store)
|
||||
else:
|
||||
# Fresh start
|
||||
conversation = await self._initialize_conversation(ctx)
|
||||
output_accumulator = OutputAccumulator(store=self.store)
|
||||
|
||||
# Event loop with periodic checkpointing
|
||||
iteration = 0
|
||||
while iteration < self.config.max_iterations:
|
||||
|
||||
# Optional: checkpoint every N iterations
|
||||
if self.config.checkpoint_every_n_iterations:
|
||||
if iteration % self.config.checkpoint_every_n_iterations == 0:
|
||||
await self._save_loop_checkpoint(
|
||||
conversation,
|
||||
output_accumulator,
|
||||
iteration,
|
||||
)
|
||||
|
||||
# ... rest of event loop
|
||||
|
||||
iteration += 1
|
||||
```
|
||||
|
||||
**Note**: EventLoopNode conversation state is already persisted to disk after each turn via `ConversationStore`, so it's naturally resumable. We just need to:
|
||||
1. Track which conversation checkpoint to restore from
|
||||
2. Ensure output accumulator state is also restored
|
||||
|
||||
### 5. User-Facing API
|
||||
|
||||
#### MCP Tools for Resume
|
||||
|
||||
```python
|
||||
# In tools/src/aden_tools/tools/session_management/
|
||||
|
||||
@tool
|
||||
async def list_resumable_sessions(
|
||||
agent_work_dir: str,
|
||||
status: str = "failed", # "failed", "paused", "cancelled"
|
||||
limit: int = 20,
|
||||
) -> dict:
|
||||
"""
|
||||
List sessions that can be resumed.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"sessions": [
|
||||
{
|
||||
"session_id": "session_20260208_143022_abc12345",
|
||||
"status": "failed",
|
||||
"error": "Missing API key: OPENAI_API_KEY",
|
||||
"failed_at_node": "analyzer",
|
||||
"last_checkpoint": "checkpoint_20260208_143045_abc789",
|
||||
"created_at": "2026-02-08T14:30:22Z",
|
||||
"updated_at": "2026-02-08T14:30:45Z"
|
||||
}
|
||||
],
|
||||
"total": 1
|
||||
}
|
||||
"""
|
||||
|
||||
@tool
|
||||
async def list_session_checkpoints(
|
||||
agent_work_dir: str,
|
||||
session_id: str,
|
||||
checkpoint_type: str = "", # Filter by type
|
||||
clean_only: bool = False, # Only show clean checkpoints
|
||||
) -> dict:
|
||||
"""
|
||||
List all checkpoints for a session.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"session_id": "session_20260208_143022_abc12345",
|
||||
"checkpoints": [
|
||||
{
|
||||
"checkpoint_id": "checkpoint_20260208_143030_xyz123",
|
||||
"type": "node_complete",
|
||||
"created_at": "2026-02-08T14:30:30Z",
|
||||
"current_node": "collector",
|
||||
"is_clean": true,
|
||||
"can_resume_from": true,
|
||||
"description": "Completed collector node successfully"
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
@tool
|
||||
async def inspect_checkpoint(
|
||||
agent_work_dir: str,
|
||||
session_id: str,
|
||||
checkpoint_id: str,
|
||||
include_memory: bool = False, # Include full memory state
|
||||
) -> dict:
|
||||
"""
|
||||
Inspect a checkpoint's detailed state.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"checkpoint_id": "checkpoint_20260208_143030_xyz123",
|
||||
"type": "node_complete",
|
||||
"current_node": "collector",
|
||||
"execution_path": ["start", "collector"],
|
||||
"accumulated_outputs": {
|
||||
"twitter_handles": ["@user1", "@user2"]
|
||||
},
|
||||
"memory": {...}, # If include_memory=True
|
||||
"metrics_snapshot": {
|
||||
"total_retries": 2,
|
||||
"nodes_with_failures": []
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
@tool
|
||||
async def resume_session(
|
||||
agent_work_dir: str,
|
||||
session_id: str,
|
||||
checkpoint_id: str = "", # Empty = latest checkpoint
|
||||
memory_modifications: str = "{}", # JSON string of memory overrides
|
||||
) -> dict:
|
||||
"""
|
||||
Resume a session from a checkpoint.
|
||||
|
||||
Args:
|
||||
agent_work_dir: Path to agent workspace
|
||||
session_id: Session to resume
|
||||
checkpoint_id: Specific checkpoint (empty = latest)
|
||||
memory_modifications: JSON object with memory key overrides
|
||||
|
||||
Returns:
|
||||
{
|
||||
"session_id": "session_20260208_143022_abc12345",
|
||||
"resumed_from": "checkpoint_20260208_143045_abc789",
|
||||
"status": "active", # Now actively running
|
||||
"message": "Session resumed successfully from checkpoint_20260208_143045_abc789"
|
||||
}
|
||||
"""
|
||||
```
|
||||
|
||||
#### CLI Commands
|
||||
|
||||
```bash
|
||||
# List resumable sessions
|
||||
hive sessions list --agent deep_research_agent --status failed
|
||||
|
||||
# Show checkpoints for a session
|
||||
hive sessions checkpoints session_20260208_143022_abc12345
|
||||
|
||||
# Inspect a checkpoint
|
||||
hive sessions inspect session_20260208_143022_abc12345 checkpoint_20260208_143045_abc789
|
||||
|
||||
# Resume a session
|
||||
hive sessions resume session_20260208_143022_abc12345
|
||||
|
||||
# Resume from specific checkpoint
|
||||
hive sessions resume session_20260208_143022_abc12345 --checkpoint checkpoint_20260208_143030_xyz123
|
||||
|
||||
# Resume with memory modifications (e.g., after adding credentials)
|
||||
hive sessions resume session_20260208_143022_abc12345 --set api_key=sk-...
|
||||
```
|
||||
|
||||
### 6. Configuration
|
||||
|
||||
#### CheckpointConfig
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class CheckpointConfig:
|
||||
"""Configuration for checkpoint behavior."""
|
||||
|
||||
# When to checkpoint
|
||||
checkpoint_on_node_start: bool = True
|
||||
checkpoint_on_node_complete: bool = True
|
||||
checkpoint_on_edge: bool = False # Usually redundant with node_start
|
||||
checkpoint_on_loop_iteration: bool = False # Can be expensive
|
||||
checkpoint_every_n_iterations: int = 0 # 0 = disabled
|
||||
|
||||
# Pruning
|
||||
max_checkpoints_per_session: int = 100
|
||||
prune_after_node_count: int = 10 # Prune every N nodes
|
||||
keep_clean_checkpoints_only: bool = False
|
||||
|
||||
# Performance
|
||||
async_checkpoint: bool = True # Don't block execution on checkpoint writes
|
||||
|
||||
# What to include
|
||||
include_conversation_snapshots: bool = True
|
||||
include_full_memory: bool = True
|
||||
```
|
||||
|
||||
#### Agent-Level Configuration
|
||||
|
||||
```python
|
||||
# In agent.py or config.py
|
||||
|
||||
class MyAgent(Agent):
|
||||
def get_checkpoint_config(self) -> CheckpointConfig:
|
||||
"""Override to customize checkpoint behavior."""
|
||||
return CheckpointConfig(
|
||||
checkpoint_on_node_start=True,
|
||||
checkpoint_on_node_complete=True,
|
||||
checkpoint_every_n_iterations=5, # Checkpoint every 5 iterations in loops
|
||||
max_checkpoints_per_session=50,
|
||||
)
|
||||
```
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Core Checkpoint Infrastructure (Week 1)
|
||||
|
||||
1. **Create checkpoint schemas**
|
||||
- `Checkpoint` dataclass
|
||||
- `CheckpointIndex` for manifest
|
||||
- Serialization/deserialization
|
||||
|
||||
2. **Implement CheckpointStore**
|
||||
- `save_checkpoint()` with atomic writes
|
||||
- `load_checkpoint()` with deserialization
|
||||
- `list_checkpoints()` with filtering
|
||||
- `prune_checkpoints()` for cleanup
|
||||
|
||||
3. **Update SessionState schema**
|
||||
- Add `resume_from_checkpoint_id` field
|
||||
- Add `checkpoints_enabled` flag
|
||||
|
||||
### Phase 2: GraphExecutor Integration (Week 2)
|
||||
|
||||
1. **Modify GraphExecutor**
|
||||
- Add `CheckpointConfig` parameter
|
||||
- Implement checkpoint saving at node boundaries
|
||||
- Implement checkpoint restoration logic
|
||||
- Handle memory state snapshots
|
||||
|
||||
2. **Update execution loop**
|
||||
- Checkpoint before node execution
|
||||
- Checkpoint after successful completion
|
||||
- Mark failure checkpoints on errors
|
||||
|
||||
### Phase 3: EventLoopNode Integration (Week 3)
|
||||
|
||||
1. **Enhance conversation restoration**
|
||||
- Link checkpoints to conversation states
|
||||
- Ensure OutputAccumulator is checkpointed
|
||||
- Test loop resumption from middle of execution
|
||||
|
||||
2. **Add optional loop iteration checkpoints**
|
||||
- Configurable iteration frequency
|
||||
- Balance between granularity and performance
|
||||
|
||||
### Phase 4: User-Facing Features (Week 4)
|
||||
|
||||
1. **Implement MCP tools**
|
||||
- `list_resumable_sessions`
|
||||
- `list_session_checkpoints`
|
||||
- `inspect_checkpoint`
|
||||
- `resume_session`
|
||||
|
||||
2. **Add CLI commands**
|
||||
- `hive sessions list`
|
||||
- `hive sessions checkpoints`
|
||||
- `hive sessions inspect`
|
||||
- `hive sessions resume`
|
||||
|
||||
3. **Update TUI**
|
||||
- Show resumable sessions in UI
|
||||
- Allow resume from TUI interface
|
||||
|
||||
### Phase 5: Testing & Documentation (Week 5)
|
||||
|
||||
1. **Write comprehensive tests**
|
||||
- Unit tests for CheckpointStore
|
||||
- Integration tests for resume flow
|
||||
- Edge case testing (concurrent checkpoints, corruption, etc.)
|
||||
|
||||
2. **Performance testing**
|
||||
- Measure checkpoint overhead
|
||||
- Optimize async checkpoint writing
|
||||
- Test with large memory states
|
||||
|
||||
3. **Documentation**
|
||||
- Update skills with resume patterns
|
||||
- Document checkpoint configuration
|
||||
- Add troubleshooting guide
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Checkpoint Overhead
|
||||
|
||||
**Estimated overhead per checkpoint**:
|
||||
- Memory serialization: ~5-10ms for typical state (< 1MB)
|
||||
- File I/O: ~10-20ms for atomic write
|
||||
- Total: ~15-30ms per checkpoint
|
||||
|
||||
**Mitigation strategies**:
|
||||
1. **Async checkpointing**: Don't block execution on writes
|
||||
2. **Selective checkpointing**: Only checkpoint at important boundaries
|
||||
3. **Incremental checkpoints**: Store deltas instead of full state (future)
|
||||
4. **Compression**: Compress large memory states before writing
|
||||
|
||||
### Storage Size
|
||||
|
||||
**Typical checkpoint size**:
|
||||
- Small memory state (< 100KB): ~50-100KB per checkpoint
|
||||
- Medium memory state (< 1MB): ~500KB-1MB per checkpoint
|
||||
- Large memory state (> 1MB): ~1-5MB per checkpoint
|
||||
|
||||
**Mitigation strategies**:
|
||||
1. **Pruning**: Keep only N most recent checkpoints
|
||||
2. **Clean-only retention**: Only keep checkpoints from clean execution
|
||||
3. **Compression**: Use gzip for checkpoint files
|
||||
4. **Archiving**: Move old checkpoints to archive storage
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Checkpoint Save Failures
|
||||
|
||||
**Scenarios**:
|
||||
- Disk full
|
||||
- Permission errors
|
||||
- Serialization failures
|
||||
- Concurrent writes
|
||||
|
||||
**Handling**:
|
||||
```python
|
||||
try:
|
||||
await checkpoint_store.save_checkpoint(session_id, checkpoint)
|
||||
except CheckpointSaveError as e:
|
||||
# Log warning but don't fail execution
|
||||
logger.warning(f"Failed to save checkpoint: {e}")
|
||||
# Continue execution without checkpoint
|
||||
```
|
||||
|
||||
### Checkpoint Load Failures
|
||||
|
||||
**Scenarios**:
|
||||
- Checkpoint file corrupted
|
||||
- Checkpoint format incompatible
|
||||
- Referenced conversation state missing
|
||||
|
||||
**Handling**:
|
||||
```python
|
||||
try:
|
||||
checkpoint = await checkpoint_store.load_checkpoint(session_id, checkpoint_id)
|
||||
except CheckpointLoadError as e:
|
||||
# Try to find previous valid checkpoint
|
||||
checkpoints = await checkpoint_store.list_checkpoints(session_id)
|
||||
for cp in reversed(checkpoints):
|
||||
try:
|
||||
checkpoint = await checkpoint_store.load_checkpoint(session_id, cp.checkpoint_id)
|
||||
logger.info(f"Fell back to checkpoint {cp.checkpoint_id}")
|
||||
break
|
||||
except CheckpointLoadError:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(f"No valid checkpoints found for session {session_id}")
|
||||
```
|
||||
|
||||
### Resume Failures
|
||||
|
||||
**Scenarios**:
|
||||
- Checkpoint state inconsistent with current graph
|
||||
- Node no longer exists in updated agent code
|
||||
- Memory keys missing required values
|
||||
|
||||
**Handling**:
|
||||
1. **Validation**: Verify checkpoint compatibility before resume
|
||||
2. **Graceful degradation**: Resume from earlier checkpoint if possible
|
||||
3. **User notification**: Clear error messages about why resume failed
|
||||
|
||||
## Migration Path
|
||||
|
||||
### Backward Compatibility
|
||||
|
||||
**Existing sessions** (without checkpoints):
|
||||
- Can still be executed normally
|
||||
- Checkpoint system is opt-in per agent
|
||||
- No breaking changes to existing APIs
|
||||
|
||||
**Enabling checkpoints**:
|
||||
```python
|
||||
# Option 1: Agent-level default
|
||||
class MyAgent(Agent):
|
||||
checkpoint_config = CheckpointConfig(
|
||||
checkpoint_on_node_complete=True,
|
||||
)
|
||||
|
||||
# Option 2: Runtime override
|
||||
runtime = create_agent_runtime(
|
||||
agent=my_agent,
|
||||
checkpoint_config=CheckpointConfig(...),
|
||||
)
|
||||
|
||||
# Option 3: Per-execution
|
||||
result = await executor.execute(
|
||||
graph=graph,
|
||||
goal=goal,
|
||||
checkpoint_config=CheckpointConfig(...),
|
||||
)
|
||||
```
|
||||
|
||||
### Gradual Rollout
|
||||
|
||||
1. **Phase 1**: Core infrastructure, no user-facing features
|
||||
2. **Phase 2**: Opt-in for specific agents via config
|
||||
3. **Phase 3**: User-facing MCP tools and CLI
|
||||
4. **Phase 4**: Enable by default for all new agents
|
||||
5. **Phase 5**: TUI integration
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
### 1. Incremental Checkpoints
|
||||
|
||||
Instead of full state snapshots, store only deltas:
|
||||
```python
|
||||
@dataclass
|
||||
class IncrementalCheckpoint:
|
||||
"""Checkpoint with only changed state."""
|
||||
base_checkpoint_id: str # Parent checkpoint
|
||||
memory_delta: dict[str, Any] # Only changed keys
|
||||
added_outputs: dict[str, Any] # Only new outputs
|
||||
```
|
||||
|
||||
### 2. Distributed Checkpointing
|
||||
|
||||
For long-running agents, checkpoint to cloud storage:
|
||||
```python
|
||||
checkpoint_config = CheckpointConfig(
|
||||
storage_backend="s3", # or "gcs", "azure"
|
||||
storage_url="s3://my-bucket/checkpoints/",
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Checkpoint Compression
|
||||
|
||||
Compress large memory states:
|
||||
```python
|
||||
checkpoint_config = CheckpointConfig(
|
||||
compress=True,
|
||||
compression_threshold_bytes=100_000, # Compress if > 100KB
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Smart Checkpoint Selection
|
||||
|
||||
Use heuristics to decide when to checkpoint:
|
||||
```python
|
||||
class SmartCheckpointStrategy:
|
||||
def should_checkpoint(self, context: ExecutionContext) -> bool:
|
||||
# Checkpoint after expensive nodes
|
||||
if context.node_latency_ms > 30_000:
|
||||
return True
|
||||
# Checkpoint before risky operations
|
||||
if context.node_id in ["api_call", "external_tool"]:
|
||||
return True
|
||||
# Checkpoint after significant memory changes
|
||||
if context.memory_delta_size > 10:
|
||||
return True
|
||||
return False
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### 1. Sensitive Data in Checkpoints
|
||||
|
||||
**Problem**: Checkpoints may contain sensitive data (API keys, credentials, PII)
|
||||
|
||||
**Mitigation**:
|
||||
```python
|
||||
@dataclass
|
||||
class CheckpointConfig:
|
||||
# Exclude sensitive keys from checkpoint
|
||||
exclude_memory_keys: list[str] = field(default_factory=lambda: [
|
||||
"api_key",
|
||||
"credentials",
|
||||
"access_token",
|
||||
])
|
||||
|
||||
# Encrypt checkpoint files
|
||||
encrypt_checkpoints: bool = True
|
||||
encryption_key_source: str = "keychain" # or "env_var", "file"
|
||||
```
|
||||
|
||||
### 2. Checkpoint Tampering
|
||||
|
||||
**Problem**: Malicious modification of checkpoint files
|
||||
|
||||
**Mitigation**:
|
||||
```python
|
||||
@dataclass
|
||||
class Checkpoint:
|
||||
# Add cryptographic signature
|
||||
signature: str # HMAC of checkpoint content
|
||||
|
||||
def verify_signature(self, secret_key: str) -> bool:
|
||||
"""Verify checkpoint hasn't been tampered with."""
|
||||
...
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [RUNTIME_LOGGING.md](./RUNTIME_LOGGING.md) - Current logging system
|
||||
- [session_state.py](../schemas/session_state.py) - Session state schema
|
||||
- [session_store.py](../storage/session_store.py) - Session storage
|
||||
- [executor.py](../graph/executor.py) - Graph executor
|
||||
- [event_loop_node.py](../graph/event_loop_node.py) - EventLoop implementation
|
||||
@@ -197,8 +197,17 @@ class NodeStepLog:
|
||||
tokens_used: int
|
||||
latency_ms: int
|
||||
# ... detailed execution state
|
||||
# Trace context (OTel-aligned; empty if observability context not set):
|
||||
trace_id: str # From set_trace_context (OTel trace)
|
||||
span_id: str # 16 hex chars per step (OTel span)
|
||||
parent_span_id: str # Optional; for nested span hierarchy
|
||||
execution_id: str # Session/run correlation id
|
||||
```
|
||||
|
||||
L3 entries include `trace_id`, `span_id`, and `execution_id` for correlation and **OpenTelemetry (OTel) compatibility**. When the framework sets trace context (e.g. via `Runtime.start_run()` or `StreamRuntime.start_run()`), these fields are populated automatically so L3 data can be exported to OTel backends without schema changes.
|
||||
|
||||
**L2: NodeDetail** also includes `trace_id` and `span_id`; **L1: RunSummaryLog** includes `trace_id` and `execution_id` for the same correlation.
|
||||
|
||||
---
|
||||
|
||||
## Querying Logs (MCP Tools)
|
||||
@@ -215,7 +224,7 @@ Three MCP tools provide access to the logging system:
|
||||
|
||||
```python
|
||||
query_runtime_logs(
|
||||
agent_work_dir: str, # e.g., "~/.hive/agents/twitter_outreach"
|
||||
agent_work_dir: str, # e.g., "~/.hive/agents/deep_research_agent"
|
||||
status: str = "", # "needs_attention", "success", "failure", "degraded"
|
||||
limit: int = 20
|
||||
) -> dict # {"runs": [...], "total": int}
|
||||
@@ -362,14 +371,14 @@ query_runtime_log_raw(agent_work_dir, run_id)
|
||||
```python
|
||||
# 1. Find problematic runs (L1)
|
||||
result = query_runtime_logs(
|
||||
agent_work_dir="~/.hive/agents/twitter_outreach",
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
status="needs_attention"
|
||||
)
|
||||
run_id = result["runs"][0]["run_id"]
|
||||
|
||||
# 2. Identify failing nodes (L2)
|
||||
details = query_runtime_log_details(
|
||||
agent_work_dir="~/.hive/agents/twitter_outreach",
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
run_id=run_id,
|
||||
needs_attention_only=True
|
||||
)
|
||||
@@ -377,7 +386,7 @@ problem_node = details["nodes"][0]["node_id"]
|
||||
|
||||
# 3. Analyze root cause (L3)
|
||||
raw = query_runtime_log_raw(
|
||||
agent_work_dir="~/.hive/agents/twitter_outreach",
|
||||
agent_work_dir="~/.hive/agents/deep_research_agent",
|
||||
run_id=run_id,
|
||||
node_id=problem_node
|
||||
)
|
||||
@@ -487,7 +496,7 @@ logger.start_run(goal_id, session_id=execution_id)
|
||||
```json
|
||||
{
|
||||
"run_id": "session_20260206_115718_e22339c5",
|
||||
"goal_id": "twitter-outreach-multi-loop",
|
||||
"goal_id": "deep-research",
|
||||
"status": "degraded",
|
||||
"started_at": "2026-02-06T11:57:18.593081",
|
||||
"ended_at": "2026-02-06T11:58:45.123456",
|
||||
@@ -520,9 +529,10 @@ logger.start_run(goal_id, session_id=execution_id)
|
||||
**Written:** Incrementally (append per step)
|
||||
**Format:** JSONL (one JSON object per line)
|
||||
|
||||
Each line includes **trace context** when the framework has set it (via the observability module): `trace_id`, `span_id`, `parent_span_id` (optional), and `execution_id`. These align with OpenTelemetry/W3C TraceContext so L3 data can be exported to OTel backends without schema changes.
|
||||
|
||||
```jsonl
|
||||
{"node_id":"intake-collector","step_index":3,"tool_calls":[{"tool":"web_search","args":{"query":"@RomuloNevesOf"}}],"tool_results":[{"status":"success","data":"..."}],"verdict":"RETRY","verdict_feedback":"Missing required output 'twitter_handles'. You found the handle but didn't call set_output.","llm_response_text":"I found the profile...","tokens_used":1234,"latency_ms":2500}
|
||||
{"node_id":"intake-collector","step_index":4,"tool_calls":[{"tool":"web_search","args":{"query":"@RomuloNevesOf twitter"}}],"tool_results":[{"status":"success","data":"..."}],"verdict":"RETRY","verdict_feedback":"Still missing 'twitter_handles'.","llm_response_text":"Found more info...","tokens_used":1456,"latency_ms":2300}
|
||||
{"node_id":"intake-collector","step_index":3,"trace_id":"54e80d7b5bd6409dbc3217e5cd16a4fd","span_id":"a1b2c3d4e5f67890","execution_id":"b4c348ec54e80d7b5bd6409dbc3217e50","tool_calls":[...],"verdict":"RETRY",...}
|
||||
```
|
||||
|
||||
**Why JSONL?**
|
||||
|
||||
@@ -12,6 +12,7 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.graph.executor import ExecutionResult
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.execution_stream import EntryPointSpec, ExecutionStream
|
||||
@@ -102,6 +103,7 @@ class AgentRuntime:
|
||||
tool_executor: Callable | None = None,
|
||||
config: AgentRuntimeConfig | None = None,
|
||||
runtime_log_store: Any = None,
|
||||
checkpoint_config: CheckpointConfig | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize agent runtime.
|
||||
@@ -115,11 +117,13 @@ class AgentRuntime:
|
||||
tool_executor: Function to execute tools
|
||||
config: Optional runtime configuration
|
||||
runtime_log_store: Optional RuntimeLogStore for per-execution logging
|
||||
checkpoint_config: Optional checkpoint configuration for resumable sessions
|
||||
"""
|
||||
self.graph = graph
|
||||
self.goal = goal
|
||||
self._config = config or AgentRuntimeConfig()
|
||||
self._runtime_log_store = runtime_log_store
|
||||
self._checkpoint_config = checkpoint_config
|
||||
|
||||
# Initialize storage
|
||||
storage_path_obj = Path(storage_path) if isinstance(storage_path, str) else storage_path
|
||||
@@ -150,6 +154,9 @@ class AgentRuntime:
|
||||
self._running = False
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
# Optional greeting shown to user on TUI load (set by AgentRunner)
|
||||
self.intro_message: str = ""
|
||||
|
||||
def register_entry_point(self, spec: EntryPointSpec) -> None:
|
||||
"""
|
||||
Register a named entry point for the agent.
|
||||
@@ -222,6 +229,7 @@ class AgentRuntime:
|
||||
result_retention_ttl_seconds=self._config.execution_result_ttl_seconds,
|
||||
runtime_log_store=self._runtime_log_store,
|
||||
session_store=self._session_store,
|
||||
checkpoint_config=self._checkpoint_config,
|
||||
)
|
||||
await stream.start()
|
||||
self._streams[ep_id] = stream
|
||||
@@ -460,6 +468,7 @@ def create_agent_runtime(
|
||||
config: AgentRuntimeConfig | None = None,
|
||||
runtime_log_store: Any = None,
|
||||
enable_logging: bool = True,
|
||||
checkpoint_config: CheckpointConfig | None = None,
|
||||
) -> AgentRuntime:
|
||||
"""
|
||||
Create and configure an AgentRuntime with entry points.
|
||||
@@ -480,6 +489,8 @@ def create_agent_runtime(
|
||||
If None and enable_logging=True, creates one automatically.
|
||||
enable_logging: Whether to enable runtime logging (default: True).
|
||||
Set to False to disable logging entirely.
|
||||
checkpoint_config: Optional checkpoint configuration for resumable sessions.
|
||||
If None, uses default checkpointing behavior.
|
||||
|
||||
Returns:
|
||||
Configured AgentRuntime (not yet started)
|
||||
@@ -500,6 +511,7 @@ def create_agent_runtime(
|
||||
tool_executor=tool_executor,
|
||||
config=config,
|
||||
runtime_log_store=runtime_log_store,
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
for spec in entry_points:
|
||||
|
||||
@@ -13,6 +13,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from framework.observability import set_trace_context
|
||||
from framework.schemas.decision import Decision, DecisionType, Option, Outcome
|
||||
from framework.schemas.run import Run, RunStatus
|
||||
from framework.storage.backend import FileStorage
|
||||
@@ -79,6 +80,14 @@ class Runtime:
|
||||
The run ID
|
||||
"""
|
||||
run_id = f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
|
||||
trace_id = uuid.uuid4().hex
|
||||
execution_id = uuid.uuid4().hex # 32 hex, OTel/W3C-aligned for logs
|
||||
|
||||
set_trace_context(
|
||||
trace_id=trace_id,
|
||||
execution_id=execution_id,
|
||||
goal_id=goal_id,
|
||||
)
|
||||
|
||||
self._current_run = Run(
|
||||
id=run_id,
|
||||
|
||||
@@ -17,6 +17,7 @@ from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.graph.executor import ExecutionResult, GraphExecutor
|
||||
from framework.runtime.shared_state import IsolationLevel, SharedStateManager
|
||||
from framework.runtime.stream_runtime import StreamRuntime, StreamRuntimeAdapter
|
||||
@@ -115,6 +116,7 @@ class ExecutionStream:
|
||||
result_retention_ttl_seconds: float | None = None,
|
||||
runtime_log_store: Any = None,
|
||||
session_store: "SessionStore | None" = None,
|
||||
checkpoint_config: CheckpointConfig | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize execution stream.
|
||||
@@ -133,6 +135,7 @@ class ExecutionStream:
|
||||
tool_executor: Function to execute tools
|
||||
runtime_log_store: Optional RuntimeLogStore for per-execution logging
|
||||
session_store: Optional SessionStore for unified session storage
|
||||
checkpoint_config: Optional checkpoint configuration for resumable sessions
|
||||
"""
|
||||
self.stream_id = stream_id
|
||||
self.entry_spec = entry_spec
|
||||
@@ -148,6 +151,7 @@ class ExecutionStream:
|
||||
self._result_retention_max = result_retention_max
|
||||
self._result_retention_ttl_seconds = result_retention_ttl_seconds
|
||||
self._runtime_log_store = runtime_log_store
|
||||
self._checkpoint_config = checkpoint_config
|
||||
self._session_store = session_store
|
||||
|
||||
# Create stream-scoped runtime
|
||||
@@ -357,6 +361,13 @@ class ExecutionStream:
|
||||
# Create runtime adapter for this execution
|
||||
runtime_adapter = StreamRuntimeAdapter(self._runtime, execution_id)
|
||||
|
||||
# Start run to set trace context (CRITICAL for observability)
|
||||
runtime_adapter.start_run(
|
||||
goal_id=self.goal.id,
|
||||
goal_description=self.goal.description,
|
||||
input_data=ctx.input_data,
|
||||
)
|
||||
|
||||
# Create per-execution runtime logger
|
||||
runtime_logger = None
|
||||
if self._runtime_log_store:
|
||||
@@ -400,6 +411,7 @@ class ExecutionStream:
|
||||
goal=self.goal,
|
||||
input_data=ctx.input_data,
|
||||
session_state=ctx.session_state,
|
||||
checkpoint_config=self._checkpoint_config,
|
||||
)
|
||||
|
||||
# Clean up executor reference
|
||||
@@ -408,6 +420,13 @@ class ExecutionStream:
|
||||
# Store result with retention
|
||||
self._record_execution_result(execution_id, result)
|
||||
|
||||
# End run to complete trace (for observability)
|
||||
runtime_adapter.end_run(
|
||||
success=result.success,
|
||||
narrative=f"Execution {'succeeded' if result.success else 'failed'}",
|
||||
output_data=result.output,
|
||||
)
|
||||
|
||||
# Update context
|
||||
ctx.completed_at = datetime.now()
|
||||
ctx.status = "completed" if result.success else "failed"
|
||||
@@ -437,8 +456,42 @@ class ExecutionStream:
|
||||
logger.debug(f"Execution {execution_id} completed: success={result.success}")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
ctx.status = "cancelled"
|
||||
raise
|
||||
# Execution was cancelled
|
||||
# The executor catches CancelledError and returns a paused result,
|
||||
# but if cancellation happened before executor started, we won't have a result
|
||||
logger.info(f"Execution {execution_id} cancelled")
|
||||
|
||||
# Check if we have a result (executor completed and returned)
|
||||
try:
|
||||
_ = result # Check if result variable exists
|
||||
has_result = True
|
||||
except NameError:
|
||||
has_result = False
|
||||
result = ExecutionResult(
|
||||
success=False,
|
||||
error="Execution cancelled",
|
||||
)
|
||||
|
||||
# Update context status based on result
|
||||
if has_result and result.paused_at:
|
||||
ctx.status = "paused"
|
||||
ctx.completed_at = datetime.now()
|
||||
else:
|
||||
ctx.status = "cancelled"
|
||||
|
||||
# Clean up executor reference
|
||||
self._active_executors.pop(execution_id, None)
|
||||
|
||||
# Store result with retention
|
||||
self._record_execution_result(execution_id, result)
|
||||
|
||||
# Write session state
|
||||
if has_result and result.paused_at:
|
||||
await self._write_session_state(execution_id, ctx, result=result)
|
||||
else:
|
||||
await self._write_session_state(execution_id, ctx, error="Execution cancelled")
|
||||
|
||||
# Don't re-raise - we've handled it and saved state
|
||||
|
||||
except Exception as e:
|
||||
ctx.status = "failed"
|
||||
@@ -456,6 +509,16 @@ class ExecutionStream:
|
||||
# Write error session state
|
||||
await self._write_session_state(execution_id, ctx, error=str(e))
|
||||
|
||||
# End run with failure (for observability)
|
||||
try:
|
||||
runtime_adapter.end_run(
|
||||
success=False,
|
||||
narrative=f"Execution failed: {str(e)}",
|
||||
output_data={},
|
||||
)
|
||||
except Exception:
|
||||
pass # Don't let end_run errors mask the original error
|
||||
|
||||
# Emit failure event
|
||||
if self._event_bus:
|
||||
await self._event_bus.emit_execution_failed(
|
||||
@@ -511,7 +574,11 @@ class ExecutionStream:
|
||||
else:
|
||||
status = SessionStatus.FAILED
|
||||
elif error:
|
||||
status = SessionStatus.FAILED
|
||||
# Check if this is a cancellation
|
||||
if ctx.status == "cancelled" or "cancelled" in error.lower():
|
||||
status = SessionStatus.CANCELLED
|
||||
else:
|
||||
status = SessionStatus.FAILED
|
||||
else:
|
||||
status = SessionStatus.ACTIVE
|
||||
|
||||
|
||||
@@ -31,6 +31,9 @@ class NodeStepLog(BaseModel):
|
||||
|
||||
For EventLoopNode, each iteration is a step. For single-step nodes
|
||||
(LLMNode, FunctionNode, RouterNode), step_index is 0.
|
||||
|
||||
OTel-aligned fields (trace_id, span_id, execution_id) enable correlation
|
||||
and future OpenTelemetry export without schema changes.
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
@@ -48,6 +51,11 @@ class NodeStepLog(BaseModel):
|
||||
error: str = "" # Error message if step failed
|
||||
stacktrace: str = "" # Full stack trace if exception occurred
|
||||
is_partial: bool = False # True if step didn't complete normally
|
||||
# OTel / trace context (from observability; empty if not set):
|
||||
trace_id: str = "" # OTel trace id (e.g. from set_trace_context)
|
||||
span_id: str = "" # OTel span id (16 hex chars per step)
|
||||
parent_span_id: str = "" # Optional; for nested span hierarchy
|
||||
execution_id: str = "" # Session/run correlation id
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -56,7 +64,10 @@ class NodeStepLog(BaseModel):
|
||||
|
||||
|
||||
class NodeDetail(BaseModel):
|
||||
"""Per-node completion result and attention flags."""
|
||||
"""Per-node completion result and attention flags.
|
||||
|
||||
OTel-aligned fields (trace_id, span_id) tie L2 to the same trace as L3.
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
node_name: str = ""
|
||||
@@ -78,6 +89,9 @@ class NodeDetail(BaseModel):
|
||||
continue_count: int = 0
|
||||
needs_attention: bool = False
|
||||
attention_reasons: list[str] = Field(default_factory=list)
|
||||
# OTel / trace context (from observability; empty if not set):
|
||||
trace_id: str = ""
|
||||
span_id: str = "" # Optional node-level span for hierarchy
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -86,7 +100,10 @@ class NodeDetail(BaseModel):
|
||||
|
||||
|
||||
class RunSummaryLog(BaseModel):
|
||||
"""Run-level summary for a full graph execution."""
|
||||
"""Run-level summary for a full graph execution.
|
||||
|
||||
OTel-aligned fields (trace_id, execution_id) tie L1 to the same trace as L2/L3.
|
||||
"""
|
||||
|
||||
run_id: str
|
||||
agent_id: str = ""
|
||||
@@ -101,6 +118,9 @@ class RunSummaryLog(BaseModel):
|
||||
started_at: str = "" # ISO timestamp
|
||||
duration_ms: int = 0
|
||||
execution_quality: str = "" # "clean"|"degraded"|"failed"
|
||||
# OTel / trace context (from observability; empty if not set):
|
||||
trace_id: str = ""
|
||||
execution_id: str = ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -52,29 +52,20 @@ class RuntimeLogStore:
|
||||
|
||||
- New format (session_*): {storage_root}/sessions/{run_id}/logs/
|
||||
- Old format (anything else): {base_path}/runs/{run_id}/ (deprecated)
|
||||
|
||||
When base_path ends with 'runtime_logs', we use the parent directory
|
||||
to avoid nesting under runtime_logs/.
|
||||
|
||||
This allows backward compatibility for reading old logs.
|
||||
"""
|
||||
if run_id.startswith("session_"):
|
||||
# New: sessions/{session_id}/logs/
|
||||
# If base_path ends with runtime_logs, use parent (storage root)
|
||||
is_runtime_logs = self._base_path.name == "runtime_logs"
|
||||
root = self._base_path.parent if is_runtime_logs else self._base_path
|
||||
return root / "sessions" / run_id / "logs"
|
||||
else:
|
||||
# Old: runs/{run_id}/ (deprecated, backward compatibility only)
|
||||
import warnings
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
f"Reading logs from deprecated location for run_id={run_id}. "
|
||||
"New sessions use unified storage at sessions/session_*/logs/",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
return self._base_path / "runs" / run_id
|
||||
warnings.warn(
|
||||
f"Reading logs from deprecated location for run_id={run_id}. "
|
||||
"New sessions use unified storage at sessions/session_*/logs/",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
return self._base_path / "runs" / run_id
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Incremental write (sync — called from locked sections)
|
||||
|
||||
@@ -26,6 +26,7 @@ import uuid
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from framework.observability import get_trace_context
|
||||
from framework.runtime.runtime_log_schemas import (
|
||||
NodeDetail,
|
||||
NodeStepLog,
|
||||
@@ -64,10 +65,8 @@ class RuntimeLogger:
|
||||
The run_id (same as session_id if provided)
|
||||
"""
|
||||
if session_id:
|
||||
# Use provided session_id as run_id (unified sessions)
|
||||
self._run_id = session_id
|
||||
else:
|
||||
# Generate run_id in old format (backward compatibility)
|
||||
ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S")
|
||||
short_uuid = uuid.uuid4().hex[:8]
|
||||
self._run_id = f"{ts}_{short_uuid}"
|
||||
@@ -118,6 +117,12 @@ class RuntimeLogger:
|
||||
)
|
||||
)
|
||||
|
||||
# OTel / trace context: from observability ContextVar (empty if not set)
|
||||
ctx = get_trace_context()
|
||||
trace_id = ctx.get("trace_id", "")
|
||||
execution_id = ctx.get("execution_id", "")
|
||||
span_id = uuid.uuid4().hex[:16] # OTel 16-hex span_id per step
|
||||
|
||||
step_log = NodeStepLog(
|
||||
node_id=node_id,
|
||||
node_type=node_type,
|
||||
@@ -132,6 +137,9 @@ class RuntimeLogger:
|
||||
error=error,
|
||||
stacktrace=stacktrace,
|
||||
is_partial=is_partial,
|
||||
trace_id=trace_id,
|
||||
span_id=span_id,
|
||||
execution_id=execution_id,
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
@@ -190,6 +198,11 @@ class RuntimeLogger:
|
||||
needs_attention = True
|
||||
attention_reasons.append(f"Many iterations: {total_steps}")
|
||||
|
||||
# OTel / trace context for L2 correlation
|
||||
ctx = get_trace_context()
|
||||
trace_id = ctx.get("trace_id", "")
|
||||
span_id = uuid.uuid4().hex[:16] # Optional node-level span
|
||||
|
||||
detail = NodeDetail(
|
||||
node_id=node_id,
|
||||
node_name=node_name,
|
||||
@@ -210,6 +223,8 @@ class RuntimeLogger:
|
||||
continue_count=continue_count,
|
||||
needs_attention=needs_attention,
|
||||
attention_reasons=attention_reasons,
|
||||
trace_id=trace_id,
|
||||
span_id=span_id,
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
@@ -274,6 +289,11 @@ class RuntimeLogger:
|
||||
for nd in node_details:
|
||||
attention_reasons.extend(nd.attention_reasons)
|
||||
|
||||
# OTel / trace context for L1 correlation
|
||||
ctx = get_trace_context()
|
||||
trace_id = ctx.get("trace_id", "")
|
||||
execution_id = ctx.get("execution_id", "")
|
||||
|
||||
summary = RunSummaryLog(
|
||||
run_id=self._run_id,
|
||||
agent_id=self._agent_id,
|
||||
@@ -288,6 +308,8 @@ class RuntimeLogger:
|
||||
started_at=self._started_at,
|
||||
duration_ms=duration_ms,
|
||||
execution_quality=execution_quality,
|
||||
trace_id=trace_id,
|
||||
execution_id=execution_id,
|
||||
)
|
||||
|
||||
await self._store.save_summary(self._run_id, summary)
|
||||
|
||||
@@ -12,6 +12,7 @@ import uuid
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from framework.observability import set_trace_context
|
||||
from framework.schemas.decision import Decision, DecisionType, Option, Outcome
|
||||
from framework.schemas.run import Run, RunStatus
|
||||
from framework.storage.concurrent import ConcurrentStorage
|
||||
@@ -119,6 +120,16 @@ class StreamRuntime:
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
run_id = f"run_{self.stream_id}_{timestamp}_{uuid.uuid4().hex[:8]}"
|
||||
trace_id = uuid.uuid4().hex
|
||||
otel_execution_id = uuid.uuid4().hex # 32 hex, OTel/W3C-aligned for logs
|
||||
|
||||
set_trace_context(
|
||||
trace_id=trace_id,
|
||||
execution_id=otel_execution_id,
|
||||
run_id=run_id,
|
||||
goal_id=goal_id,
|
||||
stream_id=self.stream_id,
|
||||
)
|
||||
|
||||
run = Run(
|
||||
id=run_id,
|
||||
|
||||
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
Checkpoint Schema - Execution state snapshots for resumability.
|
||||
|
||||
Checkpoints capture the execution state at strategic points (node boundaries,
|
||||
iterations) to enable crash recovery and resume-from-failure scenarios.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Checkpoint(BaseModel):
|
||||
"""
|
||||
Single checkpoint in execution timeline.
|
||||
|
||||
Captures complete execution state at a specific point to enable
|
||||
resuming from that exact point after failures or pauses.
|
||||
"""
|
||||
|
||||
# Identity
|
||||
checkpoint_id: str # Format: cp_{type}_{node_id}_{timestamp}
|
||||
checkpoint_type: str # "node_start" | "node_complete" | "loop_iteration"
|
||||
session_id: str
|
||||
|
||||
# Timestamps
|
||||
created_at: str # ISO 8601 format
|
||||
|
||||
# Execution state
|
||||
current_node: str | None = None
|
||||
next_node: str | None = None # For edge_transition checkpoints
|
||||
execution_path: list[str] = Field(default_factory=list) # Nodes executed so far
|
||||
|
||||
# State snapshots
|
||||
shared_memory: dict[str, Any] = Field(default_factory=dict) # Full SharedMemory._data
|
||||
accumulated_outputs: dict[str, Any] = Field(default_factory=dict) # Outputs accumulated so far
|
||||
|
||||
# Execution metrics (for resuming quality tracking)
|
||||
metrics_snapshot: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Metadata
|
||||
is_clean: bool = True # True if no failures/retries before this checkpoint
|
||||
description: str = "" # Human-readable checkpoint description
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
checkpoint_type: str,
|
||||
session_id: str,
|
||||
current_node: str,
|
||||
execution_path: list[str],
|
||||
shared_memory: dict[str, Any],
|
||||
next_node: str | None = None,
|
||||
accumulated_outputs: dict[str, Any] | None = None,
|
||||
metrics_snapshot: dict[str, Any] | None = None,
|
||||
is_clean: bool = True,
|
||||
description: str = "",
|
||||
) -> "Checkpoint":
|
||||
"""
|
||||
Create a new checkpoint with generated ID and timestamp.
|
||||
|
||||
Args:
|
||||
checkpoint_type: Type of checkpoint (node_start, node_complete, etc.)
|
||||
session_id: Session this checkpoint belongs to
|
||||
current_node: Node ID at checkpoint time
|
||||
execution_path: List of node IDs executed so far
|
||||
shared_memory: Full memory state snapshot
|
||||
next_node: Next node to execute (for node_complete checkpoints)
|
||||
accumulated_outputs: Outputs accumulated so far
|
||||
metrics_snapshot: Execution metrics at checkpoint time
|
||||
is_clean: Whether execution was clean up to this point
|
||||
description: Human-readable description
|
||||
|
||||
Returns:
|
||||
New Checkpoint instance
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
checkpoint_id = f"cp_{checkpoint_type}_{current_node}_{timestamp}"
|
||||
|
||||
if not description:
|
||||
description = f"{checkpoint_type.replace('_', ' ').title()}: {current_node}"
|
||||
|
||||
return cls(
|
||||
checkpoint_id=checkpoint_id,
|
||||
checkpoint_type=checkpoint_type,
|
||||
session_id=session_id,
|
||||
created_at=datetime.now().isoformat(),
|
||||
current_node=current_node,
|
||||
next_node=next_node,
|
||||
execution_path=execution_path,
|
||||
shared_memory=shared_memory,
|
||||
accumulated_outputs=accumulated_outputs or {},
|
||||
metrics_snapshot=metrics_snapshot or {},
|
||||
is_clean=is_clean,
|
||||
description=description,
|
||||
)
|
||||
|
||||
|
||||
class CheckpointSummary(BaseModel):
|
||||
"""
|
||||
Lightweight checkpoint metadata for index listings.
|
||||
|
||||
Used in checkpoint index to provide fast scanning without
|
||||
loading full checkpoint data.
|
||||
"""
|
||||
|
||||
checkpoint_id: str
|
||||
checkpoint_type: str
|
||||
created_at: str
|
||||
current_node: str | None = None
|
||||
next_node: str | None = None
|
||||
is_clean: bool = True
|
||||
description: str = ""
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@classmethod
|
||||
def from_checkpoint(cls, checkpoint: Checkpoint) -> "CheckpointSummary":
|
||||
"""Create summary from full checkpoint."""
|
||||
return cls(
|
||||
checkpoint_id=checkpoint.checkpoint_id,
|
||||
checkpoint_type=checkpoint.checkpoint_type,
|
||||
created_at=checkpoint.created_at,
|
||||
current_node=checkpoint.current_node,
|
||||
next_node=checkpoint.next_node,
|
||||
is_clean=checkpoint.is_clean,
|
||||
description=checkpoint.description,
|
||||
)
|
||||
|
||||
|
||||
class CheckpointIndex(BaseModel):
|
||||
"""
|
||||
Manifest of all checkpoints for a session.
|
||||
|
||||
Provides fast lookup and filtering without loading
|
||||
full checkpoint files.
|
||||
"""
|
||||
|
||||
session_id: str
|
||||
checkpoints: list[CheckpointSummary] = Field(default_factory=list)
|
||||
latest_checkpoint_id: str | None = None
|
||||
total_checkpoints: int = 0
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
def add_checkpoint(self, checkpoint: Checkpoint) -> None:
|
||||
"""Add a checkpoint to the index."""
|
||||
summary = CheckpointSummary.from_checkpoint(checkpoint)
|
||||
self.checkpoints.append(summary)
|
||||
self.latest_checkpoint_id = checkpoint.checkpoint_id
|
||||
self.total_checkpoints = len(self.checkpoints)
|
||||
|
||||
def get_checkpoint_summary(self, checkpoint_id: str) -> CheckpointSummary | None:
|
||||
"""Get checkpoint summary by ID."""
|
||||
for summary in self.checkpoints:
|
||||
if summary.checkpoint_id == checkpoint_id:
|
||||
return summary
|
||||
return None
|
||||
|
||||
def filter_by_type(self, checkpoint_type: str) -> list[CheckpointSummary]:
|
||||
"""Filter checkpoints by type."""
|
||||
return [cp for cp in self.checkpoints if cp.checkpoint_type == checkpoint_type]
|
||||
|
||||
def filter_by_node(self, node_id: str) -> list[CheckpointSummary]:
|
||||
"""Filter checkpoints by current_node."""
|
||||
return [cp for cp in self.checkpoints if cp.current_node == node_id]
|
||||
|
||||
def get_clean_checkpoints(self) -> list[CheckpointSummary]:
|
||||
"""Get all clean checkpoints (no failures before them)."""
|
||||
return [cp for cp in self.checkpoints if cp.is_clean]
|
||||
|
||||
def get_latest_clean_checkpoint(self) -> CheckpointSummary | None:
|
||||
"""Get the most recent clean checkpoint."""
|
||||
clean = self.get_clean_checkpoints()
|
||||
return clean[-1] if clean else None
|
||||
@@ -91,10 +91,11 @@ class SessionState(BaseModel):
|
||||
|
||||
Version History:
|
||||
- v1.0: Initial schema (2026-02-06)
|
||||
- v1.1: Added checkpoint support (2026-02-08)
|
||||
"""
|
||||
|
||||
# Schema version for forward/backward compatibility
|
||||
schema_version: str = "1.0"
|
||||
schema_version: str = "1.1"
|
||||
|
||||
# Identity
|
||||
session_id: str # Format: session_YYYYMMDD_HHMMSS_{uuid_8char}
|
||||
@@ -136,6 +137,10 @@ class SessionState(BaseModel):
|
||||
# Isolation level (from ExecutionContext)
|
||||
isolation_level: str = "shared"
|
||||
|
||||
# Checkpointing (for crash recovery and resume-from-failure)
|
||||
checkpoint_enabled: bool = False
|
||||
latest_checkpoint_id: str | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@computed_field
|
||||
@@ -154,6 +159,14 @@ class SessionState(BaseModel):
|
||||
"""Can this session be resumed?"""
|
||||
return self.status == SessionStatus.PAUSED and self.progress.resume_from is not None
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def is_resumable_from_checkpoint(self) -> bool:
|
||||
"""Can this session be resumed from a checkpoint?"""
|
||||
# ANY session with checkpoints can be resumed (not just failed ones)
|
||||
# This enables: pause/resume, iterative execution, continuation after completion
|
||||
return self.checkpoint_enabled and self.latest_checkpoint_id is not None
|
||||
|
||||
@classmethod
|
||||
def from_execution_result(
|
||||
cls,
|
||||
|
||||
@@ -0,0 +1,325 @@
|
||||
"""
|
||||
Checkpoint Store - Manages checkpoint storage with atomic writes.
|
||||
|
||||
Handles saving, loading, listing, and pruning of execution checkpoints
|
||||
for session resumability.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from framework.schemas.checkpoint import Checkpoint, CheckpointIndex, CheckpointSummary
|
||||
from framework.utils.io import atomic_write
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CheckpointStore:
|
||||
"""
|
||||
Manages checkpoint storage with atomic writes.
|
||||
|
||||
Stores checkpoints in a session's checkpoints/ directory with
|
||||
an index for fast lookup and filtering.
|
||||
|
||||
Directory structure:
|
||||
checkpoints/
|
||||
index.json # Checkpoint manifest
|
||||
cp_{type}_{node}_{timestamp}.json # Individual checkpoints
|
||||
"""
|
||||
|
||||
def __init__(self, base_path: Path):
|
||||
"""
|
||||
Initialize checkpoint store.
|
||||
|
||||
Args:
|
||||
base_path: Session directory (e.g., ~/.hive/agents/agent_name/sessions/session_ID/)
|
||||
"""
|
||||
self.base_path = Path(base_path)
|
||||
self.checkpoints_dir = self.base_path / "checkpoints"
|
||||
self.index_path = self.checkpoints_dir / "index.json"
|
||||
self._index_lock = asyncio.Lock()
|
||||
|
||||
async def save_checkpoint(self, checkpoint: Checkpoint) -> None:
|
||||
"""
|
||||
Atomically save checkpoint and update index.
|
||||
|
||||
Uses temp file + rename for crash safety. Updates index
|
||||
after checkpoint is persisted.
|
||||
|
||||
Args:
|
||||
checkpoint: Checkpoint to save
|
||||
|
||||
Raises:
|
||||
OSError: If file write fails
|
||||
"""
|
||||
|
||||
def _write():
|
||||
# Ensure directory exists
|
||||
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write checkpoint file atomically
|
||||
checkpoint_path = self.checkpoints_dir / f"{checkpoint.checkpoint_id}.json"
|
||||
with atomic_write(checkpoint_path) as f:
|
||||
f.write(checkpoint.model_dump_json(indent=2))
|
||||
|
||||
logger.debug(f"Saved checkpoint {checkpoint.checkpoint_id}")
|
||||
|
||||
# Write checkpoint file (blocking I/O in thread)
|
||||
await asyncio.to_thread(_write)
|
||||
|
||||
# Update index (with lock to prevent concurrent modifications)
|
||||
async with self._index_lock:
|
||||
await self._update_index_add(checkpoint)
|
||||
|
||||
async def load_checkpoint(
|
||||
self,
|
||||
checkpoint_id: str | None = None,
|
||||
) -> Checkpoint | None:
|
||||
"""
|
||||
Load checkpoint by ID or latest.
|
||||
|
||||
Args:
|
||||
checkpoint_id: Checkpoint ID to load, or None for latest
|
||||
|
||||
Returns:
|
||||
Checkpoint object, or None if not found
|
||||
"""
|
||||
|
||||
def _read(checkpoint_id: str) -> Checkpoint | None:
|
||||
checkpoint_path = self.checkpoints_dir / f"{checkpoint_id}.json"
|
||||
|
||||
if not checkpoint_path.exists():
|
||||
logger.warning(f"Checkpoint file not found: {checkpoint_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
return Checkpoint.model_validate_json(checkpoint_path.read_text())
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load checkpoint {checkpoint_id}: {e}")
|
||||
return None
|
||||
|
||||
# Load index to get checkpoint ID if not provided
|
||||
if checkpoint_id is None:
|
||||
index = await self.load_index()
|
||||
if not index or not index.latest_checkpoint_id:
|
||||
logger.warning("No checkpoints found in index")
|
||||
return None
|
||||
checkpoint_id = index.latest_checkpoint_id
|
||||
|
||||
return await asyncio.to_thread(_read, checkpoint_id)
|
||||
|
||||
async def load_index(self) -> CheckpointIndex | None:
|
||||
"""
|
||||
Load checkpoint index.
|
||||
|
||||
Returns:
|
||||
CheckpointIndex or None if not found
|
||||
"""
|
||||
|
||||
def _read() -> CheckpointIndex | None:
|
||||
if not self.index_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
return CheckpointIndex.model_validate_json(self.index_path.read_text())
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load checkpoint index: {e}")
|
||||
return None
|
||||
|
||||
return await asyncio.to_thread(_read)
|
||||
|
||||
async def list_checkpoints(
|
||||
self,
|
||||
checkpoint_type: str | None = None,
|
||||
is_clean: bool | None = None,
|
||||
) -> list[CheckpointSummary]:
|
||||
"""
|
||||
List checkpoints with optional filters.
|
||||
|
||||
Args:
|
||||
checkpoint_type: Filter by type (node_start, node_complete)
|
||||
is_clean: Filter by clean status
|
||||
|
||||
Returns:
|
||||
List of CheckpointSummary objects
|
||||
"""
|
||||
index = await self.load_index()
|
||||
if not index:
|
||||
return []
|
||||
|
||||
checkpoints = index.checkpoints
|
||||
|
||||
# Apply filters
|
||||
if checkpoint_type:
|
||||
checkpoints = [cp for cp in checkpoints if cp.checkpoint_type == checkpoint_type]
|
||||
|
||||
if is_clean is not None:
|
||||
checkpoints = [cp for cp in checkpoints if cp.is_clean == is_clean]
|
||||
|
||||
return checkpoints
|
||||
|
||||
async def delete_checkpoint(self, checkpoint_id: str) -> bool:
|
||||
"""
|
||||
Delete a specific checkpoint.
|
||||
|
||||
Args:
|
||||
checkpoint_id: Checkpoint ID to delete
|
||||
|
||||
Returns:
|
||||
True if deleted, False if not found
|
||||
"""
|
||||
|
||||
def _delete(checkpoint_id: str) -> bool:
|
||||
checkpoint_path = self.checkpoints_dir / f"{checkpoint_id}.json"
|
||||
|
||||
if not checkpoint_path.exists():
|
||||
logger.warning(f"Checkpoint file not found: {checkpoint_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
checkpoint_path.unlink()
|
||||
logger.info(f"Deleted checkpoint {checkpoint_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete checkpoint {checkpoint_id}: {e}")
|
||||
return False
|
||||
|
||||
# Delete checkpoint file
|
||||
deleted = await asyncio.to_thread(_delete, checkpoint_id)
|
||||
|
||||
if deleted:
|
||||
# Update index (with lock)
|
||||
async with self._index_lock:
|
||||
await self._update_index_remove(checkpoint_id)
|
||||
|
||||
return deleted
|
||||
|
||||
async def prune_checkpoints(
|
||||
self,
|
||||
max_age_days: int = 7,
|
||||
) -> int:
|
||||
"""
|
||||
Prune checkpoints older than max_age_days.
|
||||
|
||||
Args:
|
||||
max_age_days: Maximum age in days (default 7)
|
||||
|
||||
Returns:
|
||||
Number of checkpoints deleted
|
||||
"""
|
||||
index = await self.load_index()
|
||||
if not index or not index.checkpoints:
|
||||
return 0
|
||||
|
||||
# Calculate cutoff datetime
|
||||
cutoff = datetime.now() - timedelta(days=max_age_days)
|
||||
|
||||
# Find old checkpoints
|
||||
old_checkpoints = []
|
||||
for cp in index.checkpoints:
|
||||
try:
|
||||
created = datetime.fromisoformat(cp.created_at)
|
||||
if created < cutoff:
|
||||
old_checkpoints.append(cp.checkpoint_id)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse timestamp for {cp.checkpoint_id}: {e}")
|
||||
|
||||
# Delete old checkpoints
|
||||
deleted_count = 0
|
||||
for checkpoint_id in old_checkpoints:
|
||||
if await self.delete_checkpoint(checkpoint_id):
|
||||
deleted_count += 1
|
||||
|
||||
if deleted_count > 0:
|
||||
logger.info(f"Pruned {deleted_count} checkpoints older than {max_age_days} days")
|
||||
|
||||
return deleted_count
|
||||
|
||||
async def checkpoint_exists(self, checkpoint_id: str) -> bool:
|
||||
"""
|
||||
Check if a checkpoint exists.
|
||||
|
||||
Args:
|
||||
checkpoint_id: Checkpoint ID
|
||||
|
||||
Returns:
|
||||
True if checkpoint exists
|
||||
"""
|
||||
|
||||
def _check(checkpoint_id: str) -> bool:
|
||||
checkpoint_path = self.checkpoints_dir / f"{checkpoint_id}.json"
|
||||
return checkpoint_path.exists()
|
||||
|
||||
return await asyncio.to_thread(_check, checkpoint_id)
|
||||
|
||||
async def _update_index_add(self, checkpoint: Checkpoint) -> None:
|
||||
"""
|
||||
Update index after adding a checkpoint.
|
||||
|
||||
Should be called with _index_lock held.
|
||||
|
||||
Args:
|
||||
checkpoint: Checkpoint that was added
|
||||
"""
|
||||
|
||||
def _write(index: CheckpointIndex):
|
||||
# Ensure directory exists
|
||||
self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write index atomically
|
||||
with atomic_write(self.index_path) as f:
|
||||
f.write(index.model_dump_json(indent=2))
|
||||
|
||||
# Load or create index
|
||||
index = await self.load_index()
|
||||
if not index:
|
||||
index = CheckpointIndex(
|
||||
session_id=checkpoint.session_id,
|
||||
checkpoints=[],
|
||||
)
|
||||
|
||||
# Add checkpoint to index
|
||||
index.add_checkpoint(checkpoint)
|
||||
|
||||
# Write updated index
|
||||
await asyncio.to_thread(_write, index)
|
||||
|
||||
logger.debug(f"Updated index with checkpoint {checkpoint.checkpoint_id}")
|
||||
|
||||
async def _update_index_remove(self, checkpoint_id: str) -> None:
|
||||
"""
|
||||
Update index after removing a checkpoint.
|
||||
|
||||
Should be called with _index_lock held.
|
||||
|
||||
Args:
|
||||
checkpoint_id: Checkpoint ID that was removed
|
||||
"""
|
||||
|
||||
def _write(index: CheckpointIndex):
|
||||
with atomic_write(self.index_path) as f:
|
||||
f.write(index.model_dump_json(indent=2))
|
||||
|
||||
# Load index
|
||||
index = await self.load_index()
|
||||
if not index:
|
||||
return
|
||||
|
||||
# Remove checkpoint from index
|
||||
index.checkpoints = [cp for cp in index.checkpoints if cp.checkpoint_id != checkpoint_id]
|
||||
|
||||
# Update totals
|
||||
index.total_checkpoints = len(index.checkpoints)
|
||||
|
||||
# Update latest_checkpoint_id if we removed the latest
|
||||
if index.latest_checkpoint_id == checkpoint_id:
|
||||
index.latest_checkpoint_id = (
|
||||
index.checkpoints[-1].checkpoint_id if index.checkpoints else None
|
||||
)
|
||||
|
||||
# Write updated index
|
||||
await asyncio.to_thread(_write, index)
|
||||
|
||||
logger.debug(f"Removed checkpoint {checkpoint_id} from index")
|
||||
@@ -37,7 +37,7 @@ class SessionStore:
|
||||
Initialize session store.
|
||||
|
||||
Args:
|
||||
base_path: Base path for storage (e.g., ~/.hive/agents/twitter_outreach)
|
||||
base_path: Base path for storage (e.g., ~/.hive/agents/deep_research_agent)
|
||||
"""
|
||||
self.base_path = Path(base_path)
|
||||
self.sessions_dir = self.base_path / "sessions"
|
||||
|
||||
@@ -3,6 +3,10 @@ Pytest templates for test file generation.
|
||||
|
||||
These templates provide headers and fixtures for pytest-compatible async tests.
|
||||
Tests are written to exports/{agent}/tests/ as Python files and run with pytest.
|
||||
|
||||
Tests use AgentRunner.load() — the canonical runtime path — which creates
|
||||
AgentRuntime, ExecutionStream, and proper session/log storage. For agents
|
||||
with client-facing nodes, an auto_responder fixture handles input injection.
|
||||
"""
|
||||
|
||||
# Template for the test file header (imports and fixtures)
|
||||
@@ -11,17 +15,19 @@ PYTEST_TEST_FILE_HEADER = '''"""
|
||||
|
||||
{description}
|
||||
|
||||
REQUIRES: API_KEY (OpenAI or Anthropic) for real testing.
|
||||
REQUIRES: API_KEY for execution tests. Structure tests run without keys.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
from {agent_module} import default_agent
|
||||
from pathlib import Path
|
||||
|
||||
# Agent path resolved from this test file's location
|
||||
AGENT_PATH = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def _get_api_key():
|
||||
"""Get API key from CredentialStoreAdapter or environment."""
|
||||
# 1. Try CredentialStoreAdapter for Anthropic
|
||||
try:
|
||||
from aden_tools.credentials import CredentialStoreAdapter
|
||||
creds = CredentialStoreAdapter.default()
|
||||
@@ -29,28 +35,43 @@ def _get_api_key():
|
||||
return creds.get("anthropic")
|
||||
except (ImportError, KeyError):
|
||||
pass
|
||||
|
||||
# 2. Fallback to standard environment variables for OpenAI and others
|
||||
return (
|
||||
os.environ.get("OPENAI_API_KEY") or
|
||||
os.environ.get("ANTHROPIC_API_KEY") or
|
||||
os.environ.get("CEREBRAS_API_KEY") or
|
||||
os.environ.get("GROQ_API_KEY")
|
||||
os.environ.get("GROQ_API_KEY") or
|
||||
os.environ.get("GEMINI_API_KEY")
|
||||
)
|
||||
|
||||
|
||||
# Skip all tests if no API key and not in mock mode
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not _get_api_key() and not os.environ.get("MOCK_MODE"),
|
||||
reason="API key required. Please set OPENAI_API_KEY, ANTHROPIC_API_KEY, or use MOCK_MODE=1."
|
||||
reason="API key required. Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests."
|
||||
)
|
||||
'''
|
||||
|
||||
# Template for conftest.py with shared fixtures
|
||||
PYTEST_CONFTEST_TEMPLATE = '''"""Shared test fixtures for {agent_name} tests."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add exports/ and core/ to sys.path so the agent package and framework are importable
|
||||
_repo_root = Path(__file__).resolve().parents[3]
|
||||
for _p in ["exports", "core"]:
|
||||
_path = str(_repo_root / _p)
|
||||
if _path not in sys.path:
|
||||
sys.path.insert(0, _path)
|
||||
|
||||
import pytest
|
||||
from framework.runner.runner import AgentRunner
|
||||
from framework.runtime.event_bus import EventType
|
||||
|
||||
AGENT_PATH = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def _get_api_key():
|
||||
@@ -62,19 +83,80 @@ def _get_api_key():
|
||||
return creds.get("anthropic")
|
||||
except (ImportError, KeyError):
|
||||
pass
|
||||
|
||||
return (
|
||||
os.environ.get("OPENAI_API_KEY") or
|
||||
os.environ.get("ANTHROPIC_API_KEY") or
|
||||
os.environ.get("CEREBRAS_API_KEY") or
|
||||
os.environ.get("GROQ_API_KEY")
|
||||
os.environ.get("GROQ_API_KEY") or
|
||||
os.environ.get("GEMINI_API_KEY")
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def mock_mode():
|
||||
"""Check if running in mock mode."""
|
||||
return bool(os.environ.get("MOCK_MODE"))
|
||||
"""Return True if running in mock mode (no API key or MOCK_MODE=1)."""
|
||||
if os.environ.get("MOCK_MODE"):
|
||||
return True
|
||||
return not bool(_get_api_key())
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
async def runner(tmp_path_factory, mock_mode):
|
||||
"""Create an AgentRunner using the canonical runtime path.
|
||||
|
||||
Uses tmp_path_factory for storage so tests don't pollute ~/.hive/agents/.
|
||||
Goes through AgentRunner.load() -> _setup() -> AgentRuntime, the same
|
||||
path as ``hive run``.
|
||||
"""
|
||||
storage = tmp_path_factory.mktemp("agent_storage")
|
||||
r = AgentRunner.load(
|
||||
AGENT_PATH,
|
||||
mock_mode=mock_mode,
|
||||
storage_path=storage,
|
||||
)
|
||||
r._setup()
|
||||
yield r
|
||||
await r.cleanup_async()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def auto_responder(runner):
|
||||
"""Auto-respond to client-facing node input requests.
|
||||
|
||||
Subscribes to CLIENT_INPUT_REQUESTED events and injects a response
|
||||
to unblock the node. Customize the response before calling start():
|
||||
|
||||
auto_responder.response = "approve the report"
|
||||
await auto_responder.start()
|
||||
"""
|
||||
class AutoResponder:
|
||||
def __init__(self, runner_instance):
|
||||
self._runner = runner_instance
|
||||
self.response = "yes, proceed"
|
||||
self.interactions = []
|
||||
self._sub_id = None
|
||||
|
||||
async def start(self):
|
||||
runtime = self._runner._agent_runtime
|
||||
if runtime is None:
|
||||
return
|
||||
|
||||
async def _handle(event):
|
||||
self.interactions.append(event.node_id)
|
||||
await runtime.inject_input(event.node_id, self.response)
|
||||
|
||||
self._sub_id = runtime.subscribe_to_events(
|
||||
event_types=[EventType.CLIENT_INPUT_REQUESTED],
|
||||
handler=_handle,
|
||||
)
|
||||
|
||||
async def stop(self):
|
||||
runtime = self._runner._agent_runtime
|
||||
if self._sub_id and runtime:
|
||||
runtime.unsubscribe_from_events(self._sub_id)
|
||||
self._sub_id = None
|
||||
|
||||
return AutoResponder(runner)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
@@ -82,19 +164,51 @@ def check_api_key():
|
||||
"""Ensure API key is set for real testing."""
|
||||
if not _get_api_key():
|
||||
if os.environ.get("MOCK_MODE"):
|
||||
print("\\n⚠️ Running in MOCK MODE - structure validation only")
|
||||
print(" This does NOT test LLM behavior or agent quality")
|
||||
print(" Set OPENAI_API_KEY or ANTHROPIC_API_KEY for real testing\\n")
|
||||
print("\\n Running in MOCK MODE - structure validation only")
|
||||
print(" Set ANTHROPIC_API_KEY for real testing\\n")
|
||||
else:
|
||||
pytest.fail(
|
||||
"\\n❌ No API key found!\\n\\n"
|
||||
"Real testing requires an API key. Choose one:\\n"
|
||||
"1. Set OpenAI key:\\n"
|
||||
" export OPENAI_API_KEY='your-key-here'\\n"
|
||||
"2. Set Anthropic key:\\n"
|
||||
" export ANTHROPIC_API_KEY='your-key-here'\\n"
|
||||
"3. Run structure validation only:\\n"
|
||||
" MOCK_MODE=1 pytest exports/{agent_name}/tests/\\n\\n"
|
||||
"Note: Mock mode does NOT validate agent behavior or quality."
|
||||
"\\nNo API key found!\\n"
|
||||
"Set ANTHROPIC_API_KEY or use MOCK_MODE=1 for structure tests.\\n"
|
||||
)
|
||||
|
||||
|
||||
def parse_json_from_output(result, key):
|
||||
"""Parse JSON from agent output (framework may store full LLM response as string)."""
|
||||
val = result.output.get(key, "")
|
||||
if isinstance(val, (dict, list)):
|
||||
return val
|
||||
if isinstance(val, str):
|
||||
json_text = re.sub(r"```json\\s*|\\s*```", "", val).strip()
|
||||
try:
|
||||
return json.loads(json_text)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return val
|
||||
return val
|
||||
|
||||
|
||||
def safe_get_nested(result, key_path, default=None):
|
||||
"""Safely get nested value from result.output."""
|
||||
output = result.output or {{}}
|
||||
current = output
|
||||
for key in key_path:
|
||||
if isinstance(current, dict):
|
||||
current = current.get(key)
|
||||
elif isinstance(current, str):
|
||||
try:
|
||||
json_text = re.sub(r"```json\\s*|\\s*```", "", current).strip()
|
||||
parsed = json.loads(json_text)
|
||||
if isinstance(parsed, dict):
|
||||
current = parsed.get(key)
|
||||
else:
|
||||
return default
|
||||
except json.JSONDecodeError:
|
||||
return default
|
||||
else:
|
||||
return default
|
||||
return current if current is not None else default
|
||||
|
||||
|
||||
pytest.parse_json_from_output = parse_json_from_output
|
||||
pytest.safe_get_nested = safe_get_nested
|
||||
'''
|
||||
|
||||
+101
-4
@@ -6,7 +6,7 @@ import time
|
||||
from textual.app import App, ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal, Vertical
|
||||
from textual.widgets import Footer, Label
|
||||
from textual.widgets import Footer, Input, Label
|
||||
|
||||
from framework.runtime.agent_runtime import AgentRuntime
|
||||
from framework.runtime.event_bus import AgentEvent, EventType
|
||||
@@ -208,17 +208,24 @@ class AdenTUI(App):
|
||||
Binding("ctrl+c", "ctrl_c", "Interrupt", show=False, priority=True),
|
||||
Binding("super+c", "ctrl_c", "Copy", show=False, priority=True),
|
||||
Binding("ctrl+s", "screenshot", "Screenshot (SVG)", show=True, priority=True),
|
||||
Binding("ctrl+z", "pause_execution", "Pause", show=True, priority=True),
|
||||
Binding("ctrl+r", "show_sessions", "Sessions", show=True, priority=True),
|
||||
Binding("tab", "focus_next", "Next Panel", show=True),
|
||||
Binding("shift+tab", "focus_previous", "Previous Panel", show=False),
|
||||
]
|
||||
|
||||
def __init__(self, runtime: AgentRuntime):
|
||||
def __init__(
|
||||
self,
|
||||
runtime: AgentRuntime,
|
||||
resume_session: str | None = None,
|
||||
resume_checkpoint: str | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.runtime = runtime
|
||||
self.log_pane = LogPane()
|
||||
self.graph_view = GraphOverview(runtime)
|
||||
self.chat_repl = ChatRepl(runtime)
|
||||
self.chat_repl = ChatRepl(runtime, resume_session, resume_checkpoint)
|
||||
self.status_bar = StatusBar(graph_id=runtime.graph.id)
|
||||
self.is_ready = False
|
||||
|
||||
@@ -528,9 +535,99 @@ class AdenTUI(App):
|
||||
except Exception as e:
|
||||
self.notify(f"Screenshot failed: {e}", severity="error", timeout=5)
|
||||
|
||||
def action_pause_execution(self) -> None:
|
||||
"""Immediately pause execution by cancelling task (bound to Ctrl+Z)."""
|
||||
try:
|
||||
chat_repl = self.query_one(ChatRepl)
|
||||
if not chat_repl._current_exec_id:
|
||||
self.notify(
|
||||
"No active execution to pause",
|
||||
severity="information",
|
||||
timeout=3,
|
||||
)
|
||||
return
|
||||
|
||||
# Find and cancel the execution task - executor will catch and save state
|
||||
task_cancelled = False
|
||||
for stream in self.runtime._streams.values():
|
||||
exec_id = chat_repl._current_exec_id
|
||||
task = stream._execution_tasks.get(exec_id)
|
||||
if task and not task.done():
|
||||
task.cancel()
|
||||
task_cancelled = True
|
||||
self.notify(
|
||||
"⏸ Execution paused - state saved",
|
||||
severity="information",
|
||||
timeout=3,
|
||||
)
|
||||
break
|
||||
|
||||
if not task_cancelled:
|
||||
self.notify(
|
||||
"Execution already completed",
|
||||
severity="information",
|
||||
timeout=2,
|
||||
)
|
||||
except Exception as e:
|
||||
self.notify(
|
||||
f"Error pausing execution: {e}",
|
||||
severity="error",
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
def action_show_sessions(self) -> None:
|
||||
"""Show sessions list (bound to Ctrl+R)."""
|
||||
# Send /sessions command to chat input
|
||||
try:
|
||||
chat_repl = self.query_one(ChatRepl)
|
||||
chat_input = chat_repl.query_one("#chat-input", Input)
|
||||
chat_input.value = "/sessions"
|
||||
# Trigger submission
|
||||
self.notify(
|
||||
"💡 Type /sessions in the chat to see all sessions",
|
||||
severity="information",
|
||||
timeout=3,
|
||||
)
|
||||
except Exception:
|
||||
self.notify(
|
||||
"Use /sessions command to see all sessions",
|
||||
severity="information",
|
||||
timeout=3,
|
||||
)
|
||||
|
||||
async def on_unmount(self) -> None:
|
||||
"""Cleanup on app shutdown."""
|
||||
"""Cleanup on app shutdown - cancel execution which will save state."""
|
||||
self.is_ready = False
|
||||
|
||||
# Cancel any active execution - the executor will catch CancelledError
|
||||
# and save current state as paused (no waiting needed!)
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
chat_repl = self.query_one(ChatRepl)
|
||||
if chat_repl._current_exec_id:
|
||||
# Find the stream with this execution
|
||||
for stream in self.runtime._streams.values():
|
||||
exec_id = chat_repl._current_exec_id
|
||||
task = stream._execution_tasks.get(exec_id)
|
||||
if task and not task.done():
|
||||
# Cancel the task - executor will catch and save state
|
||||
task.cancel()
|
||||
try:
|
||||
# Wait for executor to save state (may take a few seconds)
|
||||
# Longer timeout for quit to ensure state is properly saved
|
||||
await asyncio.wait_for(task, timeout=5.0)
|
||||
except (TimeoutError, asyncio.CancelledError):
|
||||
# Expected - task was cancelled
|
||||
# If timeout, state may not be fully saved
|
||||
pass
|
||||
except Exception:
|
||||
# Ignore other exceptions during cleanup
|
||||
pass
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
if hasattr(self, "_subscription_id"):
|
||||
self.runtime.unsubscribe_from_events(self._subscription_id)
|
||||
|
||||
@@ -17,6 +17,7 @@ Client-facing input:
|
||||
import asyncio
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from textual.app import ComposeResult
|
||||
@@ -69,13 +70,22 @@ class ChatRepl(Vertical):
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, runtime: AgentRuntime):
|
||||
def __init__(
|
||||
self,
|
||||
runtime: AgentRuntime,
|
||||
resume_session: str | None = None,
|
||||
resume_checkpoint: str | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.runtime = runtime
|
||||
self._current_exec_id: str | None = None
|
||||
self._streaming_snapshot: str = ""
|
||||
self._waiting_for_input: bool = False
|
||||
self._input_node_id: str | None = None
|
||||
self._pending_ask_question: str = ""
|
||||
self._resume_session = resume_session
|
||||
self._resume_checkpoint = resume_checkpoint
|
||||
self._session_index: list[str] = [] # IDs from last listing
|
||||
|
||||
# Dedicated event loop for agent execution.
|
||||
# Keeps blocking runtime code (LLM calls, MCP tools) off
|
||||
@@ -121,10 +131,613 @@ class ChatRepl(Vertical):
|
||||
if was_at_bottom:
|
||||
history.scroll_end(animate=False)
|
||||
|
||||
async def _handle_command(self, command: str) -> None:
|
||||
"""Handle slash commands for session and checkpoint operations."""
|
||||
parts = command.split(maxsplit=2)
|
||||
cmd = parts[0].lower()
|
||||
|
||||
if cmd == "/help":
|
||||
self._write_history("""[bold cyan]Available Commands:[/bold cyan]
|
||||
[bold]/sessions[/bold] - List all sessions for this agent
|
||||
[bold]/sessions[/bold] <session_id> - Show session details and checkpoints
|
||||
[bold]/resume[/bold] - List sessions and pick one to resume
|
||||
[bold]/resume[/bold] <number> - Resume session by list number
|
||||
[bold]/resume[/bold] <session_id> - Resume session by ID
|
||||
[bold]/recover[/bold] <session_id> <cp_id> - Recover from specific checkpoint
|
||||
[bold]/pause[/bold] - Pause current execution (Ctrl+Z)
|
||||
[bold]/help[/bold] - Show this help message
|
||||
|
||||
[dim]Examples:[/dim]
|
||||
/sessions [dim]# List all sessions[/dim]
|
||||
/sessions session_20260208_143022 [dim]# Show session details[/dim]
|
||||
/resume [dim]# Show numbered session list[/dim]
|
||||
/resume 1 [dim]# Resume first listed session[/dim]
|
||||
/resume session_20260208_143022 [dim]# Resume by full session ID[/dim]
|
||||
/recover session_20260208_143022 cp_xxx [dim]# Recover from specific checkpoint[/dim]
|
||||
/pause [dim]# Pause (or Ctrl+Z)[/dim]
|
||||
""")
|
||||
elif cmd == "/sessions":
|
||||
session_id = parts[1].strip() if len(parts) > 1 else None
|
||||
await self._cmd_sessions(session_id)
|
||||
elif cmd == "/resume":
|
||||
if len(parts) < 2:
|
||||
# No arg → show session list so user can pick one
|
||||
await self._cmd_sessions(None)
|
||||
return
|
||||
|
||||
arg = parts[1].strip()
|
||||
|
||||
# Numeric index → resolve from last listing
|
||||
if arg.isdigit():
|
||||
idx = int(arg) - 1 # 1-based to 0-based
|
||||
if 0 <= idx < len(self._session_index):
|
||||
session_id = self._session_index[idx]
|
||||
else:
|
||||
self._write_history(f"[bold red]Error:[/bold red] No session at index {arg}")
|
||||
self._write_history(" Use [bold]/resume[/bold] to see available sessions")
|
||||
return
|
||||
else:
|
||||
session_id = arg
|
||||
|
||||
await self._cmd_resume(session_id)
|
||||
elif cmd == "/recover":
|
||||
# Recover from specific checkpoint
|
||||
if len(parts) < 3:
|
||||
self._write_history(
|
||||
"[bold red]Error:[/bold red] /recover requires session_id and checkpoint_id"
|
||||
)
|
||||
self._write_history(" Usage: [bold]/recover <session_id> <checkpoint_id>[/bold]")
|
||||
self._write_history(
|
||||
" Tip: Use [bold]/sessions <session_id>[/bold] to see checkpoints"
|
||||
)
|
||||
return
|
||||
session_id = parts[1].strip()
|
||||
checkpoint_id = parts[2].strip()
|
||||
await self._cmd_recover(session_id, checkpoint_id)
|
||||
elif cmd == "/pause":
|
||||
await self._cmd_pause()
|
||||
else:
|
||||
self._write_history(
|
||||
f"[bold red]Unknown command:[/bold red] {cmd}\n"
|
||||
"Type [bold]/help[/bold] for available commands"
|
||||
)
|
||||
|
||||
async def _cmd_sessions(self, session_id: str | None) -> None:
|
||||
"""List sessions or show details of a specific session."""
|
||||
try:
|
||||
# Get storage path from runtime
|
||||
storage_path = self.runtime._storage.base_path
|
||||
|
||||
if session_id:
|
||||
# Show details of specific session including checkpoints
|
||||
await self._show_session_details(storage_path, session_id)
|
||||
else:
|
||||
# List all sessions
|
||||
await self._list_sessions(storage_path)
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error:[/bold red] {e}")
|
||||
self._write_history(" Could not access session data")
|
||||
|
||||
async def _find_latest_resumable_session(self) -> str | None:
|
||||
"""Find the most recent paused or failed session."""
|
||||
try:
|
||||
storage_path = self.runtime._storage.base_path
|
||||
sessions_dir = storage_path / "sessions"
|
||||
|
||||
if not sessions_dir.exists():
|
||||
return None
|
||||
|
||||
# Get all sessions, most recent first
|
||||
session_dirs = sorted(
|
||||
[d for d in sessions_dir.iterdir() if d.is_dir()],
|
||||
key=lambda d: d.name,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# Find first paused, failed, or cancelled session
|
||||
import json
|
||||
|
||||
for session_dir in session_dirs:
|
||||
state_file = session_dir / "state.json"
|
||||
if not state_file.exists():
|
||||
continue
|
||||
|
||||
with open(state_file) as f:
|
||||
state = json.load(f)
|
||||
|
||||
status = state.get("status", "").lower()
|
||||
|
||||
# Check if resumable (any non-completed status)
|
||||
if status in ["paused", "failed", "cancelled", "active"]:
|
||||
return session_dir.name
|
||||
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _get_session_label(self, state: dict) -> str:
|
||||
"""Extract the first user message from input_data as a human-readable label."""
|
||||
input_data = state.get("input_data", {})
|
||||
for value in input_data.values():
|
||||
if isinstance(value, str) and value.strip():
|
||||
label = value.strip()
|
||||
return label[:60] + "..." if len(label) > 60 else label
|
||||
return "(no input)"
|
||||
|
||||
async def _list_sessions(self, storage_path: Path) -> None:
|
||||
"""List all sessions for the agent."""
|
||||
self._write_history("[bold cyan]Available Sessions:[/bold cyan]")
|
||||
|
||||
# Find all session directories
|
||||
sessions_dir = storage_path / "sessions"
|
||||
if not sessions_dir.exists():
|
||||
self._write_history("[dim]No sessions found.[/dim]")
|
||||
self._write_history(" Sessions will appear here after running the agent")
|
||||
return
|
||||
|
||||
session_dirs = sorted(
|
||||
[d for d in sessions_dir.iterdir() if d.is_dir()],
|
||||
key=lambda d: d.name,
|
||||
reverse=True, # Most recent first
|
||||
)
|
||||
|
||||
if not session_dirs:
|
||||
self._write_history("[dim]No sessions found.[/dim]")
|
||||
return
|
||||
|
||||
self._write_history(f"[dim]Found {len(session_dirs)} session(s)[/dim]\n")
|
||||
|
||||
# Reset the session index for numeric lookups
|
||||
self._session_index = []
|
||||
|
||||
import json
|
||||
|
||||
for session_dir in session_dirs[:10]: # Show last 10 sessions
|
||||
session_id = session_dir.name
|
||||
state_file = session_dir / "state.json"
|
||||
|
||||
if not state_file.exists():
|
||||
continue
|
||||
|
||||
# Read session state
|
||||
try:
|
||||
with open(state_file) as f:
|
||||
state = json.load(f)
|
||||
|
||||
# Track this session for /resume <number> lookup
|
||||
self._session_index.append(session_id)
|
||||
index = len(self._session_index)
|
||||
|
||||
status = state.get("status", "unknown").upper()
|
||||
label = self._get_session_label(state)
|
||||
|
||||
# Status with color
|
||||
if status == "COMPLETED":
|
||||
status_colored = f"[green]{status}[/green]"
|
||||
elif status == "FAILED":
|
||||
status_colored = f"[red]{status}[/red]"
|
||||
elif status == "PAUSED":
|
||||
status_colored = f"[yellow]{status}[/yellow]"
|
||||
elif status == "CANCELLED":
|
||||
status_colored = f"[dim yellow]{status}[/dim yellow]"
|
||||
else:
|
||||
status_colored = f"[dim]{status}[/dim]"
|
||||
|
||||
# Session line with index and label
|
||||
self._write_history(f" [bold]{index}.[/bold] {label} {status_colored}")
|
||||
self._write_history(f" [dim]{session_id}[/dim]")
|
||||
self._write_history("") # Blank line
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f" [dim red]Error reading: {e}[/dim red]")
|
||||
|
||||
if self._session_index:
|
||||
self._write_history("[dim]Use [bold]/resume <number>[/bold] to resume a session[/dim]")
|
||||
|
||||
async def _show_session_details(self, storage_path: Path, session_id: str) -> None:
|
||||
"""Show detailed information about a specific session."""
|
||||
self._write_history(f"[bold cyan]Session Details:[/bold cyan] {session_id}\n")
|
||||
|
||||
session_dir = storage_path / "sessions" / session_id
|
||||
if not session_dir.exists():
|
||||
self._write_history("[bold red]Error:[/bold red] Session not found")
|
||||
self._write_history(f" Path: {session_dir}")
|
||||
self._write_history(" Tip: Use [bold]/sessions[/bold] to see available sessions")
|
||||
return
|
||||
|
||||
state_file = session_dir / "state.json"
|
||||
if not state_file.exists():
|
||||
self._write_history("[bold red]Error:[/bold red] Session state not found")
|
||||
return
|
||||
|
||||
try:
|
||||
import json
|
||||
|
||||
with open(state_file) as f:
|
||||
state = json.load(f)
|
||||
|
||||
# Basic info
|
||||
status = state.get("status", "unknown").upper()
|
||||
if status == "COMPLETED":
|
||||
status_colored = f"[green]{status}[/green]"
|
||||
elif status == "FAILED":
|
||||
status_colored = f"[red]{status}[/red]"
|
||||
elif status == "PAUSED":
|
||||
status_colored = f"[yellow]{status}[/yellow]"
|
||||
elif status == "CANCELLED":
|
||||
status_colored = f"[dim yellow]{status}[/dim yellow]"
|
||||
else:
|
||||
status_colored = status
|
||||
|
||||
self._write_history(f"Status: {status_colored}")
|
||||
|
||||
if "started_at" in state:
|
||||
self._write_history(f"Started: {state['started_at']}")
|
||||
if "completed_at" in state:
|
||||
self._write_history(f"Completed: {state['completed_at']}")
|
||||
|
||||
# Execution path
|
||||
if "execution_path" in state and state["execution_path"]:
|
||||
self._write_history("\n[bold]Execution Path:[/bold]")
|
||||
for node_id in state["execution_path"]:
|
||||
self._write_history(f" ✓ {node_id}")
|
||||
|
||||
# Checkpoints
|
||||
checkpoint_dir = session_dir / "checkpoints"
|
||||
if checkpoint_dir.exists():
|
||||
checkpoint_files = sorted(checkpoint_dir.glob("cp_*.json"))
|
||||
if checkpoint_files:
|
||||
self._write_history(
|
||||
f"\n[bold]Available Checkpoints:[/bold] ({len(checkpoint_files)})"
|
||||
)
|
||||
|
||||
# Load and show checkpoints
|
||||
for i, cp_file in enumerate(checkpoint_files[-5:], 1): # Last 5
|
||||
try:
|
||||
with open(cp_file) as f:
|
||||
cp_data = json.load(f)
|
||||
|
||||
cp_id = cp_data.get("checkpoint_id", cp_file.stem)
|
||||
cp_type = cp_data.get("checkpoint_type", "unknown")
|
||||
current_node = cp_data.get("current_node", "unknown")
|
||||
is_clean = cp_data.get("is_clean", False)
|
||||
|
||||
clean_marker = "✓" if is_clean else "⚠"
|
||||
self._write_history(f" {i}. {clean_marker} [cyan]{cp_id}[/cyan]")
|
||||
self._write_history(f" Type: {cp_type}, Node: {current_node}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Quick actions
|
||||
if checkpoint_dir.exists() and list(checkpoint_dir.glob("cp_*.json")):
|
||||
self._write_history("\n[bold]Quick Actions:[/bold]")
|
||||
self._write_history(
|
||||
f" [dim]/resume {session_id}[/dim] - Resume from latest checkpoint"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error:[/bold red] {e}")
|
||||
import traceback
|
||||
|
||||
self._write_history(f"[dim]{traceback.format_exc()}[/dim]")
|
||||
|
||||
async def _cmd_resume(self, session_id: str) -> None:
|
||||
"""Resume a session from its last state (session state, not checkpoint)."""
|
||||
try:
|
||||
storage_path = self.runtime._storage.base_path
|
||||
session_dir = storage_path / "sessions" / session_id
|
||||
|
||||
# Verify session exists
|
||||
if not session_dir.exists():
|
||||
self._write_history(f"[bold red]Error:[/bold red] Session not found: {session_id}")
|
||||
self._write_history(" Use [bold]/sessions[/bold] to see available sessions")
|
||||
return
|
||||
|
||||
# Load session state
|
||||
state_file = session_dir / "state.json"
|
||||
if not state_file.exists():
|
||||
self._write_history("[bold red]Error:[/bold red] Session state not found")
|
||||
return
|
||||
|
||||
import json
|
||||
|
||||
with open(state_file) as f:
|
||||
state = json.load(f)
|
||||
|
||||
# Resume from session state (not checkpoint)
|
||||
progress = state.get("progress", {})
|
||||
paused_at = progress.get("paused_at") or progress.get("resume_from")
|
||||
|
||||
if paused_at:
|
||||
# Has paused_at - resume from there
|
||||
resume_session_state = {
|
||||
"paused_at": paused_at,
|
||||
"memory": state.get("memory", {}),
|
||||
"execution_path": progress.get("path", []),
|
||||
"node_visit_counts": progress.get("node_visit_counts", {}),
|
||||
}
|
||||
resume_info = f"From node: [cyan]{paused_at}[/cyan]"
|
||||
else:
|
||||
# No paused_at - just retry with same input
|
||||
resume_session_state = {}
|
||||
resume_info = "Retrying with same input"
|
||||
|
||||
# Display resume info
|
||||
self._write_history(f"[bold cyan]🔄 Resuming session[/bold cyan] {session_id}")
|
||||
self._write_history(f" {resume_info}")
|
||||
if paused_at:
|
||||
self._write_history(" [dim](Using session state, not checkpoint)[/dim]")
|
||||
|
||||
# Check if already executing
|
||||
if self._current_exec_id is not None:
|
||||
self._write_history(
|
||||
"[bold yellow]Warning:[/bold yellow] An execution is already running"
|
||||
)
|
||||
self._write_history(" Wait for it to complete or use /pause first")
|
||||
return
|
||||
|
||||
# Get original input data from session state
|
||||
input_data = state.get("input_data", {})
|
||||
|
||||
# Show indicator
|
||||
indicator = self.query_one("#processing-indicator", Label)
|
||||
indicator.update("Resuming from session state...")
|
||||
indicator.display = True
|
||||
|
||||
# Update placeholder
|
||||
chat_input = self.query_one("#chat-input", Input)
|
||||
chat_input.placeholder = "Commands: /pause, /sessions (agent resuming...)"
|
||||
|
||||
# Trigger execution with resume state
|
||||
try:
|
||||
entry_points = self.runtime.get_entry_points()
|
||||
if not entry_points:
|
||||
self._write_history("[bold red]Error:[/bold red] No entry points available")
|
||||
return
|
||||
|
||||
# Submit execution with resume state and original input data
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.runtime.trigger(
|
||||
entry_points[0].id,
|
||||
input_data=input_data,
|
||||
session_state=resume_session_state,
|
||||
),
|
||||
self._agent_loop,
|
||||
)
|
||||
exec_id = await asyncio.wrap_future(future)
|
||||
self._current_exec_id = exec_id
|
||||
|
||||
self._write_history(
|
||||
f"[green]✓[/green] Resume started (execution: {exec_id[:12]}...)"
|
||||
)
|
||||
self._write_history(" Agent is continuing from where it stopped...")
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error starting resume:[/bold red] {e}")
|
||||
indicator.display = False
|
||||
chat_input.placeholder = "Enter input for agent..."
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error:[/bold red] {e}")
|
||||
import traceback
|
||||
|
||||
self._write_history(f"[dim]{traceback.format_exc()}[/dim]")
|
||||
|
||||
async def _cmd_recover(self, session_id: str, checkpoint_id: str) -> None:
|
||||
"""Recover a session from a specific checkpoint (time-travel debugging)."""
|
||||
try:
|
||||
storage_path = self.runtime._storage.base_path
|
||||
session_dir = storage_path / "sessions" / session_id
|
||||
|
||||
# Verify session exists
|
||||
if not session_dir.exists():
|
||||
self._write_history(f"[bold red]Error:[/bold red] Session not found: {session_id}")
|
||||
self._write_history(" Use [bold]/sessions[/bold] to see available sessions")
|
||||
return
|
||||
|
||||
# Verify checkpoint exists
|
||||
checkpoint_file = session_dir / "checkpoints" / f"{checkpoint_id}.json"
|
||||
if not checkpoint_file.exists():
|
||||
self._write_history(
|
||||
f"[bold red]Error:[/bold red] Checkpoint not found: {checkpoint_id}"
|
||||
)
|
||||
self._write_history(
|
||||
f" Use [bold]/sessions {session_id}[/bold] to see available checkpoints"
|
||||
)
|
||||
return
|
||||
|
||||
# Display recover info
|
||||
self._write_history(f"[bold cyan]⏪ Recovering session[/bold cyan] {session_id}")
|
||||
self._write_history(f" From checkpoint: [cyan]{checkpoint_id}[/cyan]")
|
||||
self._write_history(
|
||||
" [dim](Checkpoint-based recovery for time-travel debugging)[/dim]"
|
||||
)
|
||||
|
||||
# Check if already executing
|
||||
if self._current_exec_id is not None:
|
||||
self._write_history(
|
||||
"[bold yellow]Warning:[/bold yellow] An execution is already running"
|
||||
)
|
||||
self._write_history(" Wait for it to complete or use /pause first")
|
||||
return
|
||||
|
||||
# Create session_state for checkpoint recovery
|
||||
recover_session_state = {
|
||||
"resume_from_checkpoint": checkpoint_id,
|
||||
}
|
||||
|
||||
# Show indicator
|
||||
indicator = self.query_one("#processing-indicator", Label)
|
||||
indicator.update("Recovering from checkpoint...")
|
||||
indicator.display = True
|
||||
|
||||
# Update placeholder
|
||||
chat_input = self.query_one("#chat-input", Input)
|
||||
chat_input.placeholder = "Commands: /pause, /sessions (agent recovering...)"
|
||||
|
||||
# Trigger execution with checkpoint recovery
|
||||
try:
|
||||
entry_points = self.runtime.get_entry_points()
|
||||
if not entry_points:
|
||||
self._write_history("[bold red]Error:[/bold red] No entry points available")
|
||||
return
|
||||
|
||||
# Submit execution with checkpoint recovery state
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.runtime.trigger(
|
||||
entry_points[0].id,
|
||||
input_data={},
|
||||
session_state=recover_session_state,
|
||||
),
|
||||
self._agent_loop,
|
||||
)
|
||||
exec_id = await asyncio.wrap_future(future)
|
||||
self._current_exec_id = exec_id
|
||||
|
||||
self._write_history(
|
||||
f"[green]✓[/green] Recovery started (execution: {exec_id[:12]}...)"
|
||||
)
|
||||
self._write_history(" Agent is continuing from checkpoint...")
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error starting recovery:[/bold red] {e}")
|
||||
indicator.display = False
|
||||
chat_input.placeholder = "Enter input for agent..."
|
||||
|
||||
except Exception as e:
|
||||
self._write_history(f"[bold red]Error:[/bold red] {e}")
|
||||
import traceback
|
||||
|
||||
self._write_history(f"[dim]{traceback.format_exc()}[/dim]")
|
||||
|
||||
async def _cmd_pause(self) -> None:
|
||||
"""Immediately pause execution by cancelling task (same as Ctrl+Z)."""
|
||||
# Check if there's a current execution
|
||||
if not self._current_exec_id:
|
||||
self._write_history("[bold yellow]No active execution to pause[/bold yellow]")
|
||||
self._write_history(" Start an execution first, then use /pause during execution")
|
||||
return
|
||||
|
||||
# Find and cancel the execution task - executor will catch and save state
|
||||
task_cancelled = False
|
||||
for stream in self.runtime._streams.values():
|
||||
exec_id = self._current_exec_id
|
||||
task = stream._execution_tasks.get(exec_id)
|
||||
if task and not task.done():
|
||||
task.cancel()
|
||||
task_cancelled = True
|
||||
self._write_history("[bold green]⏸ Execution paused - state saved[/bold green]")
|
||||
self._write_history(" Resume later with: [bold]/resume[/bold]")
|
||||
break
|
||||
|
||||
if not task_cancelled:
|
||||
self._write_history("[bold yellow]Execution already completed[/bold yellow]")
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Add welcome message when widget mounts."""
|
||||
"""Add welcome message and check for resumable sessions."""
|
||||
history = self.query_one("#chat-history", RichLog)
|
||||
history.write("[bold cyan]Chat REPL Ready[/bold cyan] — Type your input below\n")
|
||||
history.write(
|
||||
"[bold cyan]Chat REPL Ready[/bold cyan] — "
|
||||
"Type your input or use [bold]/help[/bold] for commands\n"
|
||||
)
|
||||
|
||||
# Auto-trigger resume/recover if CLI args provided
|
||||
if self._resume_session:
|
||||
if self._resume_checkpoint:
|
||||
# Use /recover for checkpoint-based recovery
|
||||
history.write(
|
||||
"\n[bold cyan]🔄 Auto-recovering from checkpoint "
|
||||
"(--resume-session + --checkpoint)[/bold cyan]"
|
||||
)
|
||||
self.call_later(self._cmd_recover, self._resume_session, self._resume_checkpoint)
|
||||
else:
|
||||
# Use /resume for session state resume
|
||||
history.write(
|
||||
"\n[bold cyan]🔄 Auto-resuming session (--resume-session)[/bold cyan]"
|
||||
)
|
||||
self.call_later(self._cmd_resume, self._resume_session)
|
||||
return # Skip normal startup messages
|
||||
|
||||
# Check for resumable sessions
|
||||
self._check_and_show_resumable_sessions()
|
||||
|
||||
# Show agent intro message if available
|
||||
if self.runtime.intro_message:
|
||||
history.write(f"[bold blue]Agent:[/bold blue] {self.runtime.intro_message}\n")
|
||||
else:
|
||||
history.write(
|
||||
"[dim]Quick start: /sessions to see previous sessions, "
|
||||
"/pause to pause execution[/dim]\n"
|
||||
)
|
||||
|
||||
def _check_and_show_resumable_sessions(self) -> None:
|
||||
"""Check for non-terminated sessions and prompt user."""
|
||||
try:
|
||||
storage_path = self.runtime._storage.base_path
|
||||
sessions_dir = storage_path / "sessions"
|
||||
|
||||
if not sessions_dir.exists():
|
||||
return
|
||||
|
||||
# Find non-terminated sessions (paused, failed, cancelled, active)
|
||||
resumable = []
|
||||
session_dirs = sorted(
|
||||
[d for d in sessions_dir.iterdir() if d.is_dir()],
|
||||
key=lambda d: d.name,
|
||||
reverse=True, # Most recent first
|
||||
)
|
||||
|
||||
import json
|
||||
|
||||
for session_dir in session_dirs[:5]: # Check last 5 sessions
|
||||
state_file = session_dir / "state.json"
|
||||
if not state_file.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(state_file) as f:
|
||||
state = json.load(f)
|
||||
|
||||
status = state.get("status", "").lower()
|
||||
# Non-terminated statuses
|
||||
if status in ["paused", "failed", "cancelled", "active"]:
|
||||
resumable.append(
|
||||
{
|
||||
"session_id": session_dir.name,
|
||||
"status": status.upper(),
|
||||
"label": self._get_session_label(state),
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if resumable:
|
||||
# Populate session index so /resume <number> works immediately
|
||||
self._session_index = [s["session_id"] for s in resumable[:3]]
|
||||
|
||||
self._write_history("\n[bold yellow]Non-terminated sessions found:[/bold yellow]")
|
||||
for i, session in enumerate(resumable[:3], 1): # Show top 3
|
||||
status = session["status"]
|
||||
label = session["label"]
|
||||
|
||||
# Color code status
|
||||
if status == "PAUSED":
|
||||
status_colored = f"[yellow]{status}[/yellow]"
|
||||
elif status == "FAILED":
|
||||
status_colored = f"[red]{status}[/red]"
|
||||
elif status == "CANCELLED":
|
||||
status_colored = f"[dim yellow]{status}[/dim yellow]"
|
||||
else:
|
||||
status_colored = f"[dim]{status}[/dim]"
|
||||
|
||||
self._write_history(f" [bold]{i}.[/bold] {label} {status_colored}")
|
||||
|
||||
self._write_history("\n Type [bold]/resume <number>[/bold] to continue a session")
|
||||
self._write_history(" Or just type your input to start a new session\n")
|
||||
|
||||
except Exception:
|
||||
# Silently fail - don't block TUI startup
|
||||
pass
|
||||
|
||||
async def on_input_submitted(self, message: Input.Submitted) -> None:
|
||||
"""Handle input submission — either start new execution or inject input."""
|
||||
@@ -132,15 +745,21 @@ class ChatRepl(Vertical):
|
||||
if not user_input:
|
||||
return
|
||||
|
||||
# Handle commands (starting with /) - ALWAYS process commands first
|
||||
# Commands work during execution, during client-facing input, anytime
|
||||
if user_input.startswith("/"):
|
||||
await self._handle_command(user_input)
|
||||
message.input.value = ""
|
||||
return
|
||||
|
||||
# Client-facing input: route to the waiting node
|
||||
if self._waiting_for_input and self._input_node_id:
|
||||
self._write_history(f"[bold green]You:[/bold green] {user_input}")
|
||||
message.input.value = ""
|
||||
|
||||
# Disable input while agent processes the response
|
||||
# Keep input enabled for commands (but change placeholder)
|
||||
chat_input = self.query_one("#chat-input", Input)
|
||||
chat_input.disabled = True
|
||||
chat_input.placeholder = "Enter input for agent..."
|
||||
chat_input.placeholder = "Commands: /pause, /sessions (agent processing...)"
|
||||
self._waiting_for_input = False
|
||||
|
||||
indicator = self.query_one("#processing-indicator", Label)
|
||||
@@ -193,9 +812,9 @@ class ChatRepl(Vertical):
|
||||
indicator.update("Thinking...")
|
||||
indicator.display = True
|
||||
|
||||
# Disable input while the agent is working
|
||||
# Keep input enabled for commands during execution
|
||||
chat_input = self.query_one("#chat-input", Input)
|
||||
chat_input.disabled = True
|
||||
chat_input.placeholder = "Commands available: /pause, /sessions, /help"
|
||||
|
||||
# Submit execution to the dedicated agent loop so blocking
|
||||
# runtime code (LLM, MCP tools) never touches Textual's loop.
|
||||
@@ -236,8 +855,16 @@ class ChatRepl(Vertical):
|
||||
|
||||
def handle_tool_started(self, tool_name: str, tool_input: dict[str, Any]) -> None:
|
||||
"""Handle a tool call starting."""
|
||||
# Update indicator to show tool activity
|
||||
indicator = self.query_one("#processing-indicator", Label)
|
||||
|
||||
if tool_name == "ask_user":
|
||||
# Stash the question for handle_input_requested() to display.
|
||||
# Suppress the generic "Tool: ask_user" line.
|
||||
self._pending_ask_question = tool_input.get("question", "")
|
||||
indicator.update("Preparing question...")
|
||||
return
|
||||
|
||||
# Update indicator to show tool activity
|
||||
indicator.update(f"Using tool: {tool_name}...")
|
||||
|
||||
# Write a discrete status line to history
|
||||
@@ -245,6 +872,11 @@ class ChatRepl(Vertical):
|
||||
|
||||
def handle_tool_completed(self, tool_name: str, result: str, is_error: bool) -> None:
|
||||
"""Handle a tool call completing."""
|
||||
if tool_name == "ask_user":
|
||||
# Suppress the synthetic "Waiting for user input..." result.
|
||||
# The actual question is displayed by handle_input_requested().
|
||||
return
|
||||
|
||||
result_str = str(result)
|
||||
preview = result_str[:200] + "..." if len(result_str) > 200 else result_str
|
||||
preview = preview.replace("\n", " ")
|
||||
@@ -275,6 +907,7 @@ class ChatRepl(Vertical):
|
||||
self._streaming_snapshot = ""
|
||||
self._waiting_for_input = False
|
||||
self._input_node_id = None
|
||||
self._pending_ask_question = ""
|
||||
|
||||
# Re-enable input
|
||||
chat_input = self.query_one("#chat-input", Input)
|
||||
@@ -293,6 +926,7 @@ class ChatRepl(Vertical):
|
||||
self._current_exec_id = None
|
||||
self._streaming_snapshot = ""
|
||||
self._waiting_for_input = False
|
||||
self._pending_ask_question = ""
|
||||
self._input_node_id = None
|
||||
|
||||
# Re-enable input
|
||||
@@ -309,10 +943,18 @@ class ChatRepl(Vertical):
|
||||
and sets a flag so the next submission routes to inject_input().
|
||||
"""
|
||||
# Flush accumulated streaming text as agent output
|
||||
if self._streaming_snapshot:
|
||||
self._write_history(f"[bold blue]Agent:[/bold blue] {self._streaming_snapshot}")
|
||||
flushed_snapshot = self._streaming_snapshot
|
||||
if flushed_snapshot:
|
||||
self._write_history(f"[bold blue]Agent:[/bold blue] {flushed_snapshot}")
|
||||
self._streaming_snapshot = ""
|
||||
|
||||
# Display the ask_user question if stashed and not already
|
||||
# present in the streaming snapshot (avoids double-display).
|
||||
question = self._pending_ask_question
|
||||
self._pending_ask_question = ""
|
||||
if question and question not in flushed_snapshot:
|
||||
self._write_history(f"[bold blue]Agent:[/bold blue] {question}")
|
||||
|
||||
self._waiting_for_input = True
|
||||
self._input_node_id = node_id or None
|
||||
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "framework"
|
||||
version = "0.1.0"
|
||||
version = "0.4.2"
|
||||
description = "Goal-driven agent runtime with Builder-friendly observability"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
@@ -1,342 +0,0 @@
|
||||
"""Tests for the BuilderQuery interface - how Builder analyzes agent runs.
|
||||
|
||||
DEPRECATED: These tests rely on the deprecated FileStorage backend.
|
||||
BuilderQuery and Runtime both use FileStorage which is deprecated.
|
||||
New code should use unified session storage instead.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from framework import BuilderQuery, Runtime
|
||||
from framework.schemas.run import RunStatus
|
||||
|
||||
# Mark all tests in this module as skipped - they rely on deprecated FileStorage
|
||||
pytestmark = pytest.mark.skip(reason="Tests rely on deprecated FileStorage backend")
|
||||
|
||||
|
||||
def create_successful_run(runtime: Runtime, goal_id: str = "test_goal") -> str:
|
||||
"""Helper to create a successful run with decisions."""
|
||||
run_id = runtime.start_run(goal_id, f"Test goal: {goal_id}")
|
||||
|
||||
runtime.set_node("search-node")
|
||||
d1 = runtime.decide(
|
||||
intent="Search for data",
|
||||
options=[
|
||||
{"id": "web", "description": "Web search", "pros": ["Fresh data"]},
|
||||
{"id": "cache", "description": "Use cache", "pros": ["Fast"]},
|
||||
],
|
||||
chosen="web",
|
||||
reasoning="Need fresh data",
|
||||
)
|
||||
runtime.record_outcome(d1, success=True, result={"items": 3}, tokens_used=50)
|
||||
|
||||
runtime.set_node("process-node")
|
||||
d2 = runtime.decide(
|
||||
intent="Process results",
|
||||
options=[{"id": "filter", "description": "Filter and transform"}],
|
||||
chosen="filter",
|
||||
reasoning="Standard processing",
|
||||
)
|
||||
runtime.record_outcome(d2, success=True, result={"processed": 3}, tokens_used=30)
|
||||
|
||||
runtime.end_run(success=True, narrative="Successfully processed data")
|
||||
return run_id
|
||||
|
||||
|
||||
def create_failed_run(runtime: Runtime, goal_id: str = "test_goal") -> str:
|
||||
"""Helper to create a failed run."""
|
||||
run_id = runtime.start_run(goal_id, f"Test goal: {goal_id}")
|
||||
|
||||
runtime.set_node("search-node")
|
||||
d1 = runtime.decide(
|
||||
intent="Search for data",
|
||||
options=[{"id": "web", "description": "Web search"}],
|
||||
chosen="web",
|
||||
reasoning="Need data",
|
||||
)
|
||||
runtime.record_outcome(d1, success=True, result={"items": 0})
|
||||
|
||||
runtime.set_node("process-node")
|
||||
d2 = runtime.decide(
|
||||
intent="Process results",
|
||||
options=[{"id": "process", "description": "Process data"}],
|
||||
chosen="process",
|
||||
reasoning="Continue pipeline",
|
||||
)
|
||||
runtime.record_outcome(d2, success=False, error="No data to process")
|
||||
|
||||
runtime.report_problem(
|
||||
severity="critical",
|
||||
description="Processing failed due to empty input",
|
||||
decision_id=d2,
|
||||
suggested_fix="Add empty input handling",
|
||||
)
|
||||
|
||||
runtime.end_run(success=False, narrative="Failed to process - no data")
|
||||
return run_id
|
||||
|
||||
|
||||
class TestBuilderQueryBasics:
|
||||
"""Test basic query operations."""
|
||||
|
||||
def test_get_run_summary(self, tmp_path: Path):
|
||||
"""Test getting a run summary."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_successful_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
summary = query.get_run_summary(run_id)
|
||||
|
||||
assert summary is not None
|
||||
assert summary.run_id == run_id
|
||||
assert summary.status == RunStatus.COMPLETED
|
||||
assert summary.decision_count == 2
|
||||
assert summary.success_rate == 1.0
|
||||
|
||||
def test_get_full_run(self, tmp_path: Path):
|
||||
"""Test getting the full run details."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_successful_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
run = query.get_full_run(run_id)
|
||||
|
||||
assert run is not None
|
||||
assert run.id == run_id
|
||||
assert len(run.decisions) == 2
|
||||
assert run.decisions[0].node_id == "search-node"
|
||||
assert run.decisions[1].node_id == "process-node"
|
||||
|
||||
def test_list_runs_for_goal(self, tmp_path: Path):
|
||||
"""Test listing all runs for a goal."""
|
||||
runtime = Runtime(tmp_path)
|
||||
create_successful_run(runtime, "goal_a")
|
||||
create_successful_run(runtime, "goal_a")
|
||||
create_successful_run(runtime, "goal_b")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
summaries = query.list_runs_for_goal("goal_a")
|
||||
|
||||
assert len(summaries) == 2
|
||||
for s in summaries:
|
||||
assert s.goal_id == "goal_a"
|
||||
|
||||
def test_get_recent_failures(self, tmp_path: Path):
|
||||
"""Test getting recent failed runs."""
|
||||
runtime = Runtime(tmp_path)
|
||||
create_successful_run(runtime)
|
||||
create_failed_run(runtime)
|
||||
create_failed_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
failures = query.get_recent_failures()
|
||||
|
||||
assert len(failures) == 2
|
||||
for f in failures:
|
||||
assert f.status == RunStatus.FAILED
|
||||
|
||||
|
||||
class TestFailureAnalysis:
|
||||
"""Test failure analysis capabilities."""
|
||||
|
||||
def test_analyze_failure(self, tmp_path: Path):
|
||||
"""Test analyzing why a run failed."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_failed_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
analysis = query.analyze_failure(run_id)
|
||||
|
||||
assert analysis is not None
|
||||
assert analysis.run_id == run_id
|
||||
assert "No data to process" in analysis.root_cause
|
||||
assert len(analysis.decision_chain) >= 2
|
||||
assert len(analysis.problems) == 1
|
||||
assert "critical" in analysis.problems[0].lower()
|
||||
|
||||
def test_analyze_failure_returns_none_for_success(self, tmp_path: Path):
|
||||
"""analyze_failure returns None for successful runs."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_successful_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
analysis = query.analyze_failure(run_id)
|
||||
|
||||
assert analysis is None
|
||||
|
||||
def test_failure_analysis_has_suggestions(self, tmp_path: Path):
|
||||
"""Failure analysis should include suggestions."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_failed_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
analysis = query.analyze_failure(run_id)
|
||||
|
||||
assert len(analysis.suggestions) > 0
|
||||
# Should include the suggested fix from the problem
|
||||
assert any("empty input" in s.lower() for s in analysis.suggestions)
|
||||
|
||||
def test_get_decision_trace(self, tmp_path: Path):
|
||||
"""Test getting a readable decision trace."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run_id = create_successful_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
trace = query.get_decision_trace(run_id)
|
||||
|
||||
assert len(trace) == 2
|
||||
assert "search-node" in trace[0]
|
||||
assert "process-node" in trace[1]
|
||||
|
||||
|
||||
class TestPatternAnalysis:
|
||||
"""Test pattern detection across runs."""
|
||||
|
||||
def test_find_patterns_basic(self, tmp_path: Path):
|
||||
"""Test basic pattern finding."""
|
||||
runtime = Runtime(tmp_path)
|
||||
create_successful_run(runtime, "goal_x")
|
||||
create_successful_run(runtime, "goal_x")
|
||||
create_failed_run(runtime, "goal_x")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
patterns = query.find_patterns("goal_x")
|
||||
|
||||
assert patterns is not None
|
||||
assert patterns.goal_id == "goal_x"
|
||||
assert patterns.run_count == 3
|
||||
assert 0 < patterns.success_rate < 1 # 2/3 success
|
||||
|
||||
def test_find_patterns_common_failures(self, tmp_path: Path):
|
||||
"""Test finding common failures."""
|
||||
runtime = Runtime(tmp_path)
|
||||
# Create multiple runs with the same failure
|
||||
for _ in range(3):
|
||||
create_failed_run(runtime, "failing_goal")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
patterns = query.find_patterns("failing_goal")
|
||||
|
||||
assert len(patterns.common_failures) > 0
|
||||
# "No data to process" should be a common failure
|
||||
failure_messages = [f[0] for f in patterns.common_failures]
|
||||
assert any("No data to process" in msg for msg in failure_messages)
|
||||
|
||||
def test_find_patterns_problematic_nodes(self, tmp_path: Path):
|
||||
"""Test finding problematic nodes."""
|
||||
runtime = Runtime(tmp_path)
|
||||
# Create runs where process-node always fails
|
||||
for _ in range(3):
|
||||
create_failed_run(runtime, "node_test")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
patterns = query.find_patterns("node_test")
|
||||
|
||||
# process-node should be flagged as problematic
|
||||
problematic_node_ids = [n[0] for n in patterns.problematic_nodes]
|
||||
assert "process-node" in problematic_node_ids
|
||||
|
||||
def test_compare_runs(self, tmp_path: Path):
|
||||
"""Test comparing two runs."""
|
||||
runtime = Runtime(tmp_path)
|
||||
run1 = create_successful_run(runtime)
|
||||
run2 = create_failed_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
comparison = query.compare_runs(run1, run2)
|
||||
|
||||
assert comparison["run_1"]["status"] == "completed"
|
||||
assert comparison["run_2"]["status"] == "failed"
|
||||
assert len(comparison["differences"]) > 0
|
||||
|
||||
|
||||
class TestImprovementSuggestions:
|
||||
"""Test improvement suggestion generation."""
|
||||
|
||||
def test_suggest_improvements(self, tmp_path: Path):
|
||||
"""Test generating improvement suggestions."""
|
||||
runtime = Runtime(tmp_path)
|
||||
# Create runs with failures to trigger suggestions
|
||||
for _ in range(3):
|
||||
create_failed_run(runtime, "improve_goal")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
suggestions = query.suggest_improvements("improve_goal")
|
||||
|
||||
assert len(suggestions) > 0
|
||||
# Should suggest improving the problematic node
|
||||
node_suggestions = [s for s in suggestions if s["type"] == "node_improvement"]
|
||||
assert len(node_suggestions) > 0
|
||||
|
||||
def test_suggest_improvements_for_low_success_rate(self, tmp_path: Path):
|
||||
"""Should suggest architecture review for low success rate."""
|
||||
runtime = Runtime(tmp_path)
|
||||
# 4 failures, 1 success = 20% success rate
|
||||
for _ in range(4):
|
||||
create_failed_run(runtime, "low_success")
|
||||
create_successful_run(runtime, "low_success")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
suggestions = query.suggest_improvements("low_success")
|
||||
|
||||
arch_suggestions = [s for s in suggestions if s["type"] == "architecture"]
|
||||
assert len(arch_suggestions) > 0
|
||||
assert arch_suggestions[0]["priority"] == "high"
|
||||
|
||||
def test_get_node_performance(self, tmp_path: Path):
|
||||
"""Test getting performance metrics for a node."""
|
||||
runtime = Runtime(tmp_path)
|
||||
create_successful_run(runtime)
|
||||
create_successful_run(runtime)
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
perf = query.get_node_performance("search-node")
|
||||
|
||||
assert perf["node_id"] == "search-node"
|
||||
assert perf["total_decisions"] == 2
|
||||
assert perf["success_rate"] == 1.0
|
||||
assert perf["total_tokens"] == 100 # 50 tokens per run
|
||||
|
||||
|
||||
class TestBuilderWorkflow:
|
||||
"""Test complete Builder workflows."""
|
||||
|
||||
def test_builder_investigation_workflow(self, tmp_path: Path):
|
||||
"""Test a complete investigation workflow as Builder would use it."""
|
||||
runtime = Runtime(tmp_path)
|
||||
|
||||
# Set up scenario: some successes, some failures
|
||||
for _ in range(2):
|
||||
create_successful_run(runtime, "customer_goal")
|
||||
for _ in range(2):
|
||||
create_failed_run(runtime, "customer_goal")
|
||||
|
||||
query = BuilderQuery(tmp_path)
|
||||
|
||||
# Step 1: Get overview of the goal
|
||||
summaries = query.list_runs_for_goal("customer_goal")
|
||||
assert len(summaries) == 4
|
||||
|
||||
# Step 2: Find patterns
|
||||
patterns = query.find_patterns("customer_goal")
|
||||
assert patterns.success_rate == 0.5 # 2/4
|
||||
|
||||
# Step 3: Get recent failures
|
||||
failures = query.get_recent_failures()
|
||||
assert len(failures) == 2
|
||||
|
||||
# Step 4: Analyze a specific failure
|
||||
failure_id = failures[0].run_id
|
||||
analysis = query.analyze_failure(failure_id)
|
||||
assert analysis is not None
|
||||
assert len(analysis.suggestions) > 0
|
||||
|
||||
# Step 5: Generate improvement suggestions
|
||||
suggestions = query.suggest_improvements("customer_goal")
|
||||
assert len(suggestions) > 0
|
||||
|
||||
# Step 6: Check node performance
|
||||
perf = query.get_node_performance("process-node")
|
||||
assert perf["success_rate"] < 1.0 # process-node fails in failed runs
|
||||
@@ -1,185 +0,0 @@
|
||||
"""Tests for ConcurrentStorage race condition and cache invalidation fixes."""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.schemas.run import Run, RunMetrics, RunStatus
|
||||
from framework.storage.concurrent import ConcurrentStorage
|
||||
|
||||
|
||||
def create_test_run(
|
||||
run_id: str, goal_id: str = "test-goal", status: RunStatus = RunStatus.RUNNING
|
||||
) -> Run:
|
||||
"""Create a minimal test Run object."""
|
||||
return Run(
|
||||
id=run_id,
|
||||
goal_id=goal_id,
|
||||
status=status,
|
||||
narrative="Test run",
|
||||
metrics=RunMetrics(
|
||||
nodes_executed=[],
|
||||
),
|
||||
decisions=[],
|
||||
problems=[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="FileStorage.save_run() is deprecated and now a no-op. "
|
||||
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
|
||||
"New sessions use unified storage at sessions/{session_id}/state.json"
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_cache_invalidation_on_save(tmp_path: Path):
|
||||
"""Test that summary cache is invalidated when a run is saved.
|
||||
|
||||
This tests the fix for the cache invalidation bug where load_summary()
|
||||
would return stale data after a run was updated.
|
||||
"""
|
||||
storage = ConcurrentStorage(tmp_path)
|
||||
await storage.start()
|
||||
|
||||
try:
|
||||
run_id = "test-run-1"
|
||||
|
||||
# Create and save initial run
|
||||
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Load summary to populate the cache
|
||||
summary = await storage.load_summary(run_id)
|
||||
assert summary is not None
|
||||
assert summary.status == RunStatus.RUNNING
|
||||
|
||||
# Update run with new status
|
||||
run.status = RunStatus.COMPLETED
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Load summary again - should get fresh data, not cached stale data
|
||||
summary = await storage.load_summary(run_id)
|
||||
assert summary is not None
|
||||
assert summary.status == RunStatus.COMPLETED, (
|
||||
"Summary cache should be invalidated on save - got stale data"
|
||||
)
|
||||
finally:
|
||||
await storage.stop()
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="FileStorage.save_run() is deprecated and now a no-op. "
|
||||
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
|
||||
"New sessions use unified storage at sessions/{session_id}/state.json"
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_batched_write_cache_consistency(tmp_path: Path):
|
||||
"""Test that cache is only updated after successful batched write.
|
||||
|
||||
This tests the fix for the race condition where cache was updated
|
||||
before the batched write completed.
|
||||
"""
|
||||
storage = ConcurrentStorage(tmp_path, batch_interval=0.05)
|
||||
await storage.start()
|
||||
|
||||
try:
|
||||
run_id = "test-run-2"
|
||||
|
||||
# Save via batching (immediate=False)
|
||||
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||
await storage.save_run(run, immediate=False)
|
||||
|
||||
# Before batch flush, cache should NOT contain the run
|
||||
# (This is the fix - previously cache was updated immediately)
|
||||
cache_key = f"run:{run_id}"
|
||||
assert cache_key not in storage._cache, (
|
||||
"Cache should not be updated before batch is flushed"
|
||||
)
|
||||
|
||||
# Wait for batch to flush (poll instead of fixed sleep for CI reliability)
|
||||
for _ in range(500): # 500 * 0.01s = 5s max
|
||||
if cache_key in storage._cache:
|
||||
break
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
# After batch flush, cache should contain the run
|
||||
assert cache_key in storage._cache, "Cache should be updated after batch flush"
|
||||
|
||||
# Verify data on disk matches cache
|
||||
loaded_run = await storage.load_run(run_id, use_cache=False)
|
||||
assert loaded_run is not None
|
||||
assert loaded_run.id == run_id
|
||||
assert loaded_run.status == RunStatus.RUNNING
|
||||
finally:
|
||||
await storage.stop()
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="FileStorage.save_run() is deprecated and now a no-op. "
|
||||
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
|
||||
"New sessions use unified storage at sessions/{session_id}/state.json"
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_immediate_write_updates_cache(tmp_path: Path):
|
||||
"""Test that immediate writes still update cache correctly."""
|
||||
storage = ConcurrentStorage(tmp_path)
|
||||
await storage.start()
|
||||
|
||||
try:
|
||||
run_id = "test-run-3"
|
||||
|
||||
# Save with immediate=True
|
||||
run = create_test_run(run_id, status=RunStatus.COMPLETED)
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Cache should be updated immediately for immediate writes
|
||||
cache_key = f"run:{run_id}"
|
||||
assert cache_key in storage._cache, "Cache should be updated after immediate write"
|
||||
|
||||
# Verify cached value is correct
|
||||
cached_run = storage._cache[cache_key].value
|
||||
assert cached_run.id == run_id
|
||||
assert cached_run.status == RunStatus.COMPLETED
|
||||
finally:
|
||||
await storage.stop()
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="FileStorage.save_run() is deprecated and now a no-op. "
|
||||
"ConcurrentStorage wraps FileStorage, so these tests no longer work. "
|
||||
"New sessions use unified storage at sessions/{session_id}/state.json"
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_summary_cache_invalidated_on_multiple_saves(tmp_path: Path):
|
||||
"""Test that summary cache is invalidated on each save, not just the first."""
|
||||
storage = ConcurrentStorage(tmp_path)
|
||||
await storage.start()
|
||||
|
||||
try:
|
||||
run_id = "test-run-4"
|
||||
|
||||
# First save
|
||||
run = create_test_run(run_id, status=RunStatus.RUNNING)
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Load summary to cache it
|
||||
summary1 = await storage.load_summary(run_id)
|
||||
assert summary1.status == RunStatus.RUNNING
|
||||
|
||||
# Second save with new status
|
||||
run.status = RunStatus.RUNNING
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Load summary - should be fresh
|
||||
summary2 = await storage.load_summary(run_id)
|
||||
assert summary2.status == RunStatus.RUNNING
|
||||
|
||||
# Third save with final status
|
||||
run.status = RunStatus.COMPLETED
|
||||
await storage.save_run(run, immediate=True)
|
||||
|
||||
# Load summary - should be fresh again
|
||||
summary3 = await storage.load_summary(run_id)
|
||||
assert summary3.status == RunStatus.COMPLETED
|
||||
finally:
|
||||
await storage.stop()
|
||||
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
Regression tests for conditional edge direct key access (Issue #3599).
|
||||
|
||||
Verifies that node outputs are written to memory before edge evaluation,
|
||||
enabling direct key access in conditional expressions (e.g., 'score > 80')
|
||||
instead of requiring output['score'] > 80 syntax.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.executor import GraphExecutor
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
|
||||
class SimpleRuntime(Runtime):
|
||||
"""Minimal runtime for testing."""
|
||||
|
||||
def start_run(self, **kwargs):
|
||||
return "test-run"
|
||||
|
||||
def end_run(self, **kwargs):
|
||||
pass
|
||||
|
||||
def report_problem(self, **kwargs):
|
||||
pass
|
||||
|
||||
def decide(self, **kwargs):
|
||||
return "test-decision"
|
||||
|
||||
def record_outcome(self, **kwargs):
|
||||
pass
|
||||
|
||||
def set_node(self, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class ScoreNode(NodeProtocol):
|
||||
"""Node that outputs a score value."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
return NodeResult(success=True, output={"score": 85})
|
||||
|
||||
|
||||
class HighScoreNode(NodeProtocol):
|
||||
"""Consumer node for high scores."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
return NodeResult(success=True, output={"result": "high_score_path"})
|
||||
|
||||
|
||||
class MultiKeyNode(NodeProtocol):
|
||||
"""Node that outputs multiple keys."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
return NodeResult(success=True, output={"x": 100, "y": 50})
|
||||
|
||||
|
||||
class ConsumerNode(NodeProtocol):
|
||||
"""Generic consumer node."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
return NodeResult(success=True, output={"processed": True})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_direct_key_access_in_conditional_edge():
|
||||
"""
|
||||
Verify direct key access works in conditional edges (e.g., 'score > 80').
|
||||
|
||||
This is the core regression test for issue #3599. Before the fix,
|
||||
node outputs were only written to memory during input mapping (after
|
||||
edge evaluation), causing NameError when edges tried to access keys directly.
|
||||
"""
|
||||
goal = Goal(
|
||||
id="test-direct-key",
|
||||
name="Test Direct Key Access",
|
||||
description="Test that direct key access works in conditional edges",
|
||||
)
|
||||
|
||||
nodes = [
|
||||
NodeSpec(
|
||||
id="score_node",
|
||||
name="ScoreNode",
|
||||
description="Outputs a score",
|
||||
node_type="function",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="high_score_node",
|
||||
name="HighScoreNode",
|
||||
description="Handles high scores",
|
||||
node_type="function",
|
||||
input_keys=["score"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
]
|
||||
|
||||
# Edge with DIRECT key access: 'score > 80' (not 'output["score"] > 80')
|
||||
edges = [
|
||||
EdgeSpec(
|
||||
id="score_to_high",
|
||||
source="score_node",
|
||||
target="high_score_node",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="score > 80", # Direct key access
|
||||
)
|
||||
]
|
||||
|
||||
graph = GraphSpec(
|
||||
id="test-graph",
|
||||
goal_id="test-direct-key",
|
||||
entry_node="score_node",
|
||||
nodes=nodes,
|
||||
edges=edges,
|
||||
terminal_nodes=["high_score_node"],
|
||||
)
|
||||
|
||||
runtime = SimpleRuntime(storage_path="/tmp/test")
|
||||
executor = GraphExecutor(runtime=runtime)
|
||||
executor.register_node("score_node", ScoreNode())
|
||||
executor.register_node("high_score_node", HighScoreNode())
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
# Verify the edge was followed (high_score_node executed)
|
||||
assert result.success, "Execution should succeed"
|
||||
assert "high_score_node" in result.path, (
|
||||
f"Expected high_score_node in path. "
|
||||
f"Condition 'score > 80' should evaluate to True (score=85). "
|
||||
f"Path: {result.path}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_backward_compatibility_output_syntax():
|
||||
"""
|
||||
Verify backward compatibility: output['key'] syntax still works.
|
||||
|
||||
The fix should not break existing code that uses the explicit
|
||||
output dictionary syntax in conditional expressions.
|
||||
"""
|
||||
goal = Goal(
|
||||
id="test-backward-compat",
|
||||
name="Test Backward Compatibility",
|
||||
description="Test that output['key'] syntax still works",
|
||||
)
|
||||
|
||||
nodes = [
|
||||
NodeSpec(
|
||||
id="score_node",
|
||||
name="ScoreNode",
|
||||
description="Outputs a score",
|
||||
node_type="function",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="consumer_node",
|
||||
name="ConsumerNode",
|
||||
description="Consumer",
|
||||
node_type="function",
|
||||
input_keys=["score"],
|
||||
output_keys=["processed"],
|
||||
),
|
||||
]
|
||||
|
||||
# Edge with OLD syntax: output['score'] > 80
|
||||
edges = [
|
||||
EdgeSpec(
|
||||
id="score_to_consumer",
|
||||
source="score_node",
|
||||
target="consumer_node",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="output['score'] > 80", # Old explicit syntax
|
||||
)
|
||||
]
|
||||
|
||||
graph = GraphSpec(
|
||||
id="test-graph-compat",
|
||||
goal_id="test-backward-compat",
|
||||
entry_node="score_node",
|
||||
nodes=nodes,
|
||||
edges=edges,
|
||||
terminal_nodes=["consumer_node"],
|
||||
)
|
||||
|
||||
runtime = SimpleRuntime(storage_path="/tmp/test")
|
||||
executor = GraphExecutor(runtime=runtime)
|
||||
executor.register_node("score_node", ScoreNode())
|
||||
executor.register_node("consumer_node", ConsumerNode())
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
# Verify backward compatibility maintained
|
||||
assert result.success, "Execution should succeed"
|
||||
assert "consumer_node" in result.path, (
|
||||
f"Expected consumer_node in path. "
|
||||
f"Old syntax output['score'] > 80 should still work. "
|
||||
f"Path: {result.path}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_keys_in_expression():
|
||||
"""
|
||||
Verify multiple direct keys work in complex expressions.
|
||||
|
||||
Tests that expressions like 'x > y and y < 100' work correctly
|
||||
when both x and y are written to memory before edge evaluation.
|
||||
"""
|
||||
goal = Goal(
|
||||
id="test-multi-key",
|
||||
name="Test Multiple Keys",
|
||||
description="Test multiple keys in conditional expression",
|
||||
)
|
||||
|
||||
nodes = [
|
||||
NodeSpec(
|
||||
id="multi_key_node",
|
||||
name="MultiKeyNode",
|
||||
description="Outputs multiple keys",
|
||||
node_type="function",
|
||||
output_keys=["x", "y"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="consumer_node",
|
||||
name="ConsumerNode",
|
||||
description="Consumer",
|
||||
node_type="function",
|
||||
input_keys=["x", "y"],
|
||||
output_keys=["processed"],
|
||||
),
|
||||
]
|
||||
|
||||
# Complex expression with multiple direct keys
|
||||
edges = [
|
||||
EdgeSpec(
|
||||
id="multi_to_consumer",
|
||||
source="multi_key_node",
|
||||
target="consumer_node",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="x > y and y < 100", # Multiple keys
|
||||
)
|
||||
]
|
||||
|
||||
graph = GraphSpec(
|
||||
id="test-graph-multi",
|
||||
goal_id="test-multi-key",
|
||||
entry_node="multi_key_node",
|
||||
nodes=nodes,
|
||||
edges=edges,
|
||||
terminal_nodes=["consumer_node"],
|
||||
)
|
||||
|
||||
runtime = SimpleRuntime(storage_path="/tmp/test")
|
||||
executor = GraphExecutor(runtime=runtime)
|
||||
executor.register_node("multi_key_node", MultiKeyNode())
|
||||
executor.register_node("consumer_node", ConsumerNode())
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
# Verify multiple keys work correctly
|
||||
assert result.success, "Execution should succeed"
|
||||
assert "consumer_node" in result.path, (
|
||||
f"Expected consumer_node in path. "
|
||||
f"Condition 'x > y and y < 100' should be True (x=100, y=50). "
|
||||
f"Path: {result.path}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_negative_case_condition_false():
|
||||
"""
|
||||
Verify conditions correctly evaluate to False when not met.
|
||||
|
||||
Tests that when a condition fails, the edge is NOT followed
|
||||
and execution doesn't proceed to the target node.
|
||||
"""
|
||||
goal = Goal(
|
||||
id="test-negative",
|
||||
name="Test Negative Case",
|
||||
description="Test condition evaluates to False correctly",
|
||||
)
|
||||
|
||||
class LowScoreNode(NodeProtocol):
|
||||
"""Node that outputs a LOW score."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
return NodeResult(success=True, output={"score": 30})
|
||||
|
||||
nodes = [
|
||||
NodeSpec(
|
||||
id="low_score_node",
|
||||
name="LowScoreNode",
|
||||
description="Outputs low score",
|
||||
node_type="function",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="high_score_handler",
|
||||
name="HighScoreHandler",
|
||||
description="Should NOT execute",
|
||||
node_type="function",
|
||||
input_keys=["score"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
]
|
||||
|
||||
# Condition should be FALSE (30 is not > 80)
|
||||
edges = [
|
||||
EdgeSpec(
|
||||
id="low_to_high",
|
||||
source="low_score_node",
|
||||
target="high_score_handler",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="score > 80", # Should be False
|
||||
)
|
||||
]
|
||||
|
||||
graph = GraphSpec(
|
||||
id="test-graph-negative",
|
||||
goal_id="test-negative",
|
||||
entry_node="low_score_node",
|
||||
nodes=nodes,
|
||||
edges=edges,
|
||||
terminal_nodes=["high_score_handler"],
|
||||
)
|
||||
|
||||
runtime = SimpleRuntime(storage_path="/tmp/test")
|
||||
executor = GraphExecutor(runtime=runtime)
|
||||
executor.register_node("low_score_node", LowScoreNode())
|
||||
executor.register_node("high_score_handler", HighScoreNode())
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
# Verify condition correctly evaluated to False
|
||||
assert result.success, "Execution should succeed"
|
||||
assert "high_score_handler" not in result.path, (
|
||||
f"high_score_handler should NOT be in path. "
|
||||
f"Condition 'score > 80' should be False (score=30). "
|
||||
f"Path: {result.path}"
|
||||
)
|
||||
@@ -951,7 +951,7 @@ async def test_client_facing_node_streams_output():
|
||||
config=LoopConfig(max_iterations=5),
|
||||
)
|
||||
|
||||
# Text-only on client_facing no longer blocks (no ask_user called),
|
||||
# Text-only on client_facing does not block (no ask_user called),
|
||||
# so the node completes without needing a shutdown workaround.
|
||||
result = await node.execute(ctx)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.observability import clear_trace_context, set_trace_context
|
||||
from framework.runtime.runtime_log_schemas import (
|
||||
NodeDetail,
|
||||
NodeStepLog,
|
||||
@@ -464,6 +465,114 @@ class TestRuntimeLogger:
|
||||
assert tool_logs.steps[0].verdict == "RETRY"
|
||||
assert tool_logs.steps[2].verdict == "ACCEPT"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_trace_context_populated_in_l1_l2_l3(self, tmp_path: Path):
|
||||
"""With trace context set, L3/L2/L1 entries include trace_id, span_id, execution_id."""
|
||||
set_trace_context(
|
||||
trace_id="a1b2c3d4e5f6789012345678abcdef01",
|
||||
execution_id="b2c3d4e5f6789012345678abcdef0123",
|
||||
)
|
||||
try:
|
||||
store = RuntimeLogStore(tmp_path / "logs")
|
||||
rl = RuntimeLogger(store=store, agent_id="test-agent")
|
||||
run_id = rl.start_run("goal-1")
|
||||
|
||||
rl.log_step(
|
||||
node_id="node-1",
|
||||
node_type="event_loop",
|
||||
step_index=0,
|
||||
llm_text="Step.",
|
||||
input_tokens=10,
|
||||
output_tokens=5,
|
||||
)
|
||||
rl.log_node_complete(
|
||||
node_id="node-1",
|
||||
node_name="Search",
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
exit_status="success",
|
||||
)
|
||||
await rl.end_run(
|
||||
status="success",
|
||||
duration_ms=100,
|
||||
node_path=["node-1"],
|
||||
execution_quality="clean",
|
||||
)
|
||||
|
||||
# L3: tool_logs
|
||||
tool_logs = await store.load_tool_logs(run_id)
|
||||
assert tool_logs is not None
|
||||
assert len(tool_logs.steps) == 1
|
||||
step = tool_logs.steps[0]
|
||||
assert step.trace_id == "a1b2c3d4e5f6789012345678abcdef01"
|
||||
assert step.execution_id == "b2c3d4e5f6789012345678abcdef0123"
|
||||
assert len(step.span_id) == 16
|
||||
assert all(c in "0123456789abcdef" for c in step.span_id)
|
||||
|
||||
# L2: details
|
||||
details = await store.load_details(run_id)
|
||||
assert details is not None
|
||||
assert len(details.nodes) == 1
|
||||
nd = details.nodes[0]
|
||||
assert nd.trace_id == "a1b2c3d4e5f6789012345678abcdef01"
|
||||
assert len(nd.span_id) == 16
|
||||
|
||||
# L1: summary
|
||||
summary = await store.load_summary(run_id)
|
||||
assert summary is not None
|
||||
assert summary.trace_id == "a1b2c3d4e5f6789012345678abcdef01"
|
||||
assert summary.execution_id == "b2c3d4e5f6789012345678abcdef0123"
|
||||
finally:
|
||||
clear_trace_context()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_trace_context_empty_when_not_set(self, tmp_path: Path):
|
||||
"""Without trace context, L3/L2/L1 trace_id and execution_id are empty."""
|
||||
clear_trace_context()
|
||||
store = RuntimeLogStore(tmp_path / "logs")
|
||||
rl = RuntimeLogger(store=store, agent_id="test-agent")
|
||||
run_id = rl.start_run("goal-1")
|
||||
|
||||
rl.log_step(
|
||||
node_id="node-1",
|
||||
node_type="event_loop",
|
||||
step_index=0,
|
||||
llm_text="Step.",
|
||||
input_tokens=10,
|
||||
output_tokens=5,
|
||||
)
|
||||
rl.log_node_complete(
|
||||
node_id="node-1",
|
||||
node_name="Search",
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
exit_status="success",
|
||||
)
|
||||
await rl.end_run(
|
||||
status="success",
|
||||
duration_ms=100,
|
||||
node_path=["node-1"],
|
||||
execution_quality="clean",
|
||||
)
|
||||
|
||||
# L3: trace_id and execution_id from context should be empty
|
||||
tool_logs = await store.load_tool_logs(run_id)
|
||||
assert tool_logs is not None
|
||||
assert len(tool_logs.steps) == 1
|
||||
assert tool_logs.steps[0].trace_id == ""
|
||||
assert tool_logs.steps[0].execution_id == ""
|
||||
|
||||
# L2
|
||||
details = await store.load_details(run_id)
|
||||
assert details is not None
|
||||
assert details.nodes[0].trace_id == ""
|
||||
|
||||
# L1
|
||||
summary = await store.load_summary(run_id)
|
||||
assert summary is not None
|
||||
assert summary.trace_id == ""
|
||||
assert summary.execution_id == ""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_node_lifecycle(self, tmp_path: Path):
|
||||
"""Test logging across multiple nodes in a graph run."""
|
||||
|
||||
@@ -0,0 +1,172 @@
|
||||
# Agent Runtime
|
||||
|
||||
Unified execution system for all Hive agents. Every agent — single-entry or multi-entry, headless or TUI — runs through the same runtime stack.
|
||||
|
||||
## Topology
|
||||
|
||||
```
|
||||
AgentRunner.load(agent_path)
|
||||
|
|
||||
AgentRunner
|
||||
(factory + public API)
|
||||
|
|
||||
_setup_agent_runtime()
|
||||
|
|
||||
AgentRuntime
|
||||
(lifecycle + orchestration)
|
||||
/ | \\
|
||||
Stream A Stream B Stream C ← one per entry point
|
||||
| | |
|
||||
GraphExecutor GraphExecutor GraphExecutor
|
||||
| | |
|
||||
Node → Node → Node (graph traversal)
|
||||
```
|
||||
|
||||
Single-entry agents get a `"default"` entry point automatically. There is no separate code path.
|
||||
|
||||
## Components
|
||||
|
||||
| Component | File | Role |
|
||||
| --- | --- | --- |
|
||||
| `AgentRunner` | `runner/runner.py` | Load agents, configure tools/LLM, expose high-level API |
|
||||
| `AgentRuntime` | `runtime/agent_runtime.py` | Lifecycle management, entry point routing, event bus |
|
||||
| `ExecutionStream` | `runtime/execution_stream.py` | Per-entry-point execution queue, session persistence |
|
||||
| `GraphExecutor` | `graph/executor.py` | Node traversal, tool dispatch, checkpointing |
|
||||
| `EventBus` | `runtime/event_bus.py` | Pub/sub for execution events (streaming, I/O) |
|
||||
| `SharedStateManager` | `runtime/shared_state.py` | Cross-stream state with isolation levels |
|
||||
| `OutcomeAggregator` | `runtime/outcome_aggregator.py` | Goal progress tracking across streams |
|
||||
| `SessionStore` | `storage/session_store.py` | Session state persistence (`sessions/{id}/state.json`) |
|
||||
|
||||
## Programming Interface
|
||||
|
||||
### AgentRunner (high-level)
|
||||
|
||||
```python
|
||||
from framework.runner import AgentRunner
|
||||
|
||||
# Load and run
|
||||
runner = AgentRunner.load("exports/my_agent", model="anthropic/claude-sonnet-4-20250514")
|
||||
result = await runner.run({"query": "hello"})
|
||||
|
||||
# Resume from paused session
|
||||
result = await runner.run({"query": "continue"}, session_state=saved_state)
|
||||
|
||||
# Lifecycle
|
||||
await runner.start() # Start the runtime
|
||||
await runner.stop() # Stop the runtime
|
||||
exec_id = await runner.trigger("default", {}) # Non-blocking trigger
|
||||
progress = await runner.get_goal_progress() # Goal evaluation
|
||||
entry_points = runner.get_entry_points() # List entry points
|
||||
|
||||
# Context manager
|
||||
async with AgentRunner.load("exports/my_agent") as runner:
|
||||
result = await runner.run({"query": "hello"})
|
||||
|
||||
# Cleanup
|
||||
runner.cleanup() # Synchronous
|
||||
await runner.cleanup_async() # Asynchronous
|
||||
```
|
||||
|
||||
### AgentRuntime (lower-level)
|
||||
|
||||
```python
|
||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
# Create runtime with entry points
|
||||
runtime = create_agent_runtime(
|
||||
graph=graph,
|
||||
goal=goal,
|
||||
storage_path=Path("~/.hive/agents/my_agent"),
|
||||
entry_points=[
|
||||
EntryPointSpec(id="default", name="Default", entry_node="start", trigger_type="manual"),
|
||||
],
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
# Lifecycle
|
||||
await runtime.start()
|
||||
await runtime.stop()
|
||||
|
||||
# Execution
|
||||
exec_id = await runtime.trigger("default", {"query": "hello"}) # Non-blocking
|
||||
result = await runtime.trigger_and_wait("default", {"query": "hello"}) # Blocking
|
||||
result = await runtime.trigger_and_wait("default", {}, session_state=state) # Resume
|
||||
|
||||
# Client-facing node I/O
|
||||
await runtime.inject_input(node_id="chat", content="user response")
|
||||
|
||||
# Events
|
||||
sub_id = runtime.subscribe_to_events(
|
||||
event_types=[EventType.CLIENT_OUTPUT_DELTA],
|
||||
handler=my_handler,
|
||||
)
|
||||
runtime.unsubscribe_from_events(sub_id)
|
||||
|
||||
# Inspection
|
||||
runtime.is_running # bool
|
||||
runtime.event_bus # EventBus
|
||||
runtime.state_manager # SharedStateManager
|
||||
runtime.get_stats() # Runtime statistics
|
||||
```
|
||||
|
||||
## Execution Flow
|
||||
|
||||
1. `AgentRunner.run()` calls `AgentRuntime.trigger_and_wait()`
|
||||
2. `AgentRuntime` routes to the `ExecutionStream` for the entry point
|
||||
3. `ExecutionStream` creates a `GraphExecutor` and calls `execute()`
|
||||
4. `GraphExecutor` traverses nodes, dispatches tools, manages checkpoints
|
||||
5. `ExecutionResult` flows back up through the stack
|
||||
6. `ExecutionStream` writes session state to disk
|
||||
|
||||
## Session Resume
|
||||
|
||||
All execution paths support session resume:
|
||||
|
||||
```python
|
||||
# First run (agent pauses at a client-facing node)
|
||||
result = await runner.run({"query": "start task"})
|
||||
# result.paused_at = "review-node"
|
||||
# result.session_state = {"memory": {...}, "paused_at": "review-node", ...}
|
||||
|
||||
# Resume
|
||||
result = await runner.run({"input": "approved"}, session_state=result.session_state)
|
||||
```
|
||||
|
||||
Session state flows: `AgentRunner.run()` → `AgentRuntime.trigger_and_wait()` → `ExecutionStream.execute()` → `GraphExecutor.execute()`.
|
||||
|
||||
Checkpoints are saved at node boundaries (`sessions/{id}/checkpoints/`) for crash recovery.
|
||||
|
||||
## Event Bus
|
||||
|
||||
The `EventBus` provides real-time execution visibility:
|
||||
|
||||
| Event | When |
|
||||
| --- | --- |
|
||||
| `NODE_STARTED` | Node begins execution |
|
||||
| `NODE_COMPLETED` | Node finishes |
|
||||
| `TOOL_CALL_STARTED` | Tool invocation begins |
|
||||
| `TOOL_CALL_COMPLETED` | Tool invocation finishes |
|
||||
| `CLIENT_OUTPUT_DELTA` | Agent streams text to user |
|
||||
| `CLIENT_INPUT_REQUESTED` | Agent needs user input |
|
||||
| `EXECUTION_COMPLETED` | Full execution finishes |
|
||||
|
||||
In headless mode, `AgentRunner` subscribes to `CLIENT_OUTPUT_DELTA` and `CLIENT_INPUT_REQUESTED` to print output and read stdin. In TUI mode, `AdenTUI` subscribes to route events to UI widgets.
|
||||
|
||||
## Storage Layout
|
||||
|
||||
```
|
||||
~/.hive/agents/{agent_name}/
|
||||
sessions/
|
||||
session_YYYYMMDD_HHMMSS_{uuid}/
|
||||
state.json # Session state (status, memory, progress)
|
||||
checkpoints/ # Node-boundary snapshots
|
||||
logs/
|
||||
summary.json # Execution summary
|
||||
details.jsonl # Detailed event log
|
||||
tool_logs.jsonl # Tool call log
|
||||
runtime_logs/ # Cross-session runtime logs
|
||||
```
|
||||
+27
-6
@@ -5,12 +5,31 @@ Aden Hive is a Python-based agent framework. Configuration is handled through en
|
||||
## Configuration Overview
|
||||
|
||||
```
|
||||
Environment variables (API keys, runtime flags)
|
||||
Agent config.py (per-agent settings: model, tools, storage)
|
||||
pyproject.toml (package metadata and dependencies)
|
||||
.mcp.json (MCP server connections)
|
||||
~/.hive/configuration.json (global defaults: provider, model, max_tokens)
|
||||
Environment variables (API keys, runtime flags)
|
||||
Agent config.py (per-agent settings: model, tools, storage)
|
||||
pyproject.toml (package metadata and dependencies)
|
||||
.mcp.json (MCP server connections)
|
||||
```
|
||||
|
||||
## Global Configuration (~/.hive/configuration.json)
|
||||
|
||||
The `quickstart.sh` script creates this file during setup. It stores the default LLM provider, model, and max_tokens used by all agents unless overridden in an agent's own `config.py`.
|
||||
|
||||
```json
|
||||
{
|
||||
"llm": {
|
||||
"provider": "anthropic",
|
||||
"model": "claude-sonnet-4-5-20250929",
|
||||
"max_tokens": 8192,
|
||||
"api_key_env_var": "ANTHROPIC_API_KEY"
|
||||
},
|
||||
"created_at": "2026-01-15T12:00:00+00:00"
|
||||
}
|
||||
```
|
||||
|
||||
The default `max_tokens` value (8192) is defined as `DEFAULT_MAX_TOKENS` in `framework.graph.edge` and re-exported from `framework.graph`. Each agent's `RuntimeConfig` reads from this file at startup. To change defaults, either re-run `quickstart.sh` or edit the file directly.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### LLM Providers (at least one required for real execution)
|
||||
@@ -61,14 +80,16 @@ Each agent package in `exports/` contains its own `config.py`:
|
||||
```python
|
||||
# exports/my_agent/config.py
|
||||
CONFIG = {
|
||||
"model": "claude-haiku-4-5-20251001", # Default LLM model
|
||||
"max_tokens": 4096,
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929", # Default LLM model
|
||||
"max_tokens": 8192, # default: DEFAULT_MAX_TOKENS from framework.graph
|
||||
"temperature": 0.7,
|
||||
"tools": ["web_search", "pdf_read"], # MCP tools to enable
|
||||
"storage_path": "/tmp/my_agent", # Runtime data location
|
||||
}
|
||||
```
|
||||
|
||||
If `model` or `max_tokens` are omitted, the agent loads defaults from `~/.hive/configuration.json`.
|
||||
|
||||
### Agent Graph Specification
|
||||
|
||||
Agent behavior is defined in `agent.json` (or constructed in `agent.py`):
|
||||
|
||||
@@ -116,6 +116,16 @@ Skills are also available in Cursor. To enable:
|
||||
3. Restart Cursor to load the MCP servers from `.cursor/mcp.json`
|
||||
4. Type `/` in Agent chat and search for skills (e.g., `/hive-create`)
|
||||
|
||||
|
||||
### Opencode Support
|
||||
To enable Opencode integration:
|
||||
|
||||
1. Create/Ensure `.opencode/` directory exists
|
||||
2. Configure MCP servers in `.opencode/mcp.json`
|
||||
3. Restart Opencode to load the MCP servers
|
||||
4. Switch to the Hive agent
|
||||
* **Tools:** Accesses `agent-builder` and standard `tools` via standard MCP protocols over stdio.
|
||||
|
||||
### Verify Setup
|
||||
|
||||
```bash
|
||||
@@ -163,6 +173,7 @@ hive/ # Repository root
|
||||
│ │ ├── llm/ # LLM provider integrations (Anthropic, OpenAI, etc.)
|
||||
│ │ ├── mcp/ # MCP server integration
|
||||
│ │ ├── runner/ # AgentRunner - loads and runs agents
|
||||
| | ├── observability/ # Structured logging - human-readable and machine-parseable tracing
|
||||
│ │ ├── runtime/ # Runtime environment
|
||||
│ │ ├── schemas/ # Data schemas
|
||||
│ │ ├── storage/ # File-based persistence
|
||||
|
||||
+40
-32
@@ -65,28 +65,26 @@ source .venv/bin/activate
|
||||
|
||||
If you prefer to set up manually or the script fails:
|
||||
|
||||
### 1. Install Core Framework
|
||||
### 1. Sync Workspace Dependencies
|
||||
|
||||
```bash
|
||||
cd core
|
||||
uv pip install -e .
|
||||
# From repository root - this creates a single .venv at the root
|
||||
uv sync
|
||||
```
|
||||
|
||||
### 2. Install Tools Package
|
||||
> **Note:** The `uv sync` command uses the workspace configuration in `pyproject.toml` to install both `core` (framework) and `tools` (aden_tools) packages together. This is the recommended approach over individual `pip install -e` commands which may fail due to circular dependencies.
|
||||
|
||||
### 2. Activate the Virtual Environment
|
||||
|
||||
```bash
|
||||
cd tools
|
||||
uv pip install -e .
|
||||
# Linux/macOS
|
||||
source .venv/bin/activate
|
||||
|
||||
# Windows (PowerShell)
|
||||
.venv\Scripts\Activate.ps1
|
||||
```
|
||||
|
||||
### 3. Upgrade OpenAI Package
|
||||
|
||||
```bash
|
||||
# litellm requires openai >= 1.0.0
|
||||
uv pip install --upgrade "openai>=1.0.0"
|
||||
```
|
||||
|
||||
### 4. Verify Installation
|
||||
### 3. Verify Installation
|
||||
|
||||
```bash
|
||||
uv run python -c "import framework; print('✓ framework OK')"
|
||||
@@ -281,18 +279,20 @@ Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
|
||||
### "ModuleNotFoundError: No module named 'framework'"
|
||||
|
||||
**Solution:** Install the core package:
|
||||
**Solution:** Sync the workspace dependencies:
|
||||
|
||||
```bash
|
||||
cd core && uv pip install -e .
|
||||
# From repository root
|
||||
uv sync
|
||||
```
|
||||
|
||||
### "ModuleNotFoundError: No module named 'aden_tools'"
|
||||
|
||||
**Solution:** Install the tools package:
|
||||
**Solution:** Sync the workspace dependencies:
|
||||
|
||||
```bash
|
||||
cd tools && uv pip install -e .
|
||||
# From repository root
|
||||
uv sync
|
||||
```
|
||||
|
||||
Or run the setup script:
|
||||
@@ -350,15 +350,14 @@ The Hive framework consists of three Python packages:
|
||||
|
||||
```
|
||||
hive/
|
||||
├── .venv/ # Single workspace venv (created by uv sync)
|
||||
├── core/ # Core framework (runtime, graph executor, LLM providers)
|
||||
│ ├── framework/
|
||||
│ ├── .venv/ # Created by quickstart.sh
|
||||
│ └── pyproject.toml
|
||||
│
|
||||
├── tools/ # Tools and MCP servers
|
||||
│ ├── src/
|
||||
│ │ └── aden_tools/ # Actual package location
|
||||
│ ├── .venv/ # Created by quickstart.sh
|
||||
│ └── pyproject.toml
|
||||
│
|
||||
├── exports/ # Agent packages (user-created, gitignored)
|
||||
@@ -368,28 +367,29 @@ hive/
|
||||
└── templates/ # Pre-built template agents
|
||||
```
|
||||
|
||||
## Separate Virtual Environments
|
||||
## Virtual Environment Setup
|
||||
|
||||
Hive primarily uses **uv** to create and manage separate virtual environments for `core` and `tools`.
|
||||
Hive uses **uv workspaces** to manage dependencies. When you run `uv sync` from the repository root, a **single `.venv`** is created at the root containing both packages.
|
||||
|
||||
The project uses separate virtual environments to:
|
||||
### Benefits of Workspace Mode
|
||||
|
||||
- Isolate dependencies and avoid conflicts
|
||||
- Allow independent development and testing of each package
|
||||
- Enable MCP servers to run with their specific dependencies
|
||||
- **Single environment** - No need to switch between multiple venvs
|
||||
- **Unified dependencies** - Consistent package versions across core and tools
|
||||
- **Simpler development** - One activation, access to everything
|
||||
|
||||
### How It Works
|
||||
|
||||
When you run `./quickstart.sh`, `uv` sets up:
|
||||
When you run `./quickstart.sh` or `uv sync`:
|
||||
|
||||
1. **core/.venv/** - Contains the `framework` package and its dependencies (anthropic, litellm, mcp, etc.)
|
||||
2. **tools/.venv/** - Contains the `aden_tools` package and its dependencies (beautifulsoup4, pandas, etc.)
|
||||
1. **/.venv/** - Single root virtual environment is created
|
||||
2. Both `framework` (from core/) and `aden_tools` (from tools/) are installed
|
||||
3. All dependencies (anthropic, litellm, beautifulsoup4, pandas, etc.) are resolved together
|
||||
|
||||
If you need to refresh environments manually, use `uv`:
|
||||
If you need to refresh the environment:
|
||||
|
||||
```bash
|
||||
cd core && uv sync
|
||||
cd ../tools && uv sync
|
||||
# From repository root
|
||||
uv sync
|
||||
```
|
||||
|
||||
### Cross-Package Imports
|
||||
@@ -521,7 +521,15 @@ export ADEN_CREDENTIALS_PATH="/custom/path"
|
||||
# Agent storage location (default: /tmp)
|
||||
export AGENT_STORAGE_PATH="/custom/storage"
|
||||
```
|
||||
## Opencode Setup
|
||||
|
||||
[Opencode](https://github.com/opencode-ai/opencode) is fully supported as a coding agent.
|
||||
|
||||
### Automatic Setup
|
||||
Run the quickstart script in the root directorys:
|
||||
```bash
|
||||
./quickstart.sh
|
||||
```
|
||||
## Additional Resources
|
||||
|
||||
- **Framework Documentation:** [core/README.md](../core/README.md)
|
||||
|
||||
@@ -93,12 +93,12 @@ hive/
|
||||
│ └── pyproject.toml # Package metadata
|
||||
│
|
||||
├── tools/ # MCP Tools Package
|
||||
│ ├── mcp_server.py # MCP server entry point
|
||||
│ └── src/aden_tools/ # Tools for agent capabilities
|
||||
│ ├── tools/ # Individual tool implementations
|
||||
│ │ ├── web_search_tool/
|
||||
│ │ ├── web_scrape_tool/
|
||||
│ │ └── file_system_toolkits/
|
||||
│ └── mcp_server.py # HTTP MCP server
|
||||
│ └── tools/ # Individual tool implementations
|
||||
│ ├── web_search_tool/
|
||||
│ ├── web_scrape_tool/
|
||||
│ └── file_system_toolkits/
|
||||
│
|
||||
├── exports/ # Agent Packages (user-generated, not in repo)
|
||||
│ └── your_agent/ # Your agents created via /hive
|
||||
|
||||
@@ -32,26 +32,26 @@ Each goal has weighted success criteria that define what "done" looks like. Thes
|
||||
|
||||
```python
|
||||
Goal(
|
||||
id="twitter-outreach",
|
||||
name="Personalized Twitter Outreach",
|
||||
id="deep-research",
|
||||
name="Deep Research Report",
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="personalized",
|
||||
description="Messages reference specific details from the prospect's profile",
|
||||
id="comprehensive",
|
||||
description="Report covers all major aspects of the research topic",
|
||||
metric="llm_judge",
|
||||
weight=0.4
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="compliant",
|
||||
description="Messages follow brand voice guidelines",
|
||||
id="cited",
|
||||
description="All claims are backed by cited sources",
|
||||
metric="llm_judge",
|
||||
weight=0.3
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="actionable",
|
||||
description="Each message includes a clear call to action",
|
||||
id="structured",
|
||||
description="Report has clear sections with headings and a summary",
|
||||
metric="output_contains",
|
||||
target="CTA",
|
||||
target="## Summary",
|
||||
weight=0.3
|
||||
),
|
||||
],
|
||||
|
||||
@@ -40,7 +40,7 @@ Welcome to the Aden Engineering Challenges! These quizzes are designed for stude
|
||||
After completing challenges, submit your work by:
|
||||
|
||||
1. Creating a GitHub Gist with your answers
|
||||
2. Emailing the link to `careers@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name`
|
||||
2. Emailing the link to `contact@adenhq.com` with subject: `[Engineering Challenge] Your Name - Track Name`
|
||||
3. Include your GitHub username in the email
|
||||
|
||||
## Getting Help
|
||||
|
||||
@@ -0,0 +1,261 @@
|
||||
# Developer success
|
||||
Our value and principle is developer success. We truly care about helping developers achieve their goals — not just shipping features, but ensuring every developer who uses Hive can build, debug, deploy, and iterate on agents that work in production. Developer success means our developers succeed in their own work: automating real business processes, shipping products, and growing their capabilities. If our developers aren't winning, we aren't winning.
|
||||
|
||||
## Developer profiles
|
||||
From what we currently see, these are the developers who will achieve success with our framework the earliest with our framework
|
||||
- IT Specialists and Consultants
|
||||
- Individual developers who want to build a product
|
||||
- Developers who want to get a job done (they have a real-world business process)
|
||||
- Developers Who Want to learn and become a business process owner
|
||||
- One-man CEOs
|
||||
|
||||
## How They Find Us & Why They Use Us
|
||||
|
||||
**IT Specialists and Consultants:**
|
||||
Always trying to learn and find the state-of-the-art tools on the market, as it defines their career. They tried Claude but found it hard to apply to their customers' needs. They received Vincent's email and wanted to give it a try. They see the opportunity to resell this product and become active users of ours.
|
||||
|
||||
**Developers Who Want to Get a Job Done:**
|
||||
They find us through our marketing efforts selling the sample agents and our SEO pages for business processes, while they're researching solutions to the problems they're trying to solve.
|
||||
|
||||
**Developers Who Want to learn and become a business process owner:**
|
||||
They find us through the rage-bait post "If you're a developer that doesn't own a business process, you'll lose your job" and the seminars we host. They believe they need to upgrade themselves from just a coder to somebody who can own a process. They check the GitHub and find the templates interesting. Then they join our Discord to discover more agent ideas developed by the community.
|
||||
|
||||
**One-Man CEO:**
|
||||
Has a business idea and might have some traction, but is overwhelmed by too much work. They saw news saying AI agents can handle all their repetitive tasks. During research, they found us and our tutorials. After seeing a wall of sample agents and playing with them, they couldn't refuse the value and joined our Discord. [See roadmap — Hosted sample agent playgrounds]
|
||||
|
||||
**Individual Product Developer:**
|
||||
Has a product idea and is trying to find the best framework. They encounter a post from Patrick: "I built an AI agent that does market research for me every day using this new framework." They go to our GitHub, find the idea aligned with their vision, and join our Discord.
|
||||
|
||||
> **Note:** Individual product developers want to do one thing well and resell it. One-man CEOs have many things to do and need multiple agents.
|
||||
|
||||
> **Note:** Ordered by importance. Here is the rationale: Among all developers, IT people are going to be the first group to truly deploy their work in production and achieve real developer success. They are also likely to contribute to the framework. Developers who want to learn are the group who won't get things deployed anytime soon but can be good community members. The product developer is the more long-term play. As a dev tool, it would be a huge developer success if we have them building a product with it. It is the hardest challenge for our framework and also requires good product developers to spend time figuring things out. This is not going to happen in two months.
|
||||
|
||||
## What Is Their Success
|
||||
|
||||
**IT Specialists and Consultants:**
|
||||
Success means they're able to resell our framework to their customers and deliver use cases in a production environment. It will be critical for us to have a few "less serious" use cases so people know where to start.
|
||||
|
||||
**Developers Who Want to Get a Job Done:**
|
||||
The framework is adjustable enough for developers to either start from scratch or build from templates to get the job done.
|
||||
|
||||
Job done is considered as:
|
||||
1. The developer deploys it to production and gets users to use it
|
||||
2. The developer starts to own the business process and knows how to maintain it
|
||||
3. The developer can add more features and integrations to expand the agent's capability as the business process updates
|
||||
4. The developer is alerted when any failure/escalation happens and is able to debug the agent when sessions go wrong
|
||||
|
||||
**Developers Who Want to Learn and Become a Business Process Owner:**
|
||||
1. The developer learns from sample agents how business processes are done
|
||||
2. The developer can deploy a sample agent for their team to automate some processes
|
||||
3. The developer starts to own the business process and knows how to maintain it
|
||||
4. The developer can add more features and integrations to expand the agent's capability as the business process updates
|
||||
5. The developer is able to debug the agent when sessions go wrong
|
||||
|
||||
**One-Man CEO:**
|
||||
1. The developer can deploy multiple agents from sample agents
|
||||
2. The developer can tweak the agent according to their needs
|
||||
3. The developer can easily program a human-in-the-loop fallback so when the agent can't handle a problem, they receive a notification and fix the issue themselves
|
||||
4. The developer can generate ad-hoc agents that solve new issues for their business
|
||||
5. The developer can turn an ad-hoc agent into an agent that runs repeatedly
|
||||
6. The developer can turn a repeatedly-running agent into one that runs autonomously
|
||||
7. When the agent fails, the developer receives an alert
|
||||
|
||||
**Individual Product Developer:**
|
||||
1. The developer can develop an MVP with our generation framework
|
||||
2. The developer can easily add more capabilities
|
||||
3. The developer can trust the framework is future-proof for them
|
||||
4. The developer can have a deployment strategy where they wrap the agent as part of their product
|
||||
5. The developer can monitor the logs and costs for their users
|
||||
6. The product achieves success (like Unity), long term
|
||||
|
||||
```
|
||||
**Summary:**
|
||||
The common denominator:
|
||||
1. Can create an agent
|
||||
2. Can debug the agent
|
||||
3. Can maintain the agent
|
||||
4. Can deploy the agent
|
||||
5. Can iterate on the agent
|
||||
```
|
||||
|
||||
## Basic use cases (we shall have template for each one of these)
|
||||
|
||||
- Github issue triaging agent
|
||||
- Tech&AI news digest agent
|
||||
- Research report agent
|
||||
- Teams daily digest and to-dos
|
||||
- Discord autoreply bot
|
||||
- Finance stock digest
|
||||
- WhatsApp auto response agent
|
||||
- Email followup agent
|
||||
- Meeting time coordination agent
|
||||
|
||||
## Intermediate use cases
|
||||
|
||||
### 1. Sales & Marketing
|
||||
Marketing is often the most time-consuming "distraction" for a CEO. You provide the vision; they provide the volume.
|
||||
|
||||
- [Social Media Management](../examples/recipes/social_media_management/): Scheduling posts, replying to comments, and monitoring trends.
|
||||
- [News Jacking](../examples/recipes/news_jacking/): Personalized outreach triggered by real-time company news (funding, hires, press mentions).
|
||||
- [Newsletter Production](../examples/recipes/newsletter_production/): Taking your raw ideas or voice memos and turning them into a polished weekly email.
|
||||
- [CRM Update Agent](../examples/recipes/crm_hygiene/): Ensuring every lead has a follow-up date and a status update.
|
||||
|
||||
### 2. Customer Success
|
||||
You shouldn't be the one answering "How do I reset my password?" but you should be the one closing $10k deals.
|
||||
|
||||
- [Inquiry Triaging](../examples/recipes/inquiry_triaging/): Sorting the "tire kickers" from the "hot leads."
|
||||
- [Onboarding Assistance](../examples/recipes/onboarding_assistance/): Helping new clients set up their accounts or sending out "Welcome" kits.
|
||||
- [Customer support & Troubleshooting](../examples/recipes/support_troubleshooting/): Handling "Level 1" tech support for your platform or website.
|
||||
|
||||
### 3. Operations Automation
|
||||
This is your right hand. They keep the gears greased so you don't get stuck in the "admin trap."
|
||||
|
||||
- [Email Inbox Management](../examples/recipes/inbox_management/): Clearing out the spam and highlighting the three emails that actually need your brain.
|
||||
- [Invoicing & Collections](../examples/recipes/invoicing_collections/): Sending out bills and—more importantly—politely chasing down the people who haven't paid them.
|
||||
- [Data Keeper](../examples/recipes/data_keeper/): Pull data and reports from multiple data sources, and union them in one place.
|
||||
- [Travel & Calendar Coordination](../examples/recipes/calendar_coordination/): Protecting your "Deep Work" time from getting fragmented by random 15-minute meetings.
|
||||
|
||||
### 4. The Technical & Product Maintenance
|
||||
Unless you are a developer, tech debt will kill your productivity. A part-timer can keep the lights on.
|
||||
|
||||
- [Quality Assurance](../examples/recipes/quality_assurance/): Testing new features or links before they go live to ensure nothing is broken.
|
||||
- [Documentation](../examples/recipes/documentation/): Turning your messy processes into clean Standard Operating Procedures (SOPs).
|
||||
- [Issue Triaging](../examples/recipes/issue_triaging/): Categorizing and routing incoming bug reports by severity.
|
||||
|
||||
## Installation
|
||||
|
||||
Install the prerequisites like Python, then install the quickstart package.
|
||||
|
||||
## Use Existing Agent
|
||||
|
||||
To run an existing agent:
|
||||
|
||||
1. Run `hive run <agent_name>` or `hive tui <agent_name>`
|
||||
2. Hive automatically validates that your agent has all required prerequisites
|
||||
3. Type something in the TUI or trigger an event source (like receiving an email)
|
||||
4. Your agent runs, and the outcome is recorded
|
||||
5. If something fails, you'll see where the logs are saved
|
||||
|
||||
## Agent Generation (Alternative to Using Existing Agent)
|
||||
|
||||
If you want to build something custom, you can generate your own agent from scratch. See [Agent Generation](#agent-generation).
|
||||
|
||||
If you prefer to start with a working example first, try running an existing agent to see how it works. See [Use Existing Agent](#use-existing-agent).
|
||||
|
||||
If you find something you can't accomplish with the framework, you can contribute by opening an issue or sharing your feedback in our Discord channel.
|
||||
|
||||
## Agent Testing
|
||||
|
||||
**Interactive testing:** Run `hive tui` to test your agent in a terminal UI.
|
||||
|
||||
**Autonomous testing:** Run `hive run <agent_name> --debug` and trigger the event source. Testing scheduled events can be tricky—Hive provides developer tools to help you simulate them.
|
||||
|
||||
**Try before you install:** You can test sample agents hosted in the cloud without any local installation.
|
||||
|
||||
## Integration
|
||||
|
||||
You need to set up integrations correctly before testing can succeed.
|
||||
|
||||
**Happy path:** Your agent accomplishes the goal exactly as specified.
|
||||
|
||||
**Mid path:** After negotiation, your agent explicitly tells you what it can and cannot do.
|
||||
|
||||
**Sad path:** After negotiation, you may need to build a one-off integration for certain tools.
|
||||
|
||||
## Agent Debugging
|
||||
|
||||
When errors or unexpected behavior happen during testing, you need to be able to debug your agent effectively.
|
||||
|
||||
## Logging
|
||||
|
||||
Hive gives you an AI-assisted experience for checking logs and getting high signal-to-noise insights.
|
||||
|
||||
Hive uses **three-level observability** for tracking agent execution:
|
||||
|
||||
| Level | What it captures | File |
|
||||
|-------|------------------|------|
|
||||
| **L1 (Summary)** | Run outcomes — success/failure, execution quality, attention flags | `summary.json` |
|
||||
| **L2 (Details)** | Per-node results — retries, verdicts, latency, attention reasons | `details.jsonl` |
|
||||
| **L3 (Tool Logs)** | Step-by-step execution — tool calls, LLM responses, judge feedback | `tool_logs.jsonl` |
|
||||
|
||||
## (Optional) How Graph Works
|
||||
|
||||
To fix and improve your agent, you need to understand how node memory works and how tools are called. See `docs/key_concepts` for details.
|
||||
|
||||
## **First Success**
|
||||
|
||||
By this point, you should have run your first agent and understand how the framework works. You're ready to use it for real use cases, which often means updating and customizing your agent.
|
||||
|
||||
Everything before your first success should run as smoothly as possible—this is non-negotiable.
|
||||
|
||||
## Contribution
|
||||
|
||||
If you encounter issues creating your desired agent, or find that the integrations aren't sufficient for your use case, open an issue or let us know in our Discord channel.
|
||||
|
||||
## Iteration (Building) - More Like Debugging
|
||||
|
||||
After your MVP agent or sample agent runs, you'll want to iterate by expanding the use cases.
|
||||
|
||||
## Iteration (Production) - Evolution and Inventiveness
|
||||
|
||||
After your MVP is deployed, your taste and judgment still drive the direction—AI is a significant force multiplier for rapidly iterating and solving problems.
|
||||
|
||||
With Aden Cloud Hive, production evolution is fully automatic. The Aden Queen Bee runs natural selection by deploying, evaluating, and improving your agents.
|
||||
|
||||
## Version Control
|
||||
|
||||
Iteration doesn't always improve everything. Version control helps you get back to a previous version, like how git works. Run `hive git restore` to revert changes.
|
||||
|
||||
## Agent Personality
|
||||
|
||||
You can put your own soul into your agent. What remains constant across evolution matters. Success isn't about having your agent constantly changing—it's about knowing that your goal and personality stay fixed while your agent adapts to solve problems.
|
||||
|
||||
## Memory Management
|
||||
|
||||
Hive nodes have a built-in mechanism for handling node memory and passing memory between nodes. To implement cross-session memory or custom memory logic, use the memory tools.
|
||||
|
||||
# Deployment
|
||||
|
||||
## (Optional) How Agent Runtime Works
|
||||
|
||||
To fix and improve your agent, you need to understand how data transfers during runtime, how memory works, and how tools work. See `./agent_runtime.md` for details.
|
||||
|
||||
## Local Deployment
|
||||
|
||||
By default, Hive supports deployment through Docker.
|
||||
|
||||
1. Pre-flight Validation (Critical)
|
||||
2. One-Command Deployment (`hive deploy local my_agent`)
|
||||
3. Credential Handling in Containers (local credentials + Aden Cloud Credentials for OAuth)
|
||||
4. Persistence & State
|
||||
5. Debugging/Logging/Memory Access (start with CLI commands)
|
||||
6. Expose Hooks and APIs as SDK
|
||||
7. Documentation Deliverables
|
||||
|
||||
## Cloud Deployment
|
||||
|
||||
If you want zero-ops deployment, easier integration and credential management, and built-in logging, Aden Cloud is ideal. You get secure defaults, scaling, and observability out of the box—at the cost of less low-level control and some vendor lock-in.
|
||||
|
||||
## Deployment Strategy
|
||||
|
||||
Autonomous and interactive modes look different, but the core remains the same, and your deployment strategy should be consistent across both.
|
||||
|
||||
## Performance
|
||||
|
||||
Not a focus at the moment. Speed of execution, process pools, and hallucination handling are future considerations.
|
||||
|
||||
## How We Collect Data
|
||||
|
||||
Self-reported issues and cloud observability products.
|
||||
|
||||
## Runtime Guardrails
|
||||
|
||||
Hive provides built-in safety mechanisms to keep your agents within bounds.
|
||||
|
||||
## How We Make Reliability
|
||||
|
||||
Breakages still happen, even in the best business processes. Being reliable means being adaptive and fixing problems when they arise.
|
||||
|
||||
## Developer Trust
|
||||
|
||||
To deploy your agent for production use, Hive provides transparency in runtime, sufficient control, and guardrails to avoid catastrophic results.
|
||||
@@ -0,0 +1,42 @@
|
||||
# Why Conditional Edges Need Priority (Function Nodes)
|
||||
|
||||
## The problem
|
||||
|
||||
Function nodes return everything they computed. They don't pick one output key — they return all of them.
|
||||
|
||||
```python
|
||||
def score_lead(inputs):
|
||||
score = compute_score(inputs["profile"])
|
||||
return {
|
||||
"score": score,
|
||||
"is_high_value": score > 80,
|
||||
"needs_enrichment": score > 50 and not inputs["profile"].get("company"),
|
||||
}
|
||||
```
|
||||
|
||||
Lead comes in: score 92, no company on file. Output: `{"score": 92, "is_high_value": True, "needs_enrichment": True}`.
|
||||
|
||||
Two conditional edges leaving this node:
|
||||
|
||||
```
|
||||
Edge A: needs_enrichment == True → enrichment node
|
||||
Edge B: is_high_value == True → outreach node
|
||||
```
|
||||
|
||||
Both are true. Without priority, the graph either fans out to both (wrong — you'd email someone while still enriching their data) or picks one randomly (wrong — non-deterministic).
|
||||
|
||||
## Priority fixes it
|
||||
|
||||
```
|
||||
Edge A: needs_enrichment == True priority=2 (higher = checked first)
|
||||
Edge B: is_high_value == True priority=1
|
||||
Edge C: is_high_value == False priority=0
|
||||
```
|
||||
|
||||
Executor keeps only the highest-priority matching group. A wins. Lead gets enriched first, loops back, gets re-scored — now `needs_enrichment` is false, B wins, outreach happens.
|
||||
|
||||
## Why event loop nodes don't need this
|
||||
|
||||
The LLM understands "if/else." You tell it in the prompt: "if needs enrichment, set `needs_enrichment`. Otherwise if high value, set `approved`." It picks one. Only one conditional edge matches.
|
||||
|
||||
A function just returns a dict. It doesn't do "otherwise." Priority is the "otherwise" for function nodes.
|
||||
@@ -22,6 +22,32 @@ Each recipe is a markdown file (or folder with a markdown file) containing:
|
||||
|
||||
## Available recipes
|
||||
|
||||
### Sales & Marketing
|
||||
| Recipe | Description |
|
||||
|--------|-------------|
|
||||
| [marketing_agent](marketing_agent/) | Multi-channel marketing content generator with audience analysis and A/B copy variants |
|
||||
| [social_media_management](social_media_management/) | Schedule posts, reply to comments, monitor trends |
|
||||
| [newsletter_production](newsletter_production/) | Transform voice memos and ideas into polished emails |
|
||||
| [news_jacking](news_jacking/) | Personalized outreach triggered by real-time company news |
|
||||
| [crm_hygiene](crm_hygiene/) | Ensure every lead has follow-up dates and status |
|
||||
|
||||
### Customer Success
|
||||
| Recipe | Description |
|
||||
|--------|-------------|
|
||||
| [inquiry_triaging](inquiry_triaging/) | Sort tire kickers from hot leads |
|
||||
| [onboarding_assistance](onboarding_assistance/) | Guide new clients through setup and welcome kits |
|
||||
|
||||
### Operations Automation
|
||||
| Recipe | Description |
|
||||
|--------|-------------|
|
||||
| [inbox_management](inbox_management/) | Clear spam and surface emails that need your brain |
|
||||
| [invoicing_collections](invoicing_collections/) | Send invoices and chase overdue payments |
|
||||
| [data_keeper](data_keeper/) | Pull data from multiple sources into unified reports |
|
||||
| [calendar_coordination](calendar_coordination/) | Protect Deep Work time and book travel |
|
||||
|
||||
### Technical & Product Maintenance
|
||||
| Recipe | Description |
|
||||
|--------|-------------|
|
||||
| [quality_assurance](quality_assurance/) | Test features and links before they go live |
|
||||
| [documentation](documentation/) | Turn messy processes into clean SOPs |
|
||||
| [basic_troubleshooting](basic_troubleshooting/) | Handle Level 1 tech support |
|
||||
| [issue_triaging](issue_triaging/) | Categorize and route bug reports by severity |
|
||||
@@ -0,0 +1,36 @@
|
||||
# Recipe: Ad Campaign Monitoring
|
||||
|
||||
Checking daily spends on Meta/Google ads and flagging if the Cost Per Acquisition (CPA) spikes.
|
||||
|
||||
## Why
|
||||
|
||||
Ad platforms are designed to spend your money. Without daily oversight, a $50/day campaign can quietly become a $500 disaster. This agent watches your campaigns like a hawk, catching anomalies before they drain your budget and surfacing optimization opportunities you'd otherwise miss.
|
||||
|
||||
## What
|
||||
|
||||
- Monitor daily spend across all active campaigns
|
||||
- Track CPA, ROAS, CTR, and conversion metrics
|
||||
- Compare performance against historical benchmarks
|
||||
- Identify underperforming ads and audiences
|
||||
- Generate daily/weekly performance summaries
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Meta Ads API | Facebook/Instagram campaign data |
|
||||
| Google Ads API | Search/Display/YouTube campaign data |
|
||||
| Google Analytics 4 | Conversion tracking and attribution |
|
||||
| Google Sheets | Performance dashboards and reporting |
|
||||
| Slack | Alerts and daily summaries |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| CPA spikes >30% above target | Alert with breakdown by ad set and pause recommendation |
|
||||
| Daily budget exhausted before noon | Immediate alert — possible click fraud or viral ad |
|
||||
| ROAS drops below profitability threshold | Pause campaign and notify with optimization suggestions |
|
||||
| Ad rejected by platform | Alert with rejection reason and suggested fix |
|
||||
| Competitor running aggressive campaign | Flag if detected through auction insights |
|
||||
| Budget pacing off by >20% | Alert with projected monthly spend |
|
||||
@@ -0,0 +1,37 @@
|
||||
# Recipe: Travel & Calendar Coordination
|
||||
|
||||
Protecting your "Deep Work" time from getting fragmented by random 15-minute meetings.
|
||||
|
||||
## Why
|
||||
|
||||
Your calendar is a battlefield. Everyone wants a slice of your time, and without protection, your days become a patchwork of 30-minute meetings with no room for actual work. This agent defends your schedule — booking travel, consolidating meetings, and protecting the focus time you need to think.
|
||||
|
||||
## What
|
||||
|
||||
- Block and protect "Deep Work" time slots
|
||||
- Batch similar meetings together to reduce context switching
|
||||
- Book travel (flights, hotels, ground transport)
|
||||
- Handle meeting requests and rescheduling
|
||||
- Prep briefing docs before important meetings
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Google Calendar / Outlook | Calendar management |
|
||||
| Calendly / Cal.com | External scheduling |
|
||||
| TripIt / Google Flights / Kayak | Travel booking |
|
||||
| Expensify / Ramp | Travel expense tracking |
|
||||
| Notion / Google Docs | Meeting prep documents |
|
||||
| Slack | Schedule alerts and confirmations |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Someone tries to book over Deep Work time | Decline and offer alternatives, alert you if they push back |
|
||||
| VIP requests meeting during protected time | Flag for your decision — worth the exception? |
|
||||
| Flight cancelled or significantly delayed | Immediate alert with rebooking options |
|
||||
| Double-booking conflict | Alert with suggested resolution |
|
||||
| Meeting with no agenda 24h before | Prompt organizer for agenda, flag if none provided |
|
||||
| Travel cost exceeds budget threshold | Queue for approval before booking |
|
||||
@@ -0,0 +1,35 @@
|
||||
# Recipe: CRM Update
|
||||
|
||||
Ensuring every lead has a follow-up date and a status update.
|
||||
|
||||
## Why
|
||||
|
||||
A messy CRM is a leaky pipeline. Leads without follow-up dates get forgotten. Deals without status updates go stale. This agent keeps your CRM clean and actionable — so when you open it, you see exactly what needs your attention today.
|
||||
|
||||
## What
|
||||
|
||||
- Audit leads missing follow-up dates or status updates
|
||||
- Flag stale deals that haven't been touched in X days
|
||||
- Merge duplicate contacts and companies
|
||||
- Enrich records with missing data (email, phone, company info)
|
||||
- Generate daily "pipeline hygiene" reports
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| HubSpot / Salesforce / Pipedrive | CRM management |
|
||||
| Clearbit / Apollo / ZoomInfo | Data enrichment |
|
||||
| Google Sheets | Hygiene reports and audits |
|
||||
| Slack | Daily pipeline summary and action items |
|
||||
| Zapier / Make | Cross-platform data sync |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| High-value deal stale >14 days | Alert with deal history and suggested re-engagement |
|
||||
| Duplicate detected for active deal | Flag before merging — might be intentional |
|
||||
| Lead data conflicts with enrichment | Queue for human verification |
|
||||
| Pipeline value drops significantly week-over-week | Alert with analysis of what changed |
|
||||
| Follow-up overdue for >5 leads | Daily digest with prioritized action list |
|
||||
@@ -0,0 +1,38 @@
|
||||
# Recipe: Data Keeper
|
||||
|
||||
Pull data and reports from multiple data sources.
|
||||
|
||||
## Why
|
||||
|
||||
You can't steer the ship if you're the one manually copying and pasting numbers from Google Analytics into an Excel sheet. Every hour spent wrangling data is an hour not spent making decisions based on that data. This agent becomes your "Data DJ" — mixing sources, syncing sheets, and serving up the numbers you need when you need them.
|
||||
|
||||
## What
|
||||
|
||||
- Pull metrics from analytics, ads, CRM, and other platforms
|
||||
- Consolidate data into unified dashboards and spreadsheets
|
||||
- Generate daily/weekly/monthly reports automatically
|
||||
- Track KPIs and flag anomalies or trends
|
||||
- Keep data sources in sync (no more stale spreadsheets)
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Google Analytics 4 | Website traffic and conversion data |
|
||||
| Google Sheets / Excel | Report destination and dashboards |
|
||||
| Meta Ads / Google Ads | Ad performance metrics |
|
||||
| Stripe / QuickBooks | Revenue and financial data |
|
||||
| HubSpot / Salesforce | Sales pipeline and CRM metrics |
|
||||
| Slack | Report delivery and anomaly alerts |
|
||||
| BigQuery / Snowflake | Data warehouse queries (if applicable) |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Data source API fails or returns errors | Alert with error details and last successful sync time |
|
||||
| KPI drops >20% week-over-week | Immediate alert with breakdown by segment |
|
||||
| Data discrepancy between sources | Flag for investigation — which source is correct? |
|
||||
| Report generation fails | Notify with error and offer manual trigger |
|
||||
| Unusual spike in any metric | Alert with context — is this real or a tracking bug? |
|
||||
| New data source requested | Queue for setup — may need credentials or API access |
|
||||
@@ -0,0 +1,37 @@
|
||||
# Recipe: Documentation
|
||||
|
||||
Turning your messy processes into clean Standard Operating Procedures (SOPs).
|
||||
|
||||
## Why
|
||||
|
||||
Knowledge trapped in your head is a liability. When you're the only one who knows how things work, you become the bottleneck for everything. This agent captures your processes, cleans them up, and turns them into documentation anyone can follow — including your future self.
|
||||
|
||||
## What
|
||||
|
||||
- Watch you perform processes and document the steps
|
||||
- Convert rough notes and recordings into structured SOPs
|
||||
- Maintain and update existing documentation
|
||||
- Identify undocumented processes that need capture
|
||||
- Create quick-reference guides and checklists
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Notion / Confluence / GitBook | Documentation hosting |
|
||||
| Loom / Screen recording | Process capture |
|
||||
| Otter.ai / Whisper | Meeting and explanation transcription |
|
||||
| Slack | Documentation requests and updates |
|
||||
| GitHub | Technical documentation and READMEs |
|
||||
| Google Docs | Collaborative editing |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Process has conflicting documentation | Flag discrepancy for clarification |
|
||||
| SOP referenced but outdated >6 months | Queue for your review and update |
|
||||
| Someone asks question not covered by docs | Note the gap, draft new section for approval |
|
||||
| Critical process has no documentation | Alert as priority documentation needed |
|
||||
| Documentation contradicts current practice | Flag for reconciliation — update docs or process? |
|
||||
| External compliance requirement needs docs | Escalate with deadline and requirements |
|
||||
@@ -0,0 +1,35 @@
|
||||
# Recipe: Inbox Management
|
||||
|
||||
Clearing out the spam and highlighting the three emails that actually need your brain.
|
||||
|
||||
## Why
|
||||
|
||||
Email is where productivity goes to die. The average CEO gets 120+ emails per day, but only a handful actually matter. This agent acts as your email bouncer — filtering the noise so you can focus on the messages that move the needle.
|
||||
|
||||
## What
|
||||
|
||||
- Filter and archive spam, newsletters, and low-priority messages
|
||||
- Categorize emails by urgency and type (action needed, FYI, waiting on)
|
||||
- Summarize long email threads into key points
|
||||
- Draft responses for routine inquiries
|
||||
- Surface the 3-5 emails that truly need your attention
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Gmail API / Microsoft Graph | Email access and management |
|
||||
| Google Calendar | Context for scheduling-related emails |
|
||||
| Slack | Daily inbox briefing and urgent alerts |
|
||||
| Notion | Email summary archive for reference |
|
||||
| Your CRM | Cross-reference with known contacts and deals |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Email from VIP contact (investor, key client, partner) | Surface immediately, never auto-respond |
|
||||
| Legal or compliance language detected | Flag for your review — do not respond |
|
||||
| Angry or escalation tone detected | Alert with suggested de-escalation response |
|
||||
| Email requires decision with financial impact | Queue for your review with context |
|
||||
| Unrecognized sender with urgent request | Flag as potential phishing or verify before acting |
|
||||
@@ -0,0 +1,35 @@
|
||||
# Recipe: Inquiry Triaging
|
||||
|
||||
Sorting the "tire kickers" from the "hot leads."
|
||||
|
||||
## Why
|
||||
|
||||
Not all leads are created equal. For every serious buyer, there are ten people who'll never purchase. Your time should go to the prospects most likely to close — this agent scores and routes inquiries so you only see the ones worth your attention.
|
||||
|
||||
## What
|
||||
|
||||
- Analyze incoming inquiries for buying signals
|
||||
- Score leads based on company size, budget mentions, urgency, and fit
|
||||
- Route hot leads to your calendar immediately
|
||||
- Nurture warm leads with automated sequences
|
||||
- Politely deflect poor-fit inquiries
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| HubSpot / Salesforce / Pipedrive | CRM and lead management |
|
||||
| Intercom / Drift / Crisp | Live chat and inquiry capture |
|
||||
| Calendly / Cal.com | Meeting scheduling for qualified leads |
|
||||
| Clearbit / Apollo | Company enrichment and firmographics |
|
||||
| Slack / Email | Hot lead alerts |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Enterprise lead detected (>500 employees) | Immediate alert with company brief and suggested approach |
|
||||
| Lead mentions competitor by name | Flag for competitive positioning response |
|
||||
| Urgent language detected ("need this week", "ASAP") | Fast-track to your calendar |
|
||||
| Lead asks question outside playbook | Queue for your personal response |
|
||||
| High-value lead goes cold (no response in 48h) | Alert with re-engagement suggestions |
|
||||
@@ -0,0 +1,36 @@
|
||||
# Recipe: Invoicing & Collections
|
||||
|
||||
Sending out bills and—more importantly—politely chasing down the people who haven't paid them.
|
||||
|
||||
## Why
|
||||
|
||||
Cash flow is oxygen. But chasing invoices is awkward and time-consuming. This agent handles the uncomfortable job of asking for money — sending invoices on time, following up persistently but politely, and only escalating when the situation requires your personal touch.
|
||||
|
||||
## What
|
||||
|
||||
- Generate and send invoices on schedule
|
||||
- Track payment status across all outstanding invoices
|
||||
- Send automated payment reminders (friendly → firm → final)
|
||||
- Reconcile payments with bank transactions
|
||||
- Report on AR aging and cash flow projections
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| QuickBooks / Xero / FreshBooks | Invoicing and accounting |
|
||||
| Stripe / PayPal | Payment processing and status |
|
||||
| Plaid / Mercury | Bank transaction reconciliation |
|
||||
| Slack / Email | Collection alerts and summaries |
|
||||
| Google Sheets | AR aging reports and forecasts |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Invoice overdue >30 days | Escalate with payment history and suggested next steps |
|
||||
| Large invoice (>$5k) goes overdue | Alert immediately with client context |
|
||||
| Client disputes invoice | Flag for your review with dispute details |
|
||||
| Payment bounces or fails | Alert with retry options |
|
||||
| Client requests payment plan | Queue for your approval with suggested terms |
|
||||
| Collections threshold reached (>60 days) | Recommend formal collection action |
|
||||
@@ -0,0 +1,38 @@
|
||||
# Recipe: Issue Triaging
|
||||
|
||||
Categorizing and routing incoming bug reports by severity and type.
|
||||
|
||||
## Why
|
||||
|
||||
Not all bugs are equal. A typo in the footer can wait; a checkout failure cannot. This agent sorts the incoming chaos — categorizing issues by severity, gathering reproduction steps, and routing them to the right person — so critical bugs get fixed fast and minor ones don't clog the queue.
|
||||
|
||||
## What
|
||||
|
||||
- Categorize incoming issues by type (bug, feature request, question)
|
||||
- Assess severity based on impact and frequency
|
||||
- Gather reproduction steps and environment details
|
||||
- Route to appropriate team member or queue
|
||||
- Track issue lifecycle from report to resolution
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| GitHub Issues / Linear / Jira | Issue tracking |
|
||||
| Sentry / LogRocket / Datadog | Error context and logs |
|
||||
| Slack | Triage notifications and discussion |
|
||||
| Intercom / Zendesk | Customer-reported issue intake |
|
||||
| Notion | Issue categorization rules and playbooks |
|
||||
| PagerDuty | Critical issue escalation |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Security vulnerability reported | Immediate escalation, mark as confidential |
|
||||
| Data loss or corruption issue | P0 alert with all available context |
|
||||
| Issue affecting >10% of users | Escalate as incident with scope estimate |
|
||||
| Issue unsolvable within 30 minutes | Escalate with what was tried and ruled out |
|
||||
| Customer-reported issue from enterprise account | Priority flag regardless of severity assessment |
|
||||
| Same issue reported 5+ times in 24h | Alert as emerging pattern, consider incident |
|
||||
| Issue requires architecture decision | Queue for tech lead review |
|
||||
@@ -1,156 +0,0 @@
|
||||
# Recipe: Marketing Content Agent
|
||||
|
||||
A multi-channel marketing content generator. Given a product description and target audience, this agent analyzes the audience, generates tailored copy for multiple channels, and produces A/B variants.
|
||||
|
||||
## Goal
|
||||
|
||||
```
|
||||
Name: Marketing Content Generator
|
||||
Description: Generate targeted marketing content across multiple channels
|
||||
for a given product and audience.
|
||||
|
||||
Success criteria:
|
||||
- Audience analysis is produced with demographics and pain points
|
||||
- At least 2 channel-specific content pieces are generated
|
||||
- A/B variants are provided for each piece
|
||||
- All content aligns with the specified brand voice
|
||||
|
||||
Constraints:
|
||||
- (hard) No competitor brand names in generated content
|
||||
- (soft) Content should be under 280 characters for social media channels
|
||||
```
|
||||
|
||||
## Input / Output
|
||||
|
||||
**Input:**
|
||||
- `product_description` (str) — What the product is and does
|
||||
- `target_audience` (str) — Who the content is for
|
||||
- `brand_voice` (str) — Tone and style guidelines (e.g., "professional but approachable")
|
||||
- `channels` (list[str]) — Target channels, e.g. `["email", "twitter", "linkedin"]`
|
||||
|
||||
**Output:**
|
||||
- `audience_analysis` (dict) — Demographics, pain points, motivations
|
||||
- `content` (list[dict]) — Per-channel content with A/B variants
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
[analyze_audience] → [generate_content] → [review_and_refine]
|
||||
|
|
||||
(conditional)
|
||||
|
|
||||
needs_revision == True → [generate_content]
|
||||
needs_revision == False → (done)
|
||||
```
|
||||
|
||||
## Nodes
|
||||
|
||||
### 1. analyze_audience
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Type | `llm_generate` |
|
||||
| Input keys | `product_description`, `target_audience` |
|
||||
| Output keys | `audience_analysis` |
|
||||
| Tools | None |
|
||||
|
||||
**System prompt:**
|
||||
```
|
||||
You are a marketing strategist. Analyze the target audience for a product.
|
||||
|
||||
Product: {product_description}
|
||||
Target audience: {target_audience}
|
||||
|
||||
Produce a structured analysis in JSON:
|
||||
{{
|
||||
"audience_analysis": {{
|
||||
"demographics": "...",
|
||||
"pain_points": ["..."],
|
||||
"motivations": ["..."],
|
||||
"preferred_channels": ["..."],
|
||||
"messaging_angle": "..."
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
### 2. generate_content
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Type | `llm_generate` |
|
||||
| Input keys | `product_description`, `audience_analysis`, `brand_voice`, `channels` |
|
||||
| Output keys | `content` |
|
||||
| Tools | None |
|
||||
|
||||
**System prompt:**
|
||||
```
|
||||
You are a marketing copywriter. Generate content for each channel.
|
||||
|
||||
Product: {product_description}
|
||||
Audience analysis: {audience_analysis}
|
||||
Brand voice: {brand_voice}
|
||||
Channels: {channels}
|
||||
|
||||
For each channel, produce two variants (A and B).
|
||||
|
||||
Output as JSON:
|
||||
{{
|
||||
"content": [
|
||||
{{
|
||||
"channel": "twitter",
|
||||
"variant_a": "...",
|
||||
"variant_b": "..."
|
||||
}}
|
||||
]
|
||||
}}
|
||||
```
|
||||
|
||||
### 3. review_and_refine
|
||||
|
||||
| Field | Value |
|
||||
|-------|-------|
|
||||
| Type | `llm_generate` |
|
||||
| Input keys | `content`, `brand_voice` |
|
||||
| Output keys | `content`, `needs_revision` |
|
||||
| Tools | None |
|
||||
|
||||
**System prompt:**
|
||||
```
|
||||
You are a senior marketing editor. Review the following content for brand
|
||||
voice alignment, clarity, and channel appropriateness.
|
||||
|
||||
Content: {content}
|
||||
Brand voice: {brand_voice}
|
||||
|
||||
If any piece needs revision, fix it and set needs_revision to true.
|
||||
If everything looks good, return the content unchanged with needs_revision false.
|
||||
|
||||
Output as JSON:
|
||||
{{
|
||||
"content": [...],
|
||||
"needs_revision": false
|
||||
}}
|
||||
```
|
||||
|
||||
## Edges
|
||||
|
||||
| Source | Target | Condition | Priority |
|
||||
|--------|--------|-----------|----------|
|
||||
| analyze_audience | generate_content | `on_success` | 0 |
|
||||
| generate_content | review_and_refine | `on_success` | 0 |
|
||||
| review_and_refine | generate_content | `conditional: needs_revision == True` | 10 |
|
||||
|
||||
The `review_and_refine → generate_content` loop has higher priority so it's checked first. If `needs_revision` is false, execution ends at `review_and_refine` (terminal node).
|
||||
|
||||
## Tools
|
||||
|
||||
This recipe uses no external tools — all nodes are `llm_generate`. To extend it, consider adding:
|
||||
- A web search tool for competitive analysis in the `analyze_audience` node
|
||||
- A URL shortener tool for social media content
|
||||
- An image generation tool for visual content variants
|
||||
|
||||
## Variations
|
||||
|
||||
- **Single-channel mode**: Remove the `channels` input and hardcode to one channel for simpler output
|
||||
- **With approval gate**: Add a `human_input` node between `review_and_refine` and the terminal to require human sign-off
|
||||
- **With analytics**: Add a `function` node that logs generated content to a tracking system
|
||||
@@ -0,0 +1,61 @@
|
||||
# Recipe: News Jacking
|
||||
|
||||
Automated personalized outreach triggered by real-time company news.
|
||||
|
||||
## Why
|
||||
|
||||
Cold outreach gets ignored. But when you reference something that *just* happened to someone — a funding round, a podcast appearance, a new hire announcement — suddenly you're not a stranger, you're someone who pays attention. The problem is manually monitoring hundreds of leads for these moments is impossible. This agent does the watching so you can do the reaching.
|
||||
|
||||
## What
|
||||
|
||||
- Monitor news sources for lead companies (LinkedIn, Google News, TechCrunch, press releases)
|
||||
- Detect trigger events: funding announcements, executive hires, podcast appearances, product launches, awards
|
||||
- Draft hyper-personalized outreach referencing the specific event
|
||||
- Queue emails for human review or auto-send based on confidence score
|
||||
- Track response rates by trigger type to optimize over time
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Google News API / NewsAPI | Monitor company mentions |
|
||||
| LinkedIn Sales Navigator | Track company updates and job changes |
|
||||
| Apollo / Clearbit | Enrich lead data and find contact info |
|
||||
| Gmail / Outlook | Send personalized outreach |
|
||||
| CRM (HubSpot, Salesforce) | Log outreach and track responses |
|
||||
| Slack | Notify when high-value triggers detected |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| High-value lead (enterprise, known target account) | Queue for human review before sending |
|
||||
| Confidence score < 80% on event details | Flag for verification — do NOT auto-send |
|
||||
| Unable to verify news source | Skip outreach, log for manual review |
|
||||
| Lead responds | Alert immediately, pause automation for this lead |
|
||||
| Bounce or unsubscribe | Remove from automation, update CRM |
|
||||
| Same lead triggered multiple times in 30 days | Consolidate into single touchpoint |
|
||||
|
||||
## Guardrails
|
||||
|
||||
This agent has high "spam potential" if not configured carefully:
|
||||
|
||||
| Risk | Mitigation |
|
||||
|------|------------|
|
||||
| Hallucinated event details | Always include source URL, verify against multiple sources |
|
||||
| Tone-deaf timing (layoffs, bad news) | Filter out negative events, require human review for ambiguous |
|
||||
| Over-automation feels robotic | Randomize send times, vary templates, cap frequency per lead |
|
||||
| Referencing wrong person/company | Double-check entity resolution before drafting |
|
||||
|
||||
## Example Flow
|
||||
|
||||
```
|
||||
1. Agent detects: "[Lead's Company] raises $5M Series A" on TechCrunch
|
||||
2. Enriches: Finds CEO email via Apollo, confirms company match
|
||||
3. Drafts: "Hey [Name], congrats on the Series A! Saw the TechCrunch piece
|
||||
this morning. Scaling the team post-raise is always a ride — we help
|
||||
[Company Type] with [Value Prop]..."
|
||||
4. Scores: 92% confidence (verified source, exact name match)
|
||||
5. Routes: Auto-queue for send at 9:15 AM recipient's timezone
|
||||
6. Logs: Records in CRM with trigger type "funding_announcement"
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
# Recipe: Newsletter Production
|
||||
|
||||
Taking your raw ideas or voice memos and turning them into a polished weekly email.
|
||||
|
||||
## Why
|
||||
|
||||
Your audience wants to hear from you, not your ghostwriter. But you don't have 4 hours to craft the perfect newsletter. This agent captures your voice from quick inputs — voice memos, bullet points, Slack messages — and transforms them into publish-ready emails that sound like you.
|
||||
|
||||
## What
|
||||
|
||||
- Ingest raw content (voice memos, notes, bullet points)
|
||||
- Draft newsletter in your voice and style
|
||||
- Format with headers, links, and CTAs
|
||||
- Schedule for optimal send time
|
||||
- Track open rates and click-through for future optimization
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Otter.ai / Whisper | Voice memo transcription |
|
||||
| Notion / Google Docs | Draft storage and editing |
|
||||
| Mailchimp / ConvertKit / Beehiiv | Newsletter distribution |
|
||||
| Slack | Content intake and approvals |
|
||||
| Google Analytics / UTM tracking | Performance measurement |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Draft ready for review | Send preview link and summary for your approval |
|
||||
| Unusually low open rate on last send | Alert with analysis and A/B test suggestions |
|
||||
| Subscriber replies with question | Forward replies that need your expertise |
|
||||
| Unsubscribe spike after send | Flag with content analysis — what went wrong? |
|
||||
| Sponsor or partnership mention required | Queue for your review before sending |
|
||||
@@ -0,0 +1,36 @@
|
||||
# Recipe: Onboarding Assistance
|
||||
|
||||
Helping new clients set up their accounts or sending out "Welcome" kits.
|
||||
|
||||
## Why
|
||||
|
||||
First impressions stick. A smooth onboarding experience sets the tone for the entire customer relationship — but walking each new client through the same steps is a time sink. This agent delivers a white-glove experience at scale, making every customer feel personally welcomed.
|
||||
|
||||
## What
|
||||
|
||||
- Send personalized welcome emails and kits
|
||||
- Guide clients through account setup step-by-step
|
||||
- Answer common "getting started" questions
|
||||
- Track onboarding completion and milestone progress
|
||||
- Follow up on incomplete setups
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Intercom / Customer.io | Onboarding email sequences |
|
||||
| Notion / Loom | Tutorial content and documentation |
|
||||
| Calendly | Onboarding call scheduling |
|
||||
| Slack / Email | Progress updates and escalations |
|
||||
| Your product's API | Track setup completion status |
|
||||
| Typeform / Tally | Onboarding surveys and data collection |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Client stuck on setup >48 hours | Alert with where they're stuck and offer to schedule call |
|
||||
| Technical blocker during setup | Route to support with context already gathered |
|
||||
| High-value client starts onboarding | Notify so you can send personal welcome |
|
||||
| Client expresses frustration | Immediate flag for human intervention |
|
||||
| Onboarding incomplete after 7 days | Escalate with churn risk assessment |
|
||||
@@ -0,0 +1,37 @@
|
||||
# Recipe: Quality Assurance (QA)
|
||||
|
||||
Testing new features or links before they go live to ensure nothing is broken.
|
||||
|
||||
## Why
|
||||
|
||||
Broken features kill trust. One bad deploy can undo months of goodwill with your users. This agent runs systematic checks before anything goes live — catching the broken links, form errors, and edge cases that would otherwise reach your customers first.
|
||||
|
||||
## What
|
||||
|
||||
- Run automated test suites before deploys
|
||||
- Manually verify critical user flows (signup, checkout, core features)
|
||||
- Check all links for 404s and broken redirects
|
||||
- Test across browsers and device sizes
|
||||
- Verify integrations are responding correctly
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| GitHub Actions / CircleCI | CI/CD pipeline integration |
|
||||
| Playwright / Cypress / Selenium | Automated browser testing |
|
||||
| BrowserStack / LambdaTest | Cross-browser testing |
|
||||
| Checkly / Uptrends | Synthetic monitoring |
|
||||
| Slack / PagerDuty | Test failure alerts |
|
||||
| Linear / Jira | Bug ticket creation |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Critical test fails (auth, checkout, data) | Block deploy, alert immediately with failure details |
|
||||
| Flaky test (passes sometimes, fails others) | Flag for investigation but don't block |
|
||||
| New feature breaks existing functionality | Alert with regression details and affected areas |
|
||||
| Performance degradation detected | Flag with before/after metrics |
|
||||
| Security scan finds vulnerability | Immediate escalation with severity and remediation |
|
||||
| All tests pass but something "feels off" | Document observation and flag for human review |
|
||||
@@ -0,0 +1,34 @@
|
||||
# Recipe: Social Media Management
|
||||
|
||||
Scheduling posts, replying to comments, and monitoring trends.
|
||||
|
||||
## Why
|
||||
|
||||
Consistency kills on social media — but it also kills your time. One "quick post" turns into an hour of tweaking copy, finding hashtags, and responding to comments. This agent maintains your social presence so you stay visible without staying glued to your phone.
|
||||
|
||||
## What
|
||||
|
||||
- Schedule posts across platforms (Twitter/X, LinkedIn, Instagram, Facebook)
|
||||
- Reply to comments and DMs with on-brand responses
|
||||
- Monitor trending topics and hashtags in your niche
|
||||
- Track engagement metrics and surface what's working
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Buffer / Hootsuite / Later | Post scheduling and publishing |
|
||||
| Twitter/X API | Direct posting and engagement |
|
||||
| LinkedIn API | Professional network management |
|
||||
| Meta Graph API | Facebook/Instagram management |
|
||||
| Slack | Notifications and escalations |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Post goes viral (>10x normal engagement) | Alert with engagement stats and suggested follow-up content |
|
||||
| Negative viral moment | Immediate alert — do NOT auto-respond, queue for human review |
|
||||
| Influencer or press mentions you | Flag for personal response opportunity |
|
||||
| Controversial topic trending in your space | Alert before posting scheduled content that might be tone-deaf |
|
||||
| DM from verified account or known lead | Route directly to you |
|
||||
@@ -0,0 +1,37 @@
|
||||
# Recipe: Support Troubleshooting
|
||||
|
||||
Handling "Level 1" tech support for your platform or website.
|
||||
|
||||
## Why
|
||||
|
||||
Most support tickets are the same 20 questions over and over: password resets, access issues, "how do I..." questions. You don't need to answer these — but someone does. This agent handles the repetitive tier-1 support so your users get fast answers and you get your time back.
|
||||
|
||||
## What
|
||||
|
||||
- Handle password resets and account access issues
|
||||
- Answer common "how do I" questions from the knowledge base
|
||||
- Walk users through basic setup and configuration
|
||||
- Collect diagnostic information for complex issues
|
||||
- Log all support interactions for pattern analysis
|
||||
|
||||
## Integrations
|
||||
|
||||
| Platform | Purpose |
|
||||
|----------|---------|
|
||||
| Intercom / Zendesk / Freshdesk | Support ticket management |
|
||||
| Notion / Confluence | Knowledge base for answers |
|
||||
| Slack | Internal escalation channel |
|
||||
| Your product's API | Account status, password reset triggers |
|
||||
| LogRocket / FullStory | Session replay for debugging |
|
||||
| PagerDuty | Urgent escalation routing |
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Trigger | Action |
|
||||
|---------|--------|
|
||||
| Issue not resolved within 30 minutes | Escalate with full context gathered |
|
||||
| User expresses frustration or anger | Immediate handoff to human with de-escalation note |
|
||||
| Security-related issue (account compromise, data concern) | Escalate immediately, do not attempt to resolve |
|
||||
| Bug discovered during troubleshooting | Create ticket and escalate to engineering |
|
||||
| VIP or enterprise customer | Flag for priority handling regardless of issue |
|
||||
| Same issue reported by 3+ users | Alert as potential systemic problem |
|
||||
@@ -11,6 +11,7 @@ template_name/
|
||||
├── __init__.py # Package exports
|
||||
├── __main__.py # CLI entry point
|
||||
├── agent.py # Goal, edges, graph spec, agent class
|
||||
├── agent.json # Agent definition (used by build-from-template)
|
||||
├── config.py # Runtime configuration
|
||||
├── nodes/
|
||||
│ └── __init__.py # Node definitions (NodeSpec instances)
|
||||
@@ -19,20 +20,27 @@ template_name/
|
||||
|
||||
## How to use a template
|
||||
|
||||
### Option 1: Build from template (recommended)
|
||||
|
||||
Use the `/hive-create` skill and select "From a template" to interactively pick a template, customize the goal/nodes/graph, and export a new agent.
|
||||
|
||||
### Option 2: Manual copy
|
||||
|
||||
```bash
|
||||
# 1. Copy to your exports directory
|
||||
cp -r examples/templates/marketing_agent exports/my_marketing_agent
|
||||
cp -r examples/templates/deep_research_agent exports/my_research_agent
|
||||
|
||||
# 2. Update the module references in __main__.py and __init__.py
|
||||
|
||||
# 3. Customize goal, nodes, edges, and prompts
|
||||
|
||||
# 4. Run it
|
||||
uv run python -m exports.my_marketing_agent --input '{"product_description": "..."}'
|
||||
uv run python -m exports.my_research_agent --input '{"topic": "..."}'
|
||||
```
|
||||
|
||||
## Available templates
|
||||
|
||||
| Template | Description |
|
||||
|----------|-------------|
|
||||
| [marketing_agent](marketing_agent/) | Multi-channel marketing content generator with audience analysis, content generation, and editorial review nodes |
|
||||
| [deep_research_agent](deep_research_agent/) | Interactive research agent that searches diverse sources, evaluates findings with user checkpoints, and produces a cited HTML report |
|
||||
| [tech_news_reporter](tech_news_reporter/) | Researches the latest technology and AI news from the web and produces a well-organized report |
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# Deep Research Agent
|
||||
|
||||
A template agent designed to perform comprehensive research on a specific topic and generate a structured report.
|
||||
|
||||
## Usage
|
||||
|
||||
Run the agent using the following command:
|
||||
|
||||
### Linux / Mac
|
||||
```bash
|
||||
PYTHONPATH=core:examples/templates python -m deep_research_agent run --mock --topic "Artificial Intelligence"
|
||||
|
||||
### Windows
|
||||
```powershell
|
||||
$env:PYTHONPATH="core;examples\templates"
|
||||
python -m deep_research_agent run --mock --topic "Artificial Intelligence"
|
||||
|
||||
## Options
|
||||
|
||||
- `-t, --topic`: The research topic (required).
|
||||
- `--mock`: Run without calling real LLM APIs (simulated execution).
|
||||
- `--help`: Show all available options.
|
||||
@@ -207,17 +207,8 @@ async def _interactive_shell(verbose=False):
|
||||
|
||||
if result.success:
|
||||
output = result.output
|
||||
if "report_content" in output:
|
||||
click.echo("\n--- Report ---\n")
|
||||
click.echo(output["report_content"])
|
||||
click.echo("\n")
|
||||
if "references" in output:
|
||||
click.echo("--- References ---\n")
|
||||
for ref in output.get("references", []):
|
||||
click.echo(
|
||||
f" [{ref.get('number', '?')}] {ref.get('title', '')} - {ref.get('url', '')}"
|
||||
)
|
||||
click.echo("\n")
|
||||
status = output.get("delivery_status", "unknown")
|
||||
click.echo(f"\nResearch complete (status: {status})\n")
|
||||
else:
|
||||
click.echo(f"\nResearch failed: {result.error}\n")
|
||||
|
||||
|
||||
@@ -0,0 +1,276 @@
|
||||
{
|
||||
"agent": {
|
||||
"id": "deep_research_agent",
|
||||
"name": "Deep Research Agent",
|
||||
"version": "1.0.0",
|
||||
"description": "Interactive research agent that rigorously investigates topics through multi-source search, quality evaluation, and synthesis - with TUI conversation at key checkpoints for user guidance and feedback."
|
||||
},
|
||||
"graph": {
|
||||
"id": "deep-research-agent-graph",
|
||||
"goal_id": "rigorous-interactive-research",
|
||||
"version": "1.0.0",
|
||||
"entry_node": "intake",
|
||||
"entry_points": {
|
||||
"start": "intake"
|
||||
},
|
||||
"pause_nodes": [],
|
||||
"terminal_nodes": [
|
||||
"report"
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "intake",
|
||||
"name": "Research Intake",
|
||||
"description": "Discuss the research topic with the user, clarify scope, and confirm direction",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"topic"
|
||||
],
|
||||
"output_keys": [
|
||||
"research_brief"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are a research intake specialist. The user wants to research a topic.\nHave a brief conversation to clarify what they need.\n\n**STEP 1 \u2014 Read and respond (text only, NO tool calls):**\n1. Read the topic provided\n2. If it's vague, ask 1-2 clarifying questions (scope, angle, depth)\n3. If it's already clear, confirm your understanding and ask the user to confirm\n\nKeep it short. Don't over-ask.\n\nAfter your message, call ask_user() to wait for the user's response.\n\n**STEP 2 \u2014 After the user confirms, call set_output:**\n- set_output(\"research_brief\", \"A clear paragraph describing exactly what to research, what questions to answer, what scope to cover, and how deep to go.\")",
|
||||
"tools": [],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": true
|
||||
},
|
||||
{
|
||||
"id": "research",
|
||||
"name": "Research",
|
||||
"description": "Search the web, fetch source content, and compile findings",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"research_brief",
|
||||
"feedback"
|
||||
],
|
||||
"output_keys": [
|
||||
"findings",
|
||||
"sources",
|
||||
"gaps"
|
||||
],
|
||||
"nullable_output_keys": [
|
||||
"feedback"
|
||||
],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are a research agent. Given a research brief, find and analyze sources.\n\nIf feedback is provided, this is a follow-up round \u2014 focus on the gaps identified.\n\nWork in phases:\n1. **Search**: Use web_search with 3-5 diverse queries covering different angles.\n Prioritize authoritative sources (.edu, .gov, established publications).\n2. **Fetch**: Use web_scrape on the most promising URLs (aim for 5-8 sources).\n Skip URLs that fail. Extract the substantive content.\n3. **Analyze**: Review what you've collected. Identify key findings, themes,\n and any contradictions between sources.\n\nImportant:\n- Work in batches of 3-4 tool calls at a time to manage context\n- After each batch, assess whether you have enough material\n- Prefer quality over quantity \u2014 5 good sources beat 15 thin ones\n- Track which URL each finding comes from (you'll need citations later)\n\nWhen done, use set_output:\n- set_output(\"findings\", \"Structured summary: key findings with source URLs for each claim. Include themes, contradictions, and confidence levels.\")\n- set_output(\"sources\", [{\"url\": \"...\", \"title\": \"...\", \"summary\": \"...\"}])\n- set_output(\"gaps\", \"What aspects of the research brief are NOT well-covered yet, if any.\")",
|
||||
"tools": [
|
||||
"web_search",
|
||||
"web_scrape",
|
||||
"load_data",
|
||||
"save_data",
|
||||
"list_data_files"
|
||||
],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 3,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": false
|
||||
},
|
||||
{
|
||||
"id": "review",
|
||||
"name": "Review Findings",
|
||||
"description": "Present findings to user and decide whether to research more or write the report",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"findings",
|
||||
"sources",
|
||||
"gaps",
|
||||
"research_brief"
|
||||
],
|
||||
"output_keys": [
|
||||
"needs_more_research",
|
||||
"feedback"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "Present the research findings to the user clearly and concisely.\n\n**STEP 1 \u2014 Present (your first message, text only, NO tool calls):**\n1. **Summary** (2-3 sentences of what was found)\n2. **Key Findings** (bulleted, with confidence levels)\n3. **Sources Used** (count and quality assessment)\n4. **Gaps** (what's still unclear or under-covered)\n\nEnd by asking: Are they satisfied, or do they want deeper research? Should we proceed to writing the final report?\n\nAfter your presentation, call ask_user() to wait for the user's response.\n\n**STEP 2 \u2014 After the user responds, call set_output:**\n- set_output(\"needs_more_research\", \"true\") \u2014 if they want more\n- set_output(\"needs_more_research\", \"false\") \u2014 if they're satisfied\n- set_output(\"feedback\", \"What the user wants explored further, or empty string\")",
|
||||
"tools": [],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 3,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": true
|
||||
},
|
||||
{
|
||||
"id": "report",
|
||||
"name": "Write & Deliver Report",
|
||||
"description": "Write a cited HTML report from the findings and present it to the user",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"findings",
|
||||
"sources",
|
||||
"research_brief"
|
||||
],
|
||||
"output_keys": [
|
||||
"delivery_status"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "Write a comprehensive research report as an HTML file and present it to the user.\n\n**STEP 1 \u2014 Write the HTML report (tool calls, NO text to user yet):**\n\n1. Compose a complete, self-contained HTML document with embedded CSS styling.\n Use a clean, readable design: max-width container, pleasant typography,\n numbered citation links, a table of contents, and a references section.\n\n Report structure inside the HTML:\n - Title & date\n - Executive Summary (2-3 paragraphs)\n - Table of Contents\n - Findings (organized by theme, with [n] citation links)\n - Analysis (synthesis, implications, areas of debate)\n - Conclusion (key takeaways, confidence assessment)\n - References (numbered list with clickable URLs)\n\n Requirements:\n - Every factual claim must cite its source with [n] notation\n - Be objective \u2014 present multiple viewpoints where sources disagree\n - Distinguish well-supported conclusions from speculation\n - Answer the original research questions from the brief\n\n2. Save the HTML file:\n save_data(filename=\"report.html\", data=<your_html>)\n\n3. Get the clickable link:\n serve_file_to_user(filename=\"report.html\", label=\"Research Report\")\n\n**STEP 2 \u2014 Present the link to the user (text only, NO tool calls):**\n\nTell the user the report is ready and include the file:// URI from\nserve_file_to_user so they can click it to open. Give a brief summary\nof what the report covers. Ask if they have questions.\n\nAfter presenting the link, call ask_user() to wait for the user's response.\n\n**STEP 3 \u2014 After the user responds:**\n- Answer follow-up questions from the research material\n- Call ask_user() again if they might have more questions\n- When the user is satisfied: set_output(\"delivery_status\", \"completed\")",
|
||||
"tools": [
|
||||
"save_data",
|
||||
"serve_file_to_user",
|
||||
"load_data",
|
||||
"list_data_files"
|
||||
],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": true
|
||||
}
|
||||
],
|
||||
"edges": [
|
||||
{
|
||||
"id": "intake-to-research",
|
||||
"source": "intake",
|
||||
"target": "research",
|
||||
"condition": "on_success",
|
||||
"condition_expr": null,
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
},
|
||||
{
|
||||
"id": "research-to-review",
|
||||
"source": "research",
|
||||
"target": "review",
|
||||
"condition": "on_success",
|
||||
"condition_expr": null,
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
},
|
||||
{
|
||||
"id": "review-to-research-feedback",
|
||||
"source": "review",
|
||||
"target": "research",
|
||||
"condition": "conditional",
|
||||
"condition_expr": "str(needs_more_research).lower() == 'true'",
|
||||
"priority": 2,
|
||||
"input_mapping": {}
|
||||
},
|
||||
{
|
||||
"id": "review-to-report",
|
||||
"source": "review",
|
||||
"target": "report",
|
||||
"condition": "conditional",
|
||||
"condition_expr": "str(needs_more_research).lower() != 'true'",
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
}
|
||||
],
|
||||
"max_steps": 100,
|
||||
"max_retries_per_node": 3,
|
||||
"description": "Interactive research agent that rigorously investigates topics through multi-source search, quality evaluation, and synthesis - with TUI conversation at key checkpoints for user guidance and feedback.",
|
||||
"created_at": "2026-02-06T00:00:00.000000"
|
||||
},
|
||||
"goal": {
|
||||
"id": "rigorous-interactive-research",
|
||||
"name": "Rigorous Interactive Research",
|
||||
"description": "Research any topic by searching diverse sources, analyzing findings, and producing a cited report \u2014 with user checkpoints to guide direction.",
|
||||
"status": "draft",
|
||||
"success_criteria": [
|
||||
{
|
||||
"id": "source-diversity",
|
||||
"description": "Use multiple diverse, authoritative sources",
|
||||
"metric": "source_count",
|
||||
"target": ">=5",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "citation-coverage",
|
||||
"description": "Every factual claim in the report cites its source",
|
||||
"metric": "citation_coverage",
|
||||
"target": "100%",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "user-satisfaction",
|
||||
"description": "User reviews findings before report generation",
|
||||
"metric": "user_approval",
|
||||
"target": "true",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "report-completeness",
|
||||
"description": "Final report answers the original research questions",
|
||||
"metric": "question_coverage",
|
||||
"target": "90%",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
}
|
||||
],
|
||||
"constraints": [
|
||||
{
|
||||
"id": "no-hallucination",
|
||||
"description": "Only include information found in fetched sources",
|
||||
"constraint_type": "quality",
|
||||
"category": "accuracy",
|
||||
"check": ""
|
||||
},
|
||||
{
|
||||
"id": "source-attribution",
|
||||
"description": "Every claim must cite its source with a numbered reference",
|
||||
"constraint_type": "quality",
|
||||
"category": "accuracy",
|
||||
"check": ""
|
||||
},
|
||||
{
|
||||
"id": "user-checkpoint",
|
||||
"description": "Present findings to the user before writing the final report",
|
||||
"constraint_type": "functional",
|
||||
"category": "interaction",
|
||||
"check": ""
|
||||
}
|
||||
],
|
||||
"context": {},
|
||||
"required_capabilities": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"version": "1.0.0",
|
||||
"parent_version": null,
|
||||
"evolution_reason": null,
|
||||
"created_at": "2026-02-06 00:00:00.000000",
|
||||
"updated_at": "2026-02-06 00:00:00.000000"
|
||||
},
|
||||
"required_tools": [
|
||||
"list_data_files",
|
||||
"load_data",
|
||||
"save_data",
|
||||
"serve_file_to_user",
|
||||
"web_scrape",
|
||||
"web_search"
|
||||
],
|
||||
"metadata": {
|
||||
"created_at": "2026-02-06T00:00:00.000000",
|
||||
"node_count": 4,
|
||||
"edge_count": 4
|
||||
}
|
||||
}
|
||||
@@ -102,23 +102,23 @@ edges = [
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
# review -> research (feedback loop)
|
||||
# review -> research (feedback loop, checked first)
|
||||
EdgeSpec(
|
||||
id="review-to-research-feedback",
|
||||
source="review",
|
||||
target="research",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="needs_more_research == True",
|
||||
priority=1,
|
||||
condition_expr="str(needs_more_research).lower() == 'true'",
|
||||
priority=2,
|
||||
),
|
||||
# review -> report (user satisfied)
|
||||
# review -> report (complementary condition — proceed to report when no more research needed)
|
||||
EdgeSpec(
|
||||
id="review-to-report",
|
||||
source="review",
|
||||
target="report",
|
||||
condition=EdgeCondition.CONDITIONAL,
|
||||
condition_expr="needs_more_research == False",
|
||||
priority=2,
|
||||
condition_expr="str(needs_more_research).lower() != 'true'",
|
||||
priority=1,
|
||||
),
|
||||
]
|
||||
|
||||
@@ -241,9 +241,7 @@ class DeepResearchAgent:
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, context: dict, session_state=None
|
||||
) -> ExecutionResult:
|
||||
async def run(self, context: dict, session_state=None) -> ExecutionResult:
|
||||
"""Run the agent (convenience method for single execution)."""
|
||||
await self.start()
|
||||
try:
|
||||
|
||||
@@ -1,33 +1,8 @@
|
||||
"""Runtime configuration."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_preferred_model() -> str:
|
||||
"""Load preferred model from ~/.hive/configuration.json."""
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
llm = config.get("llm", {})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{llm['provider']}/{llm['model']}"
|
||||
except Exception:
|
||||
pass
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeConfig:
|
||||
model: str = field(default_factory=_load_preferred_model)
|
||||
temperature: float = 0.7
|
||||
max_tokens: int = 40000
|
||||
api_key: str | None = None
|
||||
api_base: str | None = None
|
||||
from dataclasses import dataclass
|
||||
|
||||
from framework.config import RuntimeConfig
|
||||
|
||||
default_config = RuntimeConfig()
|
||||
|
||||
@@ -41,6 +16,11 @@ class AgentMetadata:
|
||||
"multi-source search, quality evaluation, and synthesis - with TUI conversation "
|
||||
"at key checkpoints for user guidance and feedback."
|
||||
)
|
||||
intro_message: str = (
|
||||
"Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
|
||||
"thoroughly — searching multiple sources, evaluating quality, and synthesizing "
|
||||
"a comprehensive report. What would you like me to research?"
|
||||
)
|
||||
|
||||
|
||||
metadata = AgentMetadata()
|
||||
|
||||
@@ -23,6 +23,8 @@ Have a brief conversation to clarify what they need.
|
||||
|
||||
Keep it short. Don't over-ask.
|
||||
|
||||
After your message, call ask_user() to wait for the user's response.
|
||||
|
||||
**STEP 2 — After the user confirms, call set_output:**
|
||||
- set_output("research_brief", "A clear paragraph describing exactly what to research, \
|
||||
what questions to answer, what scope to cover, and how deep to go.")
|
||||
@@ -93,6 +95,8 @@ Present the research findings to the user clearly and concisely.
|
||||
End by asking: Are they satisfied, or do they want deeper research? \
|
||||
Should we proceed to writing the final report?
|
||||
|
||||
After your presentation, call ask_user() to wait for the user's response.
|
||||
|
||||
**STEP 2 — After the user responds, call set_output:**
|
||||
- set_output("needs_more_research", "true") — if they want more
|
||||
- set_output("needs_more_research", "false") — if they're satisfied
|
||||
@@ -147,8 +151,11 @@ Tell the user the report is ready and include the file:// URI from
|
||||
serve_file_to_user so they can click it to open. Give a brief summary
|
||||
of what the report covers. Ask if they have questions.
|
||||
|
||||
After presenting the link, call ask_user() to wait for the user's response.
|
||||
|
||||
**STEP 3 — After the user responds:**
|
||||
- Answer follow-up questions from the research material
|
||||
- Call ask_user() again if they might have more questions
|
||||
- When the user is satisfied: set_output("delivery_status", "completed")
|
||||
""",
|
||||
tools=["save_data", "serve_file_to_user", "load_data", "list_data_files"],
|
||||
|
||||
@@ -225,9 +225,7 @@ class TechNewsReporterAgent:
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, context: dict, session_state=None
|
||||
) -> ExecutionResult:
|
||||
async def run(self, context: dict, session_state=None) -> ExecutionResult:
|
||||
"""Run the agent (convenience method for single execution)."""
|
||||
await self.start()
|
||||
try:
|
||||
|
||||
@@ -1,33 +1,8 @@
|
||||
"""Runtime configuration."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_preferred_model() -> str:
|
||||
"""Load preferred model from ~/.hive/configuration.json."""
|
||||
config_path = Path.home() / ".hive" / "configuration.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
llm = config.get("llm", {})
|
||||
if llm.get("provider") and llm.get("model"):
|
||||
return f"{llm['provider']}/{llm['model']}"
|
||||
except Exception:
|
||||
pass
|
||||
return "anthropic/claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeConfig:
|
||||
model: str = field(default_factory=_load_preferred_model)
|
||||
temperature: float = 0.7
|
||||
max_tokens: int = 40000
|
||||
api_key: str | None = None
|
||||
api_base: str | None = None
|
||||
from dataclasses import dataclass
|
||||
|
||||
from framework.config import RuntimeConfig
|
||||
|
||||
default_config = RuntimeConfig()
|
||||
|
||||
@@ -41,6 +16,11 @@ class AgentMetadata:
|
||||
"summarize key stories, and produce a well-organized report "
|
||||
"for the user to read."
|
||||
)
|
||||
intro_message: str = (
|
||||
"Hi! I'm your tech news reporter. I'll search the web for the latest technology "
|
||||
"and AI news, then put together a clear summary for you. What topic or area "
|
||||
"should I cover?"
|
||||
)
|
||||
|
||||
|
||||
metadata = AgentMetadata()
|
||||
|
||||
@@ -69,10 +69,22 @@ You do NOT have web search — instead, scrape news directly from known sites.
|
||||
- Recency (past week)
|
||||
- Significance and diversity of topics
|
||||
|
||||
CRITICAL: Copy URLs EXACTLY as they appear in the "href" field of the scraped
|
||||
links. Do NOT reconstruct, guess, or modify URLs from memory. Use the verbatim
|
||||
href value from the web_scrape result.
|
||||
|
||||
3. For each selected article, use web_scrape with max_length=3000 on the
|
||||
individual article URL to get the content. Extract: title, source name,
|
||||
URL, publication date, a 2-3 sentence summary, and the main topic category.
|
||||
|
||||
4. **VERIFY LINKS** — Before producing your final output, verify each article URL
|
||||
by checking the web_scrape result you got in step 3:
|
||||
- If the scrape returned content successfully, the URL is verified — use it as-is.
|
||||
- If the scrape returned an error or the page was not found (404, timeout, etc.),
|
||||
go back to the front page links from step 1 and pick a different article URL
|
||||
to replace it. Scrape the replacement to confirm it works.
|
||||
- Only include articles whose URLs returned successful scrape results.
|
||||
|
||||
**Output format:**
|
||||
Use set_output("articles_data", <JSON string>) with this structure:
|
||||
```json
|
||||
@@ -94,12 +106,13 @@ Use set_output("articles_data", <JSON string>) with this structure:
|
||||
|
||||
**Rules:**
|
||||
- Only include REAL articles with REAL URLs you scraped. Never fabricate.
|
||||
- The "url" field MUST be a URL you successfully scraped. Never invent URLs.
|
||||
- Focus on news from the past week.
|
||||
- Aim for at least 3 distinct topic categories.
|
||||
- Keep summaries factual and concise.
|
||||
- If a site fails to load, skip it and move on to the next.
|
||||
- Always use max_length to limit scraped content (5000 for front pages, 3000 for articles).
|
||||
- Work in batches: scrape front pages first, then articles. Don't scrape everything at once.
|
||||
- Work in batches: scrape front pages first, then articles, then verify. Don't scrape everything at once.
|
||||
""",
|
||||
tools=["web_scrape"],
|
||||
)
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
# Twitter Outreach Agent
|
||||
|
||||
Personalized email outreach powered by Twitter/X research.
|
||||
|
||||
## What it does
|
||||
|
||||
1. **Intake** — Collects the target's Twitter handle, outreach purpose, and recipient email
|
||||
2. **Research** — Searches and scrapes the target's Twitter/X profile for bio, tweets, interests
|
||||
3. **Draft & Review** — Crafts a personalized email and presents it for your approval (with iteration)
|
||||
4. **Send** — Sends the approved email
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Validate the agent structure
|
||||
PYTHONPATH=core:exports uv run python -m twitter_outreach validate
|
||||
|
||||
# Show agent info
|
||||
PYTHONPATH=core:exports uv run python -m twitter_outreach info
|
||||
|
||||
# Run the workflow
|
||||
PYTHONPATH=core:exports uv run python -m twitter_outreach run
|
||||
|
||||
# Launch the TUI
|
||||
PYTHONPATH=core:exports uv run python -m twitter_outreach tui
|
||||
|
||||
# Interactive shell
|
||||
PYTHONPATH=core:exports uv run python -m twitter_outreach shell
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
intake → research → draft-review → send
|
||||
```
|
||||
|
||||
## Tools Used
|
||||
|
||||
- `web_search` — Search for Twitter profiles and public info
|
||||
- `web_scrape` — Read Twitter/X profile pages
|
||||
- `send_email` — Send the approved outreach email
|
||||
|
||||
## Nodes
|
||||
|
||||
| Node | Type | Client-Facing | Description |
|
||||
|------|------|:---:|-------------|
|
||||
| `intake` | event_loop | Yes | Collect target info from user |
|
||||
| `research` | event_loop | No | Research Twitter/X profile |
|
||||
| `draft-review` | event_loop | Yes | Draft email, iterate with user |
|
||||
| `send` | event_loop | No | Send approved email |
|
||||
|
||||
## Constraints
|
||||
|
||||
- **No Spam** — No spammy language, clickbait, or aggressive sales tactics
|
||||
- **Approval Required** — Never sends without explicit user approval
|
||||
- **Tone** — Professional, authentic, conversational
|
||||
- **Privacy** — Only uses publicly available information
|
||||
@@ -1,23 +0,0 @@
|
||||
"""
|
||||
Twitter Outreach Agent - Personalized email outreach powered by Twitter/X research.
|
||||
|
||||
Reads a target's Twitter/X profile, crafts a personalized outreach email
|
||||
referencing their specific activity, and sends it after user approval.
|
||||
"""
|
||||
|
||||
from .agent import TwitterOutreachAgent, default_agent, goal, nodes, edges
|
||||
from .config import RuntimeConfig, AgentMetadata, default_config, metadata
|
||||
|
||||
__version__ = "1.0.0"
|
||||
|
||||
__all__ = [
|
||||
"TwitterOutreachAgent",
|
||||
"default_agent",
|
||||
"goal",
|
||||
"nodes",
|
||||
"edges",
|
||||
"RuntimeConfig",
|
||||
"AgentMetadata",
|
||||
"default_config",
|
||||
"metadata",
|
||||
]
|
||||
@@ -1,206 +0,0 @@
|
||||
"""
|
||||
CLI entry point for Twitter Outreach Agent.
|
||||
|
||||
Uses AgentRuntime for TUI support with client-facing interaction.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import click
|
||||
|
||||
from .agent import default_agent, TwitterOutreachAgent
|
||||
|
||||
|
||||
def setup_logging(verbose=False, debug=False):
|
||||
"""Configure logging for execution visibility."""
|
||||
if debug:
|
||||
level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
|
||||
elif verbose:
|
||||
level, fmt = logging.INFO, "%(message)s"
|
||||
else:
|
||||
level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
|
||||
logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
|
||||
logging.getLogger("framework").setLevel(level)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(version="1.0.0")
|
||||
def cli():
|
||||
"""Twitter Outreach Agent - Personalized email outreach powered by Twitter/X research."""
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
||||
def run(quiet, verbose, debug):
|
||||
"""Execute the outreach workflow."""
|
||||
if not quiet:
|
||||
setup_logging(verbose=verbose, debug=debug)
|
||||
|
||||
result = asyncio.run(default_agent.run({}))
|
||||
|
||||
output_data = {
|
||||
"success": result.success,
|
||||
"steps_executed": result.steps_executed,
|
||||
"output": result.output,
|
||||
}
|
||||
if result.error:
|
||||
output_data["error"] = result.error
|
||||
|
||||
click.echo(json.dumps(output_data, indent=2, default=str))
|
||||
sys.exit(0 if result.success else 1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
|
||||
@click.option("--debug", is_flag=True, help="Show debug logging")
|
||||
def tui(verbose, debug):
|
||||
"""Launch the TUI dashboard for interactive outreach."""
|
||||
setup_logging(verbose=verbose, debug=debug)
|
||||
|
||||
try:
|
||||
from framework.tui.app import AdenTUI
|
||||
except ImportError:
|
||||
click.echo(
|
||||
"TUI requires the 'textual' package. Install with: pip install textual"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.agent_runtime import create_agent_runtime
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
async def run_with_tui():
|
||||
agent = TwitterOutreachAgent()
|
||||
|
||||
agent._event_bus = EventBus()
|
||||
agent._tool_registry = ToolRegistry()
|
||||
|
||||
storage_path = Path.home() / ".hive" / "twitter_outreach"
|
||||
storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
||||
if mcp_config_path.exists():
|
||||
agent._tool_registry.load_mcp_config(mcp_config_path)
|
||||
|
||||
llm = LiteLLMProvider(
|
||||
model=agent.config.model,
|
||||
api_key=agent.config.api_key,
|
||||
api_base=agent.config.api_base,
|
||||
)
|
||||
|
||||
tools = list(agent._tool_registry.get_tools().values())
|
||||
tool_executor = agent._tool_registry.get_executor()
|
||||
graph = agent._build_graph()
|
||||
|
||||
runtime = create_agent_runtime(
|
||||
graph=graph,
|
||||
goal=agent.goal,
|
||||
storage_path=storage_path,
|
||||
entry_points=[
|
||||
EntryPointSpec(
|
||||
id="start",
|
||||
name="Start Outreach",
|
||||
entry_node="intake",
|
||||
trigger_type="manual",
|
||||
isolation_level="isolated",
|
||||
),
|
||||
],
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
)
|
||||
|
||||
await runtime.start()
|
||||
|
||||
try:
|
||||
app = AdenTUI(runtime)
|
||||
await app.run_async()
|
||||
finally:
|
||||
await runtime.stop()
|
||||
|
||||
asyncio.run(run_with_tui())
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--json", "output_json", is_flag=True)
|
||||
def info(output_json):
|
||||
"""Show agent information."""
|
||||
info_data = default_agent.info()
|
||||
if output_json:
|
||||
click.echo(json.dumps(info_data, indent=2))
|
||||
else:
|
||||
click.echo(f"Agent: {info_data['name']}")
|
||||
click.echo(f"Version: {info_data['version']}")
|
||||
click.echo(f"Description: {info_data['description']}")
|
||||
click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
|
||||
click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
|
||||
click.echo(f"Entry: {info_data['entry_node']}")
|
||||
click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
|
||||
|
||||
|
||||
@cli.command()
|
||||
def validate():
|
||||
"""Validate agent structure."""
|
||||
validation = default_agent.validate()
|
||||
if validation["valid"]:
|
||||
click.echo("Agent is valid")
|
||||
if validation["warnings"]:
|
||||
for warning in validation["warnings"]:
|
||||
click.echo(f" WARNING: {warning}")
|
||||
else:
|
||||
click.echo("Agent has errors:")
|
||||
for error in validation["errors"]:
|
||||
click.echo(f" ERROR: {error}")
|
||||
sys.exit(0 if validation["valid"] else 1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
def shell(verbose):
|
||||
"""Interactive outreach session (CLI, no TUI)."""
|
||||
asyncio.run(_interactive_shell(verbose))
|
||||
|
||||
|
||||
async def _interactive_shell(verbose=False):
|
||||
"""Async interactive shell."""
|
||||
setup_logging(verbose=verbose)
|
||||
|
||||
click.echo("=== Twitter Outreach Agent ===")
|
||||
click.echo("Starting outreach workflow...\n")
|
||||
|
||||
agent = TwitterOutreachAgent()
|
||||
await agent.start()
|
||||
|
||||
try:
|
||||
result = await agent.trigger_and_wait("start", {})
|
||||
|
||||
if result is None:
|
||||
click.echo("\n[Execution timed out]\n")
|
||||
elif result.success:
|
||||
output = result.output
|
||||
status = output.get("delivery_status", "unknown")
|
||||
click.echo(f"\nOutreach complete! Delivery status: {status}")
|
||||
else:
|
||||
click.echo(f"\nOutreach failed: {result.error}")
|
||||
except KeyboardInterrupt:
|
||||
click.echo("\nGoodbye!")
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
await agent.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
@@ -1,265 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"id": "twitter_outreach",
|
||||
"name": "Personalized Twitter Outreach",
|
||||
"version": "1.0.0",
|
||||
"description": "Given a Twitter/X handle and outreach context, research the target's profile (bio, tweets, interests), craft a personalized outreach email referencing their specific activity, and send it after user approval."
|
||||
},
|
||||
"graph": {
|
||||
"id": "twitter_outreach-graph",
|
||||
"goal_id": "twitter-outreach",
|
||||
"version": "1.0.0",
|
||||
"entry_node": "intake",
|
||||
"entry_points": {
|
||||
"start": "intake"
|
||||
},
|
||||
"pause_nodes": [],
|
||||
"terminal_nodes": [
|
||||
"send"
|
||||
],
|
||||
"nodes": [
|
||||
{
|
||||
"id": "intake",
|
||||
"name": "Intake",
|
||||
"description": "Collect the target Twitter handle, outreach purpose, and recipient email from the user",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [],
|
||||
"output_keys": [
|
||||
"twitter_handle",
|
||||
"outreach_context",
|
||||
"recipient_email"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are the intake assistant for a personalized Twitter outreach agent.\n\n**STEP 1 \u2014 Respond to the user (text only, NO tool calls):**\nGreet the user and ask them to provide:\n1. The Twitter/X handle of the person they want to reach out to\n2. The purpose/context of the outreach (e.g., partnership opportunity, hiring, collaboration, introduction)\n3. The recipient's email address\n\nBe friendly and concise. If the user provides partial info, ask for what's missing.\n\n**STEP 2 \u2014 After the user provides ALL three pieces of information, call set_output:**\n- set_output(\"twitter_handle\", \"<the Twitter handle, including @>\")\n- set_output(\"outreach_context\", \"<the outreach purpose/context>\")\n- set_output(\"recipient_email\", \"<the email address>\")",
|
||||
"tools": [],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": true
|
||||
},
|
||||
{
|
||||
"id": "research",
|
||||
"name": "Research",
|
||||
"description": "Research the target's Twitter/X profile \u2014 bio, recent tweets, interests, and topics they engage with",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"twitter_handle"
|
||||
],
|
||||
"output_keys": [
|
||||
"profile_summary"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are a Twitter/X profile researcher. Your job is to thoroughly research a person's public Twitter/X presence.\n\nGiven the Twitter handle provided in your inputs, do the following:\n\n1. Use web_search to find their Twitter/X profile and any relevant public information about them.\n2. Use web_scrape to read their Twitter/X profile page (try https://x.com/{handle} or https://twitter.com/{handle}).\n3. Extract and analyze:\n - Their bio and self-description\n - Recent tweets and topics they post about\n - Professional interests, projects, or accomplishments\n - Any recurring themes or passions\n - Specific tweets worth referencing in outreach\n4. Look for additional context (personal website, blog, other social profiles mentioned in bio).\n\nCompile a comprehensive profile summary that would help someone write a highly personalized outreach email.\n\nUse set_output(\"profile_summary\", <your detailed summary as a string>) to store your findings.\n\nDo NOT return raw JSON. Use the set_output tool to produce outputs.",
|
||||
"tools": [
|
||||
"web_search",
|
||||
"web_scrape"
|
||||
],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": false
|
||||
},
|
||||
{
|
||||
"id": "draft-review",
|
||||
"name": "Draft & Review",
|
||||
"description": "Draft a personalized outreach email using profile research, present to user for review, and iterate until approved",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"outreach_context",
|
||||
"recipient_email",
|
||||
"profile_summary"
|
||||
],
|
||||
"output_keys": [
|
||||
"approved_email"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are an expert email copywriter specializing in personalized outreach.\n\nYou have been given:\n- A profile summary of the target person (from their Twitter/X)\n- The outreach context/purpose\n- The recipient's email address\n\n**STEP 1 \u2014 Draft and present the email (text only, NO tool calls):**\n\nUsing the profile research, draft a personalized outreach email that:\n- References at least 2 specific details from their Twitter profile (tweets, interests, projects)\n- Clearly connects to the outreach purpose\n- Includes a specific, relevant call to action\n- Is professional but conversational and authentic \u2014 NOT spammy, robotic, or overly formal\n- Is concise (under 300 words)\n\nPresent the complete email draft to the user, formatted clearly with Subject line and Body.\nThen ask: \"Would you like any changes, or shall I send this?\"\n\nIf the user requests changes, revise the email and present the updated version. Keep iterating until the user is satisfied.\n\n**STEP 2 \u2014 After the user explicitly approves the email, call set_output:**\n- set_output(\"approved_email\", \"<the final approved email text including subject line>\")",
|
||||
"tools": [],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": true
|
||||
},
|
||||
{
|
||||
"id": "send",
|
||||
"name": "Send",
|
||||
"description": "Send the approved outreach email to the recipient",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": [
|
||||
"approved_email",
|
||||
"recipient_email"
|
||||
],
|
||||
"output_keys": [
|
||||
"delivery_status"
|
||||
],
|
||||
"nullable_output_keys": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"system_prompt": "You are responsible for sending the approved outreach email.\n\nYou have the approved email text and the recipient's email address in your inputs.\n\nParse the subject line and body from the approved_email, then use the send_email tool to send it to the recipient_email address.\n\nAfter sending successfully, call:\n- set_output(\"delivery_status\", \"sent\")\n\nIf there is an error sending, call:\n- set_output(\"delivery_status\", \"failed: <error details>\")\n\nDo NOT return raw JSON. Use the set_output tool to produce outputs.",
|
||||
"tools": [
|
||||
"send_email"
|
||||
],
|
||||
"model": null,
|
||||
"function": null,
|
||||
"routes": {},
|
||||
"max_retries": 3,
|
||||
"retry_on": [],
|
||||
"max_node_visits": 1,
|
||||
"output_model": null,
|
||||
"max_validation_retries": 2,
|
||||
"client_facing": false
|
||||
}
|
||||
],
|
||||
"edges": [
|
||||
{
|
||||
"id": "intake-to-research",
|
||||
"source": "intake",
|
||||
"target": "research",
|
||||
"condition": "on_success",
|
||||
"condition_expr": null,
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
},
|
||||
{
|
||||
"id": "research-to-draft-review",
|
||||
"source": "research",
|
||||
"target": "draft-review",
|
||||
"condition": "on_success",
|
||||
"condition_expr": null,
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
},
|
||||
{
|
||||
"id": "draft-review-to-send",
|
||||
"source": "draft-review",
|
||||
"target": "send",
|
||||
"condition": "on_success",
|
||||
"condition_expr": null,
|
||||
"priority": 1,
|
||||
"input_mapping": {}
|
||||
}
|
||||
],
|
||||
"max_steps": 100,
|
||||
"max_retries_per_node": 3,
|
||||
"description": "Given a Twitter/X handle and outreach context, research the target's profile (bio, tweets, interests), craft a personalized outreach email referencing their specific activity, and send it after user approval.",
|
||||
"created_at": "2026-02-05T13:32:44.573661"
|
||||
},
|
||||
"goal": {
|
||||
"id": "twitter-outreach",
|
||||
"name": "Personalized Twitter Outreach",
|
||||
"description": "Given a Twitter/X handle and outreach context, research the target's profile (bio, tweets, interests), craft a personalized outreach email referencing their specific activity, and send it after user approval.",
|
||||
"status": "draft",
|
||||
"success_criteria": [
|
||||
{
|
||||
"id": "profile-research",
|
||||
"description": "Agent extracts meaningful information from target's Twitter profile including bio, recent tweets, interests, and topics they engage with",
|
||||
"metric": "research_quality",
|
||||
"target": "Identifies at least 3 distinct profile details",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "email-personalization",
|
||||
"description": "Drafted email references at least 2 specific details from the target's Twitter profile",
|
||||
"metric": "personalization_score",
|
||||
"target": "At least 2 specific references to profile content",
|
||||
"weight": 0.25,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "clear-cta",
|
||||
"description": "Email includes a specific relevant call to action",
|
||||
"metric": "cta_present",
|
||||
"target": "Email contains clear call to action",
|
||||
"weight": 0.15,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "user-approval-gate",
|
||||
"description": "Email is presented to user for review and only sent after explicit approval with opportunity to request edits",
|
||||
"metric": "approval_obtained",
|
||||
"target": "User explicitly approves before send",
|
||||
"weight": 0.2,
|
||||
"met": false
|
||||
},
|
||||
{
|
||||
"id": "successful-delivery",
|
||||
"description": "Email is sent successfully via the send_email tool",
|
||||
"metric": "delivery_status",
|
||||
"target": "Email sent without errors",
|
||||
"weight": 0.15,
|
||||
"met": false
|
||||
}
|
||||
],
|
||||
"constraints": [
|
||||
{
|
||||
"id": "no-spam",
|
||||
"description": "Email must not use spammy language, clickbait, or aggressive sales tactics",
|
||||
"constraint_type": "quality",
|
||||
"category": "content",
|
||||
"check": ""
|
||||
},
|
||||
{
|
||||
"id": "approval-required",
|
||||
"description": "Must never send an email without explicit user approval",
|
||||
"constraint_type": "safety",
|
||||
"category": "process",
|
||||
"check": ""
|
||||
},
|
||||
{
|
||||
"id": "tone-appropriate",
|
||||
"description": "Email tone must be professional, authentic, and conversational \u2014 not robotic or overly formal",
|
||||
"constraint_type": "quality",
|
||||
"category": "content",
|
||||
"check": ""
|
||||
},
|
||||
{
|
||||
"id": "privacy-respect",
|
||||
"description": "Only use publicly available information from the target's Twitter profile",
|
||||
"constraint_type": "safety",
|
||||
"category": "ethics",
|
||||
"check": ""
|
||||
}
|
||||
],
|
||||
"context": {},
|
||||
"required_capabilities": [],
|
||||
"input_schema": {},
|
||||
"output_schema": {},
|
||||
"version": "1.0.0",
|
||||
"parent_version": null,
|
||||
"evolution_reason": null,
|
||||
"created_at": "2026-02-05 13:30:59.934460",
|
||||
"updated_at": "2026-02-05 13:30:59.934462"
|
||||
},
|
||||
"required_tools": [
|
||||
"web_scrape",
|
||||
"send_email",
|
||||
"web_search"
|
||||
],
|
||||
"metadata": {
|
||||
"created_at": "2026-02-05T13:32:44.573712",
|
||||
"node_count": 4,
|
||||
"edge_count": 3
|
||||
}
|
||||
}
|
||||
@@ -1,308 +0,0 @@
|
||||
"""Agent graph construction for Twitter Outreach Agent."""
|
||||
|
||||
from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
|
||||
from framework.graph.edge import GraphSpec
|
||||
from framework.graph.executor import ExecutionResult, GraphExecutor
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.core import Runtime
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
|
||||
from .config import default_config, metadata
|
||||
from .nodes import (
|
||||
intake_node,
|
||||
research_node,
|
||||
draft_review_node,
|
||||
send_node,
|
||||
)
|
||||
|
||||
# Goal definition
|
||||
goal = Goal(
|
||||
id="twitter-outreach",
|
||||
name="Personalized Twitter Outreach",
|
||||
description=(
|
||||
"Given a Twitter/X handle and outreach context, research the target's profile "
|
||||
"(bio, tweets, interests), craft a personalized outreach email referencing their "
|
||||
"specific activity, and send it after user approval."
|
||||
),
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="profile-research",
|
||||
description="Agent extracts meaningful information from target's Twitter profile including bio, recent tweets, interests, and topics they engage with",
|
||||
metric="research_quality",
|
||||
target="Identifies at least 3 distinct profile details",
|
||||
weight=0.25,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="email-personalization",
|
||||
description="Drafted email references at least 2 specific details from the target's Twitter profile",
|
||||
metric="personalization_score",
|
||||
target="At least 2 specific references to profile content",
|
||||
weight=0.25,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="clear-cta",
|
||||
description="Email includes a specific relevant call to action",
|
||||
metric="cta_present",
|
||||
target="Email contains clear call to action",
|
||||
weight=0.15,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="user-approval-gate",
|
||||
description="Email is presented to user for review and only sent after explicit approval with opportunity to request edits",
|
||||
metric="approval_obtained",
|
||||
target="User explicitly approves before send",
|
||||
weight=0.2,
|
||||
),
|
||||
SuccessCriterion(
|
||||
id="successful-delivery",
|
||||
description="Email is sent successfully via the send_email tool",
|
||||
metric="delivery_status",
|
||||
target="Email sent without errors",
|
||||
weight=0.15,
|
||||
),
|
||||
],
|
||||
constraints=[
|
||||
Constraint(
|
||||
id="no-spam",
|
||||
description="Email must not use spammy language, clickbait, or aggressive sales tactics",
|
||||
constraint_type="quality",
|
||||
category="content",
|
||||
),
|
||||
Constraint(
|
||||
id="approval-required",
|
||||
description="Must never send an email without explicit user approval",
|
||||
constraint_type="safety",
|
||||
category="process",
|
||||
),
|
||||
Constraint(
|
||||
id="tone-appropriate",
|
||||
description="Email tone must be professional, authentic, and conversational — not robotic or overly formal",
|
||||
constraint_type="quality",
|
||||
category="content",
|
||||
),
|
||||
Constraint(
|
||||
id="privacy-respect",
|
||||
description="Only use publicly available information from the target's Twitter profile",
|
||||
constraint_type="safety",
|
||||
category="ethics",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# Node list
|
||||
nodes = [
|
||||
intake_node,
|
||||
research_node,
|
||||
draft_review_node,
|
||||
send_node,
|
||||
]
|
||||
|
||||
# Edge definitions
|
||||
edges = [
|
||||
EdgeSpec(
|
||||
id="intake-to-research",
|
||||
source="intake",
|
||||
target="research",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
EdgeSpec(
|
||||
id="research-to-draft-review",
|
||||
source="research",
|
||||
target="draft-review",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
EdgeSpec(
|
||||
id="draft-review-to-send",
|
||||
source="draft-review",
|
||||
target="send",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
priority=1,
|
||||
),
|
||||
]
|
||||
|
||||
# Graph configuration
|
||||
entry_node = "intake"
|
||||
entry_points = {"start": "intake"}
|
||||
pause_nodes = []
|
||||
terminal_nodes = ["send"]
|
||||
|
||||
|
||||
class TwitterOutreachAgent:
|
||||
"""
|
||||
Twitter Outreach Agent — 4-node pipeline with user approval checkpoint.
|
||||
|
||||
Flow: intake -> research -> draft-review -> send
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
self.config = config or default_config
|
||||
self.goal = goal
|
||||
self.nodes = nodes
|
||||
self.edges = edges
|
||||
self.entry_node = entry_node
|
||||
self.entry_points = entry_points
|
||||
self.pause_nodes = pause_nodes
|
||||
self.terminal_nodes = terminal_nodes
|
||||
self._executor: GraphExecutor | None = None
|
||||
self._graph: GraphSpec | None = None
|
||||
self._event_bus: EventBus | None = None
|
||||
self._tool_registry: ToolRegistry | None = None
|
||||
|
||||
def _build_graph(self) -> GraphSpec:
|
||||
"""Build the GraphSpec."""
|
||||
return GraphSpec(
|
||||
id="twitter-outreach-graph",
|
||||
goal_id=self.goal.id,
|
||||
version="1.0.0",
|
||||
entry_node=self.entry_node,
|
||||
entry_points=self.entry_points,
|
||||
terminal_nodes=self.terminal_nodes,
|
||||
pause_nodes=self.pause_nodes,
|
||||
nodes=self.nodes,
|
||||
edges=self.edges,
|
||||
default_model=self.config.model,
|
||||
max_tokens=self.config.max_tokens,
|
||||
loop_config={
|
||||
"max_iterations": 50,
|
||||
"max_tool_calls_per_turn": 10,
|
||||
"max_history_tokens": 32000,
|
||||
},
|
||||
)
|
||||
|
||||
def _setup(self) -> GraphExecutor:
|
||||
"""Set up the executor with all components."""
|
||||
from pathlib import Path
|
||||
|
||||
storage_path = Path.home() / ".hive" / "twitter_outreach"
|
||||
storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._event_bus = EventBus()
|
||||
self._tool_registry = ToolRegistry()
|
||||
|
||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
||||
if mcp_config_path.exists():
|
||||
self._tool_registry.load_mcp_config(mcp_config_path)
|
||||
|
||||
llm = LiteLLMProvider(
|
||||
model=self.config.model,
|
||||
api_key=self.config.api_key,
|
||||
api_base=self.config.api_base,
|
||||
)
|
||||
|
||||
tool_executor = self._tool_registry.get_executor()
|
||||
tools = list(self._tool_registry.get_tools().values())
|
||||
|
||||
self._graph = self._build_graph()
|
||||
runtime = Runtime(storage_path)
|
||||
|
||||
self._executor = GraphExecutor(
|
||||
runtime=runtime,
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
event_bus=self._event_bus,
|
||||
storage_path=storage_path,
|
||||
loop_config=self._graph.loop_config,
|
||||
)
|
||||
|
||||
return self._executor
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Set up the agent (initialize executor and tools)."""
|
||||
if self._executor is None:
|
||||
self._setup()
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Clean up resources."""
|
||||
self._executor = None
|
||||
self._event_bus = None
|
||||
|
||||
async def trigger_and_wait(
|
||||
self,
|
||||
entry_point: str,
|
||||
input_data: dict,
|
||||
timeout: float | None = None,
|
||||
session_state: dict | None = None,
|
||||
) -> ExecutionResult | None:
|
||||
"""Execute the graph and wait for completion."""
|
||||
if self._executor is None:
|
||||
raise RuntimeError("Agent not started. Call start() first.")
|
||||
if self._graph is None:
|
||||
raise RuntimeError("Graph not built. Call start() first.")
|
||||
|
||||
return await self._executor.execute(
|
||||
graph=self._graph,
|
||||
goal=self.goal,
|
||||
input_data=input_data,
|
||||
session_state=session_state,
|
||||
)
|
||||
|
||||
async def run(
|
||||
self, context: dict, session_state=None
|
||||
) -> ExecutionResult:
|
||||
"""Run the agent (convenience method for single execution)."""
|
||||
await self.start()
|
||||
try:
|
||||
result = await self.trigger_and_wait(
|
||||
"start", context, session_state=session_state
|
||||
)
|
||||
return result or ExecutionResult(success=False, error="Execution timeout")
|
||||
finally:
|
||||
await self.stop()
|
||||
|
||||
def info(self):
|
||||
"""Get agent information."""
|
||||
return {
|
||||
"name": metadata.name,
|
||||
"version": metadata.version,
|
||||
"description": metadata.description,
|
||||
"goal": {
|
||||
"name": self.goal.name,
|
||||
"description": self.goal.description,
|
||||
},
|
||||
"nodes": [n.id for n in self.nodes],
|
||||
"edges": [e.id for e in self.edges],
|
||||
"entry_node": self.entry_node,
|
||||
"entry_points": self.entry_points,
|
||||
"pause_nodes": self.pause_nodes,
|
||||
"terminal_nodes": self.terminal_nodes,
|
||||
"client_facing_nodes": [n.id for n in self.nodes if n.client_facing],
|
||||
}
|
||||
|
||||
def validate(self):
|
||||
"""Validate agent structure."""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
node_ids = {node.id for node in self.nodes}
|
||||
for edge in self.edges:
|
||||
if edge.source not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: source '{edge.source}' not found")
|
||||
if edge.target not in node_ids:
|
||||
errors.append(f"Edge {edge.id}: target '{edge.target}' not found")
|
||||
|
||||
if self.entry_node not in node_ids:
|
||||
errors.append(f"Entry node '{self.entry_node}' not found")
|
||||
|
||||
for terminal in self.terminal_nodes:
|
||||
if terminal not in node_ids:
|
||||
errors.append(f"Terminal node '{terminal}' not found")
|
||||
|
||||
for ep_id, node_id in self.entry_points.items():
|
||||
if node_id not in node_ids:
|
||||
errors.append(
|
||||
f"Entry point '{ep_id}' references unknown node '{node_id}'"
|
||||
)
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
|
||||
# Create default instance
|
||||
default_agent = TwitterOutreachAgent()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user