Merge pull request #5058 from adenhq/fix/deprecation
fix(arch): remove all deprecated concepts and deadcodes
This commit is contained in:
@@ -492,7 +492,7 @@ AskUserQuestion(questions=[{
|
||||
- node_id (kebab-case)
|
||||
- name
|
||||
- description
|
||||
- node_type: `"event_loop"` (recommended for all LLM work) or `"function"` (deterministic, no LLM)
|
||||
- node_type: `"event_loop"` (the only valid type; use `client_facing: True` for HITL)
|
||||
- input_keys (what data this node receives)
|
||||
- output_keys (what data this node produces)
|
||||
- tools (ONLY tools that exist from Step 1 — empty list if no tools needed)
|
||||
@@ -852,8 +852,7 @@ cd /home/timothy/oss/hive && PYTHONPATH=exports uv run python -m AGENT_NAME vali
|
||||
|
||||
| Type | tools param | Use when |
|
||||
| ------------ | ----------------------- | --------------------------------------- |
|
||||
| `event_loop` | `'["tool1"]'` or `'[]'` | LLM-powered work with or without tools |
|
||||
| `function` | N/A | Deterministic Python operations, no LLM |
|
||||
| `event_loop` | `'["tool1"]'` or `'[]'` | All agent work (with or without tools, HITL via client_facing) |
|
||||
|
||||
---
|
||||
|
||||
@@ -1008,7 +1007,7 @@ Use this reference during STEP 2 to give accurate, honest assessments.
|
||||
| Sub-second responses | LLM latency is inherent | Traditional code, no LLM |
|
||||
| Processing millions of items | Context windows and rate limits | Batch processing + sampling |
|
||||
| Real-time streaming data | No built-in pub/sub or streaming input | Custom MCP server + agent |
|
||||
| Guaranteed determinism | LLM outputs vary | Function nodes for deterministic parts |
|
||||
| Guaranteed determinism | LLM outputs vary | Traditional code for deterministic parts |
|
||||
| Offline/air-gapped | Requires LLM API access | Local models (not currently supported) |
|
||||
| Multi-user concurrency | Single-user session model | Separate agent instances per user |
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ Register an MCP server as a tool source for your agent.
|
||||
"example_tool"
|
||||
],
|
||||
"total_mcp_servers": 1,
|
||||
"note": "MCP server 'tools' registered with 6 tools. These tools can now be used in llm_tool_use nodes."
|
||||
"note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes."
|
||||
}
|
||||
```
|
||||
|
||||
@@ -149,7 +149,7 @@ List tools available from registered MCP servers.
|
||||
]
|
||||
},
|
||||
"total_tools": 6,
|
||||
"note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes"
|
||||
"note": "Use these tool names in the 'tools' parameter when adding event_loop nodes"
|
||||
}
|
||||
```
|
||||
|
||||
@@ -246,7 +246,7 @@ Here's a complete workflow for building an agent with MCP tools:
|
||||
"node_id": "web-searcher",
|
||||
"name": "Web Search",
|
||||
"description": "Search the web for information",
|
||||
"node_type": "llm_tool_use",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": "[\"query\"]",
|
||||
"output_keys": "[\"search_results\"]",
|
||||
"system_prompt": "Search for {query} using the web_search tool",
|
||||
|
||||
@@ -119,7 +119,7 @@ builder = WorkflowBuilder()
|
||||
builder.add_node(
|
||||
node_id="researcher",
|
||||
name="Web Researcher",
|
||||
node_type="llm_tool_use",
|
||||
node_type="event_loop",
|
||||
system_prompt="Research the topic using web_search",
|
||||
tools=["web_search"], # Tool from tools MCP server
|
||||
input_keys=["topic"],
|
||||
@@ -137,7 +137,7 @@ Tools from MCP servers can be referenced in your agent.json just like built-in t
|
||||
{
|
||||
"id": "searcher",
|
||||
"name": "Web Searcher",
|
||||
"node_type": "llm_tool_use",
|
||||
"node_type": "event_loop",
|
||||
"system_prompt": "Search for information about {topic}",
|
||||
"tools": ["web_search", "web_scrape"],
|
||||
"input_keys": ["topic"],
|
||||
|
||||
+17
-70
@@ -103,31 +103,20 @@ Add a processing node to the agent graph.
|
||||
- `node_id` (string, required): Unique node identifier
|
||||
- `name` (string, required): Human-readable name
|
||||
- `description` (string, required): What this node does
|
||||
- `node_type` (string, required): One of: `llm_generate`, `llm_tool_use`, `router`, `function`
|
||||
- `node_type` (string, required): Must be `event_loop` (the only valid type)
|
||||
- `input_keys` (string, required): JSON array of input variable names
|
||||
- `output_keys` (string, required): JSON array of output variable names
|
||||
- `system_prompt` (string, optional): System prompt for LLM nodes
|
||||
- `tools` (string, optional): JSON array of tool names for tool_use nodes
|
||||
- `routes` (string, optional): JSON object of route mappings for router nodes
|
||||
- `system_prompt` (string, optional): System prompt for the LLM
|
||||
- `tools` (string, optional): JSON array of tool names
|
||||
- `client_facing` (boolean, optional): Set to true for human-in-the-loop interaction
|
||||
|
||||
**Node Types:**
|
||||
**Node Type:**
|
||||
|
||||
1. **llm_generate**: Uses LLM to generate output from inputs
|
||||
- Requires: `system_prompt`
|
||||
- Tools: Not used
|
||||
|
||||
2. **llm_tool_use**: Uses LLM with tools to accomplish tasks
|
||||
- Requires: `system_prompt`, `tools`
|
||||
- Tools: Array of tool names (e.g., `["web_search", "web_fetch"]`)
|
||||
|
||||
3. **router**: LLM-powered routing to different paths
|
||||
- Requires: `system_prompt`, `routes`
|
||||
- Routes: Object mapping route names to target node IDs
|
||||
- Example: `{"pass": "success_node", "fail": "retry_node"}`
|
||||
|
||||
4. **function**: Executes a pre-defined function
|
||||
- System prompt describes the function behavior
|
||||
- No LLM calls, pure computation
|
||||
**event_loop**: LLM-powered node with self-correction loop
|
||||
- Requires: `system_prompt`
|
||||
- Optional: `tools` (array of tool names, e.g., `["web_search", "web_fetch"]`)
|
||||
- Optional: `client_facing` (set to true for HITL / user interaction)
|
||||
- Supports: iterative refinement, judge-based evaluation, tool use, streaming
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
@@ -135,7 +124,7 @@ Add a processing node to the agent graph.
|
||||
"node_id": "search_sources",
|
||||
"name": "Search Sources",
|
||||
"description": "Searches for relevant sources on the topic",
|
||||
"node_type": "llm_tool_use",
|
||||
"node_type": "event_loop",
|
||||
"input_keys": "[\"topic\", \"search_queries\"]",
|
||||
"output_keys": "[\"sources\", \"source_count\"]",
|
||||
"system_prompt": "Search for sources using the provided queries...",
|
||||
@@ -198,7 +187,7 @@ Export the validated graph as an agent specification.
|
||||
|
||||
**What it does:**
|
||||
1. Validates the graph
|
||||
2. Auto-generates missing edges from router routes
|
||||
2. Validates edge connectivity
|
||||
3. Writes files to disk:
|
||||
- `exports/{agent-name}/agent.json` - Full agent specification
|
||||
- `exports/{agent-name}/README.md` - Auto-generated documentation
|
||||
@@ -252,47 +241,6 @@ Test the complete agent graph with sample inputs.
|
||||
|
||||
---
|
||||
|
||||
### Evaluation Rules
|
||||
|
||||
#### `add_evaluation_rule`
|
||||
Add a rule for the HybridJudge to evaluate node outputs.
|
||||
|
||||
**Parameters:**
|
||||
- `rule_id` (string, required): Unique rule identifier
|
||||
- `description` (string, required): What this rule checks
|
||||
- `condition` (string, required): Python expression to evaluate
|
||||
- `action` (string, required): Action to take: `accept`, `retry`, `escalate`
|
||||
- `priority` (integer, optional): Rule priority (default: 0)
|
||||
- `feedback_template` (string, optional): Feedback message template
|
||||
|
||||
**Condition Examples:**
|
||||
- `'result.get("success") == True'` - Check for success flag
|
||||
- `'result.get("error_type") == "timeout"'` - Check error type
|
||||
- `'len(result.get("data", [])) > 0'` - Check for non-empty data
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"rule_id": "timeout_retry",
|
||||
"description": "Retry on timeout errors",
|
||||
"condition": "result.get('error_type') == 'timeout'",
|
||||
"action": "retry",
|
||||
"priority": 10,
|
||||
"feedback_template": "Timeout occurred, retrying..."
|
||||
}
|
||||
```
|
||||
|
||||
#### `list_evaluation_rules`
|
||||
List all configured evaluation rules.
|
||||
|
||||
#### `remove_evaluation_rule`
|
||||
Remove an evaluation rule.
|
||||
|
||||
**Parameters:**
|
||||
- `rule_id` (string, required): Rule to remove
|
||||
|
||||
---
|
||||
|
||||
## Example Workflow
|
||||
|
||||
Here's a complete workflow for building a research agent:
|
||||
@@ -320,7 +268,7 @@ add_node(
|
||||
node_id="planner",
|
||||
name="Research Planner",
|
||||
description="Creates research strategy",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys='["topic"]',
|
||||
output_keys='["strategy", "queries"]',
|
||||
system_prompt="Analyze topic and create research plan..."
|
||||
@@ -330,7 +278,7 @@ add_node(
|
||||
node_id="searcher",
|
||||
name="Search Sources",
|
||||
description="Find relevant sources",
|
||||
node_type="llm_tool_use",
|
||||
node_type="event_loop",
|
||||
input_keys='["queries"]',
|
||||
output_keys='["sources"]',
|
||||
system_prompt="Search for sources...",
|
||||
@@ -359,10 +307,9 @@ The exported agent will be saved to `exports/research-agent/`.
|
||||
|
||||
1. **Start with the goal**: Define clear success criteria before building nodes
|
||||
2. **Test nodes individually**: Use `test_node` to verify each node works
|
||||
3. **Use router nodes for branching**: Don't create edges manually for routers - define routes and they'll be auto-generated
|
||||
4. **Add evaluation rules**: Help the judge evaluate outputs deterministically
|
||||
5. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
|
||||
6. **Check exports**: Review the generated README.md to verify your agent structure
|
||||
3. **Use conditional edges for branching**: Define condition_expr on edges for decision points
|
||||
4. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
|
||||
5. **Check exports**: Review the generated README.md to verify your agent structure
|
||||
|
||||
---
|
||||
|
||||
|
||||
+1
-1
@@ -73,7 +73,7 @@ To use the agent builder with Claude Desktop or other MCP clients, add this to y
|
||||
The MCP server provides tools for:
|
||||
- Creating agent building sessions
|
||||
- Defining goals with success criteria
|
||||
- Adding nodes (llm_generate, llm_tool_use, router, function)
|
||||
- Adding nodes (event_loop only)
|
||||
- Connecting nodes with edges
|
||||
- Validating and exporting agent graphs
|
||||
- Testing nodes and full agent graphs
|
||||
|
||||
@@ -68,7 +68,7 @@ from framework.graph.event_loop_node import ( # noqa: E402
|
||||
)
|
||||
from framework.graph.executor import GraphExecutor # noqa: E402
|
||||
from framework.graph.goal import Goal # noqa: E402
|
||||
from framework.graph.node import NodeSpec # noqa: E402
|
||||
from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec # noqa: E402
|
||||
from framework.llm.litellm import LiteLLMProvider # noqa: E402
|
||||
from framework.runner.tool_registry import ToolRegistry # noqa: E402
|
||||
from framework.runtime.core import Runtime # noqa: E402
|
||||
@@ -654,7 +654,7 @@ NODE_SPECS = {
|
||||
id="sender",
|
||||
name="Sender",
|
||||
description="Send approved campaign emails",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["approved_emails"],
|
||||
output_keys=["send_results"],
|
||||
),
|
||||
@@ -823,11 +823,20 @@ def _send_email_via_resend(
|
||||
return {"error": f"Network error: {e}"}
|
||||
|
||||
|
||||
class SenderNode(NodeProtocol):
|
||||
"""Node wrapper for send_emails function."""
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
approved = ctx.input_data.get("approved_emails", "")
|
||||
result_str = send_emails(approved_emails=approved)
|
||||
ctx.memory.write("send_results", result_str)
|
||||
return NodeResult(success=True, output={"send_results": result_str})
|
||||
|
||||
|
||||
def send_emails(approved_emails: str = "") -> str:
|
||||
"""Send approved campaign emails via Resend, or log if unconfigured.
|
||||
|
||||
Called by FunctionNode which unpacks input_keys as kwargs.
|
||||
Returns a JSON string (FunctionNode wraps it in NodeResult).
|
||||
Returns a JSON string.
|
||||
"""
|
||||
approved = approved_emails
|
||||
if not approved:
|
||||
@@ -1780,7 +1789,7 @@ async def _run_pipeline(websocket, initial_message: str):
|
||||
)
|
||||
for nid, impl in nodes.items():
|
||||
executor.register_node(nid, impl)
|
||||
executor.register_function("sender", send_emails)
|
||||
executor.register_node("sender", SenderNode())
|
||||
|
||||
# --- Event forwarding: bus → WebSocket ---
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ Minimal Manual Agent Example
|
||||
This example demonstrates how to build and run an agent programmatically
|
||||
without using the Claude Code CLI or external LLM APIs.
|
||||
|
||||
It uses 'function' nodes to define logic in pure Python, making it perfect
|
||||
for understanding the core runtime loop:
|
||||
It uses custom NodeProtocol implementations to define logic in pure Python,
|
||||
making it perfect for understanding the core runtime loop:
|
||||
Setup -> Graph definition -> Execution -> Result
|
||||
|
||||
Run with:
|
||||
@@ -16,22 +16,33 @@ import asyncio
|
||||
|
||||
from framework.graph import EdgeCondition, EdgeSpec, Goal, GraphSpec, NodeSpec
|
||||
from framework.graph.executor import GraphExecutor
|
||||
from framework.graph.node import NodeContext, NodeProtocol, NodeResult
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
|
||||
# 1. Define Node Logic (Pure Python Functions)
|
||||
def greet(name: str) -> str:
|
||||
# 1. Define Node Logic (Custom NodeProtocol implementations)
|
||||
class GreeterNode(NodeProtocol):
|
||||
"""Generate a simple greeting."""
|
||||
return f"Hello, {name}!"
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
name = ctx.input_data.get("name", "World")
|
||||
greeting = f"Hello, {name}!"
|
||||
ctx.memory.write("greeting", greeting)
|
||||
return NodeResult(success=True, output={"greeting": greeting})
|
||||
|
||||
|
||||
def uppercase(greeting: str) -> str:
|
||||
class UppercaserNode(NodeProtocol):
|
||||
"""Convert text to uppercase."""
|
||||
return greeting.upper()
|
||||
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
greeting = ctx.input_data.get("greeting") or ctx.memory.read("greeting") or ""
|
||||
result = greeting.upper()
|
||||
ctx.memory.write("final_greeting", result)
|
||||
return NodeResult(success=True, output={"final_greeting": result})
|
||||
|
||||
|
||||
async def main():
|
||||
print("🚀 Setting up Manual Agent...")
|
||||
print("Setting up Manual Agent...")
|
||||
|
||||
# 2. Define the Goal
|
||||
# Every agent needs a goal with success criteria
|
||||
@@ -55,8 +66,7 @@ async def main():
|
||||
id="greeter",
|
||||
name="Greeter",
|
||||
description="Generates a simple greeting",
|
||||
node_type="function",
|
||||
function="greet", # Matches the registered function name
|
||||
node_type="event_loop",
|
||||
input_keys=["name"],
|
||||
output_keys=["greeting"],
|
||||
)
|
||||
@@ -65,8 +75,7 @@ async def main():
|
||||
id="uppercaser",
|
||||
name="Uppercaser",
|
||||
description="Converts greeting to uppercase",
|
||||
node_type="function",
|
||||
function="uppercase",
|
||||
node_type="event_loop",
|
||||
input_keys=["greeting"],
|
||||
output_keys=["final_greeting"],
|
||||
)
|
||||
@@ -98,23 +107,23 @@ async def main():
|
||||
runtime = Runtime(storage_path=Path("./agent_logs"))
|
||||
executor = GraphExecutor(runtime=runtime)
|
||||
|
||||
# 7. Register Function Implementations
|
||||
# Connect string names in NodeSpecs to actual Python functions
|
||||
executor.register_function("greeter", greet)
|
||||
executor.register_function("uppercaser", uppercase)
|
||||
# 7. Register Node Implementations
|
||||
# Connect node IDs in the graph to actual Python implementations
|
||||
executor.register_node("greeter", GreeterNode())
|
||||
executor.register_node("uppercaser", UppercaserNode())
|
||||
|
||||
# 8. Execute Agent
|
||||
print("▶ Executing agent with input: name='Alice'...")
|
||||
print("Executing agent with input: name='Alice'...")
|
||||
|
||||
result = await executor.execute(graph=graph, goal=goal, input_data={"name": "Alice"})
|
||||
|
||||
# 9. Verify Results
|
||||
if result.success:
|
||||
print("\n✅ Success!")
|
||||
print("\nSuccess!")
|
||||
print(f"Path taken: {' -> '.join(result.path)}")
|
||||
print(f"Final output: {result.output.get('final_greeting')}")
|
||||
else:
|
||||
print(f"\n❌ Failed: {result.error}")
|
||||
print(f"\nFailed: {result.error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -122,7 +122,7 @@ async def example_4_custom_agent_with_mcp_tools():
|
||||
node_id="web-searcher",
|
||||
name="Web Search",
|
||||
description="Search the web for information",
|
||||
node_type="llm_tool_use",
|
||||
node_type="event_loop",
|
||||
system_prompt="Search for {query} and return the top results. Use the web_search tool.",
|
||||
tools=["web_search"], # This tool comes from tools MCP server
|
||||
input_keys=["query"],
|
||||
@@ -133,7 +133,7 @@ async def example_4_custom_agent_with_mcp_tools():
|
||||
node_id="summarizer",
|
||||
name="Summarize Results",
|
||||
description="Summarize the search results",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
system_prompt="Summarize the following search results in 2-3 sentences: {search_results}",
|
||||
input_keys=["search_results"],
|
||||
output_keys=["summary"],
|
||||
|
||||
@@ -245,20 +245,14 @@ class GraphBuilder:
|
||||
warnings.append(f"Node '{node.id}' should have a description")
|
||||
|
||||
# Type-specific validation
|
||||
if node.node_type == "llm_tool_use":
|
||||
if not node.tools:
|
||||
errors.append(f"LLM tool node '{node.id}' must specify tools")
|
||||
if not node.system_prompt:
|
||||
warnings.append(f"LLM node '{node.id}' should have a system_prompt")
|
||||
if node.node_type == "event_loop":
|
||||
if node.tools and not node.system_prompt:
|
||||
warnings.append(f"Event loop node '{node.id}' should have a system_prompt")
|
||||
|
||||
if node.node_type == "router":
|
||||
if not node.routes:
|
||||
errors.append(f"Router node '{node.id}' must specify routes")
|
||||
|
||||
if node.node_type == "function":
|
||||
if not node.function:
|
||||
errors.append(f"Function node '{node.id}' must specify function name")
|
||||
|
||||
# Check input/output keys
|
||||
if not node.input_keys:
|
||||
suggestions.append(f"Consider specifying input_keys for '{node.id}'")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Graph structures: Goals, Nodes, Edges, and Flexible Execution."""
|
||||
"""Graph structures: Goals, Nodes, Edges, and Execution."""
|
||||
|
||||
from framework.graph.client_io import (
|
||||
ActiveNodeClientIO,
|
||||
@@ -6,7 +6,6 @@ from framework.graph.client_io import (
|
||||
InertNodeClientIO,
|
||||
NodeClientIO,
|
||||
)
|
||||
from framework.graph.code_sandbox import CodeSandbox, safe_eval, safe_exec
|
||||
from framework.graph.context_handoff import ContextHandoff, HandoffContext
|
||||
from framework.graph.conversation import ConversationStore, Message, NodeConversation
|
||||
from framework.graph.edge import DEFAULT_MAX_TOKENS, EdgeCondition, EdgeSpec, GraphSpec
|
||||
@@ -18,31 +17,9 @@ from framework.graph.event_loop_node import (
|
||||
OutputAccumulator,
|
||||
)
|
||||
from framework.graph.executor import GraphExecutor
|
||||
from framework.graph.flexible_executor import ExecutorConfig, FlexibleGraphExecutor
|
||||
from framework.graph.goal import Constraint, Goal, GoalStatus, SuccessCriterion
|
||||
from framework.graph.judge import HybridJudge, create_default_judge
|
||||
from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec
|
||||
|
||||
# Flexible execution (Worker-Judge pattern)
|
||||
from framework.graph.plan import (
|
||||
ActionSpec,
|
||||
ActionType,
|
||||
# HITL (Human-in-the-loop)
|
||||
ApprovalDecision,
|
||||
ApprovalRequest,
|
||||
ApprovalResult,
|
||||
EvaluationRule,
|
||||
ExecutionStatus,
|
||||
Judgment,
|
||||
JudgmentAction,
|
||||
Plan,
|
||||
PlanExecutionResult,
|
||||
PlanStep,
|
||||
StepStatus,
|
||||
load_export,
|
||||
)
|
||||
from framework.graph.worker_node import StepExecutionResult, WorkerNode
|
||||
|
||||
__all__ = [
|
||||
# Goal
|
||||
"Goal",
|
||||
@@ -59,35 +36,8 @@ __all__ = [
|
||||
"EdgeCondition",
|
||||
"GraphSpec",
|
||||
"DEFAULT_MAX_TOKENS",
|
||||
# Executor (fixed graph)
|
||||
# Executor
|
||||
"GraphExecutor",
|
||||
# Plan (flexible execution)
|
||||
"Plan",
|
||||
"PlanStep",
|
||||
"ActionSpec",
|
||||
"ActionType",
|
||||
"StepStatus",
|
||||
"Judgment",
|
||||
"JudgmentAction",
|
||||
"EvaluationRule",
|
||||
"PlanExecutionResult",
|
||||
"ExecutionStatus",
|
||||
"load_export",
|
||||
# HITL (Human-in-the-loop)
|
||||
"ApprovalDecision",
|
||||
"ApprovalRequest",
|
||||
"ApprovalResult",
|
||||
# Worker-Judge
|
||||
"HybridJudge",
|
||||
"create_default_judge",
|
||||
"WorkerNode",
|
||||
"StepExecutionResult",
|
||||
"FlexibleGraphExecutor",
|
||||
"ExecutorConfig",
|
||||
# Code Sandbox
|
||||
"CodeSandbox",
|
||||
"safe_exec",
|
||||
"safe_eval",
|
||||
# Conversation
|
||||
"NodeConversation",
|
||||
"ConversationStore",
|
||||
|
||||
@@ -1,413 +0,0 @@
|
||||
"""
|
||||
Code Sandbox for Safe Execution of Dynamic Code.
|
||||
|
||||
Provides a restricted execution environment for code generated by
|
||||
the external planner. This is critical for open-ended planning where
|
||||
the planner can create arbitrary code actions.
|
||||
|
||||
Security measures:
|
||||
1. Restricted builtins (no file I/O, no imports of dangerous modules)
|
||||
2. Timeout enforcement
|
||||
3. Memory limits (via resource module on Unix)
|
||||
4. Namespace isolation
|
||||
"""
|
||||
|
||||
import ast
|
||||
import signal
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
# Safe builtins whitelist
|
||||
SAFE_BUILTINS = {
|
||||
# Basic types
|
||||
"True": True,
|
||||
"False": False,
|
||||
"None": None,
|
||||
# Type constructors
|
||||
"bool": bool,
|
||||
"int": int,
|
||||
"float": float,
|
||||
"str": str,
|
||||
"list": list,
|
||||
"dict": dict,
|
||||
"set": set,
|
||||
"tuple": tuple,
|
||||
"frozenset": frozenset,
|
||||
# Basic functions
|
||||
"abs": abs,
|
||||
"all": all,
|
||||
"any": any,
|
||||
"bin": bin,
|
||||
"chr": chr,
|
||||
"divmod": divmod,
|
||||
"enumerate": enumerate,
|
||||
"filter": filter,
|
||||
"format": format,
|
||||
"hex": hex,
|
||||
"isinstance": isinstance,
|
||||
"issubclass": issubclass,
|
||||
"iter": iter,
|
||||
"len": len,
|
||||
"map": map,
|
||||
"max": max,
|
||||
"min": min,
|
||||
"next": next,
|
||||
"oct": oct,
|
||||
"ord": ord,
|
||||
"pow": pow,
|
||||
"range": range,
|
||||
"repr": repr,
|
||||
"reversed": reversed,
|
||||
"round": round,
|
||||
"slice": slice,
|
||||
"sorted": sorted,
|
||||
"sum": sum,
|
||||
"zip": zip,
|
||||
}
|
||||
|
||||
# Modules that can be imported
|
||||
ALLOWED_MODULES = {
|
||||
"math",
|
||||
"json",
|
||||
"re",
|
||||
"datetime",
|
||||
"collections",
|
||||
"itertools",
|
||||
"functools",
|
||||
"operator",
|
||||
"string",
|
||||
"random",
|
||||
"statistics",
|
||||
"decimal",
|
||||
"fractions",
|
||||
}
|
||||
|
||||
# Dangerous AST nodes to block
|
||||
BLOCKED_AST_NODES = {
|
||||
ast.Import,
|
||||
ast.ImportFrom,
|
||||
ast.Global,
|
||||
ast.Nonlocal,
|
||||
}
|
||||
|
||||
|
||||
class CodeSandboxError(Exception):
|
||||
"""Error during sandboxed code execution."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class TimeoutError(CodeSandboxError):
|
||||
"""Code execution timed out."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class SecurityError(CodeSandboxError):
|
||||
"""Code contains potentially dangerous operations."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class SandboxResult:
|
||||
"""Result of sandboxed code execution."""
|
||||
|
||||
success: bool
|
||||
result: Any = None
|
||||
error: str | None = None
|
||||
stdout: str = ""
|
||||
variables: dict[str, Any] = field(default_factory=dict)
|
||||
execution_time_ms: int = 0
|
||||
|
||||
|
||||
class RestrictedImporter:
|
||||
"""Custom importer that only allows whitelisted modules."""
|
||||
|
||||
def __init__(self, allowed_modules: set[str]):
|
||||
self.allowed_modules = allowed_modules
|
||||
self._cache: dict[str, Any] = {}
|
||||
|
||||
def __call__(self, name: str, *args, **kwargs):
|
||||
if name not in self.allowed_modules:
|
||||
raise SecurityError(f"Import of module '{name}' is not allowed")
|
||||
|
||||
if name not in self._cache:
|
||||
import importlib
|
||||
|
||||
self._cache[name] = importlib.import_module(name)
|
||||
|
||||
return self._cache[name]
|
||||
|
||||
|
||||
class CodeValidator:
|
||||
"""Validates code for safety before execution."""
|
||||
|
||||
def __init__(self, blocked_nodes: set[type] | None = None):
|
||||
self.blocked_nodes = blocked_nodes or BLOCKED_AST_NODES
|
||||
|
||||
def validate(self, code: str) -> list[str]:
|
||||
"""
|
||||
Validate code and return list of issues.
|
||||
|
||||
Returns empty list if code is safe.
|
||||
"""
|
||||
issues = []
|
||||
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
except SyntaxError as e:
|
||||
return [f"Syntax error: {e}"]
|
||||
|
||||
for node in ast.walk(tree):
|
||||
# Check for blocked node types
|
||||
if type(node) in self.blocked_nodes:
|
||||
lineno = getattr(node, "lineno", "?")
|
||||
issues.append(f"Blocked operation: {type(node).__name__} at line {lineno}")
|
||||
|
||||
# Check for dangerous attribute access
|
||||
if isinstance(node, ast.Attribute):
|
||||
if node.attr.startswith("_"):
|
||||
issues.append(
|
||||
f"Access to private attribute '{node.attr}' at line {node.lineno}"
|
||||
)
|
||||
|
||||
# Check for exec/eval calls
|
||||
if isinstance(node, ast.Call):
|
||||
if isinstance(node.func, ast.Name):
|
||||
if node.func.id in ("exec", "eval", "compile", "__import__"):
|
||||
issues.append(
|
||||
f"Blocked function call: {node.func.id} at line {node.lineno}"
|
||||
)
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
class CodeSandbox:
|
||||
"""
|
||||
Sandboxed environment for executing dynamic code.
|
||||
|
||||
Usage:
|
||||
sandbox = CodeSandbox(timeout_seconds=5)
|
||||
result = sandbox.execute(
|
||||
code="x = 1 + 2\\nresult = x * 3",
|
||||
inputs={"multiplier": 2},
|
||||
)
|
||||
if result.success:
|
||||
print(result.variables["result"]) # 6
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout_seconds: int = 10,
|
||||
allowed_modules: set[str] | None = None,
|
||||
safe_builtins: dict[str, Any] | None = None,
|
||||
):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.allowed_modules = allowed_modules or ALLOWED_MODULES
|
||||
self.safe_builtins = safe_builtins or SAFE_BUILTINS
|
||||
self.validator = CodeValidator()
|
||||
self.importer = RestrictedImporter(self.allowed_modules)
|
||||
|
||||
@contextmanager
|
||||
def _timeout_context(self, seconds: int):
|
||||
"""Context manager for timeout enforcement."""
|
||||
|
||||
def handler(signum, frame):
|
||||
raise TimeoutError(f"Code execution timed out after {seconds} seconds")
|
||||
|
||||
# Only works on Unix-like systems
|
||||
if hasattr(signal, "SIGALRM"):
|
||||
old_handler = signal.signal(signal.SIGALRM, handler)
|
||||
signal.alarm(seconds)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
else:
|
||||
# Windows: no timeout support, just execute
|
||||
yield
|
||||
|
||||
def _create_namespace(self, inputs: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Create isolated namespace for code execution."""
|
||||
namespace = {
|
||||
"__builtins__": dict(self.safe_builtins),
|
||||
"__import__": self.importer,
|
||||
}
|
||||
|
||||
# Add input variables
|
||||
namespace.update(inputs)
|
||||
|
||||
return namespace
|
||||
|
||||
def execute(
|
||||
self,
|
||||
code: str,
|
||||
inputs: dict[str, Any] | None = None,
|
||||
extract_vars: list[str] | None = None,
|
||||
) -> SandboxResult:
|
||||
"""
|
||||
Execute code in sandbox.
|
||||
|
||||
Args:
|
||||
code: Python code to execute
|
||||
inputs: Variables to inject into namespace
|
||||
extract_vars: Variable names to extract from namespace after execution
|
||||
|
||||
Returns:
|
||||
SandboxResult with execution outcome
|
||||
"""
|
||||
import time
|
||||
|
||||
inputs = inputs or {}
|
||||
extract_vars = extract_vars or []
|
||||
|
||||
# Validate code first
|
||||
issues = self.validator.validate(code)
|
||||
if issues:
|
||||
return SandboxResult(
|
||||
success=False,
|
||||
error=f"Code validation failed: {'; '.join(issues)}",
|
||||
)
|
||||
|
||||
# Create isolated namespace
|
||||
namespace = self._create_namespace(inputs)
|
||||
|
||||
# Capture stdout
|
||||
import io
|
||||
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = captured_stdout = io.StringIO()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
with self._timeout_context(self.timeout_seconds):
|
||||
# Compile and execute
|
||||
compiled = compile(code, "<sandbox>", "exec")
|
||||
exec(compiled, namespace)
|
||||
|
||||
execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Extract requested variables
|
||||
extracted = {}
|
||||
for var in extract_vars:
|
||||
if var in namespace:
|
||||
extracted[var] = namespace[var]
|
||||
|
||||
# Also extract any new variables (not in inputs or builtins)
|
||||
for key, value in namespace.items():
|
||||
if key not in inputs and key not in self.safe_builtins and not key.startswith("_"):
|
||||
extracted[key] = value
|
||||
|
||||
return SandboxResult(
|
||||
success=True,
|
||||
result=namespace.get("result"), # Convention: 'result' is the return value
|
||||
stdout=captured_stdout.getvalue(),
|
||||
variables=extracted,
|
||||
execution_time_ms=execution_time_ms,
|
||||
)
|
||||
|
||||
except TimeoutError as e:
|
||||
return SandboxResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
execution_time_ms=self.timeout_seconds * 1000,
|
||||
)
|
||||
|
||||
except SecurityError as e:
|
||||
return SandboxResult(
|
||||
success=False,
|
||||
error=f"Security violation: {e}",
|
||||
execution_time_ms=int((time.time() - start_time) * 1000),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return SandboxResult(
|
||||
success=False,
|
||||
error=f"{type(e).__name__}: {e}",
|
||||
stdout=captured_stdout.getvalue(),
|
||||
execution_time_ms=int((time.time() - start_time) * 1000),
|
||||
)
|
||||
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
def execute_expression(
|
||||
self,
|
||||
expression: str,
|
||||
inputs: dict[str, Any] | None = None,
|
||||
) -> SandboxResult:
|
||||
"""
|
||||
Execute a single expression and return its value.
|
||||
|
||||
Simpler than execute() - just evaluates one expression.
|
||||
"""
|
||||
inputs = inputs or {}
|
||||
|
||||
# Validate
|
||||
try:
|
||||
ast.parse(expression, mode="eval")
|
||||
except SyntaxError as e:
|
||||
return SandboxResult(success=False, error=f"Syntax error: {e}")
|
||||
|
||||
namespace = self._create_namespace(inputs)
|
||||
|
||||
try:
|
||||
with self._timeout_context(self.timeout_seconds):
|
||||
result = eval(expression, namespace)
|
||||
|
||||
return SandboxResult(success=True, result=result)
|
||||
|
||||
except Exception as e:
|
||||
return SandboxResult(
|
||||
success=False,
|
||||
error=f"{type(e).__name__}: {e}",
|
||||
)
|
||||
|
||||
|
||||
# Singleton instance with default settings
|
||||
default_sandbox = CodeSandbox()
|
||||
|
||||
|
||||
def safe_exec(
|
||||
code: str,
|
||||
inputs: dict[str, Any] | None = None,
|
||||
timeout_seconds: int = 10,
|
||||
) -> SandboxResult:
|
||||
"""
|
||||
Convenience function for safe code execution.
|
||||
|
||||
Args:
|
||||
code: Python code to execute
|
||||
inputs: Variables to inject
|
||||
timeout_seconds: Max execution time
|
||||
|
||||
Returns:
|
||||
SandboxResult
|
||||
"""
|
||||
sandbox = CodeSandbox(timeout_seconds=timeout_seconds)
|
||||
return sandbox.execute(code, inputs)
|
||||
|
||||
|
||||
def safe_eval(
|
||||
expression: str,
|
||||
inputs: dict[str, Any] | None = None,
|
||||
timeout_seconds: int = 5,
|
||||
) -> SandboxResult:
|
||||
"""
|
||||
Convenience function for safe expression evaluation.
|
||||
|
||||
Args:
|
||||
expression: Python expression to evaluate
|
||||
inputs: Variables to inject
|
||||
timeout_seconds: Max execution time
|
||||
|
||||
Returns:
|
||||
SandboxResult
|
||||
"""
|
||||
sandbox = CodeSandbox(timeout_seconds=timeout_seconds)
|
||||
return sandbox.execute_expression(expression, inputs)
|
||||
@@ -11,7 +11,6 @@ The executor:
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
@@ -21,13 +20,10 @@ from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.node import (
|
||||
FunctionNode,
|
||||
LLMNode,
|
||||
NodeContext,
|
||||
NodeProtocol,
|
||||
NodeResult,
|
||||
NodeSpec,
|
||||
RouterNode,
|
||||
SharedMemory,
|
||||
)
|
||||
from framework.graph.output_cleaner import CleansingConfig, OutputCleaner
|
||||
@@ -837,9 +833,13 @@ class GraphExecutor:
|
||||
# [CORRECTED] Use node_spec.max_retries instead of hardcoded 3
|
||||
max_retries = getattr(node_spec, "max_retries", 3)
|
||||
|
||||
# Event loop nodes handle retry internally via judge —
|
||||
# executor retry is catastrophic (retry multiplication)
|
||||
if node_spec.node_type == "event_loop" and max_retries > 0:
|
||||
# EventLoopNode instances handle retry internally via judge —
|
||||
# executor retry would cause catastrophic retry multiplication.
|
||||
# Only override for actual EventLoopNode instances, not custom
|
||||
# NodeProtocol implementations that happen to use node_type="event_loop"
|
||||
from framework.graph.event_loop_node import EventLoopNode
|
||||
|
||||
if isinstance(node_impl, EventLoopNode) and max_retries > 0:
|
||||
self.logger.warning(
|
||||
f"EventLoopNode '{node_spec.id}' has max_retries={max_retries}. "
|
||||
"Overriding to 0 — event loop nodes handle retry internally via judge."
|
||||
@@ -1471,16 +1471,17 @@ class GraphExecutor:
|
||||
event_triggered=event_triggered,
|
||||
)
|
||||
|
||||
# Valid node types - no ambiguous "llm" type allowed
|
||||
VALID_NODE_TYPES = {
|
||||
"llm_tool_use",
|
||||
"llm_generate",
|
||||
"router",
|
||||
"function",
|
||||
"human_input",
|
||||
"event_loop",
|
||||
}
|
||||
DEPRECATED_NODE_TYPES = {"llm_tool_use": "event_loop", "llm_generate": "event_loop"}
|
||||
# Node types removed in v0.5 — provide migration guidance
|
||||
REMOVED_NODE_TYPES = {
|
||||
"function": "event_loop",
|
||||
"llm_tool_use": "event_loop",
|
||||
"llm_generate": "event_loop",
|
||||
"router": "event_loop", # Unused theoretical infrastructure
|
||||
"human_input": "event_loop", # Use client_facing=True instead
|
||||
}
|
||||
|
||||
def _get_node_implementation(
|
||||
self, node_spec: NodeSpec, cleanup_llm_model: str | None = None
|
||||
@@ -1490,62 +1491,23 @@ class GraphExecutor:
|
||||
if node_spec.id in self.node_registry:
|
||||
return self.node_registry[node_spec.id]
|
||||
|
||||
# Reject removed node types with migration guidance
|
||||
if node_spec.node_type in self.REMOVED_NODE_TYPES:
|
||||
replacement = self.REMOVED_NODE_TYPES[node_spec.node_type]
|
||||
raise RuntimeError(
|
||||
f"Node type '{node_spec.node_type}' was removed in v0.5. "
|
||||
f"Migrate node '{node_spec.id}' to '{replacement}'. "
|
||||
f"See https://github.com/adenhq/hive/issues/4753 for migration guide."
|
||||
)
|
||||
|
||||
# Validate node type
|
||||
if node_spec.node_type not in self.VALID_NODE_TYPES:
|
||||
raise RuntimeError(
|
||||
f"Invalid node type '{node_spec.node_type}' for node '{node_spec.id}'. "
|
||||
f"Must be one of: {sorted(self.VALID_NODE_TYPES)}. "
|
||||
f"Use 'llm_tool_use' for nodes that call tools, 'llm_generate' for text generation."
|
||||
)
|
||||
|
||||
# Warn on deprecated node types
|
||||
if node_spec.node_type in self.DEPRECATED_NODE_TYPES:
|
||||
replacement = self.DEPRECATED_NODE_TYPES[node_spec.node_type]
|
||||
warnings.warn(
|
||||
f"Node type '{node_spec.node_type}' is deprecated. "
|
||||
f"Use '{replacement}' instead. "
|
||||
f"Node: '{node_spec.id}'",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# Create based on type
|
||||
if node_spec.node_type == "llm_tool_use":
|
||||
if not node_spec.tools:
|
||||
raise RuntimeError(
|
||||
f"Node '{node_spec.id}' is type 'llm_tool_use' but declares no tools. "
|
||||
"Either add tools to the node or change type to 'llm_generate'."
|
||||
)
|
||||
return LLMNode(
|
||||
tool_executor=self.tool_executor,
|
||||
require_tools=True,
|
||||
cleanup_llm_model=cleanup_llm_model,
|
||||
)
|
||||
|
||||
if node_spec.node_type == "llm_generate":
|
||||
return LLMNode(
|
||||
tool_executor=None,
|
||||
require_tools=False,
|
||||
cleanup_llm_model=cleanup_llm_model,
|
||||
)
|
||||
|
||||
if node_spec.node_type == "router":
|
||||
return RouterNode()
|
||||
|
||||
if node_spec.node_type == "function":
|
||||
# Function nodes need explicit registration
|
||||
raise RuntimeError(
|
||||
f"Function node '{node_spec.id}' not registered. Register with node_registry."
|
||||
)
|
||||
|
||||
if node_spec.node_type == "human_input":
|
||||
# Human input nodes are handled specially by HITL mechanism
|
||||
return LLMNode(
|
||||
tool_executor=None,
|
||||
require_tools=False,
|
||||
cleanup_llm_model=cleanup_llm_model,
|
||||
f"Must be one of: {sorted(self.VALID_NODE_TYPES)}."
|
||||
)
|
||||
|
||||
# Create based on type (only event_loop is valid)
|
||||
if node_spec.node_type == "event_loop":
|
||||
# Auto-create EventLoopNode with sensible defaults.
|
||||
# Custom configs can still be pre-registered via node_registry.
|
||||
@@ -1805,9 +1767,14 @@ class GraphExecutor:
|
||||
branch.error = f"Node {branch.node_id} not found in graph"
|
||||
return branch, RuntimeError(branch.error)
|
||||
|
||||
# Get node implementation to check its type
|
||||
branch_impl = self._get_node_implementation(node_spec, graph.cleanup_llm_model)
|
||||
|
||||
effective_max_retries = node_spec.max_retries
|
||||
if node_spec.node_type == "event_loop":
|
||||
if effective_max_retries > 1:
|
||||
# Only override for actual EventLoopNode instances, not custom NodeProtocol impls
|
||||
from framework.graph.event_loop_node import EventLoopNode
|
||||
|
||||
if isinstance(branch_impl, EventLoopNode) and effective_max_retries > 1:
|
||||
self.logger.warning(
|
||||
f"EventLoopNode '{node_spec.id}' has "
|
||||
f"max_retries={effective_max_retries}. Overriding "
|
||||
@@ -1978,10 +1945,6 @@ class GraphExecutor:
|
||||
"""Register a custom node implementation."""
|
||||
self.node_registry[node_id] = implementation
|
||||
|
||||
def register_function(self, node_id: str, func: Callable) -> None:
|
||||
"""Register a function as a node."""
|
||||
self.node_registry[node_id] = FunctionNode(func)
|
||||
|
||||
def request_pause(self) -> None:
|
||||
"""
|
||||
Request graceful pause of the current execution.
|
||||
|
||||
@@ -1,552 +0,0 @@
|
||||
"""
|
||||
Flexible Graph Executor with Worker-Judge Loop.
|
||||
|
||||
Executes plans created by external planner (Claude Code, etc.)
|
||||
using a Worker-Judge loop:
|
||||
|
||||
1. External planner creates Plan
|
||||
2. FlexibleGraphExecutor receives Plan
|
||||
3. Worker executes each step
|
||||
4. Judge evaluates each result
|
||||
5. If Judge says "replan" → return to external planner with feedback
|
||||
6. If Judge says "escalate" → request human intervention
|
||||
7. If all steps complete → return success
|
||||
|
||||
This keeps planning external while execution/evaluation is internal.
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from framework.graph.code_sandbox import CodeSandbox
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.judge import HybridJudge, create_default_judge
|
||||
from framework.graph.plan import (
|
||||
ApprovalDecision,
|
||||
ApprovalRequest,
|
||||
ApprovalResult,
|
||||
ExecutionStatus,
|
||||
Judgment,
|
||||
JudgmentAction,
|
||||
Plan,
|
||||
PlanExecutionResult,
|
||||
PlanStep,
|
||||
StepStatus,
|
||||
)
|
||||
from framework.graph.worker_node import StepExecutionResult, WorkerNode
|
||||
from framework.llm.provider import LLMProvider, Tool
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
# Type alias for approval callback
|
||||
ApprovalCallback = Callable[[ApprovalRequest], ApprovalResult]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExecutorConfig:
|
||||
"""Configuration for FlexibleGraphExecutor."""
|
||||
|
||||
max_retries_per_step: int = 3
|
||||
max_total_steps: int = 100
|
||||
timeout_seconds: int = 300
|
||||
enable_parallel_execution: bool = False # Future: parallel step execution
|
||||
|
||||
|
||||
class FlexibleGraphExecutor:
|
||||
"""
|
||||
Executes plans with Worker-Judge loop.
|
||||
|
||||
Plans come from external source (Claude Code, etc.).
|
||||
Returns feedback for replanning if needed.
|
||||
|
||||
Usage:
|
||||
executor = FlexibleGraphExecutor(
|
||||
runtime=runtime,
|
||||
llm=llm_provider,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
result = await executor.execute_plan(plan, goal, context)
|
||||
|
||||
if result.status == ExecutionStatus.NEEDS_REPLAN:
|
||||
# External planner should create new plan using result.feedback
|
||||
new_plan = external_planner.replan(result.feedback_context)
|
||||
result = await executor.execute_plan(new_plan, goal, result.feedback_context)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runtime: Runtime,
|
||||
llm: LLMProvider | None = None,
|
||||
tools: dict[str, Tool] | None = None,
|
||||
tool_executor: Callable | None = None,
|
||||
functions: dict[str, Callable] | None = None,
|
||||
judge: HybridJudge | None = None,
|
||||
config: ExecutorConfig | None = None,
|
||||
approval_callback: ApprovalCallback | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize the FlexibleGraphExecutor.
|
||||
|
||||
Args:
|
||||
runtime: Runtime for decision logging
|
||||
llm: LLM provider for Worker and Judge
|
||||
tools: Available tools
|
||||
tool_executor: Function to execute tools
|
||||
functions: Registered functions
|
||||
judge: Custom judge (defaults to HybridJudge with default rules)
|
||||
config: Executor configuration
|
||||
approval_callback: Callback for human-in-the-loop approval.
|
||||
If None, steps requiring approval will pause execution.
|
||||
"""
|
||||
self.runtime = runtime
|
||||
self.llm = llm
|
||||
self.tools = tools or {}
|
||||
self.tool_executor = tool_executor
|
||||
self.functions = functions or {}
|
||||
self.config = config or ExecutorConfig()
|
||||
self.approval_callback = approval_callback
|
||||
|
||||
# Create judge
|
||||
self.judge = judge or create_default_judge(llm)
|
||||
|
||||
# Create worker
|
||||
self.worker = WorkerNode(
|
||||
runtime=runtime,
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
functions=functions,
|
||||
sandbox=CodeSandbox(),
|
||||
)
|
||||
|
||||
async def execute_plan(
|
||||
self,
|
||||
plan: Plan,
|
||||
goal: Goal,
|
||||
context: dict[str, Any] | None = None,
|
||||
) -> PlanExecutionResult:
|
||||
"""
|
||||
Execute a plan created by external planner.
|
||||
|
||||
Args:
|
||||
plan: The plan to execute
|
||||
goal: The goal context
|
||||
context: Initial context (e.g., from previous execution)
|
||||
|
||||
Returns:
|
||||
PlanExecutionResult with status and feedback
|
||||
"""
|
||||
context = context or {}
|
||||
context.update(plan.context) # Merge plan's accumulated context
|
||||
|
||||
# Start run
|
||||
_run_id = self.runtime.start_run(
|
||||
goal_id=goal.id,
|
||||
goal_description=goal.description,
|
||||
input_data={"plan_id": plan.id, "revision": plan.revision},
|
||||
)
|
||||
|
||||
steps_executed = 0
|
||||
total_tokens = 0
|
||||
total_latency = 0
|
||||
|
||||
try:
|
||||
while steps_executed < self.config.max_total_steps:
|
||||
# Get next ready steps
|
||||
ready_steps = plan.get_ready_steps()
|
||||
|
||||
if not ready_steps:
|
||||
# Check if we're done or stuck
|
||||
if plan.is_complete():
|
||||
break
|
||||
else:
|
||||
# No ready steps but not complete - something's wrong
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.NEEDS_REPLAN,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=(
|
||||
"No executable steps available but plan not complete. "
|
||||
"Check dependencies."
|
||||
),
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
# Execute next step (for now, sequential; could be parallel)
|
||||
step = ready_steps[0]
|
||||
# Debug: show ready steps
|
||||
# ready_ids = [s.id for s in ready_steps]
|
||||
# print(f" [DEBUG] Ready steps: {ready_ids}, executing: {step.id}")
|
||||
|
||||
# APPROVAL CHECK - before execution
|
||||
if step.requires_approval:
|
||||
approval_result = await self._request_approval(step, context)
|
||||
|
||||
if approval_result is None:
|
||||
# No callback, pause execution
|
||||
step.status = StepStatus.AWAITING_APPROVAL
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.AWAITING_APPROVAL,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=f"Step '{step.id}' requires approval: {step.description}",
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
if approval_result.decision == ApprovalDecision.REJECT:
|
||||
step.status = StepStatus.REJECTED
|
||||
step.error = approval_result.reason or "Rejected by human"
|
||||
# Skip this step and continue with dependents marked as skipped
|
||||
self._skip_dependent_steps(plan, step.id)
|
||||
continue
|
||||
|
||||
if approval_result.decision == ApprovalDecision.ABORT:
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.ABORTED,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=approval_result.reason or "Aborted by human",
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
if approval_result.decision == ApprovalDecision.MODIFY:
|
||||
# Apply modifications to step
|
||||
if approval_result.modifications:
|
||||
self._apply_modifications(step, approval_result.modifications)
|
||||
|
||||
# APPROVE - continue to execution
|
||||
|
||||
step.status = StepStatus.IN_PROGRESS
|
||||
step.started_at = datetime.now()
|
||||
step.attempts += 1
|
||||
|
||||
# WORK
|
||||
work_result = await self.worker.execute(step, context)
|
||||
steps_executed += 1
|
||||
total_tokens += work_result.tokens_used
|
||||
total_latency += work_result.latency_ms
|
||||
|
||||
# JUDGE
|
||||
judgment = await self.judge.evaluate(
|
||||
step=step,
|
||||
result=work_result.__dict__,
|
||||
goal=goal,
|
||||
context=context,
|
||||
)
|
||||
|
||||
# Handle judgment
|
||||
result = await self._handle_judgment(
|
||||
step=step,
|
||||
work_result=work_result,
|
||||
judgment=judgment,
|
||||
plan=plan,
|
||||
goal=goal,
|
||||
context=context,
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
if result is not None:
|
||||
# Judgment resulted in early return (replan/escalate)
|
||||
self.runtime.end_run(
|
||||
success=False,
|
||||
narrative=f"Execution stopped: {result.status.value}",
|
||||
)
|
||||
return result
|
||||
|
||||
# All steps completed successfully
|
||||
self.runtime.end_run(
|
||||
success=True,
|
||||
output_data=context,
|
||||
narrative=f"Plan completed: {steps_executed} steps executed",
|
||||
)
|
||||
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.COMPLETED,
|
||||
plan=plan,
|
||||
context=context,
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.runtime.report_problem(
|
||||
severity="critical",
|
||||
description=str(e),
|
||||
)
|
||||
self.runtime.end_run(
|
||||
success=False,
|
||||
narrative=f"Execution failed: {e}",
|
||||
)
|
||||
|
||||
return PlanExecutionResult(
|
||||
status=ExecutionStatus.FAILED,
|
||||
error=str(e),
|
||||
feedback=f"Execution error: {e}",
|
||||
feedback_context=plan.to_feedback_context(),
|
||||
completed_steps=[s.id for s in plan.get_completed_steps()],
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency_ms=total_latency,
|
||||
)
|
||||
|
||||
async def _handle_judgment(
|
||||
self,
|
||||
step: PlanStep,
|
||||
work_result: StepExecutionResult,
|
||||
judgment: Judgment,
|
||||
plan: Plan,
|
||||
goal: Goal,
|
||||
context: dict[str, Any],
|
||||
steps_executed: int,
|
||||
total_tokens: int,
|
||||
total_latency: int,
|
||||
) -> PlanExecutionResult | None:
|
||||
"""
|
||||
Handle judgment and return result if execution should stop.
|
||||
|
||||
Returns None to continue execution, or PlanExecutionResult to stop.
|
||||
"""
|
||||
if judgment.action == JudgmentAction.ACCEPT:
|
||||
# Step succeeded - update state and continue
|
||||
step.status = StepStatus.COMPLETED
|
||||
step.completed_at = datetime.now()
|
||||
step.result = work_result.outputs
|
||||
|
||||
# Map outputs to expected output keys
|
||||
# If output has generic "result" key but step expects specific keys, map it
|
||||
outputs_to_store = work_result.outputs.copy()
|
||||
if step.expected_outputs and "result" in outputs_to_store:
|
||||
result_value = outputs_to_store["result"]
|
||||
# For each expected output key that's not in outputs, map from "result"
|
||||
for expected_key in step.expected_outputs:
|
||||
if expected_key not in outputs_to_store:
|
||||
outputs_to_store[expected_key] = result_value
|
||||
|
||||
# Update context with mapped outputs
|
||||
context.update(outputs_to_store)
|
||||
|
||||
# Store in plan context for replanning feedback
|
||||
plan.context[step.id] = outputs_to_store
|
||||
|
||||
return None # Continue execution
|
||||
|
||||
elif judgment.action == JudgmentAction.RETRY:
|
||||
# Retry step if under limit
|
||||
if step.attempts < step.max_retries:
|
||||
step.status = StepStatus.PENDING
|
||||
step.error = judgment.feedback
|
||||
|
||||
# Record retry decision
|
||||
self.runtime.decide(
|
||||
intent=f"Retry step {step.id}",
|
||||
options=[{"id": "retry", "description": "Retry with feedback"}],
|
||||
chosen="retry",
|
||||
reasoning=judgment.reasoning,
|
||||
context={"attempt": step.attempts, "feedback": judgment.feedback},
|
||||
)
|
||||
|
||||
return None # Continue (step will be retried)
|
||||
else:
|
||||
# Max retries exceeded - escalate to replan
|
||||
step.status = StepStatus.FAILED
|
||||
step.error = f"Max retries ({step.max_retries}) exceeded: {judgment.feedback}"
|
||||
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.NEEDS_REPLAN,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=(
|
||||
f"Step '{step.id}' failed after {step.attempts} attempts: "
|
||||
f"{judgment.feedback}"
|
||||
),
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
elif judgment.action == JudgmentAction.REPLAN:
|
||||
# Return to external planner
|
||||
step.status = StepStatus.FAILED
|
||||
step.error = judgment.feedback
|
||||
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.NEEDS_REPLAN,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=judgment.feedback or f"Step '{step.id}' requires replanning",
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
elif judgment.action == JudgmentAction.ESCALATE:
|
||||
# Request human intervention
|
||||
return self._create_result(
|
||||
status=ExecutionStatus.NEEDS_ESCALATION,
|
||||
plan=plan,
|
||||
context=context,
|
||||
feedback=judgment.feedback or f"Step '{step.id}' requires human intervention",
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency=total_latency,
|
||||
)
|
||||
|
||||
return None # Unknown action - continue
|
||||
|
||||
def _create_result(
|
||||
self,
|
||||
status: ExecutionStatus,
|
||||
plan: Plan,
|
||||
context: dict[str, Any],
|
||||
feedback: str | None = None,
|
||||
steps_executed: int = 0,
|
||||
total_tokens: int = 0,
|
||||
total_latency: int = 0,
|
||||
) -> PlanExecutionResult:
|
||||
"""Create a PlanExecutionResult."""
|
||||
return PlanExecutionResult(
|
||||
status=status,
|
||||
results=context,
|
||||
feedback=feedback,
|
||||
feedback_context=plan.to_feedback_context(),
|
||||
completed_steps=[s.id for s in plan.get_completed_steps()],
|
||||
steps_executed=steps_executed,
|
||||
total_tokens=total_tokens,
|
||||
total_latency_ms=total_latency,
|
||||
)
|
||||
|
||||
def register_function(self, name: str, func: Callable) -> None:
|
||||
"""Register a function for FUNCTION actions."""
|
||||
self.functions[name] = func
|
||||
self.worker.register_function(name, func)
|
||||
|
||||
def register_tool(self, tool: Tool) -> None:
|
||||
"""Register a tool for TOOL_USE actions."""
|
||||
self.tools[tool.name] = tool
|
||||
self.worker.register_tool(tool)
|
||||
|
||||
def add_evaluation_rule(self, rule) -> None:
|
||||
"""Add an evaluation rule to the judge."""
|
||||
self.judge.add_rule(rule)
|
||||
|
||||
async def _request_approval(
|
||||
self,
|
||||
step: PlanStep,
|
||||
context: dict[str, Any],
|
||||
) -> ApprovalResult | None:
|
||||
"""
|
||||
Request human approval for a step.
|
||||
|
||||
Returns None if no callback is set (execution should pause).
|
||||
"""
|
||||
if self.approval_callback is None:
|
||||
return None
|
||||
|
||||
# Build preview of what will happen
|
||||
preview_parts = []
|
||||
if step.action.tool_name:
|
||||
preview_parts.append(f"Tool: {step.action.tool_name}")
|
||||
if step.action.tool_args:
|
||||
import json
|
||||
|
||||
args_preview = json.dumps(step.action.tool_args, indent=2, default=str)
|
||||
if len(args_preview) > 500:
|
||||
args_preview = args_preview[:500] + "..."
|
||||
preview_parts.append(f"Args: {args_preview}")
|
||||
elif step.action.prompt:
|
||||
prompt_preview = (
|
||||
step.action.prompt[:300] + "..."
|
||||
if len(step.action.prompt) > 300
|
||||
else step.action.prompt
|
||||
)
|
||||
preview_parts.append(f"Prompt: {prompt_preview}")
|
||||
|
||||
# Include step inputs resolved from context (what will be sent/used)
|
||||
relevant_context = {}
|
||||
for input_key, input_value in step.inputs.items():
|
||||
# Resolve variable references like "$email_sequence"
|
||||
if isinstance(input_value, str) and input_value.startswith("$"):
|
||||
context_key = input_value[1:] # Remove $ prefix
|
||||
if context_key in context:
|
||||
relevant_context[input_key] = context[context_key]
|
||||
else:
|
||||
relevant_context[input_key] = input_value
|
||||
|
||||
request = ApprovalRequest(
|
||||
step_id=step.id,
|
||||
step_description=step.description,
|
||||
action_type=step.action.action_type.value,
|
||||
action_details={
|
||||
"tool_name": step.action.tool_name,
|
||||
"tool_args": step.action.tool_args,
|
||||
"prompt": step.action.prompt,
|
||||
},
|
||||
context=relevant_context,
|
||||
approval_message=step.approval_message,
|
||||
preview="\n".join(preview_parts) if preview_parts else None,
|
||||
)
|
||||
|
||||
return self.approval_callback(request)
|
||||
|
||||
def _skip_dependent_steps(self, plan: Plan, rejected_step_id: str) -> None:
|
||||
"""Mark steps that depend on a rejected step as skipped."""
|
||||
for step in plan.steps:
|
||||
if rejected_step_id in step.dependencies:
|
||||
if step.status == StepStatus.PENDING:
|
||||
step.status = StepStatus.SKIPPED
|
||||
step.error = f"Skipped because dependency '{rejected_step_id}' was rejected"
|
||||
# Recursively skip dependents
|
||||
self._skip_dependent_steps(plan, step.id)
|
||||
|
||||
def _apply_modifications(self, step: PlanStep, modifications: dict[str, Any]) -> None:
|
||||
"""Apply human modifications to a step before execution."""
|
||||
# Allow modifying tool args
|
||||
if "tool_args" in modifications and step.action.tool_args:
|
||||
step.action.tool_args.update(modifications["tool_args"])
|
||||
|
||||
# Allow modifying prompt
|
||||
if "prompt" in modifications:
|
||||
step.action.prompt = modifications["prompt"]
|
||||
|
||||
# Allow modifying inputs
|
||||
if "inputs" in modifications:
|
||||
step.inputs.update(modifications["inputs"])
|
||||
|
||||
def set_approval_callback(self, callback: ApprovalCallback) -> None:
|
||||
"""Set the approval callback for HITL steps."""
|
||||
self.approval_callback = callback
|
||||
|
||||
|
||||
# Convenience function for simple execution
|
||||
async def execute_plan(
|
||||
plan: Plan,
|
||||
goal: Goal,
|
||||
runtime: Runtime,
|
||||
llm: LLMProvider | None = None,
|
||||
tools: dict[str, Tool] | None = None,
|
||||
tool_executor: Callable | None = None,
|
||||
context: dict[str, Any] | None = None,
|
||||
) -> PlanExecutionResult:
|
||||
"""
|
||||
Execute a plan with default configuration.
|
||||
|
||||
Convenience function for simple use cases.
|
||||
"""
|
||||
executor = FlexibleGraphExecutor(
|
||||
runtime=runtime,
|
||||
llm=llm,
|
||||
tools=tools,
|
||||
tool_executor=tool_executor,
|
||||
)
|
||||
return await executor.execute_plan(plan, goal, context)
|
||||
@@ -1,406 +0,0 @@
|
||||
"""
|
||||
Hybrid Judge for Evaluating Plan Step Results.
|
||||
|
||||
The HybridJudge evaluates step execution results using:
|
||||
1. Rule-based evaluation (fast, deterministic)
|
||||
2. LLM-based evaluation (fallback for ambiguous cases)
|
||||
|
||||
Escalation path: rules → LLM → human
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from framework.graph.code_sandbox import safe_eval
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.plan import (
|
||||
EvaluationRule,
|
||||
Judgment,
|
||||
JudgmentAction,
|
||||
PlanStep,
|
||||
)
|
||||
from framework.llm.provider import LLMProvider
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuleEvaluationResult:
|
||||
"""Result of rule-based evaluation."""
|
||||
|
||||
is_definitive: bool # True if a rule matched definitively
|
||||
judgment: Judgment | None = None
|
||||
context: dict[str, Any] = field(default_factory=dict)
|
||||
rules_checked: int = 0
|
||||
rule_matched: str | None = None
|
||||
|
||||
|
||||
class HybridJudge:
|
||||
"""
|
||||
Evaluates plan step results using rules first, then LLM fallback.
|
||||
|
||||
Usage:
|
||||
judge = HybridJudge(llm=llm_provider)
|
||||
judge.add_rule(EvaluationRule(
|
||||
id="success_check",
|
||||
condition="result.get('success') == True",
|
||||
action=JudgmentAction.ACCEPT,
|
||||
))
|
||||
|
||||
judgment = await judge.evaluate(step, result, goal)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm: LLMProvider | None = None,
|
||||
rules: list[EvaluationRule] | None = None,
|
||||
llm_confidence_threshold: float = 0.7,
|
||||
):
|
||||
"""
|
||||
Initialize the HybridJudge.
|
||||
|
||||
Args:
|
||||
llm: LLM provider for ambiguous cases
|
||||
rules: Initial evaluation rules
|
||||
llm_confidence_threshold: Confidence below this triggers escalation
|
||||
"""
|
||||
self.llm = llm
|
||||
self.rules: list[EvaluationRule] = rules or []
|
||||
self.llm_confidence_threshold = llm_confidence_threshold
|
||||
|
||||
# Sort rules by priority (higher first)
|
||||
self._sort_rules()
|
||||
|
||||
def _sort_rules(self):
|
||||
"""Sort rules by priority."""
|
||||
self.rules.sort(key=lambda r: -r.priority)
|
||||
|
||||
def add_rule(self, rule: EvaluationRule) -> None:
|
||||
"""Add an evaluation rule."""
|
||||
self.rules.append(rule)
|
||||
self._sort_rules()
|
||||
|
||||
def remove_rule(self, rule_id: str) -> bool:
|
||||
"""Remove a rule by ID. Returns True if found and removed."""
|
||||
for i, rule in enumerate(self.rules):
|
||||
if rule.id == rule_id:
|
||||
self.rules.pop(i)
|
||||
return True
|
||||
return False
|
||||
|
||||
async def evaluate(
|
||||
self,
|
||||
step: PlanStep,
|
||||
result: Any,
|
||||
goal: Goal,
|
||||
context: dict[str, Any] | None = None,
|
||||
) -> Judgment:
|
||||
"""
|
||||
Evaluate a step result.
|
||||
|
||||
Args:
|
||||
step: The executed plan step
|
||||
result: The result of executing the step
|
||||
goal: The goal context for evaluation
|
||||
context: Additional context from previous steps
|
||||
|
||||
Returns:
|
||||
Judgment with action and feedback
|
||||
"""
|
||||
context = context or {}
|
||||
|
||||
# Try rule-based evaluation first
|
||||
rule_result = self._evaluate_rules(step, result, goal, context)
|
||||
|
||||
if rule_result.is_definitive:
|
||||
return rule_result.judgment
|
||||
|
||||
# Fall back to LLM evaluation
|
||||
if self.llm:
|
||||
return await self._evaluate_llm(step, result, goal, context, rule_result)
|
||||
|
||||
# No LLM available - default to accept with low confidence
|
||||
return Judgment(
|
||||
action=JudgmentAction.ACCEPT,
|
||||
reasoning="No definitive rule matched and no LLM available for evaluation",
|
||||
confidence=0.5,
|
||||
llm_used=False,
|
||||
)
|
||||
|
||||
def _evaluate_rules(
|
||||
self,
|
||||
step: PlanStep,
|
||||
result: Any,
|
||||
goal: Goal,
|
||||
context: dict[str, Any],
|
||||
) -> RuleEvaluationResult:
|
||||
"""Evaluate step using rules."""
|
||||
rules_checked = 0
|
||||
|
||||
# Build evaluation context
|
||||
eval_context = {
|
||||
"step": step.model_dump() if hasattr(step, "model_dump") else step,
|
||||
"result": result,
|
||||
"goal": goal.model_dump() if hasattr(goal, "model_dump") else goal,
|
||||
"context": context,
|
||||
"success": isinstance(result, dict) and result.get("success", False),
|
||||
"error": isinstance(result, dict) and result.get("error"),
|
||||
}
|
||||
|
||||
for rule in self.rules:
|
||||
rules_checked += 1
|
||||
|
||||
# Evaluate rule condition
|
||||
eval_result = safe_eval(rule.condition, eval_context)
|
||||
|
||||
if eval_result.success and eval_result.result:
|
||||
# Rule matched!
|
||||
feedback = self._format_feedback(rule.feedback_template, eval_context)
|
||||
|
||||
return RuleEvaluationResult(
|
||||
is_definitive=True,
|
||||
judgment=Judgment(
|
||||
action=rule.action,
|
||||
reasoning=rule.description,
|
||||
feedback=feedback if feedback else None,
|
||||
rule_matched=rule.id,
|
||||
confidence=1.0,
|
||||
llm_used=False,
|
||||
),
|
||||
rules_checked=rules_checked,
|
||||
rule_matched=rule.id,
|
||||
)
|
||||
|
||||
# No rule matched definitively
|
||||
return RuleEvaluationResult(
|
||||
is_definitive=False,
|
||||
context=eval_context,
|
||||
rules_checked=rules_checked,
|
||||
)
|
||||
|
||||
def _format_feedback(
|
||||
self,
|
||||
template: str,
|
||||
context: dict[str, Any],
|
||||
) -> str:
|
||||
"""Format feedback template with context values."""
|
||||
if not template:
|
||||
return ""
|
||||
|
||||
try:
|
||||
return template.format(**context)
|
||||
except (KeyError, ValueError):
|
||||
return template
|
||||
|
||||
async def _evaluate_llm(
|
||||
self,
|
||||
step: PlanStep,
|
||||
result: Any,
|
||||
goal: Goal,
|
||||
context: dict[str, Any],
|
||||
rule_result: RuleEvaluationResult,
|
||||
) -> Judgment:
|
||||
"""Evaluate step using LLM."""
|
||||
system_prompt = self._build_llm_system_prompt(goal)
|
||||
user_prompt = self._build_llm_user_prompt(step, result, context, rule_result)
|
||||
|
||||
try:
|
||||
response = await self.llm.acomplete(
|
||||
messages=[{"role": "user", "content": user_prompt}],
|
||||
system=system_prompt,
|
||||
)
|
||||
|
||||
# Parse LLM response
|
||||
judgment = self._parse_llm_response(response.content)
|
||||
judgment.llm_used = True
|
||||
|
||||
# Check confidence threshold
|
||||
if judgment.confidence < self.llm_confidence_threshold:
|
||||
# Low confidence - escalate
|
||||
return Judgment(
|
||||
action=JudgmentAction.ESCALATE,
|
||||
reasoning=(
|
||||
f"LLM confidence ({judgment.confidence:.2f}) "
|
||||
f"below threshold ({self.llm_confidence_threshold})"
|
||||
),
|
||||
feedback=judgment.feedback,
|
||||
confidence=judgment.confidence,
|
||||
llm_used=True,
|
||||
context={"original_judgment": judgment.model_dump()},
|
||||
)
|
||||
|
||||
return judgment
|
||||
|
||||
except Exception as e:
|
||||
# LLM failed - escalate
|
||||
return Judgment(
|
||||
action=JudgmentAction.ESCALATE,
|
||||
reasoning=f"LLM evaluation failed: {e}",
|
||||
feedback="Human review needed due to LLM error",
|
||||
llm_used=True,
|
||||
)
|
||||
|
||||
def _build_llm_system_prompt(self, goal: Goal) -> str:
|
||||
"""Build system prompt for LLM judge."""
|
||||
return f"""You are a judge evaluating the execution of a plan step.
|
||||
|
||||
GOAL: {goal.description}
|
||||
|
||||
SUCCESS CRITERIA:
|
||||
{chr(10).join(f"- {sc.description}" for sc in goal.success_criteria)}
|
||||
|
||||
CONSTRAINTS:
|
||||
{chr(10).join(f"- {c.description}" for c in goal.constraints)}
|
||||
|
||||
Your task is to evaluate whether the step was executed successfully and decide the next action.
|
||||
|
||||
Respond in this exact format:
|
||||
ACTION: [ACCEPT|RETRY|REPLAN|ESCALATE]
|
||||
CONFIDENCE: [0.0-1.0]
|
||||
REASONING: [Your reasoning]
|
||||
FEEDBACK: [Feedback for retry/replan, or empty if accepting]
|
||||
|
||||
Actions:
|
||||
- ACCEPT: Step completed successfully, continue to next step
|
||||
- RETRY: Step failed but can be retried with feedback
|
||||
- REPLAN: Step failed in a way that requires replanning
|
||||
- ESCALATE: Requires human intervention
|
||||
"""
|
||||
|
||||
def _build_llm_user_prompt(
|
||||
self,
|
||||
step: PlanStep,
|
||||
result: Any,
|
||||
context: dict[str, Any],
|
||||
rule_result: RuleEvaluationResult,
|
||||
) -> str:
|
||||
"""Build user prompt for LLM judge."""
|
||||
return f"""Evaluate this step execution:
|
||||
|
||||
STEP: {step.description}
|
||||
STEP ID: {step.id}
|
||||
ACTION TYPE: {step.action.action_type}
|
||||
EXPECTED OUTPUTS: {step.expected_outputs}
|
||||
|
||||
RESULT:
|
||||
{result}
|
||||
|
||||
CONTEXT FROM PREVIOUS STEPS:
|
||||
{context}
|
||||
|
||||
RULES CHECKED: {rule_result.rules_checked} (none matched definitively)
|
||||
|
||||
Please evaluate and provide your judgment."""
|
||||
|
||||
def _parse_llm_response(self, response: str) -> Judgment:
|
||||
"""Parse LLM response into Judgment."""
|
||||
lines = response.strip().split("\n")
|
||||
|
||||
action = JudgmentAction.ACCEPT
|
||||
confidence = 0.8
|
||||
reasoning = ""
|
||||
feedback = ""
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line.startswith("ACTION:"):
|
||||
action_str = line.split(":", 1)[1].strip().upper()
|
||||
try:
|
||||
action = JudgmentAction(action_str.lower())
|
||||
except ValueError:
|
||||
action = JudgmentAction.ESCALATE
|
||||
|
||||
elif line.startswith("CONFIDENCE:"):
|
||||
try:
|
||||
confidence = float(line.split(":", 1)[1].strip())
|
||||
except ValueError:
|
||||
confidence = 0.5
|
||||
|
||||
elif line.startswith("REASONING:"):
|
||||
reasoning = line.split(":", 1)[1].strip()
|
||||
|
||||
elif line.startswith("FEEDBACK:"):
|
||||
feedback = line.split(":", 1)[1].strip()
|
||||
|
||||
return Judgment(
|
||||
action=action,
|
||||
reasoning=reasoning or "LLM evaluation",
|
||||
feedback=feedback if feedback else None,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
# Factory function for creating judge with common rules
|
||||
def create_default_judge(llm: LLMProvider | None = None) -> HybridJudge:
|
||||
"""
|
||||
Create a HybridJudge with commonly useful default rules.
|
||||
|
||||
Args:
|
||||
llm: LLM provider for fallback evaluation
|
||||
|
||||
Returns:
|
||||
Configured HybridJudge instance
|
||||
"""
|
||||
judge = HybridJudge(llm=llm)
|
||||
|
||||
# Rule: Accept on explicit success flag
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="explicit_success",
|
||||
description="Step explicitly marked as successful",
|
||||
condition="isinstance(result, dict) and result.get('success') == True",
|
||||
action=JudgmentAction.ACCEPT,
|
||||
priority=100,
|
||||
)
|
||||
)
|
||||
|
||||
# Rule: Retry on transient errors
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="transient_error_retry",
|
||||
description="Transient error that can be retried",
|
||||
condition=(
|
||||
"isinstance(result, dict) and "
|
||||
"result.get('error_type') in ['timeout', 'rate_limit', 'connection_error']"
|
||||
),
|
||||
action=JudgmentAction.RETRY,
|
||||
feedback_template="Transient error: {result[error]}. Please retry.",
|
||||
priority=90,
|
||||
)
|
||||
)
|
||||
|
||||
# Rule: Replan on missing data
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="missing_data_replan",
|
||||
description="Required data not available",
|
||||
condition="isinstance(result, dict) and result.get('error_type') == 'missing_data'",
|
||||
action=JudgmentAction.REPLAN,
|
||||
feedback_template="Missing required data: {result[error]}. Plan needs adjustment.",
|
||||
priority=80,
|
||||
)
|
||||
)
|
||||
|
||||
# Rule: Escalate on security issues
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="security_escalate",
|
||||
description="Security issue detected",
|
||||
condition="isinstance(result, dict) and result.get('error_type') == 'security'",
|
||||
action=JudgmentAction.ESCALATE,
|
||||
feedback_template="Security issue detected: {result[error]}",
|
||||
priority=200,
|
||||
)
|
||||
)
|
||||
|
||||
# Rule: Fail on max retries exceeded
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="max_retries_fail",
|
||||
description="Maximum retries exceeded",
|
||||
condition="step.get('attempts', 0) >= step.get('max_retries', 3)",
|
||||
action=JudgmentAction.REPLAN,
|
||||
feedback_template="Step '{step[id]}' failed after {step[attempts]} attempts",
|
||||
priority=150,
|
||||
)
|
||||
)
|
||||
|
||||
return judge
|
||||
+3
-1342
File diff suppressed because it is too large
Load Diff
@@ -1,513 +0,0 @@
|
||||
"""
|
||||
Plan Data Structures for Flexible Execution.
|
||||
|
||||
Plans are created externally (by Claude Code or another LLM agent) and
|
||||
executed internally by the FlexibleGraphExecutor with Worker-Judge loop.
|
||||
|
||||
The Plan is the contract between the external planner and the executor:
|
||||
- Planner creates a Plan with PlanSteps
|
||||
- Executor runs steps and judges results
|
||||
- If replanning needed, returns feedback to external planner
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ActionType(StrEnum):
|
||||
"""Types of actions a PlanStep can perform."""
|
||||
|
||||
LLM_CALL = "llm_call" # Call LLM for generation
|
||||
TOOL_USE = "tool_use" # Use a registered tool
|
||||
SUB_GRAPH = "sub_graph" # Execute a sub-graph
|
||||
FUNCTION = "function" # Call a Python function
|
||||
CODE_EXECUTION = "code_execution" # Execute dynamic code (sandboxed)
|
||||
|
||||
|
||||
class StepStatus(StrEnum):
|
||||
"""Status of a plan step."""
|
||||
|
||||
PENDING = "pending"
|
||||
AWAITING_APPROVAL = "awaiting_approval" # Waiting for human approval
|
||||
IN_PROGRESS = "in_progress"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
SKIPPED = "skipped"
|
||||
REJECTED = "rejected" # Human rejected execution
|
||||
|
||||
def is_terminal(self) -> bool:
|
||||
"""Check if this status represents a terminal (finished) state.
|
||||
|
||||
Terminal states are states where the step will not execute further,
|
||||
either because it completed successfully or failed/was skipped.
|
||||
"""
|
||||
return self in (
|
||||
StepStatus.COMPLETED,
|
||||
StepStatus.FAILED,
|
||||
StepStatus.SKIPPED,
|
||||
StepStatus.REJECTED,
|
||||
)
|
||||
|
||||
def is_successful(self) -> bool:
|
||||
"""Check if this status represents successful completion."""
|
||||
return self == StepStatus.COMPLETED
|
||||
|
||||
|
||||
class ApprovalDecision(StrEnum):
|
||||
"""Human decision on a step requiring approval."""
|
||||
|
||||
APPROVE = "approve" # Execute as planned
|
||||
REJECT = "reject" # Skip this step
|
||||
MODIFY = "modify" # Execute with modifications
|
||||
ABORT = "abort" # Stop entire execution
|
||||
|
||||
|
||||
class ApprovalRequest(BaseModel):
|
||||
"""Request for human approval before executing a step."""
|
||||
|
||||
step_id: str
|
||||
step_description: str
|
||||
action_type: str
|
||||
action_details: dict[str, Any] = Field(default_factory=dict)
|
||||
context: dict[str, Any] = Field(default_factory=dict)
|
||||
approval_message: str | None = None
|
||||
|
||||
# Preview of what will happen
|
||||
preview: str | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class ApprovalResult(BaseModel):
|
||||
"""Result of human approval decision."""
|
||||
|
||||
decision: ApprovalDecision
|
||||
reason: str | None = None
|
||||
modifications: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class JudgmentAction(StrEnum):
|
||||
"""Actions the judge can take after evaluating a step."""
|
||||
|
||||
ACCEPT = "accept" # Step completed successfully, continue
|
||||
RETRY = "retry" # Retry the step with feedback
|
||||
REPLAN = "replan" # Return to external planner for new plan
|
||||
ESCALATE = "escalate" # Request human intervention
|
||||
|
||||
|
||||
class ActionSpec(BaseModel):
|
||||
"""
|
||||
Specification for an action to be executed.
|
||||
|
||||
This is the "what to do" part of a PlanStep.
|
||||
"""
|
||||
|
||||
action_type: ActionType
|
||||
|
||||
# For LLM_CALL
|
||||
prompt: str | None = None
|
||||
system_prompt: str | None = None
|
||||
model: str | None = None
|
||||
|
||||
# For TOOL_USE
|
||||
tool_name: str | None = None
|
||||
tool_args: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# For SUB_GRAPH
|
||||
graph_id: str | None = None
|
||||
|
||||
# For FUNCTION
|
||||
function_name: str | None = None
|
||||
function_args: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# For CODE_EXECUTION
|
||||
code: str | None = None
|
||||
language: str = "python"
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class PlanStep(BaseModel):
|
||||
"""
|
||||
A single step in a plan.
|
||||
|
||||
Created by external planner, executed by Worker, evaluated by Judge.
|
||||
"""
|
||||
|
||||
id: str
|
||||
description: str
|
||||
action: ActionSpec
|
||||
|
||||
# Data flow
|
||||
inputs: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Input data for this step (can reference previous step outputs)",
|
||||
)
|
||||
expected_outputs: list[str] = Field(
|
||||
default_factory=list, description="Keys this step should produce"
|
||||
)
|
||||
|
||||
# Dependencies
|
||||
dependencies: list[str] = Field(
|
||||
default_factory=list, description="IDs of steps that must complete before this one"
|
||||
)
|
||||
|
||||
# Human-in-the-loop (HITL)
|
||||
requires_approval: bool = Field(
|
||||
default=False, description="If True, requires human approval before execution"
|
||||
)
|
||||
approval_message: str | None = Field(
|
||||
default=None, description="Message to show human when requesting approval"
|
||||
)
|
||||
|
||||
# Execution state
|
||||
status: StepStatus = StepStatus.PENDING
|
||||
result: Any | None = None
|
||||
error: str | None = None
|
||||
attempts: int = 0
|
||||
max_retries: int = 3
|
||||
|
||||
# Metadata
|
||||
started_at: datetime | None = None
|
||||
completed_at: datetime | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
def is_ready(self, terminal_step_ids: set[str]) -> bool:
|
||||
"""Check if this step is ready to execute (all dependencies finished).
|
||||
|
||||
A step is ready when:
|
||||
1. Its status is PENDING (not yet started)
|
||||
2. All its dependencies are in a terminal state (completed, failed, skipped, or rejected)
|
||||
|
||||
Note: This allows dependent steps to become "ready" even if their dependencies
|
||||
failed. The executor should check if any dependencies failed and handle
|
||||
accordingly (e.g., skip the step or mark it as blocked).
|
||||
|
||||
Args:
|
||||
terminal_step_ids: Set of step IDs that are in a terminal state
|
||||
"""
|
||||
if self.status != StepStatus.PENDING:
|
||||
return False
|
||||
return all(dep in terminal_step_ids for dep in self.dependencies)
|
||||
|
||||
|
||||
class Judgment(BaseModel):
|
||||
"""
|
||||
Result of judging a step execution.
|
||||
|
||||
The Judge evaluates step results and decides what to do next.
|
||||
"""
|
||||
|
||||
action: JudgmentAction
|
||||
reasoning: str
|
||||
feedback: str | None = None # For retry/replan - what went wrong
|
||||
|
||||
# For rule-based judgments
|
||||
rule_matched: str | None = None
|
||||
|
||||
# For LLM-based judgments
|
||||
confidence: float = 1.0
|
||||
llm_used: bool = False
|
||||
|
||||
# Context for replanning
|
||||
context: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class EvaluationRule(BaseModel):
|
||||
"""
|
||||
A rule for the HybridJudge to evaluate step results.
|
||||
|
||||
Rules are checked before falling back to LLM evaluation.
|
||||
"""
|
||||
|
||||
id: str
|
||||
description: str
|
||||
|
||||
# Condition (Python expression evaluated with result, step, goal context)
|
||||
condition: str
|
||||
|
||||
# What to do if condition matches
|
||||
action: JudgmentAction
|
||||
feedback_template: str = "" # Can use {result}, {step}, etc.
|
||||
|
||||
# Priority (higher = checked first)
|
||||
priority: int = 0
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
class Plan(BaseModel):
|
||||
"""
|
||||
A complete execution plan.
|
||||
|
||||
Created by external planner (Claude Code, etc).
|
||||
Executed by FlexibleGraphExecutor.
|
||||
"""
|
||||
|
||||
id: str
|
||||
goal_id: str
|
||||
description: str
|
||||
|
||||
# Steps to execute
|
||||
steps: list[PlanStep] = Field(default_factory=list)
|
||||
|
||||
# Execution state
|
||||
revision: int = 1 # Incremented on replan
|
||||
current_step_idx: int = 0
|
||||
|
||||
# Accumulated context from execution
|
||||
context: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Metadata
|
||||
created_at: datetime = Field(default_factory=datetime.now)
|
||||
created_by: str = "external" # Who created this plan
|
||||
|
||||
# Previous attempt info (for replanning)
|
||||
previous_feedback: str | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data: str | dict) -> "Plan":
|
||||
"""
|
||||
Load a Plan from exported JSON.
|
||||
|
||||
This handles the output from export_graph() and properly converts
|
||||
action_type strings to ActionType enums.
|
||||
|
||||
Args:
|
||||
data: JSON string or dict from export_graph()
|
||||
|
||||
Returns:
|
||||
Plan object ready for FlexibleGraphExecutor
|
||||
|
||||
Example:
|
||||
# Load from export_graph() output
|
||||
exported = export_graph()
|
||||
plan = Plan.from_json(exported)
|
||||
|
||||
# Load from file
|
||||
with open("plan.json") as f:
|
||||
plan = Plan.from_json(json.load(f))
|
||||
"""
|
||||
import json as json_module
|
||||
|
||||
if isinstance(data, str):
|
||||
data = json_module.loads(data)
|
||||
|
||||
# Handle nested "plan" key from export_graph output
|
||||
if "plan" in data:
|
||||
data = data["plan"]
|
||||
|
||||
# Convert steps
|
||||
steps = []
|
||||
for step_data in data.get("steps", []):
|
||||
action_data = step_data.get("action", {})
|
||||
|
||||
# Convert action_type string to enum
|
||||
action_type_str = action_data.get("action_type", "function")
|
||||
action_type = ActionType(action_type_str)
|
||||
|
||||
action = ActionSpec(
|
||||
action_type=action_type,
|
||||
prompt=action_data.get("prompt"),
|
||||
system_prompt=action_data.get("system_prompt"),
|
||||
tool_name=action_data.get("tool_name"),
|
||||
tool_args=action_data.get("tool_args", {}),
|
||||
function_name=action_data.get("function_name"),
|
||||
function_args=action_data.get("function_args", {}),
|
||||
code=action_data.get("code"),
|
||||
)
|
||||
|
||||
step = PlanStep(
|
||||
id=step_data["id"],
|
||||
description=step_data.get("description", ""),
|
||||
action=action,
|
||||
inputs=step_data.get("inputs", {}),
|
||||
expected_outputs=step_data.get("expected_outputs", []),
|
||||
dependencies=step_data.get("dependencies", []),
|
||||
requires_approval=step_data.get("requires_approval", False),
|
||||
approval_message=step_data.get("approval_message"),
|
||||
)
|
||||
steps.append(step)
|
||||
|
||||
return cls(
|
||||
id=data.get("id", "plan"),
|
||||
goal_id=data.get("goal_id", ""),
|
||||
description=data.get("description", ""),
|
||||
steps=steps,
|
||||
context=data.get("context", {}),
|
||||
revision=data.get("revision", 1),
|
||||
)
|
||||
|
||||
def get_step(self, step_id: str) -> PlanStep | None:
|
||||
"""Get a step by ID."""
|
||||
for step in self.steps:
|
||||
if step.id == step_id:
|
||||
return step
|
||||
return None
|
||||
|
||||
def get_ready_steps(self) -> list[PlanStep]:
|
||||
"""Get all steps that are ready to execute.
|
||||
|
||||
A step is ready when all its dependencies are in terminal states
|
||||
(completed, failed, skipped, or rejected).
|
||||
"""
|
||||
terminal_ids = {s.id for s in self.steps if s.status.is_terminal()}
|
||||
return [s for s in self.steps if s.is_ready(terminal_ids)]
|
||||
|
||||
def get_completed_steps(self) -> list[PlanStep]:
|
||||
"""Get all completed steps."""
|
||||
return [s for s in self.steps if s.status == StepStatus.COMPLETED]
|
||||
|
||||
def is_complete(self) -> bool:
|
||||
"""Check if all steps are in terminal states (finished executing).
|
||||
|
||||
Returns True when all steps have reached a terminal state, regardless
|
||||
of whether they succeeded or failed. Use has_failed_steps() to check
|
||||
if any steps failed.
|
||||
"""
|
||||
return all(s.status.is_terminal() for s in self.steps)
|
||||
|
||||
def is_successful(self) -> bool:
|
||||
"""Check if all steps completed successfully."""
|
||||
return all(s.status == StepStatus.COMPLETED for s in self.steps)
|
||||
|
||||
def has_failed_steps(self) -> bool:
|
||||
"""Check if any steps failed, were skipped, or were rejected."""
|
||||
return any(
|
||||
s.status in (StepStatus.FAILED, StepStatus.SKIPPED, StepStatus.REJECTED)
|
||||
for s in self.steps
|
||||
)
|
||||
|
||||
def get_failed_steps(self) -> list[PlanStep]:
|
||||
"""Get all steps that failed, were skipped, or were rejected."""
|
||||
return [
|
||||
s
|
||||
for s in self.steps
|
||||
if s.status in (StepStatus.FAILED, StepStatus.SKIPPED, StepStatus.REJECTED)
|
||||
]
|
||||
|
||||
def to_feedback_context(self) -> dict[str, Any]:
|
||||
"""Create context for replanning."""
|
||||
return {
|
||||
"plan_id": self.id,
|
||||
"revision": self.revision,
|
||||
"completed_steps": [
|
||||
{
|
||||
"id": s.id,
|
||||
"description": s.description,
|
||||
"result": s.result,
|
||||
}
|
||||
for s in self.get_completed_steps()
|
||||
],
|
||||
"failed_steps": [
|
||||
{
|
||||
"id": s.id,
|
||||
"description": s.description,
|
||||
"error": s.error,
|
||||
"attempts": s.attempts,
|
||||
}
|
||||
for s in self.steps
|
||||
if s.status == StepStatus.FAILED
|
||||
],
|
||||
"context": self.context,
|
||||
}
|
||||
|
||||
|
||||
class ExecutionStatus(StrEnum):
|
||||
"""Status of plan execution."""
|
||||
|
||||
COMPLETED = "completed"
|
||||
AWAITING_APPROVAL = "awaiting_approval" # Paused for human approval
|
||||
NEEDS_REPLAN = "needs_replan"
|
||||
NEEDS_ESCALATION = "needs_escalation"
|
||||
REJECTED = "rejected" # Human rejected a step
|
||||
ABORTED = "aborted" # Human aborted execution
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class PlanExecutionResult(BaseModel):
|
||||
"""
|
||||
Result of executing a plan.
|
||||
|
||||
Returned to external planner with status and feedback.
|
||||
"""
|
||||
|
||||
status: ExecutionStatus
|
||||
|
||||
# Results from completed steps
|
||||
results: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# For needs_replan - what to tell the planner
|
||||
feedback: str | None = None
|
||||
feedback_context: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Steps that completed before stopping
|
||||
completed_steps: list[str] = Field(default_factory=list)
|
||||
|
||||
# Metrics
|
||||
steps_executed: int = 0
|
||||
total_tokens: int = 0
|
||||
total_latency_ms: int = 0
|
||||
|
||||
# Error info (for failed status)
|
||||
error: str | None = None
|
||||
|
||||
model_config = {"extra": "allow"}
|
||||
|
||||
|
||||
def load_export(data: str | dict) -> tuple["Plan", Any]:
|
||||
"""
|
||||
Load both Plan and Goal from export_graph() output.
|
||||
|
||||
The export_graph() MCP tool returns both the plan and the goal that was
|
||||
defined and approved during the agent building process. This function
|
||||
loads both so you can use them with FlexibleGraphExecutor.
|
||||
|
||||
Args:
|
||||
data: JSON string or dict from export_graph()
|
||||
|
||||
Returns:
|
||||
Tuple of (Plan, Goal) ready for FlexibleGraphExecutor
|
||||
|
||||
Example:
|
||||
# Load from export_graph() output
|
||||
exported = export_graph()
|
||||
plan, goal = load_export(exported)
|
||||
|
||||
result = await executor.execute_plan(plan, goal, context)
|
||||
"""
|
||||
import json as json_module
|
||||
|
||||
from framework.graph.goal import Goal
|
||||
|
||||
if isinstance(data, str):
|
||||
data = json_module.loads(data)
|
||||
|
||||
# Load plan
|
||||
plan = Plan.from_json(data)
|
||||
|
||||
# Load goal
|
||||
goal_data = data.get("goal", {})
|
||||
if goal_data:
|
||||
goal = Goal.model_validate(goal_data)
|
||||
else:
|
||||
# Fallback: create minimal goal from plan metadata
|
||||
goal = Goal(
|
||||
id=plan.goal_id,
|
||||
name=plan.goal_id,
|
||||
description=plan.description,
|
||||
success_criteria=[],
|
||||
constraints=[],
|
||||
)
|
||||
|
||||
return plan, goal
|
||||
@@ -1,620 +0,0 @@
|
||||
"""
|
||||
Worker Node for Executing Plan Steps.
|
||||
|
||||
The Worker executes individual plan steps by dispatching to the
|
||||
appropriate executor based on action type:
|
||||
- LLM calls
|
||||
- Tool usage
|
||||
- Sub-graph execution
|
||||
- Function calls
|
||||
- Code execution (sandboxed)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from framework.graph.code_sandbox import CodeSandbox
|
||||
from framework.graph.plan import (
|
||||
ActionSpec,
|
||||
ActionType,
|
||||
PlanStep,
|
||||
)
|
||||
from framework.llm.provider import LLMProvider, Tool
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_llm_json_response(text: str) -> tuple[Any | None, str]:
|
||||
"""
|
||||
Parse JSON from LLM response, handling markdown code blocks.
|
||||
|
||||
LLMs often return JSON wrapped in markdown code blocks like:
|
||||
```json
|
||||
{"key": "value"}
|
||||
```
|
||||
|
||||
This function extracts and parses the JSON.
|
||||
|
||||
Args:
|
||||
text: Raw LLM response text
|
||||
|
||||
Returns:
|
||||
Tuple of (parsed_json_or_None, cleaned_text)
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None, str(text)
|
||||
|
||||
cleaned = text.strip()
|
||||
|
||||
# Try to extract JSON from markdown code blocks
|
||||
# Pattern: ```json ... ``` or ``` ... ```
|
||||
code_block_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```"
|
||||
matches = re.findall(code_block_pattern, cleaned)
|
||||
|
||||
if matches:
|
||||
# Try to parse each match
|
||||
for match in matches:
|
||||
try:
|
||||
parsed = json.loads(match.strip())
|
||||
return parsed, match.strip()
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(
|
||||
f"Failed to parse JSON from code block: {e}. "
|
||||
f"Content preview: {match.strip()[:100]}..."
|
||||
)
|
||||
continue
|
||||
|
||||
# No code blocks or parsing failed - try parsing the whole response
|
||||
try:
|
||||
parsed = json.loads(cleaned)
|
||||
return parsed, cleaned
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(
|
||||
f"Failed to parse entire response as JSON: {e}. Content preview: {cleaned[:100]}..."
|
||||
)
|
||||
|
||||
# Try to find JSON-like content (starts with { or [)
|
||||
json_start_pattern = r"(\{[\s\S]*\}|\[[\s\S]*\])"
|
||||
json_matches = re.findall(json_start_pattern, cleaned)
|
||||
|
||||
for match in json_matches:
|
||||
try:
|
||||
parsed = json.loads(match)
|
||||
return parsed, match
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"Failed to parse JSON pattern: {e}. Content preview: {match[:100]}...")
|
||||
continue
|
||||
|
||||
# Could not parse as JSON - log warning
|
||||
logger.warning(
|
||||
f"Could not parse LLM response as JSON after trying all strategies. "
|
||||
f"Response preview: {cleaned[:200]}..."
|
||||
)
|
||||
return None, cleaned
|
||||
|
||||
|
||||
@dataclass
|
||||
class StepExecutionResult:
|
||||
"""Result of executing a plan step."""
|
||||
|
||||
success: bool
|
||||
outputs: dict[str, Any] = field(default_factory=dict)
|
||||
error: str | None = None
|
||||
error_type: str | None = None # For judge rules: timeout, rate_limit, etc.
|
||||
|
||||
# Metadata
|
||||
tokens_used: int = 0
|
||||
latency_ms: int = 0
|
||||
executor_type: str = ""
|
||||
|
||||
|
||||
class WorkerNode:
|
||||
"""
|
||||
Executes plan steps by dispatching to appropriate executors.
|
||||
|
||||
Usage:
|
||||
worker = WorkerNode(
|
||||
runtime=runtime,
|
||||
llm=llm_provider,
|
||||
tools=tool_registry,
|
||||
)
|
||||
|
||||
result = await worker.execute(step, context)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
runtime: Runtime,
|
||||
llm: LLMProvider | None = None,
|
||||
tools: dict[str, Tool] | None = None,
|
||||
tool_executor: Callable | None = None,
|
||||
functions: dict[str, Callable] | None = None,
|
||||
sub_graph_executor: Callable | None = None,
|
||||
sandbox: CodeSandbox | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Worker.
|
||||
|
||||
Args:
|
||||
runtime: Runtime for decision logging
|
||||
llm: LLM provider for LLM_CALL actions
|
||||
tools: Available tools for TOOL_USE actions
|
||||
tool_executor: Function to execute tools
|
||||
functions: Registered functions for FUNCTION actions
|
||||
sub_graph_executor: Function to execute sub-graphs
|
||||
sandbox: Code sandbox for CODE_EXECUTION actions
|
||||
"""
|
||||
self.runtime = runtime
|
||||
self.llm = llm
|
||||
self.tools = tools or {}
|
||||
self.tool_executor = tool_executor
|
||||
self.functions = functions or {}
|
||||
self.sub_graph_executor = sub_graph_executor
|
||||
self.sandbox = sandbox or CodeSandbox()
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
step: PlanStep,
|
||||
context: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""
|
||||
Execute a plan step.
|
||||
|
||||
Args:
|
||||
step: The step to execute
|
||||
context: Current execution context
|
||||
|
||||
Returns:
|
||||
StepExecutionResult with outputs and status
|
||||
"""
|
||||
# Record decision
|
||||
decision_id = self.runtime.decide(
|
||||
intent=f"Execute plan step: {step.description}",
|
||||
options=[
|
||||
{
|
||||
"id": step.action.action_type.value,
|
||||
"description": f"Execute {step.action.action_type.value} action",
|
||||
"action_type": step.action.action_type.value,
|
||||
}
|
||||
],
|
||||
chosen=step.action.action_type.value,
|
||||
reasoning=f"Step requires {step.action.action_type.value}",
|
||||
context={"step_id": step.id, "inputs": step.inputs},
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Resolve inputs from context
|
||||
resolved_inputs = self._resolve_inputs(step.inputs, context)
|
||||
|
||||
# Dispatch to appropriate executor
|
||||
result = await self._dispatch(step.action, resolved_inputs, context)
|
||||
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
result.latency_ms = latency_ms
|
||||
|
||||
# Record outcome
|
||||
self.runtime.record_outcome(
|
||||
decision_id=decision_id,
|
||||
success=result.success,
|
||||
result=result.outputs if result.success else result.error,
|
||||
tokens_used=result.tokens_used,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
self.runtime.record_outcome(
|
||||
decision_id=decision_id,
|
||||
success=False,
|
||||
error=str(e),
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type="exception",
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
def _resolve_inputs(
|
||||
self,
|
||||
inputs: dict[str, Any],
|
||||
context: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
"""Resolve input references from context."""
|
||||
resolved = {}
|
||||
|
||||
for key, value in inputs.items():
|
||||
if isinstance(value, str) and value.startswith("$"):
|
||||
# Reference to context variable
|
||||
ref_key = value[1:] # Remove $
|
||||
resolved[key] = context.get(ref_key, value)
|
||||
else:
|
||||
resolved[key] = value
|
||||
|
||||
return resolved
|
||||
|
||||
async def _dispatch(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
context: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Dispatch to appropriate executor based on action type."""
|
||||
if action.action_type == ActionType.LLM_CALL:
|
||||
return await self._execute_llm_call(action, inputs, context)
|
||||
|
||||
elif action.action_type == ActionType.TOOL_USE:
|
||||
return await self._execute_tool_use(action, inputs)
|
||||
|
||||
elif action.action_type == ActionType.SUB_GRAPH:
|
||||
return await self._execute_sub_graph(action, inputs, context)
|
||||
|
||||
elif action.action_type == ActionType.FUNCTION:
|
||||
return await self._execute_function(action, inputs)
|
||||
|
||||
elif action.action_type == ActionType.CODE_EXECUTION:
|
||||
return self._execute_code(action, inputs, context)
|
||||
|
||||
else:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=f"Unknown action type: {action.action_type}",
|
||||
error_type="invalid_action",
|
||||
)
|
||||
|
||||
async def _execute_llm_call(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
context: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Execute an LLM call action."""
|
||||
if self.llm is None:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No LLM provider configured",
|
||||
error_type="configuration",
|
||||
executor_type="llm_call",
|
||||
)
|
||||
|
||||
try:
|
||||
# Build prompt with context data
|
||||
prompt = action.prompt or ""
|
||||
|
||||
# First try format placeholders (for prompts like "Hello {name}")
|
||||
if inputs:
|
||||
try:
|
||||
prompt = prompt.format(**inputs)
|
||||
except (KeyError, ValueError):
|
||||
pass # Keep original prompt if formatting fails
|
||||
|
||||
# Always append context data so LLM can personalize
|
||||
# This ensures the LLM has access to lead info, company context, etc.
|
||||
if inputs:
|
||||
context_section = "\n\n--- Context Data ---\n"
|
||||
for key, value in inputs.items():
|
||||
if isinstance(value, dict | list):
|
||||
context_section += f"{key}: {json.dumps(value, indent=2)}\n"
|
||||
else:
|
||||
context_section += f"{key}: {value}\n"
|
||||
prompt = prompt + context_section
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
|
||||
response = await self.llm.acomplete(
|
||||
messages=messages,
|
||||
system=action.system_prompt,
|
||||
)
|
||||
|
||||
# Try to parse JSON from LLM response
|
||||
# LLMs often return JSON wrapped in markdown code blocks
|
||||
parsed_json, _ = parse_llm_json_response(response.content)
|
||||
|
||||
# If JSON was parsed successfully, use it as the result
|
||||
# Otherwise, use the raw text
|
||||
result_value = parsed_json if parsed_json is not None else response.content
|
||||
|
||||
return StepExecutionResult(
|
||||
success=True,
|
||||
outputs={
|
||||
"result": result_value,
|
||||
"response": response.content, # Always keep raw response
|
||||
"parsed_json": parsed_json, # Explicit parsed JSON (or None)
|
||||
},
|
||||
tokens_used=response.input_tokens + response.output_tokens,
|
||||
executor_type="llm_call",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_type = "rate_limit" if "rate" in str(e).lower() else "llm_error"
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type=error_type,
|
||||
executor_type="llm_call",
|
||||
)
|
||||
|
||||
async def _execute_tool_use(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Execute a tool use action."""
|
||||
tool_name = action.tool_name
|
||||
if not tool_name:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No tool name specified",
|
||||
error_type="invalid_action",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
# Merge action args with inputs
|
||||
args = {**action.tool_args, **inputs}
|
||||
|
||||
# Resolve any $variable references in the merged args
|
||||
# (tool_args may contain $refs that should be resolved from inputs)
|
||||
resolved_args = {}
|
||||
for key, value in args.items():
|
||||
if isinstance(value, str) and value.startswith("$"):
|
||||
ref_key = value[1:] # Remove $
|
||||
resolved_args[key] = args.get(ref_key, value)
|
||||
else:
|
||||
resolved_args[key] = value
|
||||
args = resolved_args
|
||||
|
||||
# First, check if we have a registered function with this name
|
||||
# This allows simpler tool registration without full Tool/ToolExecutor setup
|
||||
if tool_name in self.functions:
|
||||
try:
|
||||
func = self.functions[tool_name]
|
||||
result = func(**args)
|
||||
|
||||
# Handle async functions
|
||||
if hasattr(result, "__await__"):
|
||||
result = await result
|
||||
|
||||
# If result is already a dict with success/outputs, use it directly
|
||||
if isinstance(result, dict) and "success" in result:
|
||||
return StepExecutionResult(
|
||||
success=result.get("success", False),
|
||||
outputs=result.get("outputs", {}),
|
||||
error=result.get("error"),
|
||||
error_type=result.get("error_type"),
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
# Otherwise wrap the result
|
||||
return StepExecutionResult(
|
||||
success=True,
|
||||
outputs={"result": result},
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type="tool_exception",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
# Fall back to formal Tool registry
|
||||
if tool_name not in self.tools:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=f"Tool '{tool_name}' not found",
|
||||
error_type="missing_tool",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
if self.tool_executor is None:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No tool executor configured",
|
||||
error_type="configuration",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
try:
|
||||
# Execute tool via formal executor
|
||||
from framework.llm.provider import ToolUse
|
||||
|
||||
tool_use = ToolUse(
|
||||
id=f"step_{tool_name}",
|
||||
name=tool_name,
|
||||
input=args,
|
||||
)
|
||||
|
||||
result = self.tool_executor(tool_use)
|
||||
|
||||
if result.is_error:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
outputs={},
|
||||
error=result.content,
|
||||
error_type="tool_error",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
# Parse JSON result and unpack fields into outputs
|
||||
# Tools return JSON like {"lead_email": "...", "company_name": "..."}
|
||||
# We want each field as a separate output key
|
||||
outputs = {"result": result.content}
|
||||
try:
|
||||
parsed = json.loads(result.content)
|
||||
if isinstance(parsed, dict):
|
||||
# Unpack all fields from the JSON response
|
||||
outputs.update(parsed)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass # Keep result as-is if not valid JSON
|
||||
|
||||
return StepExecutionResult(
|
||||
success=True,
|
||||
outputs=outputs,
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type="tool_exception",
|
||||
executor_type="tool_use",
|
||||
)
|
||||
|
||||
async def _execute_sub_graph(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
context: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Execute a sub-graph action."""
|
||||
if self.sub_graph_executor is None:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No sub-graph executor configured",
|
||||
error_type="configuration",
|
||||
executor_type="sub_graph",
|
||||
)
|
||||
|
||||
graph_id = action.graph_id
|
||||
if not graph_id:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No graph ID specified",
|
||||
error_type="invalid_action",
|
||||
executor_type="sub_graph",
|
||||
)
|
||||
|
||||
try:
|
||||
result = await self.sub_graph_executor(graph_id, inputs, context)
|
||||
|
||||
return StepExecutionResult(
|
||||
success=result.success,
|
||||
outputs=result.output if result.success else {},
|
||||
error=result.error if not result.success else None,
|
||||
tokens_used=result.total_tokens,
|
||||
executor_type="sub_graph",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type="sub_graph_exception",
|
||||
executor_type="sub_graph",
|
||||
)
|
||||
|
||||
async def _execute_function(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Execute a function action."""
|
||||
func_name = action.function_name
|
||||
if not func_name:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No function name specified",
|
||||
error_type="invalid_action",
|
||||
executor_type="function",
|
||||
)
|
||||
|
||||
if func_name not in self.functions:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=f"Function '{func_name}' not registered",
|
||||
error_type="missing_function",
|
||||
executor_type="function",
|
||||
)
|
||||
|
||||
try:
|
||||
func = self.functions[func_name]
|
||||
|
||||
# Merge action args with inputs
|
||||
args = {**action.function_args, **inputs}
|
||||
|
||||
# Execute function
|
||||
result = func(**args)
|
||||
|
||||
# Handle async functions
|
||||
if hasattr(result, "__await__"):
|
||||
result = await result
|
||||
|
||||
return StepExecutionResult(
|
||||
success=True,
|
||||
outputs={"result": result},
|
||||
executor_type="function",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
error_type="function_exception",
|
||||
executor_type="function",
|
||||
)
|
||||
|
||||
def _execute_code(
|
||||
self,
|
||||
action: ActionSpec,
|
||||
inputs: dict[str, Any],
|
||||
context: dict[str, Any],
|
||||
) -> StepExecutionResult:
|
||||
"""Execute a code action in sandbox."""
|
||||
code = action.code
|
||||
if not code:
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error="No code specified",
|
||||
error_type="invalid_action",
|
||||
executor_type="code_execution",
|
||||
)
|
||||
|
||||
# Merge inputs with context for code
|
||||
code_inputs = {**context, **inputs}
|
||||
|
||||
# Execute in sandbox
|
||||
sandbox_result = self.sandbox.execute(code, code_inputs)
|
||||
|
||||
if sandbox_result.success:
|
||||
return StepExecutionResult(
|
||||
success=True,
|
||||
outputs={
|
||||
"result": sandbox_result.result,
|
||||
**sandbox_result.variables,
|
||||
},
|
||||
executor_type="code_execution",
|
||||
latency_ms=sandbox_result.execution_time_ms,
|
||||
)
|
||||
else:
|
||||
error_type = "security" if "Security" in (sandbox_result.error or "") else "code_error"
|
||||
return StepExecutionResult(
|
||||
success=False,
|
||||
error=sandbox_result.error,
|
||||
error_type=error_type,
|
||||
executor_type="code_execution",
|
||||
latency_ms=sandbox_result.execution_time_ms,
|
||||
)
|
||||
|
||||
def register_function(self, name: str, func: Callable) -> None:
|
||||
"""Register a function for FUNCTION actions."""
|
||||
self.functions[name] = func
|
||||
|
||||
def register_tool(self, tool: Tool) -> None:
|
||||
"""Register a tool for TOOL_USE actions."""
|
||||
self.tools[tool.name] = tool
|
||||
@@ -36,12 +36,7 @@ from framework.graph import ( # noqa: E402
|
||||
NodeSpec,
|
||||
SuccessCriterion,
|
||||
)
|
||||
from framework.graph.plan import Plan # noqa: E402
|
||||
|
||||
# Testing framework imports
|
||||
from framework.testing.prompts import ( # noqa: E402
|
||||
PYTEST_TEST_FILE_HEADER,
|
||||
)
|
||||
from framework.testing.prompts import PYTEST_TEST_FILE_HEADER # noqa: E402
|
||||
from framework.utils.io import atomic_write # noqa: E402
|
||||
|
||||
# Initialize MCP server
|
||||
@@ -587,13 +582,12 @@ def add_node(
|
||||
description: Annotated[str, "What this node does"],
|
||||
node_type: Annotated[
|
||||
str,
|
||||
"Type: event_loop (recommended), function, router. "
|
||||
"Deprecated: llm_generate, llm_tool_use (use event_loop instead)",
|
||||
"Type: event_loop (recommended), router.",
|
||||
],
|
||||
input_keys: Annotated[str, "JSON array of keys this node reads from shared memory"],
|
||||
output_keys: Annotated[str, "JSON array of keys this node writes to shared memory"],
|
||||
system_prompt: Annotated[str, "Instructions for LLM nodes"] = "",
|
||||
tools: Annotated[str, "JSON array of tool names for event_loop or llm_tool_use nodes"] = "[]",
|
||||
tools: Annotated[str, "JSON array of tool names for event_loop nodes"] = "[]",
|
||||
routes: Annotated[
|
||||
str, "JSON object mapping conditions to target node IDs for router nodes"
|
||||
] = "{}",
|
||||
@@ -665,24 +659,18 @@ def add_node(
|
||||
errors.append("Node must have an id")
|
||||
if not name:
|
||||
errors.append("Node must have a name")
|
||||
if node_type == "llm_tool_use" and not tools_list:
|
||||
errors.append(f"Node '{node_id}' of type llm_tool_use must specify tools")
|
||||
|
||||
# Reject removed node types
|
||||
if node_type in ("function", "llm_tool_use", "llm_generate"):
|
||||
errors.append(f"Node type '{node_type}' is no longer supported. Use 'event_loop' instead.")
|
||||
|
||||
if node_type == "router" and not routes_dict:
|
||||
errors.append(f"Router node '{node_id}' must specify routes")
|
||||
if node_type in ("llm_generate", "llm_tool_use") and not system_prompt:
|
||||
warnings.append(f"LLM node '{node_id}' should have a system_prompt")
|
||||
|
||||
# EventLoopNode validation
|
||||
if node_type == "event_loop" and not system_prompt:
|
||||
warnings.append(f"Event loop node '{node_id}' should have a system_prompt")
|
||||
|
||||
# Deprecated type warnings
|
||||
if node_type in ("llm_generate", "llm_tool_use"):
|
||||
warnings.append(
|
||||
f"Node type '{node_type}' is deprecated. Use 'event_loop' instead. "
|
||||
"EventLoopNode supports tool use, streaming, and judge-based evaluation."
|
||||
)
|
||||
|
||||
# Warn about client_facing on nodes with tools (likely autonomous work)
|
||||
if node_type == "event_loop" and client_facing and tools_list:
|
||||
warnings.append(
|
||||
@@ -838,8 +826,7 @@ def update_node(
|
||||
description: Annotated[str, "Updated description"] = "",
|
||||
node_type: Annotated[
|
||||
str,
|
||||
"Updated type: event_loop (recommended), function, router. "
|
||||
"Deprecated: llm_generate, llm_tool_use",
|
||||
"Updated type: event_loop (recommended), router.",
|
||||
] = "",
|
||||
input_keys: Annotated[str, "Updated JSON array of input keys"] = "",
|
||||
output_keys: Annotated[str, "Updated JSON array of output keys"] = "",
|
||||
@@ -919,24 +906,19 @@ def update_node(
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
if node.node_type == "llm_tool_use" and not node.tools:
|
||||
errors.append(f"Node '{node_id}' of type llm_tool_use must specify tools")
|
||||
# Reject removed node types
|
||||
if node.node_type in ("function", "llm_tool_use", "llm_generate"):
|
||||
errors.append(
|
||||
f"Node type '{node.node_type}' is no longer supported. Use 'event_loop' instead."
|
||||
)
|
||||
|
||||
if node.node_type == "router" and not node.routes:
|
||||
errors.append(f"Router node '{node_id}' must specify routes")
|
||||
if node.node_type in ("llm_generate", "llm_tool_use") and not node.system_prompt:
|
||||
warnings.append(f"LLM node '{node_id}' should have a system_prompt")
|
||||
|
||||
# EventLoopNode validation
|
||||
if node.node_type == "event_loop" and not node.system_prompt:
|
||||
warnings.append(f"Event loop node '{node_id}' should have a system_prompt")
|
||||
|
||||
# Deprecated type warnings
|
||||
if node.node_type in ("llm_generate", "llm_tool_use"):
|
||||
warnings.append(
|
||||
f"Node type '{node.node_type}' is deprecated. Use 'event_loop' instead. "
|
||||
"EventLoopNode supports tool use, streaming, and judge-based evaluation."
|
||||
)
|
||||
|
||||
# nullable_output_keys must be a subset of output_keys
|
||||
if node.nullable_output_keys:
|
||||
invalid_nullable = [k for k in node.nullable_output_keys if k not in node.output_keys]
|
||||
@@ -1390,17 +1372,6 @@ def validate_graph() -> str:
|
||||
f"must be a subset of output_keys {node.output_keys}"
|
||||
)
|
||||
|
||||
# Deprecated node type warnings
|
||||
deprecated_nodes = [
|
||||
{"node_id": n.id, "type": n.node_type, "replacement": "event_loop"}
|
||||
for n in session.nodes
|
||||
if n.node_type in ("llm_generate", "llm_tool_use")
|
||||
]
|
||||
for dn in deprecated_nodes:
|
||||
warnings.append(
|
||||
f"Node '{dn['node_id']}' uses deprecated type '{dn['type']}'. Use 'event_loop' instead."
|
||||
)
|
||||
|
||||
# Warn if all event_loop nodes are client_facing (common misconfiguration)
|
||||
el_nodes = [n for n in session.nodes if n.node_type == "event_loop"]
|
||||
cf_el_nodes = [n for n in el_nodes if n.client_facing]
|
||||
@@ -1436,7 +1407,6 @@ def validate_graph() -> str:
|
||||
"event_loop_nodes": event_loop_nodes,
|
||||
"client_facing_nodes": client_facing_nodes,
|
||||
"feedback_edges": feedback_edges,
|
||||
"deprecated_node_types": deprecated_nodes,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1646,9 +1616,8 @@ def export_graph() -> str:
|
||||
"""
|
||||
Export the validated graph as a GraphSpec for GraphExecutor.
|
||||
|
||||
Exports the complete agent definition including nodes, edges, goal,
|
||||
and evaluation rules. The GraphExecutor runs the graph with dynamic
|
||||
edge traversal and routing logic.
|
||||
Exports the complete agent definition including nodes, edges, and goal.
|
||||
The GraphExecutor runs the graph with dynamic edge traversal and routing logic.
|
||||
|
||||
AUTOMATICALLY WRITES FILES TO DISK:
|
||||
- exports/{agent-name}/agent.json - Full agent specification
|
||||
@@ -1856,7 +1825,6 @@ def export_graph() -> str:
|
||||
"files_written": files_written,
|
||||
"graph": graph_spec,
|
||||
"goal": session.goal.model_dump(),
|
||||
"evaluation_rules": _evaluation_rules,
|
||||
"required_tools": list(all_tools),
|
||||
"node_count": len(session.nodes),
|
||||
"edge_count": len(edges_list),
|
||||
@@ -1966,9 +1934,6 @@ def get_session_status() -> str:
|
||||
"mcp_servers": [s["name"] for s in session.mcp_servers],
|
||||
"event_loop_nodes": [n.id for n in session.nodes if n.node_type == "event_loop"],
|
||||
"client_facing_nodes": [n.id for n in session.nodes if n.client_facing],
|
||||
"deprecated_nodes": [
|
||||
n.id for n in session.nodes if n.node_type in ("llm_generate", "llm_tool_use")
|
||||
],
|
||||
"feedback_edges": [e.id for e in session.edges if e.priority < 0],
|
||||
}
|
||||
)
|
||||
@@ -2139,7 +2104,7 @@ def add_mcp_server(
|
||||
"total_mcp_servers": len(session.mcp_servers),
|
||||
"note": (
|
||||
f"MCP server '{name}' registered with {len(tool_names)} tools. "
|
||||
"These tools can now be used in llm_tool_use nodes."
|
||||
"These tools can now be used in event_loop nodes."
|
||||
),
|
||||
},
|
||||
indent=2,
|
||||
@@ -2240,7 +2205,7 @@ def list_mcp_tools(
|
||||
"success": True,
|
||||
"tools_by_server": all_tools,
|
||||
"total_tools": total_tools,
|
||||
"note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes",
|
||||
"note": "Use these tool names in the 'tools' parameter when adding event_loop nodes",
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
@@ -2339,23 +2304,6 @@ def test_node(
|
||||
+ f"Max visits per graph run: {node_spec.max_node_visits}."
|
||||
)
|
||||
|
||||
elif node_spec.node_type in ("llm_generate", "llm_tool_use"):
|
||||
# Legacy LLM node types
|
||||
result["system_prompt"] = node_spec.system_prompt
|
||||
result["available_tools"] = node_spec.tools
|
||||
result["deprecation_warning"] = (
|
||||
f"Node type '{node_spec.node_type}' is deprecated. Use 'event_loop' instead."
|
||||
)
|
||||
|
||||
if mock_llm_response:
|
||||
result["mock_response"] = mock_llm_response
|
||||
result["simulation"] = "LLM would receive prompt and produce response"
|
||||
else:
|
||||
result["simulation"] = "LLM would be called with the system prompt and input data"
|
||||
|
||||
elif node_spec.node_type == "function":
|
||||
result["simulation"] = "Function node would execute deterministic logic"
|
||||
|
||||
# Show memory state after (simulated)
|
||||
result["expected_memory_state"] = {
|
||||
"inputs_available": {k: input_data.get(k, "<not provided>") for k in node_spec.input_keys},
|
||||
@@ -2449,7 +2397,7 @@ def test_graph(
|
||||
"writes": current_node.output_keys,
|
||||
}
|
||||
|
||||
if current_node.node_type in ("llm_generate", "llm_tool_use", "event_loop"):
|
||||
if current_node.node_type == "event_loop":
|
||||
step_info["prompt_preview"] = (
|
||||
current_node.system_prompt[:200] + "..."
|
||||
if current_node.system_prompt and len(current_node.system_prompt) > 200
|
||||
@@ -2520,466 +2468,6 @@ def test_graph(
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FLEXIBLE EXECUTION TOOLS (Worker-Judge Pattern)
|
||||
# =============================================================================
|
||||
|
||||
# Storage for evaluation rules
|
||||
_evaluation_rules: list[dict] = []
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def add_evaluation_rule(
|
||||
rule_id: Annotated[str, "Unique identifier for the rule"],
|
||||
description: Annotated[str, "Human-readable description of what this rule checks"],
|
||||
condition: Annotated[
|
||||
str,
|
||||
"Python expression with result, step, goal context. E.g., 'result.get(\"success\")'",
|
||||
],
|
||||
action: Annotated[str, "Action when rule matches: accept, retry, replan, escalate"],
|
||||
feedback_template: Annotated[
|
||||
str, "Template for feedback message, can use {result}, {step}"
|
||||
] = "",
|
||||
priority: Annotated[int, "Rule priority (higher = checked first)"] = 0,
|
||||
) -> str:
|
||||
"""
|
||||
Add an evaluation rule for the HybridJudge.
|
||||
|
||||
Rules are checked in priority order before falling back to LLM evaluation.
|
||||
Use this to define deterministic success/failure conditions.
|
||||
|
||||
Example conditions:
|
||||
- 'result.get("success") == True' - Check for explicit success flag
|
||||
- 'result.get("error_type") == "timeout"' - Check for specific error type
|
||||
- 'len(result.get("data", [])) > 0' - Check for non-empty data
|
||||
"""
|
||||
global _evaluation_rules
|
||||
|
||||
# Validate action
|
||||
valid_actions = ["accept", "retry", "replan", "escalate"]
|
||||
if action.lower() not in valid_actions:
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": f"Invalid action '{action}'. Must be one of: {valid_actions}",
|
||||
}
|
||||
)
|
||||
|
||||
# Check for duplicate
|
||||
if any(r["id"] == rule_id for r in _evaluation_rules):
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": f"Rule '{rule_id}' already exists",
|
||||
}
|
||||
)
|
||||
|
||||
rule = {
|
||||
"id": rule_id,
|
||||
"description": description,
|
||||
"condition": condition,
|
||||
"action": action.lower(),
|
||||
"feedback_template": feedback_template,
|
||||
"priority": priority,
|
||||
}
|
||||
|
||||
_evaluation_rules.append(rule)
|
||||
_evaluation_rules.sort(key=lambda r: -r["priority"])
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"rule": rule,
|
||||
"total_rules": len(_evaluation_rules),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def list_evaluation_rules() -> str:
|
||||
"""List all configured evaluation rules for the HybridJudge."""
|
||||
return json.dumps(
|
||||
{
|
||||
"rules": _evaluation_rules,
|
||||
"total": len(_evaluation_rules),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def remove_evaluation_rule(
|
||||
rule_id: Annotated[str, "ID of the rule to remove"],
|
||||
) -> str:
|
||||
"""Remove an evaluation rule."""
|
||||
global _evaluation_rules
|
||||
|
||||
for i, rule in enumerate(_evaluation_rules):
|
||||
if rule["id"] == rule_id:
|
||||
_evaluation_rules.pop(i)
|
||||
return json.dumps({"success": True, "removed": rule_id})
|
||||
|
||||
return json.dumps({"success": False, "error": f"Rule '{rule_id}' not found"})
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def create_plan(
|
||||
plan_id: Annotated[str, "Unique identifier for the plan"],
|
||||
goal_id: Annotated[str, "ID of the goal this plan achieves"],
|
||||
description: Annotated[str, "Description of what this plan does"],
|
||||
steps: Annotated[
|
||||
str,
|
||||
"JSON array of plan steps with id, description, action, inputs, outputs, deps",
|
||||
],
|
||||
context: Annotated[str, "JSON object with initial context for execution"] = "{}",
|
||||
) -> str:
|
||||
"""
|
||||
Create a plan for flexible execution.
|
||||
|
||||
Plans are executed by the Worker-Judge loop. Each step specifies:
|
||||
- id: Unique step identifier
|
||||
- description: What this step does
|
||||
- action: Object with action_type and parameters
|
||||
- action_type: "llm_call", "tool_use", "function", "code_execution", "sub_graph"
|
||||
- For llm_call: prompt, system_prompt
|
||||
- For tool_use: tool_name, tool_args
|
||||
- For function: function_name, function_args
|
||||
- For code_execution: code
|
||||
- inputs: Dict mapping input names to values or "$variable" references
|
||||
- expected_outputs: List of output keys this step should produce
|
||||
- dependencies: List of step IDs that must complete first (deps)
|
||||
|
||||
Example step:
|
||||
{
|
||||
"id": "step_1",
|
||||
"description": "Fetch user data",
|
||||
"action": {"action_type": "tool_use", "tool_name": "get_user", ...},
|
||||
"inputs": {"user_id": "$input_user_id"},
|
||||
"expected_outputs": ["user_data"],
|
||||
"dependencies": []
|
||||
}
|
||||
"""
|
||||
try:
|
||||
steps_list = json.loads(steps)
|
||||
context_dict = json.loads(context)
|
||||
except json.JSONDecodeError as e:
|
||||
return json.dumps({"success": False, "error": f"Invalid JSON: {e}"})
|
||||
|
||||
# Validate steps
|
||||
errors = []
|
||||
step_ids = set()
|
||||
|
||||
for i, step in enumerate(steps_list):
|
||||
if "id" not in step:
|
||||
errors.append(f"Step {i} missing 'id'")
|
||||
else:
|
||||
if step["id"] in step_ids:
|
||||
errors.append(f"Duplicate step id: {step['id']}")
|
||||
step_ids.add(step["id"])
|
||||
|
||||
if "description" not in step:
|
||||
errors.append(f"Step {i} missing 'description'")
|
||||
|
||||
if "action" not in step:
|
||||
errors.append(f"Step {i} missing 'action'")
|
||||
elif "action_type" not in step.get("action", {}):
|
||||
errors.append(f"Step {i} action missing 'action_type'")
|
||||
|
||||
# Check dependencies exist
|
||||
for dep in step.get("dependencies", []):
|
||||
if dep not in step_ids:
|
||||
errors.append(f"Step {step.get('id', i)} has unknown dependency: {dep}")
|
||||
|
||||
if errors:
|
||||
return json.dumps({"success": False, "errors": errors})
|
||||
|
||||
# Build plan object
|
||||
plan = {
|
||||
"id": plan_id,
|
||||
"goal_id": goal_id,
|
||||
"description": description,
|
||||
"steps": steps_list,
|
||||
"context": context_dict,
|
||||
"revision": 1,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"plan": plan,
|
||||
"step_count": len(steps_list),
|
||||
"note": "Plan created. Use execute_plan to run it with the Worker-Judge loop.",
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def validate_plan(
|
||||
plan_json: Annotated[str, "JSON string of the plan to validate"],
|
||||
) -> str:
|
||||
"""
|
||||
Validate a plan structure before execution.
|
||||
|
||||
Checks:
|
||||
- All required fields present
|
||||
- No circular dependencies
|
||||
- All dependencies reference existing steps
|
||||
- Action types are valid
|
||||
- Context flow: all $variable references can be resolved
|
||||
"""
|
||||
try:
|
||||
plan = json.loads(plan_json)
|
||||
except json.JSONDecodeError as e:
|
||||
return json.dumps({"valid": False, "errors": [f"Invalid JSON: {e}"]})
|
||||
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
# Check required fields
|
||||
required = ["id", "goal_id", "steps"]
|
||||
for field in required:
|
||||
if field not in plan:
|
||||
errors.append(f"Missing required field: {field}")
|
||||
|
||||
if "steps" not in plan:
|
||||
return json.dumps({"valid": False, "errors": errors})
|
||||
|
||||
steps = plan["steps"]
|
||||
step_ids = {s.get("id") for s in steps if "id" in s}
|
||||
steps_by_id = {s.get("id"): s for s in steps}
|
||||
|
||||
# Check each step
|
||||
valid_action_types = ["llm_call", "tool_use", "function", "code_execution", "sub_graph"]
|
||||
|
||||
for i, step in enumerate(steps):
|
||||
step_id = step.get("id", f"step_{i}")
|
||||
|
||||
# Check dependencies
|
||||
for dep in step.get("dependencies", []):
|
||||
if dep not in step_ids:
|
||||
errors.append(f"Step '{step_id}': unknown dependency '{dep}'")
|
||||
|
||||
# Check action type
|
||||
action = step.get("action", {})
|
||||
action_type = action.get("action_type")
|
||||
if action_type and action_type not in valid_action_types:
|
||||
errors.append(f"Step '{step_id}': invalid action_type '{action_type}'")
|
||||
|
||||
# Check action has required params
|
||||
if action_type == "llm_call" and not action.get("prompt"):
|
||||
warnings.append(f"Step '{step_id}': llm_call without prompt")
|
||||
if action_type == "tool_use" and not action.get("tool_name"):
|
||||
errors.append(f"Step '{step_id}': tool_use requires tool_name")
|
||||
if action_type == "code_execution" and not action.get("code"):
|
||||
errors.append(f"Step '{step_id}': code_execution requires code")
|
||||
|
||||
# Check for circular dependencies
|
||||
def has_cycle(step_id: str, visited: set, path: set) -> bool:
|
||||
if step_id in path:
|
||||
return True
|
||||
if step_id in visited:
|
||||
return False
|
||||
|
||||
visited.add(step_id)
|
||||
path.add(step_id)
|
||||
|
||||
step = next((s for s in steps if s.get("id") == step_id), None)
|
||||
if step:
|
||||
for dep in step.get("dependencies", []):
|
||||
if has_cycle(dep, visited, path):
|
||||
return True
|
||||
|
||||
path.remove(step_id)
|
||||
return False
|
||||
|
||||
for step in steps:
|
||||
if has_cycle(step.get("id", ""), set(), set()):
|
||||
errors.append(f"Circular dependency detected involving step '{step.get('id')}'")
|
||||
break
|
||||
|
||||
# === CONTEXT FLOW VALIDATION ===
|
||||
# Compute what keys each step can access (from dependencies' outputs)
|
||||
|
||||
# Build output map (step_id -> expected_outputs)
|
||||
step_outputs: dict[str, set[str]] = {}
|
||||
for step in steps:
|
||||
step_outputs[step.get("id", "")] = set(step.get("expected_outputs", []))
|
||||
|
||||
# Compute available context for each step in topological order
|
||||
available_context: dict[str, set[str]] = {}
|
||||
computed = set()
|
||||
remaining = set(step_ids)
|
||||
|
||||
# Get initial context keys from plan.context
|
||||
initial_context = set(plan.get("context", {}).keys())
|
||||
|
||||
for _ in range(len(steps) * 2):
|
||||
if not remaining:
|
||||
break
|
||||
|
||||
for step_id in list(remaining):
|
||||
step = steps_by_id.get(step_id)
|
||||
if not step:
|
||||
remaining.discard(step_id)
|
||||
continue
|
||||
|
||||
deps = step.get("dependencies", [])
|
||||
|
||||
# Can compute if all dependencies are computed
|
||||
if all(d in computed for d in deps):
|
||||
# Collect outputs from all dependencies (transitive)
|
||||
available = set(initial_context)
|
||||
for dep_id in deps:
|
||||
available.update(step_outputs.get(dep_id, set()))
|
||||
available.update(available_context.get(dep_id, set()))
|
||||
|
||||
available_context[step_id] = available
|
||||
computed.add(step_id)
|
||||
remaining.discard(step_id)
|
||||
break
|
||||
|
||||
# Check each step's inputs can be resolved
|
||||
context_errors = []
|
||||
context_warnings = []
|
||||
|
||||
for step in steps:
|
||||
step_id = step.get("id", "")
|
||||
available = available_context.get(step_id, set())
|
||||
deps = step.get("dependencies", [])
|
||||
inputs = step.get("inputs", {})
|
||||
|
||||
missing_vars = []
|
||||
for _, input_value in inputs.items():
|
||||
# Check $variable references
|
||||
if isinstance(input_value, str) and input_value.startswith("$"):
|
||||
var_name = input_value[1:] # Remove $ prefix
|
||||
if var_name not in available:
|
||||
missing_vars.append(var_name)
|
||||
|
||||
if missing_vars:
|
||||
if not deps:
|
||||
# Entry step - inputs must come from initial context
|
||||
context_warnings.append(
|
||||
f"Step '{step_id}' requires ${missing_vars} from initial context. "
|
||||
f"Ensure these are provided when running the agent: {missing_vars}"
|
||||
)
|
||||
else:
|
||||
# Find which step could provide each missing var
|
||||
suggestions = []
|
||||
for var in missing_vars:
|
||||
producers = [s.get("id") for s in steps if var in s.get("expected_outputs", [])]
|
||||
if producers:
|
||||
suggestions.append(f"${var} is produced by {producers} - add as dependency")
|
||||
else:
|
||||
suggestions.append(
|
||||
f"${var} is not produced by any step - add a step that outputs '{var}'"
|
||||
)
|
||||
|
||||
context_errors.append(
|
||||
f"Step '{step_id}' references ${missing_vars} but deps "
|
||||
f"{deps} don't provide them. Suggestions: {'; '.join(suggestions)}"
|
||||
)
|
||||
|
||||
errors.extend(context_errors)
|
||||
warnings.extend(context_warnings)
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
"step_count": len(steps),
|
||||
"context_flow": {step_id: list(keys) for step_id, keys in available_context.items()}
|
||||
if available_context
|
||||
else None,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def simulate_plan_execution(
|
||||
plan_json: Annotated[str, "JSON string of the plan to simulate"],
|
||||
max_steps: Annotated[int, "Maximum steps to simulate"] = 20,
|
||||
) -> str:
|
||||
"""
|
||||
Simulate plan execution without actually running it.
|
||||
|
||||
Shows the order steps would execute based on dependencies.
|
||||
Useful for understanding the execution flow before running.
|
||||
"""
|
||||
try:
|
||||
plan = json.loads(plan_json)
|
||||
except json.JSONDecodeError as e:
|
||||
return json.dumps({"success": False, "error": f"Invalid JSON: {e}"})
|
||||
|
||||
# Validate first
|
||||
validation = json.loads(validate_plan(plan_json))
|
||||
if not validation["valid"]:
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": "Plan is not valid",
|
||||
"validation_errors": validation["errors"],
|
||||
}
|
||||
)
|
||||
|
||||
steps = plan.get("steps", [])
|
||||
completed = set()
|
||||
execution_order = []
|
||||
iteration = 0
|
||||
|
||||
while len(completed) < len(steps) and iteration < max_steps:
|
||||
iteration += 1
|
||||
|
||||
# Find ready steps
|
||||
ready = []
|
||||
for step in steps:
|
||||
step_id = step.get("id")
|
||||
if step_id in completed:
|
||||
continue
|
||||
deps = set(step.get("dependencies", []))
|
||||
if deps.issubset(completed):
|
||||
ready.append(step)
|
||||
|
||||
if not ready:
|
||||
break
|
||||
|
||||
# Execute first ready step (in real execution, could be parallel)
|
||||
step = ready[0]
|
||||
step_id = step.get("id")
|
||||
|
||||
execution_order.append(
|
||||
{
|
||||
"iteration": iteration,
|
||||
"step_id": step_id,
|
||||
"description": step.get("description"),
|
||||
"action_type": step.get("action", {}).get("action_type"),
|
||||
"dependencies_met": list(step.get("dependencies", [])),
|
||||
"parallel_candidates": [s.get("id") for s in ready[1:]],
|
||||
}
|
||||
)
|
||||
|
||||
completed.add(step_id)
|
||||
|
||||
remaining = [s.get("id") for s in steps if s.get("id") not in completed]
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"execution_order": execution_order,
|
||||
"steps_simulated": len(execution_order),
|
||||
"remaining_steps": remaining,
|
||||
"plan_complete": len(remaining) == 0,
|
||||
"note": (
|
||||
"This is a simulation. Actual execution may differ "
|
||||
"based on step results and judge decisions."
|
||||
),
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TESTING TOOLS (Goal-Based Evaluation)
|
||||
# =============================================================================
|
||||
@@ -3713,60 +3201,6 @@ def list_tests(
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PLAN LOADING AND EXECUTION
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def load_plan_from_json(plan_json: str | dict) -> Plan:
|
||||
"""
|
||||
Load a Plan object from exported JSON.
|
||||
|
||||
Args:
|
||||
plan_json: JSON string or dict from export_graph()
|
||||
|
||||
Returns:
|
||||
Plan object ready for FlexibleGraphExecutor
|
||||
"""
|
||||
from framework.graph.plan import Plan
|
||||
|
||||
return Plan.from_json(plan_json)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def load_exported_plan(
|
||||
plan_json: Annotated[str, "JSON string from export_graph() output"],
|
||||
) -> str:
|
||||
"""
|
||||
Validate and load an exported plan, returning its structure.
|
||||
|
||||
Use this to verify a plan can be loaded before execution.
|
||||
"""
|
||||
try:
|
||||
plan = load_plan_from_json(plan_json)
|
||||
return json.dumps(
|
||||
{
|
||||
"success": True,
|
||||
"plan_id": plan.id,
|
||||
"goal_id": plan.goal_id,
|
||||
"description": plan.description,
|
||||
"step_count": len(plan.steps),
|
||||
"steps": [
|
||||
{
|
||||
"id": s.id,
|
||||
"description": s.description,
|
||||
"action_type": s.action.action_type.value,
|
||||
"dependencies": s.dependencies,
|
||||
}
|
||||
for s in plan.steps
|
||||
],
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
except Exception as e:
|
||||
return json.dumps({"success": False, "error": str(e)})
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREDENTIAL STORE TOOLS
|
||||
# =============================================================================
|
||||
|
||||
@@ -1093,9 +1093,7 @@ class AgentRunner:
|
||||
warnings.append(warning_msg)
|
||||
except ImportError:
|
||||
# aden_tools not installed - fall back to direct check
|
||||
has_llm_nodes = any(
|
||||
node.node_type in ("llm_generate", "llm_tool_use") for node in self.graph.nodes
|
||||
)
|
||||
has_llm_nodes = any(node.node_type == "event_loop" for node in self.graph.nodes)
|
||||
if has_llm_nodes:
|
||||
api_key_env = self._get_api_key_env_var(self.model)
|
||||
if api_key_env and not os.environ.get(api_key_env):
|
||||
|
||||
@@ -30,14 +30,14 @@ class NodeStepLog(BaseModel):
|
||||
"""Full tool and LLM details for one step within a node.
|
||||
|
||||
For EventLoopNode, each iteration is a step. For single-step nodes
|
||||
(LLMNode, FunctionNode, RouterNode), step_index is 0.
|
||||
(e.g. RouterNode), step_index is 0.
|
||||
|
||||
OTel-aligned fields (trace_id, span_id, execution_id) enable correlation
|
||||
and future OpenTelemetry export without schema changes.
|
||||
"""
|
||||
|
||||
node_id: str
|
||||
node_type: str = "" # "event_loop"|"llm_tool_use"|"llm_generate"|"function"|"router"
|
||||
node_type: str = "" # "event_loop" (the only valid type)
|
||||
step_index: int = 0 # iteration number for event_loop, 0 for single-step nodes
|
||||
llm_text: str = ""
|
||||
tool_calls: list[ToolCallLog] = Field(default_factory=list)
|
||||
|
||||
@@ -64,7 +64,7 @@ def sample_graph():
|
||||
id="process-webhook",
|
||||
name="Process Webhook",
|
||||
description="Process incoming webhook",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=["webhook_data"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
@@ -72,7 +72,7 @@ def sample_graph():
|
||||
id="process-api",
|
||||
name="Process API Request",
|
||||
description="Process API request",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=["request_data"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
@@ -538,7 +538,7 @@ class TestGraphSpecValidation:
|
||||
id="valid-node",
|
||||
name="Valid Node",
|
||||
description="A valid node",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=[],
|
||||
output_keys=[],
|
||||
),
|
||||
|
||||
@@ -472,7 +472,7 @@ class TestEventDrivenEntryPoints:
|
||||
id="process-event",
|
||||
name="Process Event",
|
||||
description="Process incoming event",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=["event"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
|
||||
@@ -157,39 +157,6 @@ class TestEventLoopOutputKeyOverlap:
|
||||
key_errors = [e for e in errors if "output_key" in e]
|
||||
assert len(key_errors) == 0
|
||||
|
||||
def test_overlapping_keys_non_event_loop_no_error(self):
|
||||
"""Non-event_loop nodes with overlapping keys -> no error (last-wins OK)."""
|
||||
graph = GraphSpec(
|
||||
id="g1",
|
||||
goal_id="goal1",
|
||||
entry_node="src",
|
||||
nodes=[
|
||||
NodeSpec(id="src", name="src", description="Source node"),
|
||||
NodeSpec(
|
||||
id="a",
|
||||
name="a",
|
||||
description="Node a",
|
||||
node_type="llm_generate",
|
||||
output_keys=["shared"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="b",
|
||||
name="b",
|
||||
description="Node b",
|
||||
node_type="llm_generate",
|
||||
output_keys=["shared"],
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(id="src->a", source="src", target="a", condition=EdgeCondition.ON_SUCCESS),
|
||||
EdgeSpec(id="src->b", source="src", target="b", condition=EdgeCondition.ON_SUCCESS),
|
||||
],
|
||||
)
|
||||
|
||||
errors = graph.validate()
|
||||
key_errors = [e for e in errors if "output_key" in e]
|
||||
assert len(key_errors) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Baseline: no fan-out -> no errors from these rules
|
||||
|
||||
@@ -85,14 +85,14 @@ async def test_direct_key_access_in_conditional_edge():
|
||||
id="score_node",
|
||||
name="ScoreNode",
|
||||
description="Outputs a score",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="high_score_node",
|
||||
name="HighScoreNode",
|
||||
description="Handles high scores",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["score"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
@@ -153,14 +153,14 @@ async def test_backward_compatibility_output_syntax():
|
||||
id="score_node",
|
||||
name="ScoreNode",
|
||||
description="Outputs a score",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="consumer_node",
|
||||
name="ConsumerNode",
|
||||
description="Consumer",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["score"],
|
||||
output_keys=["processed"],
|
||||
),
|
||||
@@ -221,14 +221,14 @@ async def test_multiple_keys_in_expression():
|
||||
id="multi_key_node",
|
||||
name="MultiKeyNode",
|
||||
description="Outputs multiple keys",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["x", "y"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="consumer_node",
|
||||
name="ConsumerNode",
|
||||
description="Consumer",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["x", "y"],
|
||||
output_keys=["processed"],
|
||||
),
|
||||
@@ -295,14 +295,14 @@ async def test_negative_case_condition_false():
|
||||
id="low_score_node",
|
||||
name="LowScoreNode",
|
||||
description="Outputs low score",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["score"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="high_score_handler",
|
||||
name="HighScoreHandler",
|
||||
description="Should NOT execute",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["score"],
|
||||
output_keys=["result"],
|
||||
),
|
||||
|
||||
@@ -826,7 +826,7 @@ async def test_event_loop_no_executor_retry(runtime):
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
assert not result.success
|
||||
assert failing_node.attempt_count == 1 # Executor forced max_retries to 0
|
||||
assert failing_node.attempt_count == 3 # Custom nodes keep their max_retries
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
@@ -1007,11 +1007,20 @@ async def test_internal_node_no_client_output():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mixed_node_graph(runtime):
|
||||
"""function -> event_loop -> function end-to-end."""
|
||||
"""Simple node -> event_loop -> simple node end-to-end."""
|
||||
|
||||
# Function 1: write leads to memory
|
||||
def load_leads(**kwargs):
|
||||
return ["lead_A", "lead_B", "lead_C"]
|
||||
class LoadLeadsNode(NodeProtocol):
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
leads = ["lead_A", "lead_B", "lead_C"]
|
||||
ctx.memory.write("leads", leads)
|
||||
return NodeResult(success=True, output={"leads": leads})
|
||||
|
||||
class FormatOutputNode(NodeProtocol):
|
||||
async def execute(self, ctx: NodeContext) -> NodeResult:
|
||||
summary = ctx.input_data.get("summary", ctx.memory.read("summary") or "no summary")
|
||||
report = f"Report: {summary}"
|
||||
ctx.memory.write("report", report)
|
||||
return NodeResult(success=True, output={"report": report})
|
||||
|
||||
# Event loop: process leads, produce summary
|
||||
el_scripts = [
|
||||
@@ -1028,18 +1037,12 @@ async def test_mixed_node_graph(runtime):
|
||||
]
|
||||
el_llm = ScriptableMockLLMProvider(el_scripts)
|
||||
|
||||
# Function 2: format final output
|
||||
def format_output(**kwargs):
|
||||
summary = kwargs.get("summary", "no summary")
|
||||
return f"Report: {summary}"
|
||||
|
||||
# Node specs
|
||||
load_spec = NodeSpec(
|
||||
id="load",
|
||||
name="Load Leads",
|
||||
description="Load lead data",
|
||||
node_type="function",
|
||||
function="load_leads",
|
||||
node_type="event_loop",
|
||||
output_keys=["leads"],
|
||||
)
|
||||
process_spec = NodeSpec(
|
||||
@@ -1047,17 +1050,13 @@ async def test_mixed_node_graph(runtime):
|
||||
name="Process Leads",
|
||||
description="Process leads with LLM",
|
||||
node_type="event_loop",
|
||||
# input_keys left empty: EventLoopNode._check_pause() reads "pause_requested"
|
||||
# from memory, and a restrictive scope would block it. Data flows via input_data.
|
||||
output_keys=["summary"],
|
||||
)
|
||||
format_spec = NodeSpec(
|
||||
id="format",
|
||||
name="Format Output",
|
||||
description="Format final report",
|
||||
node_type="function",
|
||||
function="format_output",
|
||||
# input_keys left empty for same scoping reason with FunctionNode
|
||||
node_type="event_loop",
|
||||
output_keys=["report"],
|
||||
)
|
||||
|
||||
@@ -1078,9 +1077,9 @@ async def test_mixed_node_graph(runtime):
|
||||
goal = Goal(id="test_goal", name="Pipeline Test", description="test full pipeline")
|
||||
|
||||
executor = GraphExecutor(runtime=runtime, llm=el_llm)
|
||||
executor.register_function("load", load_leads)
|
||||
executor.register_node("load", LoadLeadsNode())
|
||||
executor.register_node("process", EventLoopNode(config=LoopConfig(max_iterations=5)))
|
||||
executor.register_function("format", format_output)
|
||||
executor.register_node("format", FormatOutputNode())
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ def test_client_facing_defaults_false():
|
||||
id="n1",
|
||||
name="Node 1",
|
||||
description="test",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
)
|
||||
assert spec.client_facing is False
|
||||
|
||||
@@ -143,7 +143,7 @@ def test_registered_event_loop_returns_impl(runtime):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_event_loop_max_retries_forced_zero(runtime):
|
||||
"""An event_loop node with max_retries=3 should only execute once (no retry)."""
|
||||
"""Custom NodeProtocol impls with node_type=event_loop keep their max_retries."""
|
||||
node_spec = NodeSpec(
|
||||
id="el_fail",
|
||||
name="Failing Event Loop",
|
||||
@@ -171,9 +171,9 @@ async def test_event_loop_max_retries_forced_zero(runtime):
|
||||
|
||||
result = await executor.execute(graph, goal, {})
|
||||
|
||||
# Event loop nodes get max_retries overridden to 0, meaning execute once then fail
|
||||
# Custom nodes (not EventLoopNode instances) keep their max_retries
|
||||
assert not result.success
|
||||
assert failing_node.attempt_count == 1
|
||||
assert failing_node.attempt_count == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -246,21 +246,21 @@ async def test_event_loop_max_retries_positive_logs_warning(runtime, caplog):
|
||||
with caplog.at_level(logging.WARNING):
|
||||
await executor.execute(graph, goal, {})
|
||||
|
||||
assert "Overriding to 0" in caplog.text
|
||||
assert "el_warn" in caplog.text
|
||||
# Custom nodes (not EventLoopNode instances) don't get override warning
|
||||
assert "Overriding to 0" not in caplog.text
|
||||
|
||||
|
||||
# --- Existing node types unaffected ---
|
||||
|
||||
|
||||
def test_existing_node_types_unchanged():
|
||||
"""All pre-existing node types must still be in VALID_NODE_TYPES with defaults preserved."""
|
||||
expected = {"llm_tool_use", "llm_generate", "router", "function", "human_input"}
|
||||
assert expected.issubset(GraphExecutor.VALID_NODE_TYPES)
|
||||
"""Only event_loop is a valid node type."""
|
||||
expected = {"event_loop"}
|
||||
assert expected == GraphExecutor.VALID_NODE_TYPES
|
||||
|
||||
# Default node_type is still llm_tool_use
|
||||
# Default node_type is event_loop
|
||||
spec = NodeSpec(id="x", name="X", description="x")
|
||||
assert spec.node_type == "llm_tool_use"
|
||||
assert spec.node_type == "event_loop"
|
||||
|
||||
# Default max_retries is still 3
|
||||
assert spec.max_retries == 3
|
||||
|
||||
@@ -106,7 +106,7 @@ class TestExecutionQuality:
|
||||
id="node1",
|
||||
name="Always Succeeds",
|
||||
description="Never fails",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
),
|
||||
],
|
||||
@@ -151,6 +151,7 @@ class TestExecutionQuality:
|
||||
)
|
||||
|
||||
# Create graph with flaky node (fails 2 times before succeeding)
|
||||
# (actual impl from registry is FlakyNode)
|
||||
graph = GraphSpec(
|
||||
id="test-graph",
|
||||
goal_id=goal.id,
|
||||
@@ -159,7 +160,7 @@ class TestExecutionQuality:
|
||||
id="flaky",
|
||||
name="Flaky Node",
|
||||
description="Fails then succeeds",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
max_retries=3, # Allow retries
|
||||
),
|
||||
@@ -206,6 +207,7 @@ class TestExecutionQuality:
|
||||
)
|
||||
|
||||
# Create graph with always-failing node
|
||||
# (actual impl from registry is AlwaysFailsNode)
|
||||
graph = GraphSpec(
|
||||
id="test-graph",
|
||||
goal_id=goal.id,
|
||||
@@ -214,7 +216,7 @@ class TestExecutionQuality:
|
||||
id="fails",
|
||||
name="Always Fails",
|
||||
description="Never succeeds",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
max_retries=2, # Will retry twice then fail
|
||||
),
|
||||
@@ -261,6 +263,7 @@ class TestExecutionQuality:
|
||||
)
|
||||
|
||||
# Create graph with multiple flaky nodes
|
||||
# (actual impls from registry are FlakyNode instances)
|
||||
graph = GraphSpec(
|
||||
id="test-graph",
|
||||
goal_id=goal.id,
|
||||
@@ -269,7 +272,7 @@ class TestExecutionQuality:
|
||||
id="flaky1",
|
||||
name="Flaky Node 1",
|
||||
description="Fails once",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result1"],
|
||||
max_retries=3,
|
||||
),
|
||||
@@ -277,7 +280,7 @@ class TestExecutionQuality:
|
||||
id="flaky2",
|
||||
name="Flaky Node 2",
|
||||
description="Fails twice",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["result1"],
|
||||
output_keys=["result2"],
|
||||
max_retries=3,
|
||||
@@ -286,7 +289,7 @@ class TestExecutionQuality:
|
||||
id="success",
|
||||
name="Success Node",
|
||||
description="Always succeeds",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["result2"],
|
||||
output_keys=["final"],
|
||||
),
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
"""Tests for ExecutionStream retention behavior."""
|
||||
|
||||
import json
|
||||
from collections.abc import Callable
|
||||
from collections.abc import AsyncIterator, Callable
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph import Goal, NodeSpec, SuccessCriterion
|
||||
from framework.graph.edge import GraphSpec
|
||||
from framework.llm.provider import LLMProvider, LLMResponse, Tool
|
||||
from framework.llm.stream_events import FinishEvent, StreamEvent, TextDeltaEvent, ToolCallEvent
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.execution_stream import EntryPointSpec, ExecutionStream
|
||||
from framework.runtime.outcome_aggregator import OutcomeAggregator
|
||||
@@ -16,7 +18,13 @@ from framework.storage.concurrent import ConcurrentStorage
|
||||
|
||||
|
||||
class DummyLLMProvider(LLMProvider):
|
||||
"""Deterministic LLM provider for execution stream tests."""
|
||||
"""Deterministic LLM provider for execution stream tests.
|
||||
|
||||
Uses set_output tool call to properly set outputs, avoiding stall detection.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._call_count = 0
|
||||
|
||||
def complete(
|
||||
self,
|
||||
@@ -28,7 +36,7 @@ class DummyLLMProvider(LLMProvider):
|
||||
json_mode: bool = False,
|
||||
max_retries: int | None = None,
|
||||
) -> LLMResponse:
|
||||
return LLMResponse(content=json.dumps({"result": "ok"}), model="dummy")
|
||||
return LLMResponse(content="Summary for compaction.", model="dummy")
|
||||
|
||||
def complete_with_tools(
|
||||
self,
|
||||
@@ -38,7 +46,29 @@ class DummyLLMProvider(LLMProvider):
|
||||
tool_executor: Callable,
|
||||
max_iterations: int = 10,
|
||||
) -> LLMResponse:
|
||||
return LLMResponse(content=json.dumps({"result": "ok"}), model="dummy")
|
||||
return LLMResponse(content="Summary for compaction.", model="dummy")
|
||||
|
||||
async def stream(
|
||||
self,
|
||||
messages: list[dict[str, Any]],
|
||||
system: str = "",
|
||||
tools: list[Tool] | None = None,
|
||||
max_tokens: int = 4096,
|
||||
) -> AsyncIterator[StreamEvent]:
|
||||
self._call_count += 1
|
||||
|
||||
if self._call_count == 1:
|
||||
# First call: set the output via tool call
|
||||
yield ToolCallEvent(
|
||||
tool_use_id=f"tc_{self._call_count}",
|
||||
tool_name="set_output",
|
||||
tool_input={"key": "result", "value": "ok"},
|
||||
)
|
||||
yield FinishEvent(stop_reason="tool_use", input_tokens=10, output_tokens=10)
|
||||
else:
|
||||
# Subsequent calls: just finish with text
|
||||
yield TextDeltaEvent(content="Done.", snapshot="Done.")
|
||||
yield FinishEvent(stop_reason="end_turn", input_tokens=5, output_tokens=5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -62,7 +92,7 @@ async def test_execution_stream_retention(tmp_path):
|
||||
id="hello",
|
||||
name="Hello",
|
||||
description="Return a result",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=["user_name"],
|
||||
output_keys=["result"],
|
||||
system_prompt='Return JSON: {"result": "ok"}',
|
||||
@@ -149,7 +179,7 @@ async def test_shared_session_reuses_directory_and_memory(tmp_path):
|
||||
id="hello",
|
||||
name="Hello",
|
||||
description="Return a result",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=["user_name"],
|
||||
output_keys=["result"],
|
||||
system_prompt='Return JSON: {"result": "ok"}',
|
||||
|
||||
@@ -81,7 +81,9 @@ def goal():
|
||||
|
||||
def test_max_node_visits_default():
|
||||
"""NodeSpec.max_node_visits should default to 1."""
|
||||
spec = NodeSpec(id="n", name="N", description="test", node_type="function", output_keys=["out"])
|
||||
spec = NodeSpec(
|
||||
id="n", name="N", description="test", node_type="event_loop", output_keys=["out"]
|
||||
)
|
||||
assert spec.max_node_visits == 1
|
||||
|
||||
|
||||
@@ -101,7 +103,7 @@ async def test_visit_limit_skips_node(runtime, goal):
|
||||
id="a",
|
||||
name="A",
|
||||
description="entry with visit limit",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["a_out"],
|
||||
max_node_visits=1,
|
||||
)
|
||||
@@ -109,7 +111,7 @@ async def test_visit_limit_skips_node(runtime, goal):
|
||||
id="b",
|
||||
name="B",
|
||||
description="middle node",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b_out"],
|
||||
max_node_visits=0, # unlimited — let max_steps guard
|
||||
)
|
||||
@@ -159,7 +161,7 @@ async def test_visit_limit_allows_multiple(runtime, goal):
|
||||
id="a",
|
||||
name="A",
|
||||
description="entry allows two visits",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["a_out"],
|
||||
max_node_visits=2,
|
||||
)
|
||||
@@ -167,7 +169,7 @@ async def test_visit_limit_allows_multiple(runtime, goal):
|
||||
id="b",
|
||||
name="B",
|
||||
description="middle node",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b_out"],
|
||||
max_node_visits=0, # unlimited
|
||||
)
|
||||
@@ -215,7 +217,7 @@ async def test_visit_limit_zero_unlimited(runtime, goal):
|
||||
id="a",
|
||||
name="A",
|
||||
description="unlimited visits",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["a_out"],
|
||||
max_node_visits=0,
|
||||
)
|
||||
@@ -223,7 +225,7 @@ async def test_visit_limit_zero_unlimited(runtime, goal):
|
||||
id="b",
|
||||
name="B",
|
||||
description="middle node",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b_out"],
|
||||
max_node_visits=0,
|
||||
)
|
||||
@@ -274,7 +276,7 @@ async def test_conditional_feedback_edge(runtime, goal):
|
||||
id="director",
|
||||
name="Director",
|
||||
description="plans work",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["plan"],
|
||||
max_node_visits=2,
|
||||
)
|
||||
@@ -282,7 +284,7 @@ async def test_conditional_feedback_edge(runtime, goal):
|
||||
id="writer",
|
||||
name="Writer",
|
||||
description="writes draft",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["draft", "needs_revision"],
|
||||
max_node_visits=2,
|
||||
)
|
||||
@@ -290,7 +292,7 @@ async def test_conditional_feedback_edge(runtime, goal):
|
||||
id="output",
|
||||
name="Output",
|
||||
description="final output",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["final"],
|
||||
)
|
||||
|
||||
@@ -370,7 +372,7 @@ async def test_conditional_feedback_false(runtime, goal):
|
||||
id="director",
|
||||
name="Director",
|
||||
description="plans work",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["plan"],
|
||||
max_node_visits=2,
|
||||
)
|
||||
@@ -378,14 +380,14 @@ async def test_conditional_feedback_false(runtime, goal):
|
||||
id="writer",
|
||||
name="Writer",
|
||||
description="writes draft",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["draft", "needs_revision"],
|
||||
)
|
||||
output_node = NodeSpec(
|
||||
id="output",
|
||||
name="Output",
|
||||
description="final output",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["final"],
|
||||
)
|
||||
|
||||
@@ -458,14 +460,14 @@ async def test_visit_counts_in_result(runtime, goal):
|
||||
id="a",
|
||||
name="A",
|
||||
description="entry",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["a_out"],
|
||||
)
|
||||
node_b = NodeSpec(
|
||||
id="b",
|
||||
name="B",
|
||||
description="terminal",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["a_out"],
|
||||
output_keys=["b_out"],
|
||||
)
|
||||
@@ -509,21 +511,21 @@ async def test_conditional_priority_prevents_fanout(runtime, goal):
|
||||
id="writer",
|
||||
name="Writer",
|
||||
description="produces output",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["draft", "needs_revision"],
|
||||
)
|
||||
output_node = NodeSpec(
|
||||
id="output",
|
||||
name="Output",
|
||||
description="forward target",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["final"],
|
||||
)
|
||||
director = NodeSpec(
|
||||
id="director",
|
||||
name="Director",
|
||||
description="feedback target",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["plan"],
|
||||
max_node_visits=2,
|
||||
)
|
||||
|
||||
@@ -79,7 +79,7 @@ async def test_executor_respects_custom_max_retries_high(runtime):
|
||||
name="Flaky Node",
|
||||
description="A node that fails multiple times before succeeding",
|
||||
max_retries=10, # Should allow 10 retries
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
)
|
||||
|
||||
@@ -123,7 +123,7 @@ async def test_executor_respects_custom_max_retries_low(runtime):
|
||||
name="Fragile Node",
|
||||
description="A node with low retry tolerance",
|
||||
max_retries=2, # max_retries=N means N total attempts allowed
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
)
|
||||
|
||||
@@ -166,7 +166,7 @@ async def test_executor_respects_default_max_retries(runtime):
|
||||
name="Default Node",
|
||||
description="A node using default retry settings",
|
||||
# max_retries not specified, should default to 3
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
)
|
||||
|
||||
@@ -211,7 +211,7 @@ async def test_executor_max_retries_two_succeeds_on_second(runtime):
|
||||
name="Two Retry Node",
|
||||
description="A node with two attempts allowed",
|
||||
max_retries=2, # max_retries=N means N total attempts allowed
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
)
|
||||
|
||||
@@ -253,7 +253,7 @@ async def test_executor_different_nodes_different_max_retries(runtime):
|
||||
name="Node 1",
|
||||
description="First node in multi-node test",
|
||||
max_retries=2,
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result1"],
|
||||
)
|
||||
|
||||
@@ -262,7 +262,7 @@ async def test_executor_different_nodes_different_max_retries(runtime):
|
||||
name="Node 2",
|
||||
description="Second node in multi-node test",
|
||||
max_retries=5,
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["result1"],
|
||||
output_keys=["result2"],
|
||||
)
|
||||
|
||||
+29
-25
@@ -116,7 +116,7 @@ def _make_fanout_graph(
|
||||
id="source",
|
||||
name="Source",
|
||||
description="entry",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["data"],
|
||||
)
|
||||
|
||||
@@ -164,10 +164,10 @@ def _make_fanout_graph(
|
||||
async def test_fanout_triggers_on_multiple_success_edges(runtime, goal):
|
||||
"""Fan-out should activate when a node has >1 ON_SUCCESS outgoing edges."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_out"]
|
||||
id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_out"]
|
||||
)
|
||||
|
||||
graph = _make_fanout_graph([b1, b2])
|
||||
@@ -195,10 +195,10 @@ async def test_branches_execute_concurrently(runtime, goal):
|
||||
"""All fan-out branches should be launched via asyncio.gather (concurrent)."""
|
||||
order = []
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_done"]
|
||||
id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_done"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_done"]
|
||||
id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_done"]
|
||||
)
|
||||
|
||||
graph = _make_fanout_graph([b1, b2])
|
||||
@@ -223,13 +223,17 @@ async def test_branches_execute_concurrently(runtime, goal):
|
||||
async def test_convergence_at_fan_in_node(runtime, goal):
|
||||
"""After fan-out branches complete, execution should continue at convergence node."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_out"]
|
||||
id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_out"]
|
||||
)
|
||||
merge = NodeSpec(
|
||||
id="merge", name="Merge", description="fan-in", node_type="function", output_keys=["merged"]
|
||||
id="merge",
|
||||
name="Merge",
|
||||
description="fan-in",
|
||||
node_type="event_loop",
|
||||
output_keys=["merged"],
|
||||
)
|
||||
|
||||
graph = _make_fanout_graph([b1, b2], fan_in_node=merge)
|
||||
@@ -255,13 +259,13 @@ async def test_convergence_at_fan_in_node(runtime, goal):
|
||||
async def test_fail_all_strategy_raises_on_branch_failure(runtime, goal):
|
||||
"""fail_all should raise RuntimeError if any branch fails."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="ok branch", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="ok branch", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2",
|
||||
name="B2",
|
||||
description="bad branch",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b2_out"],
|
||||
max_retries=1,
|
||||
)
|
||||
@@ -290,13 +294,13 @@ async def test_fail_all_strategy_raises_on_branch_failure(runtime, goal):
|
||||
async def test_continue_others_strategy_allows_partial_success(runtime, goal):
|
||||
"""continue_others should let successful branches complete even if one fails."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="ok", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="ok", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2",
|
||||
name="B2",
|
||||
description="fail",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b2_out"],
|
||||
max_retries=1,
|
||||
)
|
||||
@@ -325,13 +329,13 @@ async def test_continue_others_strategy_allows_partial_success(runtime, goal):
|
||||
async def test_wait_all_strategy_collects_all_results(runtime, goal):
|
||||
"""wait_all should wait for all branches before proceeding."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="ok", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="ok", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2",
|
||||
name="B2",
|
||||
description="fail",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b2_out"],
|
||||
max_retries=1,
|
||||
)
|
||||
@@ -365,12 +369,12 @@ async def test_per_branch_retry(runtime, goal):
|
||||
id="b1",
|
||||
name="B1",
|
||||
description="flaky",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["b1_out"],
|
||||
max_retries=5,
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2", name="B2", description="solid", node_type="function", output_keys=["b2_out"]
|
||||
id="b2", name="B2", description="solid", node_type="event_loop", output_keys=["b2_out"]
|
||||
)
|
||||
|
||||
graph = _make_fanout_graph([b1, b2])
|
||||
@@ -394,13 +398,13 @@ async def test_per_branch_retry(runtime, goal):
|
||||
async def test_single_edge_no_parallel_overhead(runtime, goal):
|
||||
"""A single outgoing edge should follow normal sequential path, not fan-out."""
|
||||
n1 = NodeSpec(
|
||||
id="n1", name="N1", description="entry", node_type="function", output_keys=["out1"]
|
||||
id="n1", name="N1", description="entry", node_type="event_loop", output_keys=["out1"]
|
||||
)
|
||||
n2 = NodeSpec(
|
||||
id="n2",
|
||||
name="N2",
|
||||
description="next",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
input_keys=["out1"],
|
||||
output_keys=["out2"],
|
||||
)
|
||||
@@ -432,8 +436,8 @@ async def test_single_edge_no_parallel_overhead(runtime, goal):
|
||||
|
||||
def test_detect_fan_out_nodes():
|
||||
"""GraphSpec.detect_fan_out_nodes should identify fan-out topology."""
|
||||
b1 = NodeSpec(id="b1", name="B1", description="b", node_type="function", output_keys=["x"])
|
||||
b2 = NodeSpec(id="b2", name="B2", description="b", node_type="function", output_keys=["y"])
|
||||
b1 = NodeSpec(id="b1", name="B1", description="b", node_type="event_loop", output_keys=["x"])
|
||||
b2 = NodeSpec(id="b2", name="B2", description="b", node_type="event_loop", output_keys=["y"])
|
||||
graph = _make_fanout_graph([b1, b2])
|
||||
|
||||
fan_outs = graph.detect_fan_out_nodes()
|
||||
@@ -447,10 +451,10 @@ def test_detect_fan_out_nodes():
|
||||
|
||||
def test_detect_fan_in_nodes():
|
||||
"""GraphSpec.detect_fan_in_nodes should identify convergence topology."""
|
||||
b1 = NodeSpec(id="b1", name="B1", description="b", node_type="function", output_keys=["x"])
|
||||
b2 = NodeSpec(id="b2", name="B2", description="b", node_type="function", output_keys=["y"])
|
||||
b1 = NodeSpec(id="b1", name="B1", description="b", node_type="event_loop", output_keys=["x"])
|
||||
b2 = NodeSpec(id="b2", name="B2", description="b", node_type="event_loop", output_keys=["y"])
|
||||
merge = NodeSpec(
|
||||
id="merge", name="Merge", description="m", node_type="function", output_keys=["z"]
|
||||
id="merge", name="Merge", description="m", node_type="event_loop", output_keys=["z"]
|
||||
)
|
||||
graph = _make_fanout_graph([b1, b2], fan_in_node=merge)
|
||||
|
||||
@@ -467,10 +471,10 @@ def test_detect_fan_in_nodes():
|
||||
async def test_parallel_disabled_uses_sequential(runtime, goal):
|
||||
"""When enable_parallel_execution=False, multi-edge should follow first match only."""
|
||||
b1 = NodeSpec(
|
||||
id="b1", name="B1", description="b1", node_type="function", output_keys=["b1_out"]
|
||||
id="b1", name="B1", description="b1", node_type="event_loop", output_keys=["b1_out"]
|
||||
)
|
||||
b2 = NodeSpec(
|
||||
id="b2", name="B2", description="b2", node_type="function", output_keys=["b2_out"]
|
||||
id="b2", name="B2", description="b2", node_type="event_loop", output_keys=["b2_out"]
|
||||
)
|
||||
|
||||
graph = _make_fanout_graph([b1, b2])
|
||||
|
||||
@@ -1,442 +0,0 @@
|
||||
"""
|
||||
Tests for the Worker-Judge flexible execution pattern.
|
||||
|
||||
Tests cover:
|
||||
- Plan and PlanStep data structures
|
||||
- Code sandbox security
|
||||
- HybridJudge rule evaluation
|
||||
- WorkerNode action dispatch
|
||||
- FlexibleGraphExecutor end-to-end
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.code_sandbox import (
|
||||
CodeSandbox,
|
||||
safe_eval,
|
||||
safe_exec,
|
||||
)
|
||||
from framework.graph.goal import Goal, SuccessCriterion
|
||||
from framework.graph.judge import HybridJudge, create_default_judge
|
||||
from framework.graph.plan import (
|
||||
ActionSpec,
|
||||
ActionType,
|
||||
EvaluationRule,
|
||||
ExecutionStatus,
|
||||
Judgment,
|
||||
JudgmentAction,
|
||||
Plan,
|
||||
PlanExecutionResult,
|
||||
PlanStep,
|
||||
StepStatus,
|
||||
)
|
||||
|
||||
|
||||
class TestPlanDataStructures:
|
||||
"""Tests for Plan and PlanStep."""
|
||||
|
||||
def test_plan_step_creation(self):
|
||||
"""Test creating a PlanStep."""
|
||||
action = ActionSpec(
|
||||
action_type=ActionType.LLM_CALL,
|
||||
prompt="Hello, world!",
|
||||
)
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Say hello",
|
||||
action=action,
|
||||
expected_outputs=["greeting"],
|
||||
)
|
||||
|
||||
assert step.id == "step_1"
|
||||
assert step.status == StepStatus.PENDING
|
||||
assert step.action.action_type == ActionType.LLM_CALL
|
||||
|
||||
def test_plan_step_is_ready(self):
|
||||
"""Test PlanStep.is_ready() with dependencies."""
|
||||
step1 = PlanStep(
|
||||
id="step_1",
|
||||
description="First step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
)
|
||||
step2 = PlanStep(
|
||||
id="step_2",
|
||||
description="Second step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1"],
|
||||
)
|
||||
|
||||
# Step 1 is ready (no deps)
|
||||
assert step1.is_ready(set()) is True
|
||||
|
||||
# Step 2 is not ready (dep not met)
|
||||
assert step2.is_ready(set()) is False
|
||||
|
||||
# Step 2 is ready after step 1 completes
|
||||
assert step2.is_ready({"step_1"}) is True
|
||||
|
||||
def test_plan_get_ready_steps(self):
|
||||
"""Test Plan.get_ready_steps()."""
|
||||
plan = Plan(
|
||||
id="test_plan",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
),
|
||||
PlanStep(
|
||||
id="step_2",
|
||||
description="Second",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1"],
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step_1"
|
||||
|
||||
def test_plan_is_complete(self):
|
||||
"""Test Plan.is_complete()."""
|
||||
plan = Plan(
|
||||
id="test_plan",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.COMPLETED,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
assert plan.is_complete() is True
|
||||
|
||||
def test_plan_to_feedback_context(self):
|
||||
"""Test Plan.to_feedback_context()."""
|
||||
plan = Plan(
|
||||
id="test_plan",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="Completed step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.COMPLETED,
|
||||
result={"data": "value"},
|
||||
),
|
||||
PlanStep(
|
||||
id="step_2",
|
||||
description="Failed step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.FAILED,
|
||||
error="Something went wrong",
|
||||
attempts=3,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
context = plan.to_feedback_context()
|
||||
assert context["plan_id"] == "test_plan"
|
||||
assert len(context["completed_steps"]) == 1
|
||||
assert len(context["failed_steps"]) == 1
|
||||
assert context["failed_steps"][0]["error"] == "Something went wrong"
|
||||
|
||||
|
||||
class TestCodeSandbox:
|
||||
"""Tests for code sandbox security."""
|
||||
|
||||
def test_simple_execution(self):
|
||||
"""Test simple code execution."""
|
||||
result = safe_exec("x = 1 + 2\nresult = x * 3")
|
||||
assert result.success is True
|
||||
assert result.variables.get("x") == 3
|
||||
assert result.result == 9
|
||||
|
||||
def test_input_injection(self):
|
||||
"""Test passing inputs to sandbox."""
|
||||
result = safe_exec(
|
||||
"result = x + y",
|
||||
inputs={"x": 10, "y": 20},
|
||||
)
|
||||
assert result.success is True
|
||||
assert result.result == 30
|
||||
|
||||
def test_blocked_import(self):
|
||||
"""Test that dangerous imports are blocked."""
|
||||
result = safe_exec("import os")
|
||||
assert result.success is False
|
||||
assert "blocked" in result.error.lower() or "import" in result.error.lower()
|
||||
|
||||
def test_blocked_private_access(self):
|
||||
"""Test that private attribute access is blocked."""
|
||||
result = safe_exec("x = [].__class__.__bases__")
|
||||
assert result.success is False
|
||||
|
||||
def test_blocked_exec_eval(self):
|
||||
"""Test that exec/eval are blocked."""
|
||||
result = safe_exec("exec('print(1)')")
|
||||
assert result.success is False
|
||||
|
||||
def test_safe_eval_expression(self):
|
||||
"""Test safe_eval for expressions."""
|
||||
result = safe_eval("x + y", inputs={"x": 5, "y": 3})
|
||||
assert result.success is True
|
||||
assert result.result == 8
|
||||
|
||||
def test_allowed_modules(self):
|
||||
"""Test that allowed modules work."""
|
||||
sandbox = CodeSandbox()
|
||||
# math is in ALLOWED_MODULES
|
||||
result = sandbox.execute(
|
||||
"""
|
||||
import math
|
||||
result = math.sqrt(16)
|
||||
""",
|
||||
inputs={},
|
||||
)
|
||||
# Note: imports are blocked by default in validation
|
||||
# This test documents current behavior
|
||||
assert result.success is False # imports blocked by validator
|
||||
|
||||
|
||||
class TestHybridJudge:
|
||||
"""Tests for the HybridJudge."""
|
||||
|
||||
def test_rule_based_accept(self):
|
||||
"""Test rule-based accept judgment."""
|
||||
judge = HybridJudge()
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="success_check",
|
||||
description="Accept on success flag",
|
||||
condition="result.get('success') == True",
|
||||
action=JudgmentAction.ACCEPT,
|
||||
)
|
||||
)
|
||||
|
||||
step = PlanStep(
|
||||
id="test_step",
|
||||
description="Test",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
)
|
||||
goal = Goal(
|
||||
id="goal_1",
|
||||
name="Test Goal",
|
||||
description="A test goal",
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="sc1", description="Complete task", metric="completion", target="100%"
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# Use sync version for testing
|
||||
judgment = asyncio.run(judge.evaluate(step, {"success": True}, goal))
|
||||
|
||||
assert judgment.action == JudgmentAction.ACCEPT
|
||||
assert judgment.rule_matched == "success_check"
|
||||
|
||||
def test_rule_based_retry(self):
|
||||
"""Test rule-based retry judgment."""
|
||||
judge = HybridJudge()
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="timeout_retry",
|
||||
description="Retry on timeout",
|
||||
condition="result.get('error_type') == 'timeout'",
|
||||
action=JudgmentAction.RETRY,
|
||||
feedback_template="Timeout occurred, please retry",
|
||||
)
|
||||
)
|
||||
|
||||
step = PlanStep(
|
||||
id="test_step",
|
||||
description="Test",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
)
|
||||
goal = Goal(
|
||||
id="goal_1",
|
||||
name="Test Goal",
|
||||
description="A test goal",
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="sc1", description="Complete task", metric="completion", target="100%"
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
judgment = asyncio.run(judge.evaluate(step, {"error_type": "timeout"}, goal))
|
||||
|
||||
assert judgment.action == JudgmentAction.RETRY
|
||||
|
||||
def test_rule_priority(self):
|
||||
"""Test that higher priority rules are checked first."""
|
||||
judge = HybridJudge()
|
||||
|
||||
# Lower priority - would match
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="low_priority",
|
||||
description="Low priority accept",
|
||||
condition="True",
|
||||
action=JudgmentAction.ACCEPT,
|
||||
priority=1,
|
||||
)
|
||||
)
|
||||
|
||||
# Higher priority - should match first
|
||||
judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="high_priority",
|
||||
description="High priority escalate",
|
||||
condition="True",
|
||||
action=JudgmentAction.ESCALATE,
|
||||
priority=100,
|
||||
)
|
||||
)
|
||||
|
||||
step = PlanStep(
|
||||
id="test_step",
|
||||
description="Test",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
)
|
||||
goal = Goal(
|
||||
id="goal_1",
|
||||
name="Test Goal",
|
||||
description="A test goal",
|
||||
success_criteria=[
|
||||
SuccessCriterion(
|
||||
id="sc1", description="Complete task", metric="completion", target="100%"
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
judgment = asyncio.run(judge.evaluate(step, {}, goal))
|
||||
|
||||
assert judgment.rule_matched == "high_priority"
|
||||
assert judgment.action == JudgmentAction.ESCALATE
|
||||
|
||||
def test_default_judge_rules(self):
|
||||
"""Test that create_default_judge includes useful rules."""
|
||||
judge = create_default_judge()
|
||||
|
||||
# Should have rules for common cases
|
||||
rule_ids = {r.id for r in judge.rules}
|
||||
assert "explicit_success" in rule_ids
|
||||
assert "transient_error_retry" in rule_ids
|
||||
assert "security_escalate" in rule_ids
|
||||
|
||||
|
||||
class TestJudgment:
|
||||
"""Tests for Judgment data structure."""
|
||||
|
||||
def test_judgment_creation(self):
|
||||
"""Test creating a Judgment."""
|
||||
judgment = Judgment(
|
||||
action=JudgmentAction.ACCEPT,
|
||||
reasoning="Step completed successfully",
|
||||
confidence=0.95,
|
||||
)
|
||||
|
||||
assert judgment.action == JudgmentAction.ACCEPT
|
||||
assert judgment.confidence == 0.95
|
||||
assert judgment.llm_used is False
|
||||
|
||||
def test_judgment_with_feedback(self):
|
||||
"""Test Judgment with feedback for retry/replan."""
|
||||
judgment = Judgment(
|
||||
action=JudgmentAction.REPLAN,
|
||||
reasoning="Missing required data",
|
||||
feedback="Need to fetch user data first",
|
||||
context={"missing": ["user_id", "email"]},
|
||||
)
|
||||
|
||||
assert judgment.action == JudgmentAction.REPLAN
|
||||
assert judgment.feedback is not None
|
||||
assert "user_id" in judgment.context["missing"]
|
||||
|
||||
|
||||
class TestPlanExecutionResult:
|
||||
"""Tests for PlanExecutionResult."""
|
||||
|
||||
def test_completed_result(self):
|
||||
"""Test completed execution result."""
|
||||
result = PlanExecutionResult(
|
||||
status=ExecutionStatus.COMPLETED,
|
||||
results={"output": "success"},
|
||||
steps_executed=5,
|
||||
total_tokens=1000,
|
||||
)
|
||||
|
||||
assert result.status == ExecutionStatus.COMPLETED
|
||||
assert result.steps_executed == 5
|
||||
|
||||
def test_needs_replan_result(self):
|
||||
"""Test needs_replan execution result."""
|
||||
result = PlanExecutionResult(
|
||||
status=ExecutionStatus.NEEDS_REPLAN,
|
||||
feedback="Step 3 failed: missing data",
|
||||
feedback_context={
|
||||
"completed_steps": ["step_1", "step_2"],
|
||||
"failed_step": "step_3",
|
||||
},
|
||||
completed_steps=["step_1", "step_2"],
|
||||
)
|
||||
|
||||
assert result.status == ExecutionStatus.NEEDS_REPLAN
|
||||
assert result.feedback is not None
|
||||
assert len(result.completed_steps) == 2
|
||||
|
||||
|
||||
# Integration tests would require mocking Runtime and LLM
|
||||
class TestFlexibleExecutorIntegration:
|
||||
"""Integration tests for FlexibleGraphExecutor."""
|
||||
|
||||
def test_executor_creation(self, tmp_path):
|
||||
"""Test creating a FlexibleGraphExecutor."""
|
||||
from framework.graph.flexible_executor import FlexibleGraphExecutor
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
runtime = Runtime(storage_path=tmp_path / "runtime")
|
||||
executor = FlexibleGraphExecutor(runtime=runtime)
|
||||
|
||||
assert executor.runtime == runtime
|
||||
assert executor.judge is not None
|
||||
assert executor.worker is not None
|
||||
|
||||
def test_executor_with_custom_judge(self, tmp_path):
|
||||
"""Test executor with custom judge."""
|
||||
from framework.graph.flexible_executor import FlexibleGraphExecutor
|
||||
from framework.runtime.core import Runtime
|
||||
|
||||
runtime = Runtime(storage_path=tmp_path / "runtime")
|
||||
custom_judge = HybridJudge()
|
||||
custom_judge.add_rule(
|
||||
EvaluationRule(
|
||||
id="custom_rule",
|
||||
description="Custom rule",
|
||||
condition="True",
|
||||
action=JudgmentAction.ACCEPT,
|
||||
)
|
||||
)
|
||||
|
||||
executor = FlexibleGraphExecutor(runtime=runtime, judge=custom_judge)
|
||||
|
||||
assert len(executor.judge.rules) == 1
|
||||
assert executor.judge.rules[0].id == "custom_rule"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -5,7 +5,7 @@ Focused on minimal success and failure scenarios.
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
|
||||
from framework.graph.edge import GraphSpec
|
||||
from framework.graph.executor import GraphExecutor
|
||||
from framework.graph.goal import Goal
|
||||
from framework.graph.node import NodeResult, NodeSpec
|
||||
@@ -49,7 +49,7 @@ async def test_executor_single_node_success():
|
||||
id="n1",
|
||||
name="node1",
|
||||
description="test node",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=[],
|
||||
output_keys=["result"],
|
||||
max_retries=0,
|
||||
@@ -104,7 +104,7 @@ async def test_executor_single_node_failure():
|
||||
id="n1",
|
||||
name="node1",
|
||||
description="failing node",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=[],
|
||||
output_keys=["result"],
|
||||
max_retries=0,
|
||||
@@ -157,79 +157,6 @@ class FakeEventBus:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_executor_emits_node_events():
|
||||
"""Executor should emit NODE_LOOP_STARTED/COMPLETED for each non-event_loop node."""
|
||||
runtime = DummyRuntime()
|
||||
event_bus = FakeEventBus()
|
||||
|
||||
graph = GraphSpec(
|
||||
id="graph-ev",
|
||||
goal_id="g-ev",
|
||||
nodes=[
|
||||
NodeSpec(
|
||||
id="n1",
|
||||
name="first",
|
||||
description="first node",
|
||||
node_type="llm_generate",
|
||||
input_keys=[],
|
||||
output_keys=["result"],
|
||||
max_retries=0,
|
||||
),
|
||||
NodeSpec(
|
||||
id="n2",
|
||||
name="second",
|
||||
description="second node",
|
||||
node_type="llm_generate",
|
||||
input_keys=["result"],
|
||||
output_keys=["result"],
|
||||
max_retries=0,
|
||||
),
|
||||
],
|
||||
edges=[
|
||||
EdgeSpec(
|
||||
id="e1",
|
||||
source="n1",
|
||||
target="n2",
|
||||
condition=EdgeCondition.ON_SUCCESS,
|
||||
),
|
||||
],
|
||||
entry_node="n1",
|
||||
terminal_nodes=["n2"],
|
||||
)
|
||||
|
||||
executor = GraphExecutor(
|
||||
runtime=runtime,
|
||||
node_registry={
|
||||
"n1": SuccessNode(),
|
||||
"n2": SuccessNode(),
|
||||
},
|
||||
event_bus=event_bus,
|
||||
stream_id="test-stream",
|
||||
)
|
||||
|
||||
goal = Goal(id="g-ev", name="event-test", description="test events")
|
||||
result = await executor.execute(graph=graph, goal=goal)
|
||||
|
||||
assert result.success is True
|
||||
assert result.path == ["n1", "n2"]
|
||||
|
||||
# Should have 5 events: started/completed for n1, edge_traversed, then started/completed for n2
|
||||
assert len(event_bus.events) == 5
|
||||
assert event_bus.events[0] == ("started", {"stream_id": "test-stream", "node_id": "n1"})
|
||||
assert event_bus.events[1] == (
|
||||
"completed",
|
||||
{"stream_id": "test-stream", "node_id": "n1", "iterations": 1},
|
||||
)
|
||||
assert event_bus.events[2] == (
|
||||
"edge_traversed",
|
||||
{"stream_id": "test-stream", "source_node": "n1", "target_node": "n2"},
|
||||
)
|
||||
assert event_bus.events[3] == ("started", {"stream_id": "test-stream", "node_id": "n2"})
|
||||
assert event_bus.events[4] == (
|
||||
"completed",
|
||||
{"stream_id": "test-stream", "node_id": "n2", "iterations": 1},
|
||||
)
|
||||
|
||||
|
||||
# ---- Fake event_loop node (registered, so executor won't emit for it) ----
|
||||
class FakeEventLoopNode:
|
||||
@@ -292,7 +219,7 @@ async def test_executor_no_events_without_event_bus():
|
||||
id="n1",
|
||||
name="node1",
|
||||
description="test node",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
input_keys=[],
|
||||
output_keys=["result"],
|
||||
max_retries=0,
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
"""Tests for LLMNode JSON extraction logic.
|
||||
|
||||
Run with:
|
||||
cd core
|
||||
pytest tests/test_node_json_extraction.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.node import LLMNode
|
||||
|
||||
|
||||
class TestJsonExtraction:
|
||||
"""Test _extract_json JSON extraction without LLM calls."""
|
||||
|
||||
@pytest.fixture
|
||||
def node(self):
|
||||
"""Create an LLMNode instance for testing."""
|
||||
return LLMNode()
|
||||
|
||||
def test_clean_json(self, node):
|
||||
"""Test parsing clean JSON directly."""
|
||||
result = node._extract_json('{"key": "value"}', ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_json_with_whitespace(self, node):
|
||||
"""Test parsing JSON with surrounding whitespace."""
|
||||
result = node._extract_json(' {"key": "value"} \n', ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_markdown_code_block_at_start(self, node):
|
||||
"""Test extracting JSON from markdown code block at start."""
|
||||
input_text = '```json\n{"key": "value"}\n```'
|
||||
result = node._extract_json(input_text, ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_markdown_code_block_without_json_label(self, node):
|
||||
"""Test extracting JSON from markdown code block without 'json' label."""
|
||||
input_text = '```\n{"key": "value"}\n```'
|
||||
result = node._extract_json(input_text, ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_prose_around_markdown_block(self, node):
|
||||
"""Test extracting JSON when prose surrounds the markdown block."""
|
||||
input_text = 'Here is the result:\n```json\n{"key": "value"}\n```\nHope this helps!'
|
||||
result = node._extract_json(input_text, ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_json_embedded_in_prose(self, node):
|
||||
"""Test extracting JSON embedded in prose text."""
|
||||
input_text = 'The answer is {"key": "value"} as requested.'
|
||||
result = node._extract_json(input_text, ["key"])
|
||||
assert result == {"key": "value"}
|
||||
|
||||
def test_nested_json(self, node):
|
||||
"""Test parsing nested JSON objects."""
|
||||
input_text = '{"outer": {"inner": "value"}}'
|
||||
result = node._extract_json(input_text, ["outer"])
|
||||
assert result == {"outer": {"inner": "value"}}
|
||||
|
||||
def test_deeply_nested_json(self, node):
|
||||
"""Test parsing deeply nested JSON objects."""
|
||||
input_text = '{"a": {"b": {"c": {"d": "deep"}}}}'
|
||||
result = node._extract_json(input_text, ["a"])
|
||||
assert result == {"a": {"b": {"c": {"d": "deep"}}}}
|
||||
|
||||
def test_json_with_array(self, node):
|
||||
"""Test parsing JSON with array values."""
|
||||
input_text = '{"items": [1, 2, 3]}'
|
||||
result = node._extract_json(input_text, ["items"])
|
||||
assert result == {"items": [1, 2, 3]}
|
||||
|
||||
def test_json_with_string_containing_braces(self, node):
|
||||
"""Test parsing JSON where string values contain braces."""
|
||||
input_text = '{"code": "function() { return 1; }"}'
|
||||
result = node._extract_json(input_text, ["code"])
|
||||
assert result == {"code": "function() { return 1; }"}
|
||||
|
||||
def test_json_with_escaped_quotes(self, node):
|
||||
"""Test parsing JSON with escaped quotes in strings."""
|
||||
input_text = '{"message": "He said \\"hello\\""}'
|
||||
result = node._extract_json(input_text, ["message"])
|
||||
assert result == {"message": 'He said "hello"'}
|
||||
|
||||
def test_multiple_json_objects_takes_first(self, node):
|
||||
"""Test that when multiple JSON objects exist, first is taken."""
|
||||
input_text = '{"first": 1} and then {"second": 2}'
|
||||
result = node._extract_json(input_text, ["first"])
|
||||
assert result == {"first": 1}
|
||||
|
||||
def test_json_with_boolean_and_null(self, node):
|
||||
"""Test parsing JSON with boolean and null values."""
|
||||
input_text = '{"active": true, "deleted": false, "data": null}'
|
||||
result = node._extract_json(input_text, ["active", "deleted", "data"])
|
||||
assert result == {"active": True, "deleted": False, "data": None}
|
||||
|
||||
def test_json_with_numbers(self, node):
|
||||
"""Test parsing JSON with integer and float values."""
|
||||
input_text = '{"count": 42, "price": 19.99}'
|
||||
result = node._extract_json(input_text, ["count", "price"])
|
||||
assert result == {"count": 42, "price": 19.99}
|
||||
|
||||
def test_invalid_json_raises_error(self, node, monkeypatch):
|
||||
"""Test that completely invalid JSON raises an error when no LLM fallback available."""
|
||||
# Remove API keys so LLM fallback is not attempted
|
||||
monkeypatch.delenv("CEREBRAS_API_KEY", raising=False)
|
||||
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
|
||||
with pytest.raises(ValueError, match="Cannot parse JSON"):
|
||||
node._extract_json("This is not JSON at all", ["key"])
|
||||
|
||||
def test_empty_string_raises_error(self, node, monkeypatch):
|
||||
"""Test that empty string raises an error when no LLM fallback available."""
|
||||
# Remove API keys so LLM fallback is not attempted
|
||||
monkeypatch.delenv("CEREBRAS_API_KEY", raising=False)
|
||||
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
|
||||
with pytest.raises(ValueError, match="Cannot parse JSON"):
|
||||
node._extract_json("", ["key"])
|
||||
@@ -95,7 +95,7 @@ async def test_on_failure_edge_followed_after_max_retries(runtime, goal):
|
||||
id="failing",
|
||||
name="Failing Node",
|
||||
description="Always fails",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
max_retries=1,
|
||||
),
|
||||
@@ -103,7 +103,7 @@ async def test_on_failure_edge_followed_after_max_retries(runtime, goal):
|
||||
id="handler",
|
||||
name="Failure Handler",
|
||||
description="Handles failures",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["handled", "recovery"],
|
||||
),
|
||||
]
|
||||
@@ -156,7 +156,7 @@ async def test_no_on_failure_edge_still_terminates(runtime, goal):
|
||||
id="failing",
|
||||
name="Failing Node",
|
||||
description="Always fails",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
max_retries=1,
|
||||
),
|
||||
@@ -193,21 +193,21 @@ async def test_on_failure_edge_not_followed_on_success(runtime, goal):
|
||||
id="working",
|
||||
name="Working Node",
|
||||
description="Always succeeds",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["result"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="handler",
|
||||
name="Failure Handler",
|
||||
description="Should not be reached",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["handled"],
|
||||
),
|
||||
NodeSpec(
|
||||
id="next",
|
||||
name="Next Node",
|
||||
description="Normal successor",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["done"],
|
||||
),
|
||||
]
|
||||
@@ -261,7 +261,7 @@ async def test_on_failure_edge_with_zero_retries(runtime, goal):
|
||||
id="fragile",
|
||||
name="Fragile Node",
|
||||
description="Fails with no retries",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
max_retries=0,
|
||||
),
|
||||
@@ -269,7 +269,7 @@ async def test_on_failure_edge_with_zero_retries(runtime, goal):
|
||||
id="handler",
|
||||
name="Failure Handler",
|
||||
description="Handles failures",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["handled", "recovery"],
|
||||
),
|
||||
]
|
||||
@@ -317,7 +317,7 @@ async def test_on_failure_handler_appears_in_path(runtime, goal):
|
||||
id="failing",
|
||||
name="Failing Node",
|
||||
description="Always fails",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=[],
|
||||
max_retries=1,
|
||||
),
|
||||
@@ -325,7 +325,7 @@ async def test_on_failure_handler_appears_in_path(runtime, goal):
|
||||
id="handler",
|
||||
name="Failure Handler",
|
||||
description="Handles failures",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
output_keys=["handled", "recovery"],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1,592 +0,0 @@
|
||||
"""Tests for plan.py - Plan enums and Pydantic models."""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.plan import (
|
||||
ActionSpec,
|
||||
ActionType,
|
||||
ApprovalDecision,
|
||||
ExecutionStatus,
|
||||
JudgmentAction,
|
||||
Plan,
|
||||
PlanStep,
|
||||
StepStatus,
|
||||
)
|
||||
|
||||
|
||||
class TestActionTypeEnum:
|
||||
"""Tests for ActionType enum values."""
|
||||
|
||||
def test_action_type_values_exist(self):
|
||||
"""All 5 ActionType values exist."""
|
||||
assert ActionType.LLM_CALL.value == "llm_call"
|
||||
assert ActionType.TOOL_USE.value == "tool_use"
|
||||
assert ActionType.SUB_GRAPH.value == "sub_graph"
|
||||
assert ActionType.FUNCTION.value == "function"
|
||||
assert ActionType.CODE_EXECUTION.value == "code_execution"
|
||||
|
||||
def test_action_type_count(self):
|
||||
"""ActionType has exactly 5 members."""
|
||||
assert len(ActionType) == 5
|
||||
|
||||
def test_action_type_string_enum(self):
|
||||
"""ActionType is a string enum."""
|
||||
assert isinstance(ActionType.LLM_CALL, str)
|
||||
assert ActionType.LLM_CALL == "llm_call"
|
||||
|
||||
|
||||
class TestStepStatusEnum:
|
||||
"""Tests for StepStatus enum values."""
|
||||
|
||||
def test_step_status_values_exist(self):
|
||||
"""All 7 StepStatus values exist."""
|
||||
assert StepStatus.PENDING.value == "pending"
|
||||
assert StepStatus.AWAITING_APPROVAL.value == "awaiting_approval"
|
||||
assert StepStatus.IN_PROGRESS.value == "in_progress"
|
||||
assert StepStatus.COMPLETED.value == "completed"
|
||||
assert StepStatus.FAILED.value == "failed"
|
||||
assert StepStatus.SKIPPED.value == "skipped"
|
||||
assert StepStatus.REJECTED.value == "rejected"
|
||||
|
||||
def test_step_status_count(self):
|
||||
"""StepStatus has exactly 7 members."""
|
||||
assert len(StepStatus) == 7
|
||||
|
||||
def test_step_status_transition_pending_to_in_progress(self):
|
||||
"""Status can change from PENDING to IN_PROGRESS."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.PENDING,
|
||||
)
|
||||
step.status = StepStatus.IN_PROGRESS
|
||||
assert step.status == StepStatus.IN_PROGRESS
|
||||
|
||||
def test_step_status_transition_in_progress_to_completed(self):
|
||||
"""Status can change from IN_PROGRESS to COMPLETED."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.IN_PROGRESS,
|
||||
)
|
||||
step.status = StepStatus.COMPLETED
|
||||
assert step.status == StepStatus.COMPLETED
|
||||
|
||||
def test_step_status_transition_in_progress_to_failed(self):
|
||||
"""Status can change from IN_PROGRESS to FAILED."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.IN_PROGRESS,
|
||||
)
|
||||
step.status = StepStatus.FAILED
|
||||
assert step.status == StepStatus.FAILED
|
||||
|
||||
|
||||
class TestApprovalDecisionEnum:
|
||||
"""Tests for ApprovalDecision enum values."""
|
||||
|
||||
def test_approval_decision_values_exist(self):
|
||||
"""All 4 ApprovalDecision values exist."""
|
||||
assert ApprovalDecision.APPROVE.value == "approve"
|
||||
assert ApprovalDecision.REJECT.value == "reject"
|
||||
assert ApprovalDecision.MODIFY.value == "modify"
|
||||
assert ApprovalDecision.ABORT.value == "abort"
|
||||
|
||||
def test_approval_decision_count(self):
|
||||
"""ApprovalDecision has exactly 4 members."""
|
||||
assert len(ApprovalDecision) == 4
|
||||
|
||||
|
||||
class TestJudgmentActionEnum:
|
||||
"""Tests for JudgmentAction enum values."""
|
||||
|
||||
def test_judgment_action_values_exist(self):
|
||||
"""All 4 JudgmentAction values exist."""
|
||||
assert JudgmentAction.ACCEPT.value == "accept"
|
||||
assert JudgmentAction.RETRY.value == "retry"
|
||||
assert JudgmentAction.REPLAN.value == "replan"
|
||||
assert JudgmentAction.ESCALATE.value == "escalate"
|
||||
|
||||
def test_judgment_action_count(self):
|
||||
"""JudgmentAction has exactly 4 members."""
|
||||
assert len(JudgmentAction) == 4
|
||||
|
||||
|
||||
class TestExecutionStatusEnum:
|
||||
"""Tests for ExecutionStatus enum values."""
|
||||
|
||||
def test_execution_status_values_exist(self):
|
||||
"""All 7 ExecutionStatus values exist."""
|
||||
assert ExecutionStatus.COMPLETED.value == "completed"
|
||||
assert ExecutionStatus.AWAITING_APPROVAL.value == "awaiting_approval"
|
||||
assert ExecutionStatus.NEEDS_REPLAN.value == "needs_replan"
|
||||
assert ExecutionStatus.NEEDS_ESCALATION.value == "needs_escalation"
|
||||
assert ExecutionStatus.REJECTED.value == "rejected"
|
||||
assert ExecutionStatus.ABORTED.value == "aborted"
|
||||
assert ExecutionStatus.FAILED.value == "failed"
|
||||
|
||||
def test_execution_status_count(self):
|
||||
"""ExecutionStatus has exactly 7 members."""
|
||||
assert len(ExecutionStatus) == 7
|
||||
|
||||
|
||||
class TestPlanStepIsReady:
|
||||
"""Tests for PlanStep.is_ready() method."""
|
||||
|
||||
def test_plan_step_is_ready_no_deps(self):
|
||||
"""Step with no dependencies is ready when PENDING."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
status=StepStatus.PENDING,
|
||||
)
|
||||
assert step.is_ready(set()) is True
|
||||
|
||||
def test_plan_step_is_ready_deps_met(self):
|
||||
"""Step is ready when all dependencies are completed."""
|
||||
step = PlanStep(
|
||||
id="step_2",
|
||||
description="Second step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1"],
|
||||
status=StepStatus.PENDING,
|
||||
)
|
||||
assert step.is_ready({"step_1"}) is True
|
||||
|
||||
def test_plan_step_not_ready_deps_missing(self):
|
||||
"""Step is not ready when dependencies are incomplete."""
|
||||
step = PlanStep(
|
||||
id="step_2",
|
||||
description="Second step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1", "step_3"],
|
||||
status=StepStatus.PENDING,
|
||||
)
|
||||
# Only step_1 completed, step_3 still pending
|
||||
assert step.is_ready({"step_1"}) is False
|
||||
|
||||
def test_plan_step_not_ready_wrong_status(self):
|
||||
"""Step is not ready if status is not PENDING."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
status=StepStatus.IN_PROGRESS,
|
||||
)
|
||||
assert step.is_ready(set()) is False
|
||||
|
||||
def test_plan_step_not_ready_completed_status(self):
|
||||
"""Completed step is not ready to execute again."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
status=StepStatus.COMPLETED,
|
||||
)
|
||||
assert step.is_ready(set()) is False
|
||||
|
||||
def test_plan_step_is_ready_multiple_deps_all_met(self):
|
||||
"""Step with multiple dependencies is ready when all are met."""
|
||||
step = PlanStep(
|
||||
id="step_4",
|
||||
description="Fourth step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1", "step_2", "step_3"],
|
||||
status=StepStatus.PENDING,
|
||||
)
|
||||
assert step.is_ready({"step_1", "step_2", "step_3"}) is True
|
||||
|
||||
|
||||
class TestPlanFromJson:
|
||||
"""Tests for Plan.from_json() method."""
|
||||
|
||||
def test_plan_from_json_string(self):
|
||||
"""Parse Plan from JSON string."""
|
||||
json_str = json.dumps(
|
||||
{
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [
|
||||
{
|
||||
"id": "step_1",
|
||||
"description": "First step",
|
||||
"action": {
|
||||
"action_type": "function",
|
||||
"function_name": "do_something",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
plan = Plan.from_json(json_str)
|
||||
|
||||
assert plan.id == "plan_1"
|
||||
assert plan.goal_id == "goal_1"
|
||||
assert len(plan.steps) == 1
|
||||
assert plan.steps[0].id == "step_1"
|
||||
|
||||
def test_plan_from_json_dict(self):
|
||||
"""Parse Plan from dict directly."""
|
||||
data = {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [
|
||||
{
|
||||
"id": "step_1",
|
||||
"description": "First step",
|
||||
"action": {
|
||||
"action_type": "function",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
plan = Plan.from_json(data)
|
||||
|
||||
assert plan.id == "plan_1"
|
||||
assert plan.goal_id == "goal_1"
|
||||
|
||||
def test_plan_from_json_nested_plan_key(self):
|
||||
"""Handle {"plan": {...}} wrapper from export_graph()."""
|
||||
data = {
|
||||
"plan": {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [],
|
||||
}
|
||||
}
|
||||
|
||||
plan = Plan.from_json(data)
|
||||
|
||||
assert plan.id == "plan_1"
|
||||
|
||||
def test_plan_from_json_action_type_conversion(self):
|
||||
"""String action_type is converted to ActionType enum."""
|
||||
data = {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [
|
||||
{
|
||||
"id": "step_1",
|
||||
"description": "LLM step",
|
||||
"action": {
|
||||
"action_type": "llm_call",
|
||||
"prompt": "Hello",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
plan = Plan.from_json(data)
|
||||
|
||||
assert plan.steps[0].action.action_type == ActionType.LLM_CALL
|
||||
|
||||
def test_plan_from_json_all_action_types(self):
|
||||
"""All action types are correctly converted."""
|
||||
action_types = ["llm_call", "tool_use", "sub_graph", "function", "code_execution"]
|
||||
|
||||
for action_type in action_types:
|
||||
data = {
|
||||
"id": "plan",
|
||||
"goal_id": "goal",
|
||||
"description": "Test",
|
||||
"steps": [
|
||||
{
|
||||
"id": "step",
|
||||
"description": "Step",
|
||||
"action": {"action_type": action_type},
|
||||
}
|
||||
],
|
||||
}
|
||||
plan = Plan.from_json(data)
|
||||
assert plan.steps[0].action.action_type.value == action_type
|
||||
|
||||
def test_from_json_invalid_action_type(self):
|
||||
"""Unknown action_type raises ValueError."""
|
||||
data = {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [
|
||||
{
|
||||
"id": "step_1",
|
||||
"description": "Invalid step",
|
||||
"action": {
|
||||
"action_type": "invalid_type",
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
Plan.from_json(data)
|
||||
|
||||
def test_from_json_malformed_json_string(self):
|
||||
"""Invalid JSON syntax raises parse error."""
|
||||
invalid_json = "{ invalid json }"
|
||||
|
||||
with pytest.raises(json.JSONDecodeError):
|
||||
Plan.from_json(invalid_json)
|
||||
|
||||
def test_from_json_missing_step_id(self):
|
||||
"""Step without 'id' raises validation error."""
|
||||
data = {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": [
|
||||
{
|
||||
"description": "Step without ID",
|
||||
"action": {"action_type": "function"},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
Plan.from_json(data)
|
||||
|
||||
def test_from_json_wrong_type_for_steps(self):
|
||||
"""Non-list steps value raises error."""
|
||||
data = {
|
||||
"id": "plan_1",
|
||||
"goal_id": "goal_1",
|
||||
"description": "Test plan",
|
||||
"steps": "not a list",
|
||||
}
|
||||
|
||||
with pytest.raises(AttributeError):
|
||||
Plan.from_json(data)
|
||||
|
||||
def test_from_json_empty_data(self):
|
||||
"""Empty dict creates plan with defaults."""
|
||||
plan = Plan.from_json({})
|
||||
|
||||
assert plan.id == "plan"
|
||||
assert plan.goal_id == ""
|
||||
assert plan.steps == []
|
||||
|
||||
|
||||
class TestPlanMethods:
|
||||
"""Tests for Plan instance methods."""
|
||||
|
||||
@pytest.fixture
|
||||
def sample_plan(self):
|
||||
"""Create a sample plan with multiple steps."""
|
||||
return Plan(
|
||||
id="test_plan",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=[],
|
||||
status=StepStatus.COMPLETED,
|
||||
result={"data": "result1"},
|
||||
),
|
||||
PlanStep(
|
||||
id="step_2",
|
||||
description="Second step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1"],
|
||||
status=StepStatus.PENDING,
|
||||
),
|
||||
PlanStep(
|
||||
id="step_3",
|
||||
description="Third step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
dependencies=["step_1"],
|
||||
status=StepStatus.FAILED,
|
||||
error="Something went wrong",
|
||||
attempts=3,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
def test_plan_get_step(self, sample_plan):
|
||||
"""Find step by ID."""
|
||||
step = sample_plan.get_step("step_2")
|
||||
|
||||
assert step is not None
|
||||
assert step.id == "step_2"
|
||||
assert step.description == "Second step"
|
||||
|
||||
def test_plan_get_step_not_found(self, sample_plan):
|
||||
"""Returns None for missing step ID."""
|
||||
step = sample_plan.get_step("nonexistent")
|
||||
|
||||
assert step is None
|
||||
|
||||
def test_plan_get_ready_steps(self, sample_plan):
|
||||
"""Filter steps ready to execute."""
|
||||
ready = sample_plan.get_ready_steps()
|
||||
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step_2"
|
||||
|
||||
def test_plan_get_completed_steps(self, sample_plan):
|
||||
"""Filter completed steps."""
|
||||
completed = sample_plan.get_completed_steps()
|
||||
|
||||
assert len(completed) == 1
|
||||
assert completed[0].id == "step_1"
|
||||
|
||||
def test_plan_is_complete_false(self, sample_plan):
|
||||
"""Plan is not complete when steps are pending/failed."""
|
||||
assert sample_plan.is_complete() is False
|
||||
|
||||
def test_plan_is_complete_true(self):
|
||||
"""Plan is complete when all steps are completed."""
|
||||
plan = Plan(
|
||||
id="test_plan",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.COMPLETED,
|
||||
),
|
||||
PlanStep(
|
||||
id="step_2",
|
||||
description="Second step",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION),
|
||||
status=StepStatus.COMPLETED,
|
||||
),
|
||||
],
|
||||
)
|
||||
assert plan.is_complete() is True
|
||||
|
||||
def test_plan_is_complete_empty(self):
|
||||
"""Empty plan is considered complete."""
|
||||
plan = Plan(
|
||||
id="empty_plan",
|
||||
goal_id="goal_1",
|
||||
description="Empty plan",
|
||||
steps=[],
|
||||
)
|
||||
assert plan.is_complete() is True
|
||||
|
||||
def test_plan_to_feedback_context(self, sample_plan):
|
||||
"""Serializes context for replanning."""
|
||||
context = sample_plan.to_feedback_context()
|
||||
|
||||
assert context["plan_id"] == "test_plan"
|
||||
assert context["revision"] == 1
|
||||
assert len(context["completed_steps"]) == 1
|
||||
assert context["completed_steps"][0]["id"] == "step_1"
|
||||
assert len(context["failed_steps"]) == 1
|
||||
assert context["failed_steps"][0]["id"] == "step_3"
|
||||
assert context["failed_steps"][0]["error"] == "Something went wrong"
|
||||
|
||||
|
||||
class TestPlanRoundTrip:
|
||||
"""Tests for Plan serialization round-trip."""
|
||||
|
||||
def test_plan_round_trip_model_dump(self):
|
||||
"""from_json(plan.model_dump()) preserves data."""
|
||||
original = Plan(
|
||||
id="plan_1",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First step",
|
||||
action=ActionSpec(
|
||||
action_type=ActionType.LLM_CALL,
|
||||
prompt="Hello world",
|
||||
),
|
||||
dependencies=[],
|
||||
expected_outputs=["greeting"],
|
||||
),
|
||||
],
|
||||
context={"key": "value"},
|
||||
revision=2,
|
||||
)
|
||||
|
||||
# Round-trip through dict
|
||||
data = original.model_dump()
|
||||
restored = Plan.from_json(data)
|
||||
|
||||
assert restored.id == original.id
|
||||
assert restored.goal_id == original.goal_id
|
||||
assert restored.description == original.description
|
||||
assert restored.context == original.context
|
||||
assert restored.revision == original.revision
|
||||
assert len(restored.steps) == len(original.steps)
|
||||
assert restored.steps[0].id == original.steps[0].id
|
||||
assert restored.steps[0].action.action_type == original.steps[0].action.action_type
|
||||
|
||||
def test_plan_round_trip_json_string(self):
|
||||
"""from_json(plan.model_dump_json()) preserves data."""
|
||||
original = Plan(
|
||||
id="plan_1",
|
||||
goal_id="goal_1",
|
||||
description="Test plan",
|
||||
steps=[
|
||||
PlanStep(
|
||||
id="step_1",
|
||||
description="First step",
|
||||
action=ActionSpec(
|
||||
action_type=ActionType.TOOL_USE,
|
||||
tool_name="my_tool",
|
||||
tool_args={"arg1": "value1"},
|
||||
),
|
||||
dependencies=[],
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# Round-trip through JSON string
|
||||
json_str = original.model_dump_json()
|
||||
restored = Plan.from_json(json_str)
|
||||
|
||||
assert restored.id == original.id
|
||||
assert len(restored.steps) == 1
|
||||
assert restored.steps[0].action.tool_name == "my_tool"
|
||||
|
||||
def test_plan_step_serialization(self):
|
||||
"""PlanStep serializes and deserializes correctly."""
|
||||
step = PlanStep(
|
||||
id="step_1",
|
||||
description="Test step",
|
||||
action=ActionSpec(
|
||||
action_type=ActionType.CODE_EXECUTION,
|
||||
code="print('hello')",
|
||||
language="python",
|
||||
),
|
||||
inputs={"input1": "value1"},
|
||||
expected_outputs=["output1", "output2"],
|
||||
dependencies=["dep1", "dep2"],
|
||||
requires_approval=True,
|
||||
approval_message="Please approve",
|
||||
)
|
||||
|
||||
# Serialize and deserialize
|
||||
data = step.model_dump()
|
||||
|
||||
assert data["id"] == "step_1"
|
||||
assert data["action"]["action_type"] == "code_execution"
|
||||
assert data["action"]["code"] == "print('hello')"
|
||||
assert data["inputs"] == {"input1": "value1"}
|
||||
assert data["expected_outputs"] == ["output1", "output2"]
|
||||
assert data["dependencies"] == ["dep1", "dep2"]
|
||||
assert data["requires_approval"] is True
|
||||
@@ -1,384 +0,0 @@
|
||||
"""
|
||||
Tests for Plan dependency resolution with failed steps.
|
||||
|
||||
These tests verify that plan execution correctly handles failed dependencies
|
||||
instead of hanging indefinitely.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from framework.graph.plan import (
|
||||
ActionSpec,
|
||||
ActionType,
|
||||
Plan,
|
||||
PlanStep,
|
||||
StepStatus,
|
||||
)
|
||||
|
||||
|
||||
class TestStepStatusTerminal:
|
||||
"""Tests for StepStatus.is_terminal() method."""
|
||||
|
||||
def test_completed_is_terminal(self):
|
||||
"""COMPLETED status should be terminal."""
|
||||
assert StepStatus.COMPLETED.is_terminal() is True
|
||||
|
||||
def test_failed_is_terminal(self):
|
||||
"""FAILED status should be terminal."""
|
||||
assert StepStatus.FAILED.is_terminal() is True
|
||||
|
||||
def test_skipped_is_terminal(self):
|
||||
"""SKIPPED status should be terminal."""
|
||||
assert StepStatus.SKIPPED.is_terminal() is True
|
||||
|
||||
def test_rejected_is_terminal(self):
|
||||
"""REJECTED status should be terminal."""
|
||||
assert StepStatus.REJECTED.is_terminal() is True
|
||||
|
||||
def test_pending_is_not_terminal(self):
|
||||
"""PENDING status should not be terminal."""
|
||||
assert StepStatus.PENDING.is_terminal() is False
|
||||
|
||||
def test_in_progress_is_not_terminal(self):
|
||||
"""IN_PROGRESS status should not be terminal."""
|
||||
assert StepStatus.IN_PROGRESS.is_terminal() is False
|
||||
|
||||
def test_awaiting_approval_is_not_terminal(self):
|
||||
"""AWAITING_APPROVAL status should not be terminal."""
|
||||
assert StepStatus.AWAITING_APPROVAL.is_terminal() is False
|
||||
|
||||
def test_completed_is_successful(self):
|
||||
"""Only COMPLETED should be successful."""
|
||||
assert StepStatus.COMPLETED.is_successful() is True
|
||||
assert StepStatus.FAILED.is_successful() is False
|
||||
assert StepStatus.SKIPPED.is_successful() is False
|
||||
|
||||
|
||||
class TestPlanStepIsReady:
|
||||
"""Tests for PlanStep.is_ready() with terminal states."""
|
||||
|
||||
def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING):
|
||||
"""Helper to create a step."""
|
||||
return PlanStep(
|
||||
id=id,
|
||||
description=f"Step {id}",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"),
|
||||
dependencies=deps or [],
|
||||
status=status,
|
||||
)
|
||||
|
||||
def test_step_ready_when_no_dependencies(self):
|
||||
"""Step with no dependencies should be ready."""
|
||||
step = self._make_step("step1")
|
||||
assert step.is_ready(set()) is True
|
||||
|
||||
def test_step_ready_when_dependency_completed(self):
|
||||
"""Step should be ready when dependency is completed."""
|
||||
step = self._make_step("step2", deps=["step1"])
|
||||
assert step.is_ready({"step1"}) is True
|
||||
|
||||
def test_step_ready_when_dependency_failed(self):
|
||||
"""Step should be ready when dependency failed (terminal state)."""
|
||||
step = self._make_step("step2", deps=["step1"])
|
||||
# step1 is in terminal_step_ids because it failed
|
||||
assert step.is_ready({"step1"}) is True
|
||||
|
||||
def test_step_not_ready_when_dependency_pending(self):
|
||||
"""Step should not be ready when dependency is still pending."""
|
||||
step = self._make_step("step2", deps=["step1"])
|
||||
assert step.is_ready(set()) is False
|
||||
|
||||
def test_step_not_ready_when_already_completed(self):
|
||||
"""Completed step should not be ready."""
|
||||
step = self._make_step("step1", status=StepStatus.COMPLETED)
|
||||
assert step.is_ready(set()) is False
|
||||
|
||||
def test_step_not_ready_when_in_progress(self):
|
||||
"""In-progress step should not be ready."""
|
||||
step = self._make_step("step1", status=StepStatus.IN_PROGRESS)
|
||||
assert step.is_ready(set()) is False
|
||||
|
||||
def test_step_ready_with_multiple_dependencies_all_terminal(self):
|
||||
"""Step should be ready when all dependencies are terminal."""
|
||||
step = self._make_step("step3", deps=["step1", "step2"])
|
||||
assert step.is_ready({"step1", "step2"}) is True
|
||||
|
||||
def test_step_not_ready_with_partial_dependencies(self):
|
||||
"""Step should not be ready when only some dependencies are terminal."""
|
||||
step = self._make_step("step3", deps=["step1", "step2"])
|
||||
assert step.is_ready({"step1"}) is False
|
||||
|
||||
|
||||
class TestPlanGetReadySteps:
|
||||
"""Tests for Plan.get_ready_steps() with failed dependencies."""
|
||||
|
||||
def _make_plan(self, steps: list[PlanStep]) -> Plan:
|
||||
"""Helper to create a plan."""
|
||||
return Plan(
|
||||
id="test_plan",
|
||||
goal_id="test_goal",
|
||||
description="Test plan",
|
||||
steps=steps,
|
||||
)
|
||||
|
||||
def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING):
|
||||
"""Helper to create a step."""
|
||||
return PlanStep(
|
||||
id=id,
|
||||
description=f"Step {id}",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"),
|
||||
dependencies=deps or [],
|
||||
status=status,
|
||||
)
|
||||
|
||||
def test_ready_steps_with_no_dependencies(self):
|
||||
"""Steps with no dependencies should be ready."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1"),
|
||||
self._make_step("step2"),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 2
|
||||
assert {s.id for s in ready} == {"step1", "step2"}
|
||||
|
||||
def test_ready_steps_with_completed_dependency(self):
|
||||
"""Dependent step should be ready when dependency is completed."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", status=StepStatus.COMPLETED),
|
||||
self._make_step("step2", deps=["step1"]),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step2"
|
||||
|
||||
def test_ready_steps_with_failed_dependency(self):
|
||||
"""Dependent step should be ready when dependency failed."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", status=StepStatus.FAILED),
|
||||
self._make_step("step2", deps=["step1"]),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step2"
|
||||
|
||||
def test_ready_steps_with_skipped_dependency(self):
|
||||
"""Dependent step should be ready when dependency was skipped."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", status=StepStatus.SKIPPED),
|
||||
self._make_step("step2", deps=["step1"]),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step2"
|
||||
|
||||
def test_ready_steps_with_rejected_dependency(self):
|
||||
"""Dependent step should be ready when dependency was rejected."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", status=StepStatus.REJECTED),
|
||||
self._make_step("step2", deps=["step1"]),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step2"
|
||||
|
||||
def test_no_ready_steps_when_dependency_in_progress(self):
|
||||
"""Dependent step should not be ready when dependency is in progress."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", status=StepStatus.IN_PROGRESS),
|
||||
self._make_step("step2", deps=["step1"]),
|
||||
]
|
||||
)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 0
|
||||
|
||||
|
||||
class TestPlanCompletion:
|
||||
"""Tests for Plan completion status methods."""
|
||||
|
||||
def _make_plan(self, steps: list[PlanStep]) -> Plan:
|
||||
"""Helper to create a plan."""
|
||||
return Plan(
|
||||
id="test_plan",
|
||||
goal_id="test_goal",
|
||||
description="Test plan",
|
||||
steps=steps,
|
||||
)
|
||||
|
||||
def _make_step(self, id: str, status: StepStatus = StepStatus.PENDING):
|
||||
"""Helper to create a step."""
|
||||
return PlanStep(
|
||||
id=id,
|
||||
description=f"Step {id}",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"),
|
||||
status=status,
|
||||
)
|
||||
|
||||
def test_is_complete_when_all_completed(self):
|
||||
"""Plan should be complete when all steps are completed."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.COMPLETED),
|
||||
]
|
||||
)
|
||||
assert plan.is_complete() is True
|
||||
|
||||
def test_is_complete_when_all_terminal_mixed(self):
|
||||
"""Plan should be complete when all steps are in terminal states (mixed)."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.FAILED),
|
||||
self._make_step("step3", StepStatus.SKIPPED),
|
||||
]
|
||||
)
|
||||
assert plan.is_complete() is True
|
||||
|
||||
def test_is_not_complete_when_pending(self):
|
||||
"""Plan should not be complete when steps are pending."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.PENDING),
|
||||
]
|
||||
)
|
||||
assert plan.is_complete() is False
|
||||
|
||||
def test_is_not_complete_when_in_progress(self):
|
||||
"""Plan should not be complete when steps are in progress."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.IN_PROGRESS),
|
||||
]
|
||||
)
|
||||
assert plan.is_complete() is False
|
||||
|
||||
def test_is_successful_when_all_completed(self):
|
||||
"""Plan should be successful only when all steps completed."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.COMPLETED),
|
||||
]
|
||||
)
|
||||
assert plan.is_successful() is True
|
||||
|
||||
def test_is_not_successful_when_failed(self):
|
||||
"""Plan should not be successful when any step failed."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.FAILED),
|
||||
]
|
||||
)
|
||||
assert plan.is_successful() is False
|
||||
|
||||
def test_has_failed_steps(self):
|
||||
"""has_failed_steps should detect failed steps."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.FAILED),
|
||||
]
|
||||
)
|
||||
assert plan.has_failed_steps() is True
|
||||
|
||||
def test_has_no_failed_steps(self):
|
||||
"""has_failed_steps should return False when all succeeded."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.COMPLETED),
|
||||
]
|
||||
)
|
||||
assert plan.has_failed_steps() is False
|
||||
|
||||
def test_get_failed_steps(self):
|
||||
"""get_failed_steps should return all failed/skipped/rejected steps."""
|
||||
plan = self._make_plan(
|
||||
[
|
||||
self._make_step("step1", StepStatus.COMPLETED),
|
||||
self._make_step("step2", StepStatus.FAILED),
|
||||
self._make_step("step3", StepStatus.SKIPPED),
|
||||
self._make_step("step4", StepStatus.REJECTED),
|
||||
]
|
||||
)
|
||||
failed = plan.get_failed_steps()
|
||||
assert len(failed) == 3
|
||||
assert {s.id for s in failed} == {"step2", "step3", "step4"}
|
||||
|
||||
|
||||
class TestBugScenario:
|
||||
"""Test the specific bug scenario that was fixed."""
|
||||
|
||||
def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING):
|
||||
"""Helper to create a step."""
|
||||
return PlanStep(
|
||||
id=id,
|
||||
description=f"Step {id}",
|
||||
action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"),
|
||||
dependencies=deps or [],
|
||||
status=status,
|
||||
)
|
||||
|
||||
def test_dependent_step_becomes_ready_after_dependency_fails(self):
|
||||
"""
|
||||
BUG SCENARIO: When step1 fails, step2 (which depends on step1) should
|
||||
become ready, allowing the executor to handle it appropriately.
|
||||
|
||||
Before fix: step2 would never become ready, causing infinite hang.
|
||||
After fix: step2 becomes ready and executor can decide how to handle it.
|
||||
"""
|
||||
plan = Plan(
|
||||
id="test_plan",
|
||||
goal_id="test_goal",
|
||||
description="Test plan with dependency",
|
||||
steps=[
|
||||
self._make_step("step1", status=StepStatus.PENDING),
|
||||
self._make_step("step2", deps=["step1"], status=StepStatus.PENDING),
|
||||
],
|
||||
)
|
||||
|
||||
# Initially, only step1 is ready
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step1"
|
||||
|
||||
# Simulate step1 failing
|
||||
plan.steps[0].status = StepStatus.FAILED
|
||||
|
||||
# Now step2 should be ready (dependency is in terminal state)
|
||||
ready = plan.get_ready_steps()
|
||||
assert len(ready) == 1
|
||||
assert ready[0].id == "step2"
|
||||
|
||||
# Plan should not be complete yet (step2 is still pending)
|
||||
assert plan.is_complete() is False
|
||||
|
||||
# Simulate step2 also failing (or being skipped due to failed dependency)
|
||||
plan.steps[1].status = StepStatus.SKIPPED
|
||||
|
||||
# Now plan should be complete (all steps in terminal states)
|
||||
assert plan.is_complete() is True
|
||||
|
||||
# But not successful
|
||||
assert plan.is_successful() is False
|
||||
|
||||
# And should have failed steps
|
||||
assert plan.has_failed_steps() is True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -46,7 +46,7 @@ class TestNodeSpecOutputModel:
|
||||
id="test_node",
|
||||
name="Test Node",
|
||||
description="A test node",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
output_model=SimpleOutput,
|
||||
)
|
||||
|
||||
@@ -400,7 +400,7 @@ class TestPydanticValidationIntegrationExtended:
|
||||
id="full_test",
|
||||
name="Full Validation Test",
|
||||
description="Tests all validation options",
|
||||
node_type="llm_generate",
|
||||
node_type="event_loop",
|
||||
output_keys=["category", "priority", "summary", "suggested_action"],
|
||||
output_model=TicketAnalysis,
|
||||
max_validation_retries=3,
|
||||
|
||||
@@ -51,7 +51,7 @@ class TestRuntimeLogStore:
|
||||
detail2 = NodeDetail(
|
||||
node_id="node-2",
|
||||
node_name="Process Node",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
total_steps=1,
|
||||
)
|
||||
@@ -64,7 +64,7 @@ class TestRuntimeLogStore:
|
||||
assert len(loaded.nodes) == 2
|
||||
assert loaded.nodes[0].node_id == "node-1"
|
||||
assert loaded.nodes[0].exit_status == "success"
|
||||
assert loaded.nodes[1].node_type == "function"
|
||||
assert loaded.nodes[1].node_type == "event_loop"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_append_and_load_tool_logs(self, tmp_path: Path):
|
||||
@@ -606,14 +606,14 @@ class TestRuntimeLogger:
|
||||
# Node 2: function
|
||||
rt_logger.log_step(
|
||||
node_id="node-2",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
step_index=0,
|
||||
latency_ms=50,
|
||||
)
|
||||
rt_logger.log_node_complete(
|
||||
node_id="node-2",
|
||||
node_name="Process",
|
||||
node_type="function",
|
||||
node_type="event_loop",
|
||||
success=True,
|
||||
total_steps=1,
|
||||
latency_ms=50,
|
||||
|
||||
@@ -82,9 +82,9 @@ learn from the corrections to improve accuracy.
|
||||
| Feature | AutoGen | Aden |
|
||||
|---------|---------|------|
|
||||
| Agent-to-agent | Natural language | Generated connections |
|
||||
| Conversation history | Built-in | Via memory nodes |
|
||||
| Conversation history | Built-in | Via shared memory |
|
||||
| Message passing | Sequential turns | Async/event-driven |
|
||||
| Human interaction | Via UserProxyAgent | Native HITL nodes |
|
||||
| Human interaction | Via UserProxyAgent | Client-facing nodes |
|
||||
|
||||
**Verdict:** AutoGen is more natural for dialogue; Aden is more flexible for diverse patterns.
|
||||
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
# Phase 2: FunctionNode Removal + Dead Code Cleanup
|
||||
|
||||
> Ref: [GitHub Issue #4753](https://github.com/adenhq/hive/issues/4753)
|
||||
|
||||
## Context
|
||||
|
||||
`FunctionNode` (`node_type="function"`) breaks three core agent principles: conversation continuity, cumulative tools, and user interruptibility. Phase 1 (soft deprecation warnings) is complete. This plan covers Phase 2 (hard removal) plus cleanup of other dead code discovered during scoping.
|
||||
|
||||
**Total estimated removal: ~5,000+ lines** across production code, tests, docs, and examples.
|
||||
|
||||
---
|
||||
|
||||
## Part 1: Remove `FunctionNode` class and `"function"` node type
|
||||
|
||||
### 1.1 Core framework
|
||||
|
||||
| File | What to remove/change |
|
||||
|---|---|
|
||||
| `core/framework/graph/node.py` | Delete `FunctionNode` class (~L1878-1985). Remove `function` field from `NodeSpec` (~L200). |
|
||||
| `core/framework/graph/executor.py` | Remove `FunctionNode` import (~L24). Remove `"function"` from `VALID_NODE_TYPES` (~L1473). Remove `node_type == "function"` branch (~L1529-1533). Remove `register_function()` (~L1975-1977). Add migration error for graphs with `node_type="function"`. |
|
||||
| `core/framework/builder/workflow.py` | Remove `node_type == "function"` validation block (~L258-260). |
|
||||
|
||||
### 1.2 Agent Builder MCP server
|
||||
|
||||
| File | What to change |
|
||||
|---|---|
|
||||
| `core/framework/mcp/agent_builder_server.py` | Remove `"function"` from `node_type` description in `add_node` (~L590) and `update_node` (~L841). Remove `node_type == "function"` simulation branch in `test_node` (~L2356-2357). |
|
||||
|
||||
### 1.3 Examples & demos
|
||||
|
||||
| File | Action |
|
||||
|---|---|
|
||||
| `core/examples/manual_agent.py` | Rewrite to use `event_loop` nodes |
|
||||
| `core/demos/github_outreach_demo.py` | Convert `Sender` node from `function` to `event_loop` |
|
||||
| `core/examples/mcp_integration_example.py` | Rewrite to use `event_loop` nodes |
|
||||
|
||||
### 1.4 Docs & skills
|
||||
|
||||
| File | Action |
|
||||
|---|---|
|
||||
| `.claude/skills/hive-create/SKILL.md` | Remove `"function"` from node type table (~L495, L856) |
|
||||
| `docs/developer-guide.md` | Remove `"function"` node type reference (~L613) |
|
||||
| `core/MCP_SERVER_GUIDE.md` | Audit for `"function"` references |
|
||||
| `docs/why-conditional-edge-priority.md` | Remove or repurpose (entire doc framed around function nodes) |
|
||||
| `docs/environment-setup.md` | Remove "function" from node types list (~L216) |
|
||||
| `docs/i18n/*.md` | Update BUILD diagrams in 7 i18n files (ja, ko, pt, hi, es, ru, zh-CN) removing "Function" |
|
||||
| `core/framework/runtime/runtime_log_schemas.py` | Remove `"function"` from node_type comment (~L40) |
|
||||
|
||||
---
|
||||
|
||||
## Part 2: Remove deprecated `LLMNode` + `llm_tool_use` / `llm_generate`
|
||||
|
||||
Already soft-deprecated with `DeprecationWarning`. No template agent uses them. Only `mcp_integration_example.py` references them.
|
||||
|
||||
| File | What to remove/change |
|
||||
|---|---|
|
||||
| `core/framework/graph/node.py` | Delete `LLMNode` class (~L660-1689, ~1000 lines). Largest single removal. |
|
||||
| `core/framework/graph/executor.py` | Remove `LLMNode` import. Remove `"llm_tool_use"`/`"llm_generate"` from `VALID_NODE_TYPES`. Remove `DEPRECATED_NODE_TYPES` dict. Remove their branches in `_get_node_implementation` (~L1507-1523). Update `human_input` branch to use `EventLoopNode` instead of `LLMNode`. Add migration error for deprecated types. |
|
||||
| `core/framework/mcp/agent_builder_server.py` | Remove `llm_tool_use`/`llm_generate` validation warnings and branches (~L668-683, L922-937) |
|
||||
|
||||
---
|
||||
|
||||
## Part 3: Rewrite tests using `function` nodes as fixtures
|
||||
|
||||
These tests use `node_type="function"` as convenient scaffolding but actually test graph execution features (retries, fan-out, feedback edges, etc.). They all need rewriting.
|
||||
|
||||
| Test file | What it tests |
|
||||
|---|---|
|
||||
| `core/tests/test_on_failure_edges.py` | On-failure edge routing (~10 function nodes) |
|
||||
| `core/tests/test_executor_feedback_edges.py` | Max node visits, feedback loops (~20+ function nodes) |
|
||||
| `core/tests/test_executor_max_retries.py` | Retry behavior (~7 function nodes) |
|
||||
| `core/tests/test_fanout.py` | Fan-out/fan-in parallel execution (~20+ function nodes) |
|
||||
| `core/tests/test_execution_quality.py` | Retry + quality scoring (~8 function nodes) |
|
||||
| `core/tests/test_conditional_edge_direct_key.py` | Conditional edge evaluation (~8 function nodes) |
|
||||
| `core/tests/test_event_loop_integration.py` | Mixed node graph test (~2 function nodes) |
|
||||
| `core/tests/test_runtime_logger.py` | Runtime log schema (~2 references) |
|
||||
| `tools/tests/tools/test_runtime_logs_tool.py` | Log tool output (~2 references) |
|
||||
|
||||
**Strategy:** Create a `MockNode(NodeProtocol)` test helper that wraps a callable, providing the same convenience as `FunctionNode` but scoped to tests only. Tests swap `node_type="function"` for a neutral `node_type="event_loop"` and register a `MockNode` in the executor's `node_registry`. This minimizes rewrite effort.
|
||||
|
||||
---
|
||||
|
||||
## Part 4: Items NOT recommended for removal
|
||||
|
||||
| Item | Reason to keep |
|
||||
|---|---|
|
||||
| `RouterNode` | Architecturally sound (deterministic routing), just lacks template examples |
|
||||
| `human_input` node type | Valid HITL pattern, but switch implementation from `LLMNode` to `EventLoopNode` |
|
||||
| `register_function` in `tool_registry.py` | For **tool** registration — completely different concept from function nodes |
|
||||
|
||||
---
|
||||
|
||||
## Part 5: Remove the Planner-Worker subsystem (~3,900 lines dead code)
|
||||
|
||||
The entire Planner-Worker-Judge pattern has **zero external consumers**. No template agent, example, demo, or runner references it. It is only consumed by:
|
||||
- Its own internal files (self-referential imports)
|
||||
- The agent-builder MCP server (exposes tools for it)
|
||||
- Its own dedicated tests
|
||||
|
||||
### 5.1 Delete these files entirely
|
||||
|
||||
| File | Lines | What |
|
||||
|---|---|---|
|
||||
| `core/framework/graph/flexible_executor.py` | 552 | `FlexibleGraphExecutor` — Worker-Judge orchestrator |
|
||||
| `core/framework/graph/worker_node.py` | 620 | `WorkerNode` — plan step dispatcher |
|
||||
| `core/framework/graph/plan.py` | 513 | `Plan`, `PlanStep`, `ActionType`, `ActionSpec` data structures |
|
||||
| `core/framework/graph/judge.py` | 406 | `HybridJudge` — step result evaluator |
|
||||
| `core/framework/graph/code_sandbox.py` | 413 | `CodeSandbox` — sandboxed code execution |
|
||||
| `core/tests/test_flexible_executor.py` | 442 | FlexibleGraphExecutor tests |
|
||||
| `core/tests/test_plan.py` | 592 | Plan data structure tests |
|
||||
| `core/tests/test_plan_dependency_resolution.py` | 384 | Plan dependency resolution tests |
|
||||
|
||||
### 5.2 Clean up exports
|
||||
|
||||
`core/framework/graph/__init__.py` — Remove all planner-worker exports: `FlexibleGraphExecutor`, `ExecutorConfig`, `WorkerNode`, `StepExecutionResult`, `HybridJudge`, `create_default_judge`, `CodeSandbox`, `safe_eval`, `safe_exec`, `Plan`, `PlanStep`, `ActionType`, `ActionSpec`, and all related symbols.
|
||||
|
||||
### 5.3 Remove MCP tools from agent-builder server
|
||||
|
||||
`core/framework/mcp/agent_builder_server.py` — Remove these 7 MCP tools:
|
||||
|
||||
| MCP tool | Description |
|
||||
|---|---|
|
||||
| `create_plan` | Creates a plan with steps |
|
||||
| `validate_plan` | Validates plan structure |
|
||||
| `simulate_plan_execution` | Dry-run simulation |
|
||||
| `load_exported_plan` | Loads plan from JSON |
|
||||
| `add_evaluation_rule` | Adds HybridJudge rule |
|
||||
| `list_evaluation_rules` | Lists evaluation rules |
|
||||
| `remove_evaluation_rule` | Removes evaluation rule |
|
||||
|
||||
Also remove:
|
||||
- `from framework.graph.plan import Plan` import (~L39, L3731)
|
||||
- `_evaluation_rules` global list (~L2528)
|
||||
- `"evaluation_rules"` from export/session data (~L1859)
|
||||
- `load_plan_from_json()` helper function (~L3721-3733)
|
||||
|
||||
---
|
||||
|
||||
## Execution order
|
||||
|
||||
1. **Create `MockNode` test helper** — unblocks all test rewrites
|
||||
2. **Rewrite tests** using function nodes as fixtures (Part 3)
|
||||
3. **Remove `FunctionNode` class + all references** (Part 1)
|
||||
4. **Remove `LLMNode` class + deprecated types** (Part 2)
|
||||
5. **Delete Planner-Worker subsystem files** (Part 5.1)
|
||||
6. **Clean up `__init__.py` exports** (Part 5.2)
|
||||
7. **Remove MCP tools** for plans/evaluation from agent-builder server (Part 5.3)
|
||||
8. **Update examples/demos/docs/skills** (Parts 1.3, 1.4)
|
||||
9. **Run full test suite** to verify
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
1. `pytest core/tests/` — all tests pass
|
||||
2. `pytest tools/tests/` — runtime log tests pass
|
||||
3. Load any template agent JSON — no errors
|
||||
4. Attempt to load a graph with `node_type="function"` — clear `RuntimeError` with migration guidance
|
||||
5. Attempt to load a graph with `node_type="llm_tool_use"` — clear `RuntimeError` with migration guidance
|
||||
6. Agent builder MCP: `add_node` with `node_type="function"` — rejected with helpful message
|
||||
7. Plan/evaluation MCP tools no longer appear in tool list
|
||||
@@ -278,7 +278,7 @@ claude> /hive-test
|
||||
2. **Design the Workflow**
|
||||
|
||||
- The skill guides you through defining nodes
|
||||
- Each node is a unit of work (LLM call, function, router)
|
||||
- Each node is a unit of work (LLM call with event_loop)
|
||||
- Edges define how execution flows
|
||||
|
||||
3. **Generate the Agent**
|
||||
@@ -314,7 +314,7 @@ If you prefer to build agents manually:
|
||||
{
|
||||
"node_id": "analyze",
|
||||
"name": "Analyze Ticket",
|
||||
"node_type": "llm_generate",
|
||||
"node_type": "event_loop",
|
||||
"system_prompt": "Analyze this support ticket...",
|
||||
"input_keys": ["ticket_content"],
|
||||
"output_keys": ["category", "priority"]
|
||||
@@ -610,7 +610,7 @@ def my_custom_tool(param1: str, param2: int) -> Dict[str, Any]:
|
||||
"nodes": [
|
||||
{
|
||||
"node_id": "use_tool",
|
||||
"node_type": "function",
|
||||
"node_type": "event_loop",
|
||||
"tools": ["my_custom_tool"],
|
||||
...
|
||||
}
|
||||
|
||||
@@ -213,7 +213,7 @@ Follow the prompts to:
|
||||
|
||||
1. Understand the agent architecture and file structure
|
||||
2. Define the agent's goal, success criteria, and constraints
|
||||
3. Learn node types (LLM, tool-use, router, function)
|
||||
3. Learn node types (event_loop only)
|
||||
4. Discover and validate available tools before use
|
||||
|
||||
This step establishes the core concepts and rules needed before building an agent.
|
||||
|
||||
+1
-1
@@ -119,7 +119,7 @@ Los frameworks de agentes tradicionales requieren que diseñes manualmente flujo
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -128,7 +128,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -120,7 +120,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -121,7 +121,7 @@ Frameworks de agentes tradicionais exigem que você projete manualmente fluxos d
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+1
-1
@@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}'
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph BUILD["🏗️ BUILD"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
|
||||
GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>Event Loop"]
|
||||
NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
|
||||
EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
|
||||
end
|
||||
|
||||
+10
-12
@@ -10,17 +10,15 @@ Edges can loop back, creating feedback cycles where an agent retries a step or t
|
||||
|
||||
## Nodes
|
||||
|
||||
A node is a unit of work. Each node reads inputs from shared memory, does something, and writes outputs back. There are a handful of node types, each suited to a different kind of work:
|
||||
A node is a unit of work. Each node reads inputs from shared memory, does something, and writes outputs back.
|
||||
|
||||
**`event_loop`** — The workhorse. This is a multi-turn LLM loop: the model reasons about the current state, calls tools, observes results, and keeps going until it has produced the required outputs. Most of the interesting agent behavior happens in these nodes. They handle long-running tasks, manage their own context window, and can recover from crashes mid-conversation.
|
||||
**`event_loop`** — This is the only node type in Hive. It's a multi-turn LLM loop where the model reasons about the current state, calls tools, observes results, and keeps going until it has produced the required outputs. All agent behavior happens in these nodes. They handle long-running tasks, manage their own context window, and can recover from crashes mid-conversation.
|
||||
|
||||
**`function`** — A plain Python function. No LLM involved. Use these for anything deterministic: data transformation, API calls with known parameters, validation logic, or any step where you don't want a language model making judgment calls.
|
||||
|
||||
**`router`** — A decision point that directs execution down different paths. Can be rule-based ("if confidence is high, go left; otherwise, go right") or LLM-powered ("given the goal and what we know so far, which path makes sense?").
|
||||
|
||||
**`human_input`** — A pause point where the agent stops and asks a human for input before continuing. See [Human-in-the-Loop](#human-in-the-loop) below.
|
||||
|
||||
There are also simpler LLM node types (`llm_tool_use` for a single LLM call with tools, `llm_generate` for pure text generation) for steps that don't need the full event loop.
|
||||
Event loop nodes are highly configurable:
|
||||
- **Tools** — Give the node access to specific capabilities (web search, API calls, database queries, etc.)
|
||||
- **Client-facing** — Set `client_facing=True` to make the node interact directly with humans (see [Human-in-the-Loop](#human-in-the-loop))
|
||||
- **Custom logic** — Implement the `NodeProtocol` interface to wrap deterministic functions or any custom behavior
|
||||
- **Judge** — Configure evaluation criteria to control when the node accepts its output vs. retries
|
||||
|
||||
### Self-Correction Within a Node
|
||||
|
||||
@@ -57,11 +55,11 @@ Data flows through the graph in a natural way: input arrives at the start, each
|
||||
|
||||
## Human-in-the-Loop
|
||||
|
||||
Human-in-the-loop (HITL) nodes are where the agent pauses and asks a person for input. This isn't a blunt "stop everything" — the framework supports structured questions: open-ended text, multiple choice, yes/no approvals, and multi-field forms.
|
||||
Human-in-the-loop (HITL) is enabled by setting `client_facing=True` on an event loop node. These nodes pause and ask a person for input. This isn't a blunt "stop everything" — the framework supports structured questions: open-ended text, multiple choice, yes/no approvals, and multi-field forms.
|
||||
|
||||
When the agent hits a HITL node, it saves its entire state and presents the questions. The session can sit paused for minutes, hours, or days. When the human responds, execution picks up exactly where it left off.
|
||||
When the agent hits a client-facing node, it saves its entire state and presents the output or questions directly to the user. The session can sit paused for minutes, hours, or days. When the human responds, execution picks up exactly where it left off.
|
||||
|
||||
This is what makes Hive agents supervisable in production. You place HITL nodes at critical decision points — before sending a message, before making a purchase, before any action that's hard to undo. The agent handles the routine work autonomously; humans weigh in on the decisions that matter. And every time a human provides input, that decision becomes data the [evolution](./evolution.md) process can learn from.
|
||||
This is what makes Hive agents supervisable in production. You place client-facing nodes at critical decision points — before sending a message, before making a purchase, before any action that's hard to undo. The agent handles the routine work autonomously; humans weigh in on the decisions that matter. And every time a human provides input, that decision becomes data the [evolution](./evolution.md) process can learn from.
|
||||
|
||||
## The Shape of an Agent
|
||||
|
||||
|
||||
+1
-1
@@ -241,7 +241,7 @@ classDef done fill:#9e9e9e,color:#fff,stroke:#757575
|
||||
- [ ] Migrate from monolithic run storage
|
||||
- [ ] **Context Building & Conversation Loop**
|
||||
- [ ] Implement `Message.stream(sessionID)`
|
||||
- [ ] Update `LLMNode.execute()` for full context building
|
||||
- [ ] Update `EventLoopNode.execute()` for full context building
|
||||
- [ ] Implement `Message.toModelMessages()` conversion
|
||||
- [ ] **Proactive Compaction**
|
||||
- [ ] Implement proactive overflow detection
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
# Why Conditional Edges Need Priority (Function Nodes)
|
||||
|
||||
## The problem
|
||||
|
||||
Function nodes return everything they computed. They don't pick one output key — they return all of them.
|
||||
|
||||
```python
|
||||
def score_lead(inputs):
|
||||
score = compute_score(inputs["profile"])
|
||||
return {
|
||||
"score": score,
|
||||
"is_high_value": score > 80,
|
||||
"needs_enrichment": score > 50 and not inputs["profile"].get("company"),
|
||||
}
|
||||
```
|
||||
|
||||
Lead comes in: score 92, no company on file. Output: `{"score": 92, "is_high_value": True, "needs_enrichment": True}`.
|
||||
|
||||
Two conditional edges leaving this node:
|
||||
|
||||
```
|
||||
Edge A: needs_enrichment == True → enrichment node
|
||||
Edge B: is_high_value == True → outreach node
|
||||
```
|
||||
|
||||
Both are true. Without priority, the graph either fans out to both (wrong — you'd email someone while still enriching their data) or picks one randomly (wrong — non-deterministic).
|
||||
|
||||
## Priority fixes it
|
||||
|
||||
```
|
||||
Edge A: needs_enrichment == True priority=2 (higher = checked first)
|
||||
Edge B: is_high_value == True priority=1
|
||||
Edge C: is_high_value == False priority=0
|
||||
```
|
||||
|
||||
Executor keeps only the highest-priority matching group. A wins. Lead gets enriched first, loops back, gets re-scored — now `needs_enrichment` is false, B wins, outreach happens.
|
||||
|
||||
## Why event loop nodes don't need this
|
||||
|
||||
The LLM understands "if/else." You tell it in the prompt: "if needs enrichment, set `needs_enrichment`. Otherwise if high value, set `approved`." It picks one. Only one conditional edge matches.
|
||||
|
||||
A function just returns a dict. It doesn't do "otherwise." Priority is the "otherwise" for function nodes.
|
||||
@@ -5,10 +5,11 @@ from pathlib import Path
|
||||
from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
|
||||
from framework.graph.checkpoint_config import CheckpointConfig
|
||||
from framework.graph.edge import AsyncEntryPointSpec, GraphSpec
|
||||
from framework.graph.executor import ExecutionResult
|
||||
from framework.graph.executor import ExecutionResult, GraphExecutor
|
||||
from framework.llm import LiteLLMProvider
|
||||
from framework.runner.tool_registry import ToolRegistry
|
||||
from framework.runtime.agent_runtime import AgentRuntime, create_agent_runtime
|
||||
from framework.runtime.agent_runtime import create_agent_runtime
|
||||
from framework.runtime.event_bus import EventBus
|
||||
from framework.runtime.execution_stream import EntryPointSpec
|
||||
|
||||
from .config import default_config, metadata
|
||||
@@ -187,10 +188,10 @@ class EmailInboxManagementAgent:
|
||||
self.entry_points = entry_points
|
||||
self.pause_nodes = pause_nodes
|
||||
self.terminal_nodes = terminal_nodes
|
||||
self._executor: GraphExecutor | None = None
|
||||
self._graph: GraphSpec | None = None
|
||||
self._agent_runtime: AgentRuntime | None = None
|
||||
self._event_bus: EventBus | None = None
|
||||
self._tool_registry: ToolRegistry | None = None
|
||||
self._storage_path: Path | None = None
|
||||
|
||||
def _build_graph(self) -> GraphSpec:
|
||||
"""Build the GraphSpec."""
|
||||
@@ -217,6 +218,7 @@ class EmailInboxManagementAgent:
|
||||
self._storage_path = Path.home() / ".hive" / "agents" / "email_inbox_management"
|
||||
self._storage_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._event_bus = EventBus()
|
||||
self._tool_registry = ToolRegistry()
|
||||
|
||||
mcp_config_path = Path(__file__).parent / "mcp_servers.json"
|
||||
@@ -282,12 +284,12 @@ class EmailInboxManagementAgent:
|
||||
checkpoint_config=checkpoint_config,
|
||||
)
|
||||
|
||||
return self._executor
|
||||
|
||||
async def start(self, mock_mode=False) -> None:
|
||||
"""Set up and start the agent runtime."""
|
||||
if self._agent_runtime is None:
|
||||
"""Set up the agent (initialize executor and tools)."""
|
||||
if self._executor is None:
|
||||
self._setup(mock_mode=mock_mode)
|
||||
if not self._agent_runtime.is_running:
|
||||
await self._agent_runtime.start()
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop and clean up the agent runtime."""
|
||||
@@ -302,8 +304,10 @@ class EmailInboxManagementAgent:
|
||||
session_state: dict | None = None,
|
||||
) -> ExecutionResult | None:
|
||||
"""Execute the graph and wait for completion."""
|
||||
if self._agent_runtime is None:
|
||||
if self._executor is None:
|
||||
raise RuntimeError("Agent not started. Call start() first.")
|
||||
if self._graph is None:
|
||||
raise RuntimeError("Graph not built. Call start() first.")
|
||||
|
||||
return await self._agent_runtime.trigger_and_wait(
|
||||
entry_point_id=entry_point,
|
||||
|
||||
@@ -29,7 +29,7 @@ class CredentialSpec:
|
||||
"""Tool names that require this credential (e.g., ['web_search'])"""
|
||||
|
||||
node_types: list[str] = field(default_factory=list)
|
||||
"""Node types that require this credential (e.g., ['llm_generate', 'llm_tool_use'])"""
|
||||
"""Node types that require this credential (e.g., ['event_loop'])"""
|
||||
|
||||
required: bool = True
|
||||
"""Whether this credential is required (vs optional)"""
|
||||
@@ -321,7 +321,7 @@ class CredentialManager:
|
||||
Get list of missing credentials for the given node types.
|
||||
|
||||
Args:
|
||||
node_types: List of node types to check (e.g., ['llm_generate', 'llm_tool_use'])
|
||||
node_types: List of node types to check (e.g., ['event_loop'])
|
||||
|
||||
Returns:
|
||||
List of (credential_name, spec) tuples for missing credentials
|
||||
@@ -357,7 +357,7 @@ class CredentialManager:
|
||||
|
||||
Example:
|
||||
creds = CredentialManager()
|
||||
creds.validate_for_node_types(["llm_generate", "llm_tool_use"])
|
||||
creds.validate_for_node_types(["event_loop"])
|
||||
# Raises CredentialError if ANTHROPIC_API_KEY is not set
|
||||
"""
|
||||
missing = self.get_missing_for_node_types(node_types)
|
||||
|
||||
@@ -10,7 +10,7 @@ LLM_CREDENTIALS = {
|
||||
"anthropic": CredentialSpec(
|
||||
env_var="ANTHROPIC_API_KEY",
|
||||
tools=[],
|
||||
node_types=["llm_generate", "llm_tool_use"],
|
||||
node_types=["event_loop"],
|
||||
required=False, # Not required - agents can use other providers via LiteLLM
|
||||
startup_required=False, # MCP server doesn't need LLM credentials
|
||||
help_url="https://console.anthropic.com/settings/keys",
|
||||
|
||||
@@ -51,8 +51,9 @@ class TestGoogleDocsCreateDocument:
|
||||
|
||||
def test_service_account_json_without_access_token_is_not_used(self, mcp):
|
||||
"""Test that service account JSON alone is not treated as an access token."""
|
||||
env = {"GOOGLE_SERVICE_ACCOUNT_JSON": '{"type":"service_account"}'}
|
||||
with patch.dict("os.environ", env):
|
||||
with patch.dict(
|
||||
"os.environ", {"GOOGLE_SERVICE_ACCOUNT_JSON": '{"type":"service_account"}'}
|
||||
):
|
||||
tool_fn = get_tool_fn(mcp, "google_docs_create_document")
|
||||
result = tool_fn(title="Test Document")
|
||||
assert "error" in result
|
||||
|
||||
@@ -278,7 +278,7 @@ class TestCredentialSpec:
|
||||
spec = CredentialSpec(
|
||||
env_var="API_KEY",
|
||||
tools=["tool_a", "tool_b"],
|
||||
node_types=["llm_generate"],
|
||||
node_types=["event_loop"],
|
||||
required=False,
|
||||
startup_required=True,
|
||||
help_url="https://example.com",
|
||||
@@ -287,7 +287,7 @@ class TestCredentialSpec:
|
||||
|
||||
assert spec.env_var == "API_KEY"
|
||||
assert spec.tools == ["tool_a", "tool_b"]
|
||||
assert spec.node_types == ["llm_generate"]
|
||||
assert spec.node_types == ["event_loop"]
|
||||
assert spec.required is False
|
||||
assert spec.startup_required is True
|
||||
assert spec.help_url == "https://example.com"
|
||||
@@ -315,8 +315,7 @@ class TestCredentialSpecs:
|
||||
spec = CREDENTIAL_SPECS["anthropic"]
|
||||
assert spec.env_var == "ANTHROPIC_API_KEY"
|
||||
assert spec.tools == []
|
||||
assert "llm_generate" in spec.node_types
|
||||
assert "llm_tool_use" in spec.node_types
|
||||
assert "event_loop" in spec.node_types
|
||||
assert spec.required is False
|
||||
assert spec.startup_required is False
|
||||
assert "anthropic.com" in spec.help_url
|
||||
@@ -399,7 +398,7 @@ class TestNodeTypeValidation:
|
||||
creds = CredentialStoreAdapter.with_env_storage()
|
||||
|
||||
# Should not raise
|
||||
creds.validate_for_node_types(["llm_generate", "llm_tool_use"])
|
||||
creds.validate_for_node_types(["event_loop"])
|
||||
|
||||
|
||||
class TestStartupValidation:
|
||||
|
||||
@@ -68,7 +68,7 @@ def runtime_logs_dir(tmp_path: Path) -> Path:
|
||||
{
|
||||
"node_id": "node-2",
|
||||
"node_name": "Format",
|
||||
"node_type": "function",
|
||||
"node_type": "event_loop",
|
||||
"success": True,
|
||||
"total_steps": 1,
|
||||
"tokens_used": 0,
|
||||
@@ -112,7 +112,7 @@ def runtime_logs_dir(tmp_path: Path) -> Path:
|
||||
},
|
||||
{
|
||||
"node_id": "node-2",
|
||||
"node_type": "function",
|
||||
"node_type": "event_loop",
|
||||
"step_index": 0,
|
||||
"llm_text": "",
|
||||
"tool_calls": [],
|
||||
|
||||
Reference in New Issue
Block a user