diff --git a/.claude/skills/hive-create/SKILL.md b/.claude/skills/hive-create/SKILL.md index e611a2df..b3d3e5ef 100644 --- a/.claude/skills/hive-create/SKILL.md +++ b/.claude/skills/hive-create/SKILL.md @@ -492,7 +492,7 @@ AskUserQuestion(questions=[{ - node_id (kebab-case) - name - description -- node_type: `"event_loop"` (recommended for all LLM work) or `"function"` (deterministic, no LLM) +- node_type: `"event_loop"` (the only valid type; use `client_facing: True` for HITL) - input_keys (what data this node receives) - output_keys (what data this node produces) - tools (ONLY tools that exist from Step 1 β€” empty list if no tools needed) @@ -852,8 +852,7 @@ cd /home/timothy/oss/hive && PYTHONPATH=exports uv run python -m AGENT_NAME vali | Type | tools param | Use when | | ------------ | ----------------------- | --------------------------------------- | -| `event_loop` | `'["tool1"]'` or `'[]'` | LLM-powered work with or without tools | -| `function` | N/A | Deterministic Python operations, no LLM | +| `event_loop` | `'["tool1"]'` or `'[]'` | All agent work (with or without tools, HITL via client_facing) | --- @@ -1008,7 +1007,7 @@ Use this reference during STEP 2 to give accurate, honest assessments. | Sub-second responses | LLM latency is inherent | Traditional code, no LLM | | Processing millions of items | Context windows and rate limits | Batch processing + sampling | | Real-time streaming data | No built-in pub/sub or streaming input | Custom MCP server + agent | -| Guaranteed determinism | LLM outputs vary | Function nodes for deterministic parts | +| Guaranteed determinism | LLM outputs vary | Traditional code for deterministic parts | | Offline/air-gapped | Requires LLM API access | Local models (not currently supported) | | Multi-user concurrency | Single-user session model | Separate agent instances per user | diff --git a/core/MCP_BUILDER_TOOLS_GUIDE.md b/core/MCP_BUILDER_TOOLS_GUIDE.md index b445b13e..b32b5126 100644 --- a/core/MCP_BUILDER_TOOLS_GUIDE.md +++ b/core/MCP_BUILDER_TOOLS_GUIDE.md @@ -82,7 +82,7 @@ Register an MCP server as a tool source for your agent. "example_tool" ], "total_mcp_servers": 1, - "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in llm_tool_use nodes." + "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes." } ``` @@ -149,7 +149,7 @@ List tools available from registered MCP servers. ] }, "total_tools": 6, - "note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes" + "note": "Use these tool names in the 'tools' parameter when adding event_loop nodes" } ``` @@ -246,7 +246,7 @@ Here's a complete workflow for building an agent with MCP tools: "node_id": "web-searcher", "name": "Web Search", "description": "Search the web for information", - "node_type": "llm_tool_use", + "node_type": "event_loop", "input_keys": "[\"query\"]", "output_keys": "[\"search_results\"]", "system_prompt": "Search for {query} using the web_search tool", diff --git a/core/MCP_INTEGRATION_GUIDE.md b/core/MCP_INTEGRATION_GUIDE.md index 159ca3b6..11f29b52 100644 --- a/core/MCP_INTEGRATION_GUIDE.md +++ b/core/MCP_INTEGRATION_GUIDE.md @@ -119,7 +119,7 @@ builder = WorkflowBuilder() builder.add_node( node_id="researcher", name="Web Researcher", - node_type="llm_tool_use", + node_type="event_loop", system_prompt="Research the topic using web_search", tools=["web_search"], # Tool from tools MCP server input_keys=["topic"], @@ -137,7 +137,7 @@ Tools from MCP servers can be referenced in your agent.json just like built-in t { "id": "searcher", "name": "Web Searcher", - "node_type": "llm_tool_use", + "node_type": "event_loop", "system_prompt": "Search for information about {topic}", "tools": ["web_search", "web_scrape"], "input_keys": ["topic"], diff --git a/core/MCP_SERVER_GUIDE.md b/core/MCP_SERVER_GUIDE.md index fd520d5d..60b7cce2 100644 --- a/core/MCP_SERVER_GUIDE.md +++ b/core/MCP_SERVER_GUIDE.md @@ -103,31 +103,20 @@ Add a processing node to the agent graph. - `node_id` (string, required): Unique node identifier - `name` (string, required): Human-readable name - `description` (string, required): What this node does -- `node_type` (string, required): One of: `llm_generate`, `llm_tool_use`, `router`, `function` +- `node_type` (string, required): Must be `event_loop` (the only valid type) - `input_keys` (string, required): JSON array of input variable names - `output_keys` (string, required): JSON array of output variable names -- `system_prompt` (string, optional): System prompt for LLM nodes -- `tools` (string, optional): JSON array of tool names for tool_use nodes -- `routes` (string, optional): JSON object of route mappings for router nodes +- `system_prompt` (string, optional): System prompt for the LLM +- `tools` (string, optional): JSON array of tool names +- `client_facing` (boolean, optional): Set to true for human-in-the-loop interaction -**Node Types:** +**Node Type:** -1. **llm_generate**: Uses LLM to generate output from inputs - - Requires: `system_prompt` - - Tools: Not used - -2. **llm_tool_use**: Uses LLM with tools to accomplish tasks - - Requires: `system_prompt`, `tools` - - Tools: Array of tool names (e.g., `["web_search", "web_fetch"]`) - -3. **router**: LLM-powered routing to different paths - - Requires: `system_prompt`, `routes` - - Routes: Object mapping route names to target node IDs - - Example: `{"pass": "success_node", "fail": "retry_node"}` - -4. **function**: Executes a pre-defined function - - System prompt describes the function behavior - - No LLM calls, pure computation +**event_loop**: LLM-powered node with self-correction loop +- Requires: `system_prompt` +- Optional: `tools` (array of tool names, e.g., `["web_search", "web_fetch"]`) +- Optional: `client_facing` (set to true for HITL / user interaction) +- Supports: iterative refinement, judge-based evaluation, tool use, streaming **Example:** ```json @@ -135,7 +124,7 @@ Add a processing node to the agent graph. "node_id": "search_sources", "name": "Search Sources", "description": "Searches for relevant sources on the topic", - "node_type": "llm_tool_use", + "node_type": "event_loop", "input_keys": "[\"topic\", \"search_queries\"]", "output_keys": "[\"sources\", \"source_count\"]", "system_prompt": "Search for sources using the provided queries...", @@ -198,7 +187,7 @@ Export the validated graph as an agent specification. **What it does:** 1. Validates the graph -2. Auto-generates missing edges from router routes +2. Validates edge connectivity 3. Writes files to disk: - `exports/{agent-name}/agent.json` - Full agent specification - `exports/{agent-name}/README.md` - Auto-generated documentation @@ -252,47 +241,6 @@ Test the complete agent graph with sample inputs. --- -### Evaluation Rules - -#### `add_evaluation_rule` -Add a rule for the HybridJudge to evaluate node outputs. - -**Parameters:** -- `rule_id` (string, required): Unique rule identifier -- `description` (string, required): What this rule checks -- `condition` (string, required): Python expression to evaluate -- `action` (string, required): Action to take: `accept`, `retry`, `escalate` -- `priority` (integer, optional): Rule priority (default: 0) -- `feedback_template` (string, optional): Feedback message template - -**Condition Examples:** -- `'result.get("success") == True'` - Check for success flag -- `'result.get("error_type") == "timeout"'` - Check error type -- `'len(result.get("data", [])) > 0'` - Check for non-empty data - -**Example:** -```json -{ - "rule_id": "timeout_retry", - "description": "Retry on timeout errors", - "condition": "result.get('error_type') == 'timeout'", - "action": "retry", - "priority": 10, - "feedback_template": "Timeout occurred, retrying..." -} -``` - -#### `list_evaluation_rules` -List all configured evaluation rules. - -#### `remove_evaluation_rule` -Remove an evaluation rule. - -**Parameters:** -- `rule_id` (string, required): Rule to remove - ---- - ## Example Workflow Here's a complete workflow for building a research agent: @@ -320,7 +268,7 @@ add_node( node_id="planner", name="Research Planner", description="Creates research strategy", - node_type="llm_generate", + node_type="event_loop", input_keys='["topic"]', output_keys='["strategy", "queries"]', system_prompt="Analyze topic and create research plan..." @@ -330,7 +278,7 @@ add_node( node_id="searcher", name="Search Sources", description="Find relevant sources", - node_type="llm_tool_use", + node_type="event_loop", input_keys='["queries"]', output_keys='["sources"]', system_prompt="Search for sources...", @@ -359,10 +307,9 @@ The exported agent will be saved to `exports/research-agent/`. 1. **Start with the goal**: Define clear success criteria before building nodes 2. **Test nodes individually**: Use `test_node` to verify each node works -3. **Use router nodes for branching**: Don't create edges manually for routers - define routes and they'll be auto-generated -4. **Add evaluation rules**: Help the judge evaluate outputs deterministically -5. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges -6. **Check exports**: Review the generated README.md to verify your agent structure +3. **Use conditional edges for branching**: Define condition_expr on edges for decision points +4. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges +5. **Check exports**: Review the generated README.md to verify your agent structure --- diff --git a/core/README.md b/core/README.md index abd1f973..4b690e31 100644 --- a/core/README.md +++ b/core/README.md @@ -73,7 +73,7 @@ To use the agent builder with Claude Desktop or other MCP clients, add this to y The MCP server provides tools for: - Creating agent building sessions - Defining goals with success criteria -- Adding nodes (llm_generate, llm_tool_use, router, function) +- Adding nodes (event_loop only) - Connecting nodes with edges - Validating and exporting agent graphs - Testing nodes and full agent graphs diff --git a/core/demos/github_outreach_demo.py b/core/demos/github_outreach_demo.py index b7e33471..eb0ba3cf 100644 --- a/core/demos/github_outreach_demo.py +++ b/core/demos/github_outreach_demo.py @@ -68,7 +68,7 @@ from framework.graph.event_loop_node import ( # noqa: E402 ) from framework.graph.executor import GraphExecutor # noqa: E402 from framework.graph.goal import Goal # noqa: E402 -from framework.graph.node import NodeSpec # noqa: E402 +from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec # noqa: E402 from framework.llm.litellm import LiteLLMProvider # noqa: E402 from framework.runner.tool_registry import ToolRegistry # noqa: E402 from framework.runtime.core import Runtime # noqa: E402 @@ -654,7 +654,7 @@ NODE_SPECS = { id="sender", name="Sender", description="Send approved campaign emails", - node_type="function", + node_type="event_loop", input_keys=["approved_emails"], output_keys=["send_results"], ), @@ -823,11 +823,20 @@ def _send_email_via_resend( return {"error": f"Network error: {e}"} +class SenderNode(NodeProtocol): + """Node wrapper for send_emails function.""" + + async def execute(self, ctx: NodeContext) -> NodeResult: + approved = ctx.input_data.get("approved_emails", "") + result_str = send_emails(approved_emails=approved) + ctx.memory.write("send_results", result_str) + return NodeResult(success=True, output={"send_results": result_str}) + + def send_emails(approved_emails: str = "") -> str: """Send approved campaign emails via Resend, or log if unconfigured. - Called by FunctionNode which unpacks input_keys as kwargs. - Returns a JSON string (FunctionNode wraps it in NodeResult). + Returns a JSON string. """ approved = approved_emails if not approved: @@ -1780,7 +1789,7 @@ async def _run_pipeline(websocket, initial_message: str): ) for nid, impl in nodes.items(): executor.register_node(nid, impl) - executor.register_function("sender", send_emails) + executor.register_node("sender", SenderNode()) # --- Event forwarding: bus β†’ WebSocket --- diff --git a/core/examples/manual_agent.py b/core/examples/manual_agent.py index f1128581..09f3dc01 100644 --- a/core/examples/manual_agent.py +++ b/core/examples/manual_agent.py @@ -4,8 +4,8 @@ Minimal Manual Agent Example This example demonstrates how to build and run an agent programmatically without using the Claude Code CLI or external LLM APIs. -It uses 'function' nodes to define logic in pure Python, making it perfect -for understanding the core runtime loop: +It uses custom NodeProtocol implementations to define logic in pure Python, +making it perfect for understanding the core runtime loop: Setup -> Graph definition -> Execution -> Result Run with: @@ -16,22 +16,33 @@ import asyncio from framework.graph import EdgeCondition, EdgeSpec, Goal, GraphSpec, NodeSpec from framework.graph.executor import GraphExecutor +from framework.graph.node import NodeContext, NodeProtocol, NodeResult from framework.runtime.core import Runtime -# 1. Define Node Logic (Pure Python Functions) -def greet(name: str) -> str: +# 1. Define Node Logic (Custom NodeProtocol implementations) +class GreeterNode(NodeProtocol): """Generate a simple greeting.""" - return f"Hello, {name}!" + + async def execute(self, ctx: NodeContext) -> NodeResult: + name = ctx.input_data.get("name", "World") + greeting = f"Hello, {name}!" + ctx.memory.write("greeting", greeting) + return NodeResult(success=True, output={"greeting": greeting}) -def uppercase(greeting: str) -> str: +class UppercaserNode(NodeProtocol): """Convert text to uppercase.""" - return greeting.upper() + + async def execute(self, ctx: NodeContext) -> NodeResult: + greeting = ctx.input_data.get("greeting") or ctx.memory.read("greeting") or "" + result = greeting.upper() + ctx.memory.write("final_greeting", result) + return NodeResult(success=True, output={"final_greeting": result}) async def main(): - print("πŸš€ Setting up Manual Agent...") + print("Setting up Manual Agent...") # 2. Define the Goal # Every agent needs a goal with success criteria @@ -55,8 +66,7 @@ async def main(): id="greeter", name="Greeter", description="Generates a simple greeting", - node_type="function", - function="greet", # Matches the registered function name + node_type="event_loop", input_keys=["name"], output_keys=["greeting"], ) @@ -65,8 +75,7 @@ async def main(): id="uppercaser", name="Uppercaser", description="Converts greeting to uppercase", - node_type="function", - function="uppercase", + node_type="event_loop", input_keys=["greeting"], output_keys=["final_greeting"], ) @@ -98,23 +107,23 @@ async def main(): runtime = Runtime(storage_path=Path("./agent_logs")) executor = GraphExecutor(runtime=runtime) - # 7. Register Function Implementations - # Connect string names in NodeSpecs to actual Python functions - executor.register_function("greeter", greet) - executor.register_function("uppercaser", uppercase) + # 7. Register Node Implementations + # Connect node IDs in the graph to actual Python implementations + executor.register_node("greeter", GreeterNode()) + executor.register_node("uppercaser", UppercaserNode()) # 8. Execute Agent - print("β–Ά Executing agent with input: name='Alice'...") + print("Executing agent with input: name='Alice'...") result = await executor.execute(graph=graph, goal=goal, input_data={"name": "Alice"}) # 9. Verify Results if result.success: - print("\nβœ… Success!") + print(f"\nSuccess!") print(f"Path taken: {' -> '.join(result.path)}") print(f"Final output: {result.output.get('final_greeting')}") else: - print(f"\n❌ Failed: {result.error}") + print(f"\nFailed: {result.error}") if __name__ == "__main__": diff --git a/core/examples/mcp_integration_example.py b/core/examples/mcp_integration_example.py index 01a11aba..d1246443 100644 --- a/core/examples/mcp_integration_example.py +++ b/core/examples/mcp_integration_example.py @@ -122,7 +122,7 @@ async def example_4_custom_agent_with_mcp_tools(): node_id="web-searcher", name="Web Search", description="Search the web for information", - node_type="llm_tool_use", + node_type="event_loop", system_prompt="Search for {query} and return the top results. Use the web_search tool.", tools=["web_search"], # This tool comes from tools MCP server input_keys=["query"], @@ -133,7 +133,7 @@ async def example_4_custom_agent_with_mcp_tools(): node_id="summarizer", name="Summarize Results", description="Summarize the search results", - node_type="llm_generate", + node_type="event_loop", system_prompt="Summarize the following search results in 2-3 sentences: {search_results}", input_keys=["search_results"], output_keys=["summary"], diff --git a/core/framework/builder/workflow.py b/core/framework/builder/workflow.py index 5a2fe405..5d5a0e89 100644 --- a/core/framework/builder/workflow.py +++ b/core/framework/builder/workflow.py @@ -245,20 +245,14 @@ class GraphBuilder: warnings.append(f"Node '{node.id}' should have a description") # Type-specific validation - if node.node_type == "llm_tool_use": - if not node.tools: - errors.append(f"LLM tool node '{node.id}' must specify tools") - if not node.system_prompt: - warnings.append(f"LLM node '{node.id}' should have a system_prompt") + if node.node_type == "event_loop": + if node.tools and not node.system_prompt: + warnings.append(f"Event loop node '{node.id}' should have a system_prompt") if node.node_type == "router": if not node.routes: errors.append(f"Router node '{node.id}' must specify routes") - if node.node_type == "function": - if not node.function: - errors.append(f"Function node '{node.id}' must specify function name") - # Check input/output keys if not node.input_keys: suggestions.append(f"Consider specifying input_keys for '{node.id}'") diff --git a/core/framework/graph/__init__.py b/core/framework/graph/__init__.py index 2fbf447e..6fc35c0a 100644 --- a/core/framework/graph/__init__.py +++ b/core/framework/graph/__init__.py @@ -1,4 +1,4 @@ -"""Graph structures: Goals, Nodes, Edges, and Flexible Execution.""" +"""Graph structures: Goals, Nodes, Edges, and Execution.""" from framework.graph.client_io import ( ActiveNodeClientIO, @@ -6,7 +6,6 @@ from framework.graph.client_io import ( InertNodeClientIO, NodeClientIO, ) -from framework.graph.code_sandbox import CodeSandbox, safe_eval, safe_exec from framework.graph.context_handoff import ContextHandoff, HandoffContext from framework.graph.conversation import ConversationStore, Message, NodeConversation from framework.graph.edge import DEFAULT_MAX_TOKENS, EdgeCondition, EdgeSpec, GraphSpec @@ -18,31 +17,9 @@ from framework.graph.event_loop_node import ( OutputAccumulator, ) from framework.graph.executor import GraphExecutor -from framework.graph.flexible_executor import ExecutorConfig, FlexibleGraphExecutor from framework.graph.goal import Constraint, Goal, GoalStatus, SuccessCriterion -from framework.graph.judge import HybridJudge, create_default_judge from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec -# Flexible execution (Worker-Judge pattern) -from framework.graph.plan import ( - ActionSpec, - ActionType, - # HITL (Human-in-the-loop) - ApprovalDecision, - ApprovalRequest, - ApprovalResult, - EvaluationRule, - ExecutionStatus, - Judgment, - JudgmentAction, - Plan, - PlanExecutionResult, - PlanStep, - StepStatus, - load_export, -) -from framework.graph.worker_node import StepExecutionResult, WorkerNode - __all__ = [ # Goal "Goal", @@ -59,35 +36,8 @@ __all__ = [ "EdgeCondition", "GraphSpec", "DEFAULT_MAX_TOKENS", - # Executor (fixed graph) + # Executor "GraphExecutor", - # Plan (flexible execution) - "Plan", - "PlanStep", - "ActionSpec", - "ActionType", - "StepStatus", - "Judgment", - "JudgmentAction", - "EvaluationRule", - "PlanExecutionResult", - "ExecutionStatus", - "load_export", - # HITL (Human-in-the-loop) - "ApprovalDecision", - "ApprovalRequest", - "ApprovalResult", - # Worker-Judge - "HybridJudge", - "create_default_judge", - "WorkerNode", - "StepExecutionResult", - "FlexibleGraphExecutor", - "ExecutorConfig", - # Code Sandbox - "CodeSandbox", - "safe_exec", - "safe_eval", # Conversation "NodeConversation", "ConversationStore", diff --git a/core/framework/graph/code_sandbox.py b/core/framework/graph/code_sandbox.py deleted file mode 100644 index ee399586..00000000 --- a/core/framework/graph/code_sandbox.py +++ /dev/null @@ -1,413 +0,0 @@ -""" -Code Sandbox for Safe Execution of Dynamic Code. - -Provides a restricted execution environment for code generated by -the external planner. This is critical for open-ended planning where -the planner can create arbitrary code actions. - -Security measures: -1. Restricted builtins (no file I/O, no imports of dangerous modules) -2. Timeout enforcement -3. Memory limits (via resource module on Unix) -4. Namespace isolation -""" - -import ast -import signal -import sys -from contextlib import contextmanager -from dataclasses import dataclass, field -from typing import Any - -# Safe builtins whitelist -SAFE_BUILTINS = { - # Basic types - "True": True, - "False": False, - "None": None, - # Type constructors - "bool": bool, - "int": int, - "float": float, - "str": str, - "list": list, - "dict": dict, - "set": set, - "tuple": tuple, - "frozenset": frozenset, - # Basic functions - "abs": abs, - "all": all, - "any": any, - "bin": bin, - "chr": chr, - "divmod": divmod, - "enumerate": enumerate, - "filter": filter, - "format": format, - "hex": hex, - "isinstance": isinstance, - "issubclass": issubclass, - "iter": iter, - "len": len, - "map": map, - "max": max, - "min": min, - "next": next, - "oct": oct, - "ord": ord, - "pow": pow, - "range": range, - "repr": repr, - "reversed": reversed, - "round": round, - "slice": slice, - "sorted": sorted, - "sum": sum, - "zip": zip, -} - -# Modules that can be imported -ALLOWED_MODULES = { - "math", - "json", - "re", - "datetime", - "collections", - "itertools", - "functools", - "operator", - "string", - "random", - "statistics", - "decimal", - "fractions", -} - -# Dangerous AST nodes to block -BLOCKED_AST_NODES = { - ast.Import, - ast.ImportFrom, - ast.Global, - ast.Nonlocal, -} - - -class CodeSandboxError(Exception): - """Error during sandboxed code execution.""" - - pass - - -class TimeoutError(CodeSandboxError): - """Code execution timed out.""" - - pass - - -class SecurityError(CodeSandboxError): - """Code contains potentially dangerous operations.""" - - pass - - -@dataclass -class SandboxResult: - """Result of sandboxed code execution.""" - - success: bool - result: Any = None - error: str | None = None - stdout: str = "" - variables: dict[str, Any] = field(default_factory=dict) - execution_time_ms: int = 0 - - -class RestrictedImporter: - """Custom importer that only allows whitelisted modules.""" - - def __init__(self, allowed_modules: set[str]): - self.allowed_modules = allowed_modules - self._cache: dict[str, Any] = {} - - def __call__(self, name: str, *args, **kwargs): - if name not in self.allowed_modules: - raise SecurityError(f"Import of module '{name}' is not allowed") - - if name not in self._cache: - import importlib - - self._cache[name] = importlib.import_module(name) - - return self._cache[name] - - -class CodeValidator: - """Validates code for safety before execution.""" - - def __init__(self, blocked_nodes: set[type] | None = None): - self.blocked_nodes = blocked_nodes or BLOCKED_AST_NODES - - def validate(self, code: str) -> list[str]: - """ - Validate code and return list of issues. - - Returns empty list if code is safe. - """ - issues = [] - - try: - tree = ast.parse(code) - except SyntaxError as e: - return [f"Syntax error: {e}"] - - for node in ast.walk(tree): - # Check for blocked node types - if type(node) in self.blocked_nodes: - lineno = getattr(node, "lineno", "?") - issues.append(f"Blocked operation: {type(node).__name__} at line {lineno}") - - # Check for dangerous attribute access - if isinstance(node, ast.Attribute): - if node.attr.startswith("_"): - issues.append( - f"Access to private attribute '{node.attr}' at line {node.lineno}" - ) - - # Check for exec/eval calls - if isinstance(node, ast.Call): - if isinstance(node.func, ast.Name): - if node.func.id in ("exec", "eval", "compile", "__import__"): - issues.append( - f"Blocked function call: {node.func.id} at line {node.lineno}" - ) - - return issues - - -class CodeSandbox: - """ - Sandboxed environment for executing dynamic code. - - Usage: - sandbox = CodeSandbox(timeout_seconds=5) - result = sandbox.execute( - code="x = 1 + 2\\nresult = x * 3", - inputs={"multiplier": 2}, - ) - if result.success: - print(result.variables["result"]) # 6 - """ - - def __init__( - self, - timeout_seconds: int = 10, - allowed_modules: set[str] | None = None, - safe_builtins: dict[str, Any] | None = None, - ): - self.timeout_seconds = timeout_seconds - self.allowed_modules = allowed_modules or ALLOWED_MODULES - self.safe_builtins = safe_builtins or SAFE_BUILTINS - self.validator = CodeValidator() - self.importer = RestrictedImporter(self.allowed_modules) - - @contextmanager - def _timeout_context(self, seconds: int): - """Context manager for timeout enforcement.""" - - def handler(signum, frame): - raise TimeoutError(f"Code execution timed out after {seconds} seconds") - - # Only works on Unix-like systems - if hasattr(signal, "SIGALRM"): - old_handler = signal.signal(signal.SIGALRM, handler) - signal.alarm(seconds) - try: - yield - finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, old_handler) - else: - # Windows: no timeout support, just execute - yield - - def _create_namespace(self, inputs: dict[str, Any]) -> dict[str, Any]: - """Create isolated namespace for code execution.""" - namespace = { - "__builtins__": dict(self.safe_builtins), - "__import__": self.importer, - } - - # Add input variables - namespace.update(inputs) - - return namespace - - def execute( - self, - code: str, - inputs: dict[str, Any] | None = None, - extract_vars: list[str] | None = None, - ) -> SandboxResult: - """ - Execute code in sandbox. - - Args: - code: Python code to execute - inputs: Variables to inject into namespace - extract_vars: Variable names to extract from namespace after execution - - Returns: - SandboxResult with execution outcome - """ - import time - - inputs = inputs or {} - extract_vars = extract_vars or [] - - # Validate code first - issues = self.validator.validate(code) - if issues: - return SandboxResult( - success=False, - error=f"Code validation failed: {'; '.join(issues)}", - ) - - # Create isolated namespace - namespace = self._create_namespace(inputs) - - # Capture stdout - import io - - old_stdout = sys.stdout - sys.stdout = captured_stdout = io.StringIO() - - start_time = time.time() - - try: - with self._timeout_context(self.timeout_seconds): - # Compile and execute - compiled = compile(code, "", "exec") - exec(compiled, namespace) - - execution_time_ms = int((time.time() - start_time) * 1000) - - # Extract requested variables - extracted = {} - for var in extract_vars: - if var in namespace: - extracted[var] = namespace[var] - - # Also extract any new variables (not in inputs or builtins) - for key, value in namespace.items(): - if key not in inputs and key not in self.safe_builtins and not key.startswith("_"): - extracted[key] = value - - return SandboxResult( - success=True, - result=namespace.get("result"), # Convention: 'result' is the return value - stdout=captured_stdout.getvalue(), - variables=extracted, - execution_time_ms=execution_time_ms, - ) - - except TimeoutError as e: - return SandboxResult( - success=False, - error=str(e), - execution_time_ms=self.timeout_seconds * 1000, - ) - - except SecurityError as e: - return SandboxResult( - success=False, - error=f"Security violation: {e}", - execution_time_ms=int((time.time() - start_time) * 1000), - ) - - except Exception as e: - return SandboxResult( - success=False, - error=f"{type(e).__name__}: {e}", - stdout=captured_stdout.getvalue(), - execution_time_ms=int((time.time() - start_time) * 1000), - ) - - finally: - sys.stdout = old_stdout - - def execute_expression( - self, - expression: str, - inputs: dict[str, Any] | None = None, - ) -> SandboxResult: - """ - Execute a single expression and return its value. - - Simpler than execute() - just evaluates one expression. - """ - inputs = inputs or {} - - # Validate - try: - ast.parse(expression, mode="eval") - except SyntaxError as e: - return SandboxResult(success=False, error=f"Syntax error: {e}") - - namespace = self._create_namespace(inputs) - - try: - with self._timeout_context(self.timeout_seconds): - result = eval(expression, namespace) - - return SandboxResult(success=True, result=result) - - except Exception as e: - return SandboxResult( - success=False, - error=f"{type(e).__name__}: {e}", - ) - - -# Singleton instance with default settings -default_sandbox = CodeSandbox() - - -def safe_exec( - code: str, - inputs: dict[str, Any] | None = None, - timeout_seconds: int = 10, -) -> SandboxResult: - """ - Convenience function for safe code execution. - - Args: - code: Python code to execute - inputs: Variables to inject - timeout_seconds: Max execution time - - Returns: - SandboxResult - """ - sandbox = CodeSandbox(timeout_seconds=timeout_seconds) - return sandbox.execute(code, inputs) - - -def safe_eval( - expression: str, - inputs: dict[str, Any] | None = None, - timeout_seconds: int = 5, -) -> SandboxResult: - """ - Convenience function for safe expression evaluation. - - Args: - expression: Python expression to evaluate - inputs: Variables to inject - timeout_seconds: Max execution time - - Returns: - SandboxResult - """ - sandbox = CodeSandbox(timeout_seconds=timeout_seconds) - return sandbox.execute_expression(expression, inputs) diff --git a/core/framework/graph/executor.py b/core/framework/graph/executor.py index 1eeda134..d91a6e64 100644 --- a/core/framework/graph/executor.py +++ b/core/framework/graph/executor.py @@ -11,7 +11,6 @@ The executor: import asyncio import logging -import warnings from collections.abc import Callable from dataclasses import dataclass, field from pathlib import Path @@ -21,13 +20,10 @@ from framework.graph.checkpoint_config import CheckpointConfig from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec from framework.graph.goal import Goal from framework.graph.node import ( - FunctionNode, - LLMNode, NodeContext, NodeProtocol, NodeResult, NodeSpec, - RouterNode, SharedMemory, ) from framework.graph.output_cleaner import CleansingConfig, OutputCleaner @@ -833,9 +829,13 @@ class GraphExecutor: # [CORRECTED] Use node_spec.max_retries instead of hardcoded 3 max_retries = getattr(node_spec, "max_retries", 3) - # Event loop nodes handle retry internally via judge β€” - # executor retry is catastrophic (retry multiplication) - if node_spec.node_type == "event_loop" and max_retries > 0: + # EventLoopNode instances handle retry internally via judge β€” + # executor retry would cause catastrophic retry multiplication. + # Only override for actual EventLoopNode instances, not custom + # NodeProtocol implementations that happen to use node_type="event_loop" + from framework.graph.event_loop_node import EventLoopNode + + if isinstance(node_impl, EventLoopNode) and max_retries > 0: self.logger.warning( f"EventLoopNode '{node_spec.id}' has max_retries={max_retries}. " "Overriding to 0 β€” event loop nodes handle retry internally via judge." @@ -1465,16 +1465,17 @@ class GraphExecutor: cumulative_output_keys=cumulative_output_keys or [], ) - # Valid node types - no ambiguous "llm" type allowed VALID_NODE_TYPES = { - "llm_tool_use", - "llm_generate", - "router", - "function", - "human_input", "event_loop", } - DEPRECATED_NODE_TYPES = {"llm_tool_use": "event_loop", "llm_generate": "event_loop"} + # Node types removed in v0.5 β€” provide migration guidance + REMOVED_NODE_TYPES = { + "function": "event_loop", + "llm_tool_use": "event_loop", + "llm_generate": "event_loop", + "router": "event_loop", # Unused theoretical infrastructure + "human_input": "event_loop", # Use client_facing=True instead + } def _get_node_implementation( self, node_spec: NodeSpec, cleanup_llm_model: str | None = None @@ -1484,62 +1485,23 @@ class GraphExecutor: if node_spec.id in self.node_registry: return self.node_registry[node_spec.id] + # Reject removed node types with migration guidance + if node_spec.node_type in self.REMOVED_NODE_TYPES: + replacement = self.REMOVED_NODE_TYPES[node_spec.node_type] + raise RuntimeError( + f"Node type '{node_spec.node_type}' was removed in v0.5. " + f"Migrate node '{node_spec.id}' to '{replacement}'. " + f"See https://github.com/adenhq/hive/issues/4753 for migration guide." + ) + # Validate node type if node_spec.node_type not in self.VALID_NODE_TYPES: raise RuntimeError( f"Invalid node type '{node_spec.node_type}' for node '{node_spec.id}'. " - f"Must be one of: {sorted(self.VALID_NODE_TYPES)}. " - f"Use 'llm_tool_use' for nodes that call tools, 'llm_generate' for text generation." - ) - - # Warn on deprecated node types - if node_spec.node_type in self.DEPRECATED_NODE_TYPES: - replacement = self.DEPRECATED_NODE_TYPES[node_spec.node_type] - warnings.warn( - f"Node type '{node_spec.node_type}' is deprecated. " - f"Use '{replacement}' instead. " - f"Node: '{node_spec.id}'", - DeprecationWarning, - stacklevel=2, - ) - - # Create based on type - if node_spec.node_type == "llm_tool_use": - if not node_spec.tools: - raise RuntimeError( - f"Node '{node_spec.id}' is type 'llm_tool_use' but declares no tools. " - "Either add tools to the node or change type to 'llm_generate'." - ) - return LLMNode( - tool_executor=self.tool_executor, - require_tools=True, - cleanup_llm_model=cleanup_llm_model, - ) - - if node_spec.node_type == "llm_generate": - return LLMNode( - tool_executor=None, - require_tools=False, - cleanup_llm_model=cleanup_llm_model, - ) - - if node_spec.node_type == "router": - return RouterNode() - - if node_spec.node_type == "function": - # Function nodes need explicit registration - raise RuntimeError( - f"Function node '{node_spec.id}' not registered. Register with node_registry." - ) - - if node_spec.node_type == "human_input": - # Human input nodes are handled specially by HITL mechanism - return LLMNode( - tool_executor=None, - require_tools=False, - cleanup_llm_model=cleanup_llm_model, + f"Must be one of: {sorted(self.VALID_NODE_TYPES)}." ) + # Create based on type (only event_loop is valid) if node_spec.node_type == "event_loop": # Auto-create EventLoopNode with sensible defaults. # Custom configs can still be pre-registered via node_registry. @@ -1799,14 +1761,19 @@ class GraphExecutor: branch.error = f"Node {branch.node_id} not found in graph" return branch, RuntimeError(branch.error) + # Get node implementation to check its type + branch_impl = self._get_node_implementation(node_spec, graph.cleanup_llm_model) + effective_max_retries = node_spec.max_retries - if node_spec.node_type == "event_loop": - if effective_max_retries > 1: - self.logger.warning( - f"EventLoopNode '{node_spec.id}' has " - f"max_retries={effective_max_retries}. Overriding " - "to 1 β€” event loop nodes handle retry internally." - ) + # Only override for actual EventLoopNode instances, not custom NodeProtocol impls + from framework.graph.event_loop_node import EventLoopNode + + if isinstance(branch_impl, EventLoopNode) and effective_max_retries > 1: + self.logger.warning( + f"EventLoopNode '{node_spec.id}' has " + f"max_retries={effective_max_retries}. Overriding " + "to 1 β€” event loop nodes handle retry internally." + ) effective_max_retries = 1 branch.status = "running" @@ -1972,10 +1939,6 @@ class GraphExecutor: """Register a custom node implementation.""" self.node_registry[node_id] = implementation - def register_function(self, node_id: str, func: Callable) -> None: - """Register a function as a node.""" - self.node_registry[node_id] = FunctionNode(func) - def request_pause(self) -> None: """ Request graceful pause of the current execution. diff --git a/core/framework/graph/flexible_executor.py b/core/framework/graph/flexible_executor.py deleted file mode 100644 index c3a56591..00000000 --- a/core/framework/graph/flexible_executor.py +++ /dev/null @@ -1,552 +0,0 @@ -""" -Flexible Graph Executor with Worker-Judge Loop. - -Executes plans created by external planner (Claude Code, etc.) -using a Worker-Judge loop: - -1. External planner creates Plan -2. FlexibleGraphExecutor receives Plan -3. Worker executes each step -4. Judge evaluates each result -5. If Judge says "replan" β†’ return to external planner with feedback -6. If Judge says "escalate" β†’ request human intervention -7. If all steps complete β†’ return success - -This keeps planning external while execution/evaluation is internal. -""" - -from collections.abc import Callable -from dataclasses import dataclass -from datetime import datetime -from typing import Any - -from framework.graph.code_sandbox import CodeSandbox -from framework.graph.goal import Goal -from framework.graph.judge import HybridJudge, create_default_judge -from framework.graph.plan import ( - ApprovalDecision, - ApprovalRequest, - ApprovalResult, - ExecutionStatus, - Judgment, - JudgmentAction, - Plan, - PlanExecutionResult, - PlanStep, - StepStatus, -) -from framework.graph.worker_node import StepExecutionResult, WorkerNode -from framework.llm.provider import LLMProvider, Tool -from framework.runtime.core import Runtime - -# Type alias for approval callback -ApprovalCallback = Callable[[ApprovalRequest], ApprovalResult] - - -@dataclass -class ExecutorConfig: - """Configuration for FlexibleGraphExecutor.""" - - max_retries_per_step: int = 3 - max_total_steps: int = 100 - timeout_seconds: int = 300 - enable_parallel_execution: bool = False # Future: parallel step execution - - -class FlexibleGraphExecutor: - """ - Executes plans with Worker-Judge loop. - - Plans come from external source (Claude Code, etc.). - Returns feedback for replanning if needed. - - Usage: - executor = FlexibleGraphExecutor( - runtime=runtime, - llm=llm_provider, - tools=tools, - ) - - result = await executor.execute_plan(plan, goal, context) - - if result.status == ExecutionStatus.NEEDS_REPLAN: - # External planner should create new plan using result.feedback - new_plan = external_planner.replan(result.feedback_context) - result = await executor.execute_plan(new_plan, goal, result.feedback_context) - """ - - def __init__( - self, - runtime: Runtime, - llm: LLMProvider | None = None, - tools: dict[str, Tool] | None = None, - tool_executor: Callable | None = None, - functions: dict[str, Callable] | None = None, - judge: HybridJudge | None = None, - config: ExecutorConfig | None = None, - approval_callback: ApprovalCallback | None = None, - ): - """ - Initialize the FlexibleGraphExecutor. - - Args: - runtime: Runtime for decision logging - llm: LLM provider for Worker and Judge - tools: Available tools - tool_executor: Function to execute tools - functions: Registered functions - judge: Custom judge (defaults to HybridJudge with default rules) - config: Executor configuration - approval_callback: Callback for human-in-the-loop approval. - If None, steps requiring approval will pause execution. - """ - self.runtime = runtime - self.llm = llm - self.tools = tools or {} - self.tool_executor = tool_executor - self.functions = functions or {} - self.config = config or ExecutorConfig() - self.approval_callback = approval_callback - - # Create judge - self.judge = judge or create_default_judge(llm) - - # Create worker - self.worker = WorkerNode( - runtime=runtime, - llm=llm, - tools=tools, - tool_executor=tool_executor, - functions=functions, - sandbox=CodeSandbox(), - ) - - async def execute_plan( - self, - plan: Plan, - goal: Goal, - context: dict[str, Any] | None = None, - ) -> PlanExecutionResult: - """ - Execute a plan created by external planner. - - Args: - plan: The plan to execute - goal: The goal context - context: Initial context (e.g., from previous execution) - - Returns: - PlanExecutionResult with status and feedback - """ - context = context or {} - context.update(plan.context) # Merge plan's accumulated context - - # Start run - _run_id = self.runtime.start_run( - goal_id=goal.id, - goal_description=goal.description, - input_data={"plan_id": plan.id, "revision": plan.revision}, - ) - - steps_executed = 0 - total_tokens = 0 - total_latency = 0 - - try: - while steps_executed < self.config.max_total_steps: - # Get next ready steps - ready_steps = plan.get_ready_steps() - - if not ready_steps: - # Check if we're done or stuck - if plan.is_complete(): - break - else: - # No ready steps but not complete - something's wrong - return self._create_result( - status=ExecutionStatus.NEEDS_REPLAN, - plan=plan, - context=context, - feedback=( - "No executable steps available but plan not complete. " - "Check dependencies." - ), - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - # Execute next step (for now, sequential; could be parallel) - step = ready_steps[0] - # Debug: show ready steps - # ready_ids = [s.id for s in ready_steps] - # print(f" [DEBUG] Ready steps: {ready_ids}, executing: {step.id}") - - # APPROVAL CHECK - before execution - if step.requires_approval: - approval_result = await self._request_approval(step, context) - - if approval_result is None: - # No callback, pause execution - step.status = StepStatus.AWAITING_APPROVAL - return self._create_result( - status=ExecutionStatus.AWAITING_APPROVAL, - plan=plan, - context=context, - feedback=f"Step '{step.id}' requires approval: {step.description}", - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - if approval_result.decision == ApprovalDecision.REJECT: - step.status = StepStatus.REJECTED - step.error = approval_result.reason or "Rejected by human" - # Skip this step and continue with dependents marked as skipped - self._skip_dependent_steps(plan, step.id) - continue - - if approval_result.decision == ApprovalDecision.ABORT: - return self._create_result( - status=ExecutionStatus.ABORTED, - plan=plan, - context=context, - feedback=approval_result.reason or "Aborted by human", - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - if approval_result.decision == ApprovalDecision.MODIFY: - # Apply modifications to step - if approval_result.modifications: - self._apply_modifications(step, approval_result.modifications) - - # APPROVE - continue to execution - - step.status = StepStatus.IN_PROGRESS - step.started_at = datetime.now() - step.attempts += 1 - - # WORK - work_result = await self.worker.execute(step, context) - steps_executed += 1 - total_tokens += work_result.tokens_used - total_latency += work_result.latency_ms - - # JUDGE - judgment = await self.judge.evaluate( - step=step, - result=work_result.__dict__, - goal=goal, - context=context, - ) - - # Handle judgment - result = await self._handle_judgment( - step=step, - work_result=work_result, - judgment=judgment, - plan=plan, - goal=goal, - context=context, - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - if result is not None: - # Judgment resulted in early return (replan/escalate) - self.runtime.end_run( - success=False, - narrative=f"Execution stopped: {result.status.value}", - ) - return result - - # All steps completed successfully - self.runtime.end_run( - success=True, - output_data=context, - narrative=f"Plan completed: {steps_executed} steps executed", - ) - - return self._create_result( - status=ExecutionStatus.COMPLETED, - plan=plan, - context=context, - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - except Exception as e: - self.runtime.report_problem( - severity="critical", - description=str(e), - ) - self.runtime.end_run( - success=False, - narrative=f"Execution failed: {e}", - ) - - return PlanExecutionResult( - status=ExecutionStatus.FAILED, - error=str(e), - feedback=f"Execution error: {e}", - feedback_context=plan.to_feedback_context(), - completed_steps=[s.id for s in plan.get_completed_steps()], - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency_ms=total_latency, - ) - - async def _handle_judgment( - self, - step: PlanStep, - work_result: StepExecutionResult, - judgment: Judgment, - plan: Plan, - goal: Goal, - context: dict[str, Any], - steps_executed: int, - total_tokens: int, - total_latency: int, - ) -> PlanExecutionResult | None: - """ - Handle judgment and return result if execution should stop. - - Returns None to continue execution, or PlanExecutionResult to stop. - """ - if judgment.action == JudgmentAction.ACCEPT: - # Step succeeded - update state and continue - step.status = StepStatus.COMPLETED - step.completed_at = datetime.now() - step.result = work_result.outputs - - # Map outputs to expected output keys - # If output has generic "result" key but step expects specific keys, map it - outputs_to_store = work_result.outputs.copy() - if step.expected_outputs and "result" in outputs_to_store: - result_value = outputs_to_store["result"] - # For each expected output key that's not in outputs, map from "result" - for expected_key in step.expected_outputs: - if expected_key not in outputs_to_store: - outputs_to_store[expected_key] = result_value - - # Update context with mapped outputs - context.update(outputs_to_store) - - # Store in plan context for replanning feedback - plan.context[step.id] = outputs_to_store - - return None # Continue execution - - elif judgment.action == JudgmentAction.RETRY: - # Retry step if under limit - if step.attempts < step.max_retries: - step.status = StepStatus.PENDING - step.error = judgment.feedback - - # Record retry decision - self.runtime.decide( - intent=f"Retry step {step.id}", - options=[{"id": "retry", "description": "Retry with feedback"}], - chosen="retry", - reasoning=judgment.reasoning, - context={"attempt": step.attempts, "feedback": judgment.feedback}, - ) - - return None # Continue (step will be retried) - else: - # Max retries exceeded - escalate to replan - step.status = StepStatus.FAILED - step.error = f"Max retries ({step.max_retries}) exceeded: {judgment.feedback}" - - return self._create_result( - status=ExecutionStatus.NEEDS_REPLAN, - plan=plan, - context=context, - feedback=( - f"Step '{step.id}' failed after {step.attempts} attempts: " - f"{judgment.feedback}" - ), - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - elif judgment.action == JudgmentAction.REPLAN: - # Return to external planner - step.status = StepStatus.FAILED - step.error = judgment.feedback - - return self._create_result( - status=ExecutionStatus.NEEDS_REPLAN, - plan=plan, - context=context, - feedback=judgment.feedback or f"Step '{step.id}' requires replanning", - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - elif judgment.action == JudgmentAction.ESCALATE: - # Request human intervention - return self._create_result( - status=ExecutionStatus.NEEDS_ESCALATION, - plan=plan, - context=context, - feedback=judgment.feedback or f"Step '{step.id}' requires human intervention", - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency=total_latency, - ) - - return None # Unknown action - continue - - def _create_result( - self, - status: ExecutionStatus, - plan: Plan, - context: dict[str, Any], - feedback: str | None = None, - steps_executed: int = 0, - total_tokens: int = 0, - total_latency: int = 0, - ) -> PlanExecutionResult: - """Create a PlanExecutionResult.""" - return PlanExecutionResult( - status=status, - results=context, - feedback=feedback, - feedback_context=plan.to_feedback_context(), - completed_steps=[s.id for s in plan.get_completed_steps()], - steps_executed=steps_executed, - total_tokens=total_tokens, - total_latency_ms=total_latency, - ) - - def register_function(self, name: str, func: Callable) -> None: - """Register a function for FUNCTION actions.""" - self.functions[name] = func - self.worker.register_function(name, func) - - def register_tool(self, tool: Tool) -> None: - """Register a tool for TOOL_USE actions.""" - self.tools[tool.name] = tool - self.worker.register_tool(tool) - - def add_evaluation_rule(self, rule) -> None: - """Add an evaluation rule to the judge.""" - self.judge.add_rule(rule) - - async def _request_approval( - self, - step: PlanStep, - context: dict[str, Any], - ) -> ApprovalResult | None: - """ - Request human approval for a step. - - Returns None if no callback is set (execution should pause). - """ - if self.approval_callback is None: - return None - - # Build preview of what will happen - preview_parts = [] - if step.action.tool_name: - preview_parts.append(f"Tool: {step.action.tool_name}") - if step.action.tool_args: - import json - - args_preview = json.dumps(step.action.tool_args, indent=2, default=str) - if len(args_preview) > 500: - args_preview = args_preview[:500] + "..." - preview_parts.append(f"Args: {args_preview}") - elif step.action.prompt: - prompt_preview = ( - step.action.prompt[:300] + "..." - if len(step.action.prompt) > 300 - else step.action.prompt - ) - preview_parts.append(f"Prompt: {prompt_preview}") - - # Include step inputs resolved from context (what will be sent/used) - relevant_context = {} - for input_key, input_value in step.inputs.items(): - # Resolve variable references like "$email_sequence" - if isinstance(input_value, str) and input_value.startswith("$"): - context_key = input_value[1:] # Remove $ prefix - if context_key in context: - relevant_context[input_key] = context[context_key] - else: - relevant_context[input_key] = input_value - - request = ApprovalRequest( - step_id=step.id, - step_description=step.description, - action_type=step.action.action_type.value, - action_details={ - "tool_name": step.action.tool_name, - "tool_args": step.action.tool_args, - "prompt": step.action.prompt, - }, - context=relevant_context, - approval_message=step.approval_message, - preview="\n".join(preview_parts) if preview_parts else None, - ) - - return self.approval_callback(request) - - def _skip_dependent_steps(self, plan: Plan, rejected_step_id: str) -> None: - """Mark steps that depend on a rejected step as skipped.""" - for step in plan.steps: - if rejected_step_id in step.dependencies: - if step.status == StepStatus.PENDING: - step.status = StepStatus.SKIPPED - step.error = f"Skipped because dependency '{rejected_step_id}' was rejected" - # Recursively skip dependents - self._skip_dependent_steps(plan, step.id) - - def _apply_modifications(self, step: PlanStep, modifications: dict[str, Any]) -> None: - """Apply human modifications to a step before execution.""" - # Allow modifying tool args - if "tool_args" in modifications and step.action.tool_args: - step.action.tool_args.update(modifications["tool_args"]) - - # Allow modifying prompt - if "prompt" in modifications: - step.action.prompt = modifications["prompt"] - - # Allow modifying inputs - if "inputs" in modifications: - step.inputs.update(modifications["inputs"]) - - def set_approval_callback(self, callback: ApprovalCallback) -> None: - """Set the approval callback for HITL steps.""" - self.approval_callback = callback - - -# Convenience function for simple execution -async def execute_plan( - plan: Plan, - goal: Goal, - runtime: Runtime, - llm: LLMProvider | None = None, - tools: dict[str, Tool] | None = None, - tool_executor: Callable | None = None, - context: dict[str, Any] | None = None, -) -> PlanExecutionResult: - """ - Execute a plan with default configuration. - - Convenience function for simple use cases. - """ - executor = FlexibleGraphExecutor( - runtime=runtime, - llm=llm, - tools=tools, - tool_executor=tool_executor, - ) - return await executor.execute_plan(plan, goal, context) diff --git a/core/framework/graph/judge.py b/core/framework/graph/judge.py deleted file mode 100644 index 4bc8092c..00000000 --- a/core/framework/graph/judge.py +++ /dev/null @@ -1,406 +0,0 @@ -""" -Hybrid Judge for Evaluating Plan Step Results. - -The HybridJudge evaluates step execution results using: -1. Rule-based evaluation (fast, deterministic) -2. LLM-based evaluation (fallback for ambiguous cases) - -Escalation path: rules β†’ LLM β†’ human -""" - -from dataclasses import dataclass, field -from typing import Any - -from framework.graph.code_sandbox import safe_eval -from framework.graph.goal import Goal -from framework.graph.plan import ( - EvaluationRule, - Judgment, - JudgmentAction, - PlanStep, -) -from framework.llm.provider import LLMProvider - - -@dataclass -class RuleEvaluationResult: - """Result of rule-based evaluation.""" - - is_definitive: bool # True if a rule matched definitively - judgment: Judgment | None = None - context: dict[str, Any] = field(default_factory=dict) - rules_checked: int = 0 - rule_matched: str | None = None - - -class HybridJudge: - """ - Evaluates plan step results using rules first, then LLM fallback. - - Usage: - judge = HybridJudge(llm=llm_provider) - judge.add_rule(EvaluationRule( - id="success_check", - condition="result.get('success') == True", - action=JudgmentAction.ACCEPT, - )) - - judgment = await judge.evaluate(step, result, goal) - """ - - def __init__( - self, - llm: LLMProvider | None = None, - rules: list[EvaluationRule] | None = None, - llm_confidence_threshold: float = 0.7, - ): - """ - Initialize the HybridJudge. - - Args: - llm: LLM provider for ambiguous cases - rules: Initial evaluation rules - llm_confidence_threshold: Confidence below this triggers escalation - """ - self.llm = llm - self.rules: list[EvaluationRule] = rules or [] - self.llm_confidence_threshold = llm_confidence_threshold - - # Sort rules by priority (higher first) - self._sort_rules() - - def _sort_rules(self): - """Sort rules by priority.""" - self.rules.sort(key=lambda r: -r.priority) - - def add_rule(self, rule: EvaluationRule) -> None: - """Add an evaluation rule.""" - self.rules.append(rule) - self._sort_rules() - - def remove_rule(self, rule_id: str) -> bool: - """Remove a rule by ID. Returns True if found and removed.""" - for i, rule in enumerate(self.rules): - if rule.id == rule_id: - self.rules.pop(i) - return True - return False - - async def evaluate( - self, - step: PlanStep, - result: Any, - goal: Goal, - context: dict[str, Any] | None = None, - ) -> Judgment: - """ - Evaluate a step result. - - Args: - step: The executed plan step - result: The result of executing the step - goal: The goal context for evaluation - context: Additional context from previous steps - - Returns: - Judgment with action and feedback - """ - context = context or {} - - # Try rule-based evaluation first - rule_result = self._evaluate_rules(step, result, goal, context) - - if rule_result.is_definitive: - return rule_result.judgment - - # Fall back to LLM evaluation - if self.llm: - return await self._evaluate_llm(step, result, goal, context, rule_result) - - # No LLM available - default to accept with low confidence - return Judgment( - action=JudgmentAction.ACCEPT, - reasoning="No definitive rule matched and no LLM available for evaluation", - confidence=0.5, - llm_used=False, - ) - - def _evaluate_rules( - self, - step: PlanStep, - result: Any, - goal: Goal, - context: dict[str, Any], - ) -> RuleEvaluationResult: - """Evaluate step using rules.""" - rules_checked = 0 - - # Build evaluation context - eval_context = { - "step": step.model_dump() if hasattr(step, "model_dump") else step, - "result": result, - "goal": goal.model_dump() if hasattr(goal, "model_dump") else goal, - "context": context, - "success": isinstance(result, dict) and result.get("success", False), - "error": isinstance(result, dict) and result.get("error"), - } - - for rule in self.rules: - rules_checked += 1 - - # Evaluate rule condition - eval_result = safe_eval(rule.condition, eval_context) - - if eval_result.success and eval_result.result: - # Rule matched! - feedback = self._format_feedback(rule.feedback_template, eval_context) - - return RuleEvaluationResult( - is_definitive=True, - judgment=Judgment( - action=rule.action, - reasoning=rule.description, - feedback=feedback if feedback else None, - rule_matched=rule.id, - confidence=1.0, - llm_used=False, - ), - rules_checked=rules_checked, - rule_matched=rule.id, - ) - - # No rule matched definitively - return RuleEvaluationResult( - is_definitive=False, - context=eval_context, - rules_checked=rules_checked, - ) - - def _format_feedback( - self, - template: str, - context: dict[str, Any], - ) -> str: - """Format feedback template with context values.""" - if not template: - return "" - - try: - return template.format(**context) - except (KeyError, ValueError): - return template - - async def _evaluate_llm( - self, - step: PlanStep, - result: Any, - goal: Goal, - context: dict[str, Any], - rule_result: RuleEvaluationResult, - ) -> Judgment: - """Evaluate step using LLM.""" - system_prompt = self._build_llm_system_prompt(goal) - user_prompt = self._build_llm_user_prompt(step, result, context, rule_result) - - try: - response = await self.llm.acomplete( - messages=[{"role": "user", "content": user_prompt}], - system=system_prompt, - ) - - # Parse LLM response - judgment = self._parse_llm_response(response.content) - judgment.llm_used = True - - # Check confidence threshold - if judgment.confidence < self.llm_confidence_threshold: - # Low confidence - escalate - return Judgment( - action=JudgmentAction.ESCALATE, - reasoning=( - f"LLM confidence ({judgment.confidence:.2f}) " - f"below threshold ({self.llm_confidence_threshold})" - ), - feedback=judgment.feedback, - confidence=judgment.confidence, - llm_used=True, - context={"original_judgment": judgment.model_dump()}, - ) - - return judgment - - except Exception as e: - # LLM failed - escalate - return Judgment( - action=JudgmentAction.ESCALATE, - reasoning=f"LLM evaluation failed: {e}", - feedback="Human review needed due to LLM error", - llm_used=True, - ) - - def _build_llm_system_prompt(self, goal: Goal) -> str: - """Build system prompt for LLM judge.""" - return f"""You are a judge evaluating the execution of a plan step. - -GOAL: {goal.description} - -SUCCESS CRITERIA: -{chr(10).join(f"- {sc.description}" for sc in goal.success_criteria)} - -CONSTRAINTS: -{chr(10).join(f"- {c.description}" for c in goal.constraints)} - -Your task is to evaluate whether the step was executed successfully and decide the next action. - -Respond in this exact format: -ACTION: [ACCEPT|RETRY|REPLAN|ESCALATE] -CONFIDENCE: [0.0-1.0] -REASONING: [Your reasoning] -FEEDBACK: [Feedback for retry/replan, or empty if accepting] - -Actions: -- ACCEPT: Step completed successfully, continue to next step -- RETRY: Step failed but can be retried with feedback -- REPLAN: Step failed in a way that requires replanning -- ESCALATE: Requires human intervention -""" - - def _build_llm_user_prompt( - self, - step: PlanStep, - result: Any, - context: dict[str, Any], - rule_result: RuleEvaluationResult, - ) -> str: - """Build user prompt for LLM judge.""" - return f"""Evaluate this step execution: - -STEP: {step.description} -STEP ID: {step.id} -ACTION TYPE: {step.action.action_type} -EXPECTED OUTPUTS: {step.expected_outputs} - -RESULT: -{result} - -CONTEXT FROM PREVIOUS STEPS: -{context} - -RULES CHECKED: {rule_result.rules_checked} (none matched definitively) - -Please evaluate and provide your judgment.""" - - def _parse_llm_response(self, response: str) -> Judgment: - """Parse LLM response into Judgment.""" - lines = response.strip().split("\n") - - action = JudgmentAction.ACCEPT - confidence = 0.8 - reasoning = "" - feedback = "" - - for line in lines: - line = line.strip() - if line.startswith("ACTION:"): - action_str = line.split(":", 1)[1].strip().upper() - try: - action = JudgmentAction(action_str.lower()) - except ValueError: - action = JudgmentAction.ESCALATE - - elif line.startswith("CONFIDENCE:"): - try: - confidence = float(line.split(":", 1)[1].strip()) - except ValueError: - confidence = 0.5 - - elif line.startswith("REASONING:"): - reasoning = line.split(":", 1)[1].strip() - - elif line.startswith("FEEDBACK:"): - feedback = line.split(":", 1)[1].strip() - - return Judgment( - action=action, - reasoning=reasoning or "LLM evaluation", - feedback=feedback if feedback else None, - confidence=confidence, - ) - - -# Factory function for creating judge with common rules -def create_default_judge(llm: LLMProvider | None = None) -> HybridJudge: - """ - Create a HybridJudge with commonly useful default rules. - - Args: - llm: LLM provider for fallback evaluation - - Returns: - Configured HybridJudge instance - """ - judge = HybridJudge(llm=llm) - - # Rule: Accept on explicit success flag - judge.add_rule( - EvaluationRule( - id="explicit_success", - description="Step explicitly marked as successful", - condition="isinstance(result, dict) and result.get('success') == True", - action=JudgmentAction.ACCEPT, - priority=100, - ) - ) - - # Rule: Retry on transient errors - judge.add_rule( - EvaluationRule( - id="transient_error_retry", - description="Transient error that can be retried", - condition=( - "isinstance(result, dict) and " - "result.get('error_type') in ['timeout', 'rate_limit', 'connection_error']" - ), - action=JudgmentAction.RETRY, - feedback_template="Transient error: {result[error]}. Please retry.", - priority=90, - ) - ) - - # Rule: Replan on missing data - judge.add_rule( - EvaluationRule( - id="missing_data_replan", - description="Required data not available", - condition="isinstance(result, dict) and result.get('error_type') == 'missing_data'", - action=JudgmentAction.REPLAN, - feedback_template="Missing required data: {result[error]}. Plan needs adjustment.", - priority=80, - ) - ) - - # Rule: Escalate on security issues - judge.add_rule( - EvaluationRule( - id="security_escalate", - description="Security issue detected", - condition="isinstance(result, dict) and result.get('error_type') == 'security'", - action=JudgmentAction.ESCALATE, - feedback_template="Security issue detected: {result[error]}", - priority=200, - ) - ) - - # Rule: Fail on max retries exceeded - judge.add_rule( - EvaluationRule( - id="max_retries_fail", - description="Maximum retries exceeded", - condition="step.get('attempts', 0) >= step.get('max_retries', 3)", - action=JudgmentAction.REPLAN, - feedback_template="Step '{step[id]}' failed after {step[attempts]} attempts", - priority=150, - ) - ) - - return judge diff --git a/core/framework/graph/node.py b/core/framework/graph/node.py index 2e989c47..45d4b4cb 100644 --- a/core/framework/graph/node.py +++ b/core/framework/graph/node.py @@ -16,10 +16,8 @@ Protocol: """ import asyncio -import inspect import logging from abc import ABC, abstractmethod -from collections.abc import Callable from dataclasses import dataclass, field from datetime import UTC from typing import Any @@ -140,7 +138,7 @@ class NodeSpec(BaseModel): id="calculator", name="Calculator Node", description="Performs mathematical calculations", - node_type="llm_tool_use", + node_type="event_loop", input_keys=["expression"], output_keys=["result"], tools=["calculate", "math_function"], @@ -154,11 +152,8 @@ class NodeSpec(BaseModel): # Node behavior type node_type: str = Field( - default="llm_tool_use", - description=( - "Type: 'event_loop', 'function', 'router', 'human_input'. " - "Deprecated: 'llm_tool_use', 'llm_generate' (use 'event_loop' instead)." - ), + default="event_loop", + description="Type: 'event_loop' (recommended), 'router', 'human_input'.", ) # Data flow @@ -196,11 +191,6 @@ class NodeSpec(BaseModel): default=None, description="Specific model to use (defaults to graph default)" ) - # For function nodes - function: str | None = Field( - default=None, description="Function name or path for function nodes" - ) - # For router nodes routes: dict[str, str] = Field( default_factory=dict, description="Condition -> target_node_id mapping for routers" @@ -657,1329 +647,3 @@ class NodeProtocol(ABC): return errors -class LLMNode(NodeProtocol): - """ - A node that uses an LLM with tools. - - This is the most common node type. It: - 1. Builds a prompt from context - 2. Calls the LLM with available tools - 3. Executes tool calls - 4. Returns the final result - - The LLM decides how to achieve the goal within constraints. - """ - - # Stop reasons indicating truncation (varies by provider) - TRUNCATION_STOP_REASONS = {"length", "max_tokens", "token_limit"} - - # Compaction instruction added when response is truncated - COMPACTION_INSTRUCTION = """ -IMPORTANT: Your previous response was truncated because it exceeded the token limit. -Please provide a MORE CONCISE response that fits within the limit. -Focus on the essential information and omit verbose details. -Keep the same JSON structure but with shorter content values. -""" - - def __init__( - self, - tool_executor: Callable | None = None, - require_tools: bool = False, - cleanup_llm_model: str | None = None, - max_compaction_retries: int = 2, - ): - self.tool_executor = tool_executor - self.require_tools = require_tools - self.cleanup_llm_model = cleanup_llm_model - self.max_compaction_retries = max_compaction_retries - - def _is_truncated(self, response) -> bool: - """Check if LLM response was truncated due to token limit.""" - stop_reason = getattr(response, "stop_reason", "").lower() - return stop_reason in self.TRUNCATION_STOP_REASONS - - def _strip_code_blocks(self, content: str) -> str: - """Strip markdown code block wrappers from content. - - LLMs often wrap JSON output in ```json...``` blocks. - This method removes those wrappers to get clean content. - """ - import re - - content = content.strip() - # Match ```json or ``` at start and ``` at end (greedy to handle nested) - match = re.match(r"^```(?:json|JSON)?\s*\n?(.*)\n?```\s*$", content, re.DOTALL) - if match: - return match.group(1).strip() - return content - - def _estimate_tokens( - self, model: str, system: str, messages: list[dict], tools: list | None - ) -> int: - """Estimate total input tokens for an LLM call.""" - import json - - try: - import litellm as _litellm - except ImportError: - # Rough estimate: 1 token β‰ˆ 4 chars - total_chars = len(system) - for m in messages: - total_chars += len(str(m.get("content", ""))) - if tools: - total_chars += len( - json.dumps( - [ - { - "name": t.name, - "description": t.description, - "parameters": t.parameters, - } - for t in tools - ], - default=str, - ) - ) - return total_chars // 4 - - total = 0 - if system: - total += _litellm.token_counter(model=model, text=system) - for m in messages: - content = str(m.get("content", "")) - if content: - total += _litellm.token_counter(model=model, text=content) - if tools: - tools_text = json.dumps( - [ - {"name": t.name, "description": t.description, "parameters": t.parameters} - for t in tools - ], - default=str, - ) - total += _litellm.token_counter(model=model, text=tools_text) - return total - - def _get_context_limit(self, model: str) -> int: - """Get usable input token budget (80% of model's max_input_tokens).""" - try: - import litellm as _litellm - - info = _litellm.get_model_info(model) - max_input = info.get("max_input_tokens") or info.get("max_tokens") or 8192 - return int(max_input * 0.8) - except Exception: - return 8192 - - def _compact_inputs( - self, ctx: NodeContext, system: str, messages: list[dict], tools: list | None - ) -> list[dict]: - """Compact message inputs if they exceed the model's context window. - - Uses a sliding window strategy: iteratively halves the longest input - value until the total token count fits within the budget. - """ - model = ctx.llm.model if hasattr(ctx.llm, "model") else "gpt-3.5-turbo" - budget = self._get_context_limit(model) - estimated = self._estimate_tokens(model, system, messages, tools) - - if estimated <= budget: - return messages - - logger.warning( - f"[compaction] Input tokens (~{estimated}) exceed budget ({budget}) " - f"for model {model}. Compacting inputs..." - ) - - # Parse user message into key:value pairs for selective truncation - if not messages or not messages[0].get("content"): - return messages - - content = messages[0]["content"] - lines = content.split("\n") - pairs: list[tuple[str, str]] = [] - for line in lines: - if ": " in line: - key, _, value = line.partition(": ") - pairs.append((key, value)) - else: - pairs.append(("", line)) - - # Iteratively halve the longest value until we fit - max_iterations = 20 - for i in range(max_iterations): - # Find longest value - longest_idx = -1 - longest_len = 0 - for idx, (key, value) in enumerate(pairs): - if key and len(value) > longest_len: - longest_len = len(value) - longest_idx = idx - - if longest_idx == -1 or longest_len <= 100: - break - - key, value = pairs[longest_idx] - new_len = max(longest_len // 2, 100) - pairs[longest_idx] = (key, value[:new_len] + "...") - logger.warning(f"[compaction] Truncated '{key}' from {longest_len} to {new_len} chars") - - # Re-estimate - new_content = "\n".join(f"{k}: {v}" if k else v for k, v in pairs) - test_messages = [{"role": "user", "content": new_content}] - estimated = self._estimate_tokens(model, system, test_messages, tools) - if estimated <= budget: - logger.warning( - f"[compaction] Fits within budget after {i + 1} rounds (~{estimated} tokens)" - ) - return test_messages - - # Final reassembly even if still over budget - final_content = "\n".join(f"{k}: {v}" if k else v for k, v in pairs) - final_messages = [{"role": "user", "content": final_content}] - final_est = self._estimate_tokens(model, system, final_messages, tools) - logger.warning( - f"[compaction] Still ~{final_est} tokens after max compaction " - f"(budget={budget}). Proceeding anyway." - ) - return final_messages - - async def execute(self, ctx: NodeContext) -> NodeResult: - """Execute the LLM node.""" - import time - - if ctx.llm is None: - return NodeResult(success=False, error="LLM not available") - - # Fail fast if tools are required but not available - if self.require_tools and not ctx.available_tools: - return NodeResult( - success=False, - error=f"Node '{ctx.node_spec.name}' requires tools but none are available. " - f"Declared tools: {ctx.node_spec.tools}. " - "Register tools via ToolRegistry before running the agent.", - ) - - ctx.runtime.set_node(ctx.node_id) - - # Record the decision to use LLM - decision_id = ctx.runtime.decide( - intent=f"Execute {ctx.node_spec.name}", - options=[ - { - "id": "llm_execute", - "description": f"Use LLM to {ctx.node_spec.description}", - "action_type": "llm_call", - } - ], - chosen="llm_execute", - reasoning=f"Node type is {ctx.node_spec.node_type}", - context={"input": ctx.input_data}, - ) - - start = time.time() - _step_index = 0 - _captured_tool_calls: list[dict] = [] - - try: - # Build messages - messages = self._build_messages(ctx) - - # Build system prompt - system = self._build_system_prompt(ctx) - - # Compact inputs if they exceed the model's context window - messages = self._compact_inputs(ctx, system, messages, ctx.available_tools) - - # Log the LLM call details - logger.info(" πŸ€– LLM Call:") - logger.info( - f" System: {system[:150]}..." - if len(system) > 150 - else f" System: {system}" - ) - logger.info( - f" User message: {messages[-1]['content'][:150]}..." - if len(messages[-1]["content"]) > 150 - else f" User message: {messages[-1]['content']}" - ) - if ctx.available_tools: - logger.info(f" Tools available: {[t.name for t in ctx.available_tools]}") - - # Call LLM - if ctx.available_tools and self.tool_executor: - from framework.llm.provider import ToolResult, ToolUse - - def executor(tool_use: ToolUse) -> ToolResult: - args = ", ".join(f"{k}={v}" for k, v in tool_use.input.items()) - logger.info(f" πŸ”§ Tool call: {tool_use.name}({args})") - result = self.tool_executor(tool_use) - # Truncate long results - result_str = str(result.content)[:150] - if len(str(result.content)) > 150: - result_str += "..." - logger.info(f" βœ“ Tool result: {result_str}") - # Capture for runtime logging - _captured_tool_calls.append( - { - "tool_use_id": tool_use.id, - "tool_name": tool_use.name, - "tool_input": tool_use.input, - "content": result.content, - "is_error": result.is_error, - } - ) - return result - - response = await ctx.llm.acomplete_with_tools( - messages=messages, - system=system, - tools=ctx.available_tools, - tool_executor=executor, - max_tokens=ctx.max_tokens, - ) - else: - # Use JSON mode for llm_generate nodes with output_keys - # Skip strict schema validation - just validate keys after parsing - use_json_mode = ( - ctx.node_spec.node_type == "llm_generate" - and ctx.node_spec.output_keys - and len(ctx.node_spec.output_keys) >= 1 - ) - if use_json_mode: - logger.info( - f" πŸ“‹ Expecting JSON output with keys: {ctx.node_spec.output_keys}" - ) - - response = await ctx.llm.acomplete( - messages=messages, - system=system, - json_mode=use_json_mode, - max_tokens=ctx.max_tokens, - ) - - # Check for truncation and retry with compaction if needed - expects_json = ( - ctx.node_spec.node_type in ("llm_generate", "llm_tool_use") - and ctx.node_spec.output_keys - and len(ctx.node_spec.output_keys) >= 1 - ) - - compaction_attempt = 0 - while ( - self._is_truncated(response) - and expects_json - and compaction_attempt < self.max_compaction_retries - ): - compaction_attempt += 1 - logger.warning( - f" ⚠ Response truncated (stop_reason: {response.stop_reason}), " - f"retrying with compaction ({compaction_attempt}/{self.max_compaction_retries})" - ) - - # Add compaction instruction to messages - compaction_messages = messages + [ - {"role": "assistant", "content": response.content}, - {"role": "user", "content": self.COMPACTION_INSTRUCTION}, - ] - - # Retry the call with compaction instruction - if ctx.available_tools and self.tool_executor: - response = await ctx.llm.acomplete_with_tools( - messages=compaction_messages, - system=system, - tools=ctx.available_tools, - tool_executor=executor, - max_tokens=ctx.max_tokens, - ) - else: - response = await ctx.llm.acomplete( - messages=compaction_messages, - system=system, - json_mode=use_json_mode, - max_tokens=ctx.max_tokens, - ) - - if self._is_truncated(response) and expects_json: - logger.warning( - f" ⚠ Response still truncated after " - f"{compaction_attempt} compaction attempts" - ) - - # Phase 2: Validation retry loop for Pydantic models - max_validation_retries = ( - ctx.node_spec.max_validation_retries if ctx.node_spec.output_model else 0 - ) - validation_attempt = 0 - total_input_tokens = 0 - total_output_tokens = 0 - current_messages = messages.copy() - - while True: - total_input_tokens += response.input_tokens - total_output_tokens += response.output_tokens - - # Log the response - response_preview = ( - response.content[:200] if len(response.content) > 200 else response.content - ) - if len(response.content) > 200: - response_preview += "..." - logger.info(f" ← Response: {response_preview}") - - # If no output_model, break immediately (no validation needed) - if ctx.node_spec.output_model is None: - break - - # Try to parse and validate the response - try: - import json - - parsed = self._extract_json(response.content, ctx.node_spec.output_keys) - - if isinstance(parsed, dict): - from framework.graph.validator import OutputValidator - - validator = OutputValidator() - validation_result, validated_model = validator.validate_with_pydantic( - parsed, ctx.node_spec.output_model - ) - - if validation_result.success: - # Validation passed, break out of retry loop - model_name = ctx.node_spec.output_model.__name__ - logger.info(f" βœ“ Pydantic validation passed for {model_name}") - break - else: - # Validation failed - validation_attempt += 1 - - if validation_attempt <= max_validation_retries: - # Add validation feedback to messages and retry - feedback = validator.format_validation_feedback( - validation_result, ctx.node_spec.output_model - ) - logger.warning( - f" ⚠ Pydantic validation failed " - f"(attempt {validation_attempt}/{max_validation_retries}): " - f"{validation_result.error}" - ) - logger.info(" πŸ”„ Retrying with validation feedback...") - - # Add the assistant's failed response and feedback - current_messages.append( - {"role": "assistant", "content": response.content} - ) - current_messages.append({"role": "user", "content": feedback}) - - # Re-call LLM with feedback - if ctx.available_tools and self.tool_executor: - response = await ctx.llm.acomplete_with_tools( - messages=current_messages, - system=system, - tools=ctx.available_tools, - tool_executor=executor, - max_tokens=ctx.max_tokens, - ) - else: - response = await ctx.llm.acomplete( - messages=current_messages, - system=system, - json_mode=use_json_mode, - max_tokens=ctx.max_tokens, - ) - continue # Retry validation - else: - # Max retries exceeded - latency_ms = int((time.time() - start) * 1000) - err = validation_result.error - logger.error( - f" βœ— Pydantic validation failed after " - f"{max_validation_retries} retries: {err}" - ) - ctx.runtime.record_outcome( - decision_id=decision_id, - success=False, - error=f"Validation failed: {validation_result.error}", - tokens_used=total_input_tokens + total_output_tokens, - latency_ms=latency_ms, - ) - error_msg = ( - f"Pydantic validation failed after " - f"{max_validation_retries} retries: {err}" - ) - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type=ctx.node_spec.node_type, - step_index=_step_index, - llm_text=response.content, - tool_calls=_captured_tool_calls, - input_tokens=total_input_tokens, - output_tokens=total_output_tokens, - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type=ctx.node_spec.node_type, - success=False, - error=error_msg, - total_steps=_step_index + 1, - tokens_used=total_input_tokens + total_output_tokens, - input_tokens=total_input_tokens, - output_tokens=total_output_tokens, - latency_ms=latency_ms, - ) - return NodeResult( - success=False, - error=error_msg, - output=parsed, - tokens_used=total_input_tokens + total_output_tokens, - latency_ms=latency_ms, - validation_errors=validation_result.errors, - ) - else: - # Not a dict, can't validate - break and let downstream handle - break - except Exception: - # JSON extraction failed - break and let downstream handle - break - - latency_ms = int((time.time() - start) * 1000) - - ctx.runtime.record_outcome( - decision_id=decision_id, - success=True, - result=response.content, - tokens_used=total_input_tokens + total_output_tokens, - latency_ms=latency_ms, - ) - - # Write to output keys - output = self._parse_output(response.content, ctx.node_spec) - - # For llm_generate and llm_tool_use nodes, try to parse JSON and extract fields - if ( - ctx.node_spec.node_type in ("llm_generate", "llm_tool_use") - and len(ctx.node_spec.output_keys) >= 1 - ): - try: - import json - - # Try to extract JSON from response - parsed = self._extract_json( - response.content, ctx.node_spec.output_keys, self.cleanup_llm_model - ) - - # If parsed successfully, write each field to its corresponding output key - # Use validate=False since LLM output legitimately contains text that - # may trigger false positives (e.g., "from OpenAI" matches "from ") - if isinstance(parsed, dict): - # If we have output_model, the validation already happened in the retry loop - if ctx.node_spec.output_model is not None: - from framework.graph.validator import OutputValidator - - validator = OutputValidator() - validation_result, validated_model = validator.validate_with_pydantic( - parsed, ctx.node_spec.output_model - ) - # Use validated model's dict representation - if validated_model: - parsed = validated_model.model_dump() - - for key in ctx.node_spec.output_keys: - if key in parsed: - value = parsed[key] - # Strip code block wrappers from string values - if isinstance(value, str): - value = self._strip_code_blocks(value) - ctx.memory.write(key, value, validate=False) - output[key] = value - elif key in ctx.input_data: - # Key not in JSON but exists in input - pass through - ctx.memory.write(key, ctx.input_data[key], validate=False) - output[key] = ctx.input_data[key] - else: - # Key not in JSON or input, write whole response (stripped) - stripped_content = self._strip_code_blocks(response.content) - ctx.memory.write(key, stripped_content, validate=False) - output[key] = stripped_content - else: - # Not a dict, fall back to writing entire response to all keys (stripped) - stripped_content = self._strip_code_blocks(response.content) - for key in ctx.node_spec.output_keys: - ctx.memory.write(key, stripped_content, validate=False) - output[key] = stripped_content - - except (json.JSONDecodeError, Exception) as e: - # JSON extraction failed - fail explicitly instead of polluting memory - logger.error(f" βœ— Failed to extract structured output: {e}") - logger.error( - f" Raw response (first 500 chars): {response.content[:500]}..." - ) - - # Return failure instead of writing garbage to all keys - _extraction_error = ( - f"Output extraction failed: {e}. LLM returned non-JSON response. " - f"Expected keys: {ctx.node_spec.output_keys}" - ) - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type=ctx.node_spec.node_type, - step_index=_step_index, - llm_text=response.content, - tool_calls=_captured_tool_calls, - input_tokens=response.input_tokens, - output_tokens=response.output_tokens, - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type=ctx.node_spec.node_type, - success=False, - error=_extraction_error, - total_steps=_step_index + 1, - tokens_used=response.input_tokens + response.output_tokens, - input_tokens=response.input_tokens, - output_tokens=response.output_tokens, - latency_ms=latency_ms, - ) - return NodeResult( - success=False, - error=_extraction_error, - output={}, - tokens_used=total_input_tokens + total_output_tokens, - latency_ms=latency_ms, - ) - # JSON extraction failed completely - still strip code blocks - # logger.warning(f" ⚠ Failed to extract JSON output: {e}") - # stripped_content = self._strip_code_blocks(response.content) - # for key in ctx.node_spec.output_keys: - # ctx.memory.write(key, stripped_content) - # output[key] = stripped_content - else: - # For non-llm_generate or single output nodes, write entire response (stripped) - stripped_content = self._strip_code_blocks(response.content) - for key in ctx.node_spec.output_keys: - ctx.memory.write(key, stripped_content, validate=False) - output[key] = stripped_content - - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type=ctx.node_spec.node_type, - step_index=_step_index, - llm_text=response.content, - tool_calls=_captured_tool_calls, - input_tokens=response.input_tokens, - output_tokens=response.output_tokens, - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type=ctx.node_spec.node_type, - success=True, - total_steps=_step_index + 1, - tokens_used=response.input_tokens + response.output_tokens, - input_tokens=response.input_tokens, - output_tokens=response.output_tokens, - latency_ms=latency_ms, - ) - - return NodeResult( - success=True, - output=output, - tokens_used=total_input_tokens + total_output_tokens, - latency_ms=latency_ms, - ) - - except Exception as e: - latency_ms = int((time.time() - start) * 1000) - ctx.runtime.record_outcome( - decision_id=decision_id, - success=False, - error=str(e), - latency_ms=latency_ms, - ) - if ctx.runtime_logger: - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type=ctx.node_spec.node_type, - success=False, - error=str(e), - latency_ms=latency_ms, - ) - return NodeResult(success=False, error=str(e), latency_ms=latency_ms) - - def _parse_output(self, content: str, node_spec: NodeSpec) -> dict[str, Any]: - """ - Parse LLM output based on node type. - - For llm_generate nodes with multiple output keys, attempts to parse JSON. - Otherwise returns raw content. - """ - # Default output - return {"result": content} - - def _extract_json( - self, raw_response: str, output_keys: list[str], cleanup_llm_model: str | None = None - ) -> dict[str, Any]: - """Extract clean JSON from potentially verbose LLM response. - - Tries multiple extraction strategies in order: - 1. Direct JSON parse - 2. Markdown code block extraction - 3. Balanced brace matching - 4. Configured LLM fallback (last resort) - - Args: - raw_response: The raw LLM response text - output_keys: Expected output keys for the JSON - cleanup_llm_model: Optional model to use for LLM cleanup fallback - """ - import json - import re - - content = raw_response.strip() - - # Try direct JSON parse first (fast path) - try: - content = raw_response.strip() - - # Remove markdown code blocks if present - more robust extraction - if content.startswith("```"): - # Try multiple patterns for markdown code blocks - # Pattern 1: ```json\n...\n``` or ```\n...\n``` - match = re.search(r"^```(?:json)?\s*\n([\s\S]*?)\n```\s*$", content) - if match: - content = match.group(1).strip() - else: - # Pattern 2: Just strip the first and last lines if they're ``` - lines = content.split("\n") - if lines[0].startswith("```") and lines[-1].strip() == "```": - content = "\n".join(lines[1:-1]).strip() - - parsed = json.loads(content) - if isinstance(parsed, dict): - return parsed - except json.JSONDecodeError as e: - logger.info(f" Direct JSON parse failed: {e}") - logger.info(f" Content first 200 chars repr: {repr(content[:200])}") - # Try fixing unescaped newlines in string values - try: - fixed = _fix_unescaped_newlines_in_json(content) - logger.info(f" Fixed content first 200 chars repr: {repr(fixed[:200])}") - parsed = json.loads(fixed) - if isinstance(parsed, dict): - logger.info(" βœ“ Parsed JSON after fixing unescaped newlines") - return parsed - except json.JSONDecodeError as e2: - logger.info(f" Newline fix also failed: {e2}") - - # Try to extract JSON from markdown code blocks (greedy match to handle nested blocks) - # Multiple patterns to handle different LLM formatting styles - code_block_patterns = [ - # Anchored match from first ``` to last ``` - r"^```(?:json|JSON)?\s*\n?(.*)\n?```\s*$", - # Non-anchored: find ```json anywhere and extract to closing ``` - r"```(?:json|JSON)?\s*\n([\s\S]*?)\n```", - # Handle case where closing ``` might have trailing content - r"```(?:json|JSON)?\s*\n([\s\S]*?)\n```", - ] - for pattern in code_block_patterns: - code_block_match = re.search(pattern, content, re.DOTALL) - if code_block_match: - try: - extracted = code_block_match.group(1).strip() - if extracted: # Skip empty matches - # Try direct parse first, then with newline fix - try: - parsed = json.loads(extracted) - except json.JSONDecodeError: - parsed = json.loads(_fix_unescaped_newlines_in_json(extracted)) - if isinstance(parsed, dict): - return parsed - except json.JSONDecodeError: - pass - - # Try to find JSON object by matching balanced braces (use module-level helper) - json_str = find_json_object(content) - if json_str: - try: - # Try direct parse first, then with newline fix - try: - parsed = json.loads(json_str) - except json.JSONDecodeError: - parsed = json.loads(_fix_unescaped_newlines_in_json(json_str)) - if isinstance(parsed, dict): - return parsed - except json.JSONDecodeError: - pass - - # Try stripping markdown prefix and finding JSON from there - # This handles cases like "```json\n{...}" where regex might fail - if "```" in content: - # Find position after ```json or ``` marker - json_start = content.find("{") - if json_start > 0: - # Extract from first { to end, then find balanced JSON - json_str = find_json_object(content[json_start:]) - if json_str: - try: - # Try direct parse first, then with newline fix - try: - parsed = json.loads(json_str) - except json.JSONDecodeError: - parsed = json.loads(_fix_unescaped_newlines_in_json(json_str)) - if isinstance(parsed, dict): - logger.info( - " βœ“ Extracted JSON via brace matching after markdown strip" - ) - return parsed - except json.JSONDecodeError: - pass - - # All local extraction failed - use LLM as last resort - import os - - from framework.llm.litellm import LiteLLMProvider - - logger.info(f" cleanup_llm_model param: {cleanup_llm_model}") - - # Use configured cleanup model, or fall back to defaults - if cleanup_llm_model: - # Use the configured cleanup model (LiteLLM handles API keys via env vars) - cleaner_llm = LiteLLMProvider(model=cleanup_llm_model) - logger.info(f" Using configured cleanup LLM: {cleanup_llm_model}") - else: - # Fall back to default logic: Cerebras preferred, then Haiku - api_key = os.environ.get("CEREBRAS_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") - if not api_key: - raise ValueError( - "Cannot parse JSON and no API key for LLM cleanup " - "(set CEREBRAS_API_KEY or ANTHROPIC_API_KEY, or configure cleanup_llm_model)" - ) - - if os.environ.get("CEREBRAS_API_KEY"): - cleaner_llm = LiteLLMProvider( - api_key=os.environ.get("CEREBRAS_API_KEY"), - model="cerebras/llama-3.3-70b", - ) - else: - cleaner_llm = LiteLLMProvider( - api_key=api_key, - model="claude-3-5-haiku-20241022", - ) - - prompt = f"""Extract the JSON object from this LLM response. - -Expected output keys: {output_keys} - -LLM Response: -{raw_response} - -Output ONLY the JSON object, nothing else. -If no valid JSON object exists in the response, output exactly: {{"error": "NO_JSON_FOUND"}} -Do NOT fabricate data or return empty objects.""" - - try: - result = cleaner_llm.complete( - messages=[{"role": "user", "content": prompt}], - system="Extract JSON from text. Output only valid JSON.", - json_mode=True, - ) - - cleaned = result.content.strip() if result.content else "" - - # Check for empty response - if not cleaned: - logger.warning(" ⚠ LLM cleanup returned empty response") - raise ValueError( - f"LLM cleanup returned empty response. " - f"Raw response starts with: {raw_response[:200]}..." - ) - - # Remove markdown if LLM added it - if cleaned.startswith("```"): - match = re.search(r"^```(?:json)?\s*\n([\s\S]*?)\n```\s*$", cleaned) - if match: - cleaned = match.group(1).strip() - else: - # Fallback: strip first/last lines - lines = cleaned.split("\n") - if lines[0].startswith("```") and lines[-1].strip() == "```": - cleaned = "\n".join(lines[1:-1]).strip() - - # Try balanced brace extraction if still not valid JSON - if not cleaned.startswith("{"): - json_str = find_json_object(cleaned) - if json_str: - cleaned = json_str - - if not cleaned: - raise ValueError( - f"Could not extract JSON from LLM cleanup response. " - f"Raw response starts with: {raw_response[:200]}..." - ) - - # Try direct parse first, then with newline fix - try: - parsed = json.loads(cleaned) - except json.JSONDecodeError: - parsed = json.loads(_fix_unescaped_newlines_in_json(cleaned)) - - # Validate LLM didn't return empty or fabricated data - if parsed.get("error") == "NO_JSON_FOUND": - raise ValueError("Cannot parse JSON from response") - if not parsed or parsed == {}: - raise ValueError("Cannot parse JSON from response") - if all(v is None for v in parsed.values()): - raise ValueError("Cannot parse JSON from response") - logger.info(" βœ“ LLM cleaned JSON output") - return parsed - - except json.JSONDecodeError as e: - logger.warning(f" ⚠ LLM cleanup response not valid JSON: {e}") - raise ValueError( - f"LLM cleanup response not valid JSON: {e}. Expected keys: {output_keys}" - ) from e - except ValueError: - raise # Re-raise our descriptive error - except Exception as e: - logger.warning(f" ⚠ LLM JSON extraction failed: {e}") - raise - - def _build_messages(self, ctx: NodeContext) -> list[dict]: - """Build the message list for the LLM.""" - # Use Haiku to intelligently format inputs from memory - user_content = self._format_inputs_with_haiku(ctx) - return [{"role": "user", "content": user_content}] - - def _format_inputs_with_haiku(self, ctx: NodeContext) -> str: - """Use Haiku to intelligently extract and format inputs from memory.""" - if not ctx.node_spec.input_keys: - return str(ctx.input_data) - - # Read all memory for context - memory_data = ctx.memory.read_all() - - # If memory is empty or very simple, just use raw data - if not memory_data or len(memory_data) <= 2: - # Simple case - just format the input keys directly - parts = [] - for key in ctx.node_spec.input_keys: - value = ctx.memory.read(key) - if value is not None: - parts.append(f"{key}: {value}") - return "\n".join(parts) if parts else str(ctx.input_data) - - # Use Haiku to intelligently extract relevant data - import os - - api_key = os.environ.get("ANTHROPIC_API_KEY") - if not api_key: - # Fallback to simple formatting if no API key - parts = [] - for key in ctx.node_spec.input_keys: - value = ctx.memory.read(key) - if value is not None: - parts.append(f"{key}: {value}") - return "\n".join(parts) - - # Build prompt for Haiku to extract clean values - import json - - # Smart truncation: truncate values rather than corrupting JSON - def truncate_value(v, max_len=500): - s = str(v) - return s[:max_len] + "..." if len(s) > max_len else v - - truncated_data = {k: truncate_value(v) for k, v in memory_data.items()} - memory_json = json.dumps(truncated_data, indent=2, default=str) - - required_fields = ", ".join(ctx.node_spec.input_keys) - prompt = ( - f"Extract the following information from the memory context:\n\n" - f"Required fields: {required_fields}\n\n" - f"Memory context (may contain nested data, JSON strings, " - f"or extra information):\n{memory_json}\n\n" - "Extract ONLY the clean values for the required fields. " - "Ignore nested structures, JSON wrappers, and irrelevant data.\n\n" - "Output as JSON with the exact field names requested." - ) - - try: - import anthropic - - client = anthropic.Anthropic(api_key=api_key) - message = client.messages.create( - model="claude-3-5-haiku-20241022", - max_tokens=1000, - messages=[{"role": "user", "content": prompt}], - ) - - # Parse Haiku's response - response_text = message.content[0].text.strip() - - # Try to extract JSON using balanced brace matching - json_str = find_json_object(response_text) - if json_str: - extracted = json.loads(json_str) - # Format as key: value pairs - parts = [f"{k}: {v}" for k, v in extracted.items() if k in ctx.node_spec.input_keys] - if parts: - return "\n".join(parts) - - except Exception as e: - # Fallback to simple formatting on error - logger.warning(f"Haiku formatting failed: {e}, falling back to simple format") - - # Fallback: simple key-value formatting - parts = [] - for key in ctx.node_spec.input_keys: - value = ctx.memory.read(key) - if value is not None: - parts.append(f"{key}: {value}") - return "\n".join(parts) if parts else str(ctx.input_data) - - def _build_system_prompt(self, ctx: NodeContext) -> str: - """Build the system prompt.""" - from datetime import datetime - - parts = [] - - if ctx.node_spec.system_prompt: - # Format system prompt with values from memory (for input_keys placeholders) - prompt = ctx.node_spec.system_prompt - if ctx.node_spec.input_keys: - # Build formatting context from memory - format_context = {} - for key in ctx.node_spec.input_keys: - value = ctx.memory.read(key) - if value is not None: - format_context[key] = value - - # Try to format, but fallback to raw prompt if formatting fails - try: - prompt = prompt.format(**format_context) - except (KeyError, ValueError): - # Placeholders don't match or formatting error - use raw prompt - pass - - parts.append(prompt) - - # Inject current datetime so LLM knows "now" - utc_dt = datetime.now(UTC) - local_dt = datetime.now().astimezone() - local_tz_name = local_dt.tzname() or "Unknown" - parts.append("\n## Runtime Context") - parts.append(f"- Current Date/Time (UTC): {utc_dt.isoformat()}") - parts.append(f"- Local Timezone: {local_tz_name}") - parts.append(f"- Current Date/Time (Local): {local_dt.isoformat()}") - - if ctx.goal_context: - parts.append("\n# Goal Context") - parts.append(ctx.goal_context) - - return "\n".join(parts) - - -class RouterNode(NodeProtocol): - """ - A node that routes to different next nodes based on conditions. - - The router examines the current state and decides which - node should execute next. - - Can use either: - 1. Simple condition matching (deterministic) - 2. LLM-based routing (goal-aware, adaptive) - - Set node_spec.routes to a dict of conditions -> target nodes. - If node_spec.system_prompt is provided, LLM will choose the route. - """ - - async def execute(self, ctx: NodeContext) -> NodeResult: - """Execute routing logic.""" - import time as _time - - start = _time.time() - ctx.runtime.set_node(ctx.node_id) - - # Build options from routes - options = [] - for condition, target in ctx.node_spec.routes.items(): - options.append( - { - "id": condition, - "description": f"Route to {target} when condition '{condition}' is met", - "target": target, - } - ) - - # Check if we should use LLM-based routing - if ctx.node_spec.system_prompt and ctx.llm: - # LLM-based routing (goal-aware) - chosen_route = await self._llm_route(ctx, options) - else: - # Simple condition-based routing (deterministic) - route_value = ctx.input_data.get("route_on") or ctx.memory.read("route_on") - chosen_route = None - for condition, target in ctx.node_spec.routes.items(): - if self._check_condition(condition, route_value, ctx): - chosen_route = (condition, target) - break - - if chosen_route is None: - # Default route - chosen_route = ("default", ctx.node_spec.routes.get("default", "end")) - - decision_id = ctx.runtime.decide( - intent="Determine next node in graph", - options=options, - chosen=chosen_route[0], - reasoning=f"Routing decision: {chosen_route[0]}", - ) - - ctx.runtime.record_outcome( - decision_id=decision_id, - success=True, - result=chosen_route[1], - summary=f"Routing to {chosen_route[1]}", - ) - - latency_ms = int((_time.time() - start) * 1000) - - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type="router", - step_index=0, - llm_text=f"Route: {chosen_route[0]} -> {chosen_route[1]}", - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type="router", - success=True, - total_steps=1, - latency_ms=latency_ms, - ) - - return NodeResult( - success=True, - next_node=chosen_route[1], - route_reason=f"Chose route: {chosen_route[0]}", - latency_ms=latency_ms, - ) - - async def _llm_route( - self, - ctx: NodeContext, - options: list[dict[str, Any]], - ) -> tuple[str, str]: - """ - Use LLM to choose the best route based on goal and context. - - Returns: - Tuple of (chosen_condition, target_node) - """ - import json - - # Build routing options description - options_desc = "\n".join( - [f"- {opt['id']}: {opt['description']} β†’ goes to '{opt['target']}'" for opt in options] - ) - - # Build context - context_data = { - "input": ctx.input_data, - "memory_keys": list(ctx.memory.read_all().keys())[:10], - } - - prompt = f"""You are a routing agent deciding which path to take in a workflow. - -**Goal**: {ctx.goal.name} -{ctx.goal.description} - -**Current Context**: -{json.dumps(context_data, indent=2, default=str)} - -**Available Routes**: -{options_desc} - -Based on the goal and current context, which route should we take? - -Respond with ONLY a JSON object: -{{"chosen": "route_id", "reasoning": "brief explanation"}}""" - - logger.info(" πŸ€” Router using LLM to choose path...") - - try: - response = await ctx.llm.acomplete( - messages=[{"role": "user", "content": prompt}], - system=ctx.node_spec.system_prompt - or "You are a routing agent. Respond with JSON only.", - max_tokens=150, - ) - - # Parse response using balanced brace matching - json_str = find_json_object(response.content) - if json_str: - data = json.loads(json_str) - chosen = data.get("chosen", "default") - reasoning = data.get("reasoning", "") - - logger.info(f" β†’ Chose: {chosen}") - logger.info(f" Reason: {reasoning}") - - # Find the target for this choice - target = ctx.node_spec.routes.get( - chosen, ctx.node_spec.routes.get("default", "end") - ) - return (chosen, target) - - except Exception as e: - logger.warning(f" ⚠ LLM routing failed, using default: {e}") - - # Fallback to default - default_target = ctx.node_spec.routes.get("default", "end") - return ("default", default_target) - - def _check_condition( - self, - condition: str, - value: Any, - ctx: NodeContext, - ) -> bool: - """Check if a routing condition is met.""" - if condition == "default": - return True - if condition == "success" and value is True: - return True - if condition == "failure" and value is False: - return True - if condition == "error" and isinstance(value, Exception): - return True - - # String matching - if isinstance(value, str) and condition in value: - return True - - return False - - -class FunctionNode(NodeProtocol): - """ - A node that executes a Python function. - - For deterministic operations that don't need LLM reasoning. - """ - - def __init__(self, func: Callable): - self.func = func - - async def execute(self, ctx: NodeContext) -> NodeResult: - """Execute the function.""" - import time - - ctx.runtime.set_node(ctx.node_id) - - decision_id = ctx.runtime.decide( - intent=f"Execute function {ctx.node_spec.function or 'unknown'}", - options=[ - { - "id": "execute", - "description": f"Run function with inputs: {list(ctx.input_data.keys())}", - } - ], - chosen="execute", - reasoning="Deterministic function execution", - ) - - start = time.time() - - try: - # Filter input_data to only declared input_keys to prevent - # leaking extra memory keys from upstream nodes. - if ctx.node_spec.input_keys: - filtered = { - k: v for k, v in ctx.input_data.items() if k in ctx.node_spec.input_keys - } - else: - filtered = ctx.input_data - - # Call the function (supports both sync and async) - result = self.func(**filtered) - if inspect.isawaitable(result): - result = await result - - latency_ms = int((time.time() - start) * 1000) - - ctx.runtime.record_outcome( - decision_id=decision_id, - success=True, - result=result, - latency_ms=latency_ms, - ) - - # Write to output keys - output = {} - if ctx.node_spec.output_keys: - key = ctx.node_spec.output_keys[0] - output[key] = result - ctx.memory.write(key, result) - else: - output = {"result": result} - - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type="function", - step_index=0, - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type="function", - success=True, - total_steps=1, - latency_ms=latency_ms, - ) - - return NodeResult(success=True, output=output, latency_ms=latency_ms) - - except Exception as e: - latency_ms = int((time.time() - start) * 1000) - ctx.runtime.record_outcome( - decision_id=decision_id, - success=False, - error=str(e), - latency_ms=latency_ms, - ) - - if ctx.runtime_logger: - ctx.runtime_logger.log_step( - node_id=ctx.node_id, - node_type="function", - step_index=0, - latency_ms=latency_ms, - ) - ctx.runtime_logger.log_node_complete( - node_id=ctx.node_id, - node_name=ctx.node_spec.name, - node_type="function", - success=False, - error=str(e), - total_steps=1, - latency_ms=latency_ms, - ) - - return NodeResult(success=False, error=str(e), latency_ms=latency_ms) diff --git a/core/framework/graph/plan.py b/core/framework/graph/plan.py deleted file mode 100644 index 8de839fe..00000000 --- a/core/framework/graph/plan.py +++ /dev/null @@ -1,513 +0,0 @@ -""" -Plan Data Structures for Flexible Execution. - -Plans are created externally (by Claude Code or another LLM agent) and -executed internally by the FlexibleGraphExecutor with Worker-Judge loop. - -The Plan is the contract between the external planner and the executor: -- Planner creates a Plan with PlanSteps -- Executor runs steps and judges results -- If replanning needed, returns feedback to external planner -""" - -from datetime import datetime -from enum import StrEnum -from typing import Any - -from pydantic import BaseModel, Field - - -class ActionType(StrEnum): - """Types of actions a PlanStep can perform.""" - - LLM_CALL = "llm_call" # Call LLM for generation - TOOL_USE = "tool_use" # Use a registered tool - SUB_GRAPH = "sub_graph" # Execute a sub-graph - FUNCTION = "function" # Call a Python function - CODE_EXECUTION = "code_execution" # Execute dynamic code (sandboxed) - - -class StepStatus(StrEnum): - """Status of a plan step.""" - - PENDING = "pending" - AWAITING_APPROVAL = "awaiting_approval" # Waiting for human approval - IN_PROGRESS = "in_progress" - COMPLETED = "completed" - FAILED = "failed" - SKIPPED = "skipped" - REJECTED = "rejected" # Human rejected execution - - def is_terminal(self) -> bool: - """Check if this status represents a terminal (finished) state. - - Terminal states are states where the step will not execute further, - either because it completed successfully or failed/was skipped. - """ - return self in ( - StepStatus.COMPLETED, - StepStatus.FAILED, - StepStatus.SKIPPED, - StepStatus.REJECTED, - ) - - def is_successful(self) -> bool: - """Check if this status represents successful completion.""" - return self == StepStatus.COMPLETED - - -class ApprovalDecision(StrEnum): - """Human decision on a step requiring approval.""" - - APPROVE = "approve" # Execute as planned - REJECT = "reject" # Skip this step - MODIFY = "modify" # Execute with modifications - ABORT = "abort" # Stop entire execution - - -class ApprovalRequest(BaseModel): - """Request for human approval before executing a step.""" - - step_id: str - step_description: str - action_type: str - action_details: dict[str, Any] = Field(default_factory=dict) - context: dict[str, Any] = Field(default_factory=dict) - approval_message: str | None = None - - # Preview of what will happen - preview: str | None = None - - model_config = {"extra": "allow"} - - -class ApprovalResult(BaseModel): - """Result of human approval decision.""" - - decision: ApprovalDecision - reason: str | None = None - modifications: dict[str, Any] = Field(default_factory=dict) - - model_config = {"extra": "allow"} - - -class JudgmentAction(StrEnum): - """Actions the judge can take after evaluating a step.""" - - ACCEPT = "accept" # Step completed successfully, continue - RETRY = "retry" # Retry the step with feedback - REPLAN = "replan" # Return to external planner for new plan - ESCALATE = "escalate" # Request human intervention - - -class ActionSpec(BaseModel): - """ - Specification for an action to be executed. - - This is the "what to do" part of a PlanStep. - """ - - action_type: ActionType - - # For LLM_CALL - prompt: str | None = None - system_prompt: str | None = None - model: str | None = None - - # For TOOL_USE - tool_name: str | None = None - tool_args: dict[str, Any] = Field(default_factory=dict) - - # For SUB_GRAPH - graph_id: str | None = None - - # For FUNCTION - function_name: str | None = None - function_args: dict[str, Any] = Field(default_factory=dict) - - # For CODE_EXECUTION - code: str | None = None - language: str = "python" - - model_config = {"extra": "allow"} - - -class PlanStep(BaseModel): - """ - A single step in a plan. - - Created by external planner, executed by Worker, evaluated by Judge. - """ - - id: str - description: str - action: ActionSpec - - # Data flow - inputs: dict[str, Any] = Field( - default_factory=dict, - description="Input data for this step (can reference previous step outputs)", - ) - expected_outputs: list[str] = Field( - default_factory=list, description="Keys this step should produce" - ) - - # Dependencies - dependencies: list[str] = Field( - default_factory=list, description="IDs of steps that must complete before this one" - ) - - # Human-in-the-loop (HITL) - requires_approval: bool = Field( - default=False, description="If True, requires human approval before execution" - ) - approval_message: str | None = Field( - default=None, description="Message to show human when requesting approval" - ) - - # Execution state - status: StepStatus = StepStatus.PENDING - result: Any | None = None - error: str | None = None - attempts: int = 0 - max_retries: int = 3 - - # Metadata - started_at: datetime | None = None - completed_at: datetime | None = None - - model_config = {"extra": "allow"} - - def is_ready(self, terminal_step_ids: set[str]) -> bool: - """Check if this step is ready to execute (all dependencies finished). - - A step is ready when: - 1. Its status is PENDING (not yet started) - 2. All its dependencies are in a terminal state (completed, failed, skipped, or rejected) - - Note: This allows dependent steps to become "ready" even if their dependencies - failed. The executor should check if any dependencies failed and handle - accordingly (e.g., skip the step or mark it as blocked). - - Args: - terminal_step_ids: Set of step IDs that are in a terminal state - """ - if self.status != StepStatus.PENDING: - return False - return all(dep in terminal_step_ids for dep in self.dependencies) - - -class Judgment(BaseModel): - """ - Result of judging a step execution. - - The Judge evaluates step results and decides what to do next. - """ - - action: JudgmentAction - reasoning: str - feedback: str | None = None # For retry/replan - what went wrong - - # For rule-based judgments - rule_matched: str | None = None - - # For LLM-based judgments - confidence: float = 1.0 - llm_used: bool = False - - # Context for replanning - context: dict[str, Any] = Field(default_factory=dict) - - model_config = {"extra": "allow"} - - -class EvaluationRule(BaseModel): - """ - A rule for the HybridJudge to evaluate step results. - - Rules are checked before falling back to LLM evaluation. - """ - - id: str - description: str - - # Condition (Python expression evaluated with result, step, goal context) - condition: str - - # What to do if condition matches - action: JudgmentAction - feedback_template: str = "" # Can use {result}, {step}, etc. - - # Priority (higher = checked first) - priority: int = 0 - - model_config = {"extra": "allow"} - - -class Plan(BaseModel): - """ - A complete execution plan. - - Created by external planner (Claude Code, etc). - Executed by FlexibleGraphExecutor. - """ - - id: str - goal_id: str - description: str - - # Steps to execute - steps: list[PlanStep] = Field(default_factory=list) - - # Execution state - revision: int = 1 # Incremented on replan - current_step_idx: int = 0 - - # Accumulated context from execution - context: dict[str, Any] = Field(default_factory=dict) - - # Metadata - created_at: datetime = Field(default_factory=datetime.now) - created_by: str = "external" # Who created this plan - - # Previous attempt info (for replanning) - previous_feedback: str | None = None - - model_config = {"extra": "allow"} - - @classmethod - def from_json(cls, data: str | dict) -> "Plan": - """ - Load a Plan from exported JSON. - - This handles the output from export_graph() and properly converts - action_type strings to ActionType enums. - - Args: - data: JSON string or dict from export_graph() - - Returns: - Plan object ready for FlexibleGraphExecutor - - Example: - # Load from export_graph() output - exported = export_graph() - plan = Plan.from_json(exported) - - # Load from file - with open("plan.json") as f: - plan = Plan.from_json(json.load(f)) - """ - import json as json_module - - if isinstance(data, str): - data = json_module.loads(data) - - # Handle nested "plan" key from export_graph output - if "plan" in data: - data = data["plan"] - - # Convert steps - steps = [] - for step_data in data.get("steps", []): - action_data = step_data.get("action", {}) - - # Convert action_type string to enum - action_type_str = action_data.get("action_type", "function") - action_type = ActionType(action_type_str) - - action = ActionSpec( - action_type=action_type, - prompt=action_data.get("prompt"), - system_prompt=action_data.get("system_prompt"), - tool_name=action_data.get("tool_name"), - tool_args=action_data.get("tool_args", {}), - function_name=action_data.get("function_name"), - function_args=action_data.get("function_args", {}), - code=action_data.get("code"), - ) - - step = PlanStep( - id=step_data["id"], - description=step_data.get("description", ""), - action=action, - inputs=step_data.get("inputs", {}), - expected_outputs=step_data.get("expected_outputs", []), - dependencies=step_data.get("dependencies", []), - requires_approval=step_data.get("requires_approval", False), - approval_message=step_data.get("approval_message"), - ) - steps.append(step) - - return cls( - id=data.get("id", "plan"), - goal_id=data.get("goal_id", ""), - description=data.get("description", ""), - steps=steps, - context=data.get("context", {}), - revision=data.get("revision", 1), - ) - - def get_step(self, step_id: str) -> PlanStep | None: - """Get a step by ID.""" - for step in self.steps: - if step.id == step_id: - return step - return None - - def get_ready_steps(self) -> list[PlanStep]: - """Get all steps that are ready to execute. - - A step is ready when all its dependencies are in terminal states - (completed, failed, skipped, or rejected). - """ - terminal_ids = {s.id for s in self.steps if s.status.is_terminal()} - return [s for s in self.steps if s.is_ready(terminal_ids)] - - def get_completed_steps(self) -> list[PlanStep]: - """Get all completed steps.""" - return [s for s in self.steps if s.status == StepStatus.COMPLETED] - - def is_complete(self) -> bool: - """Check if all steps are in terminal states (finished executing). - - Returns True when all steps have reached a terminal state, regardless - of whether they succeeded or failed. Use has_failed_steps() to check - if any steps failed. - """ - return all(s.status.is_terminal() for s in self.steps) - - def is_successful(self) -> bool: - """Check if all steps completed successfully.""" - return all(s.status == StepStatus.COMPLETED for s in self.steps) - - def has_failed_steps(self) -> bool: - """Check if any steps failed, were skipped, or were rejected.""" - return any( - s.status in (StepStatus.FAILED, StepStatus.SKIPPED, StepStatus.REJECTED) - for s in self.steps - ) - - def get_failed_steps(self) -> list[PlanStep]: - """Get all steps that failed, were skipped, or were rejected.""" - return [ - s - for s in self.steps - if s.status in (StepStatus.FAILED, StepStatus.SKIPPED, StepStatus.REJECTED) - ] - - def to_feedback_context(self) -> dict[str, Any]: - """Create context for replanning.""" - return { - "plan_id": self.id, - "revision": self.revision, - "completed_steps": [ - { - "id": s.id, - "description": s.description, - "result": s.result, - } - for s in self.get_completed_steps() - ], - "failed_steps": [ - { - "id": s.id, - "description": s.description, - "error": s.error, - "attempts": s.attempts, - } - for s in self.steps - if s.status == StepStatus.FAILED - ], - "context": self.context, - } - - -class ExecutionStatus(StrEnum): - """Status of plan execution.""" - - COMPLETED = "completed" - AWAITING_APPROVAL = "awaiting_approval" # Paused for human approval - NEEDS_REPLAN = "needs_replan" - NEEDS_ESCALATION = "needs_escalation" - REJECTED = "rejected" # Human rejected a step - ABORTED = "aborted" # Human aborted execution - FAILED = "failed" - - -class PlanExecutionResult(BaseModel): - """ - Result of executing a plan. - - Returned to external planner with status and feedback. - """ - - status: ExecutionStatus - - # Results from completed steps - results: dict[str, Any] = Field(default_factory=dict) - - # For needs_replan - what to tell the planner - feedback: str | None = None - feedback_context: dict[str, Any] = Field(default_factory=dict) - - # Steps that completed before stopping - completed_steps: list[str] = Field(default_factory=list) - - # Metrics - steps_executed: int = 0 - total_tokens: int = 0 - total_latency_ms: int = 0 - - # Error info (for failed status) - error: str | None = None - - model_config = {"extra": "allow"} - - -def load_export(data: str | dict) -> tuple["Plan", Any]: - """ - Load both Plan and Goal from export_graph() output. - - The export_graph() MCP tool returns both the plan and the goal that was - defined and approved during the agent building process. This function - loads both so you can use them with FlexibleGraphExecutor. - - Args: - data: JSON string or dict from export_graph() - - Returns: - Tuple of (Plan, Goal) ready for FlexibleGraphExecutor - - Example: - # Load from export_graph() output - exported = export_graph() - plan, goal = load_export(exported) - - result = await executor.execute_plan(plan, goal, context) - """ - import json as json_module - - from framework.graph.goal import Goal - - if isinstance(data, str): - data = json_module.loads(data) - - # Load plan - plan = Plan.from_json(data) - - # Load goal - goal_data = data.get("goal", {}) - if goal_data: - goal = Goal.model_validate(goal_data) - else: - # Fallback: create minimal goal from plan metadata - goal = Goal( - id=plan.goal_id, - name=plan.goal_id, - description=plan.description, - success_criteria=[], - constraints=[], - ) - - return plan, goal diff --git a/core/framework/graph/worker_node.py b/core/framework/graph/worker_node.py deleted file mode 100644 index fd9f742c..00000000 --- a/core/framework/graph/worker_node.py +++ /dev/null @@ -1,620 +0,0 @@ -""" -Worker Node for Executing Plan Steps. - -The Worker executes individual plan steps by dispatching to the -appropriate executor based on action type: -- LLM calls -- Tool usage -- Sub-graph execution -- Function calls -- Code execution (sandboxed) -""" - -import json -import logging -import re -import time -from collections.abc import Callable -from dataclasses import dataclass, field -from typing import Any - -from framework.graph.code_sandbox import CodeSandbox -from framework.graph.plan import ( - ActionSpec, - ActionType, - PlanStep, -) -from framework.llm.provider import LLMProvider, Tool -from framework.runtime.core import Runtime - -logger = logging.getLogger(__name__) - - -def parse_llm_json_response(text: str) -> tuple[Any | None, str]: - """ - Parse JSON from LLM response, handling markdown code blocks. - - LLMs often return JSON wrapped in markdown code blocks like: - ```json - {"key": "value"} - ``` - - This function extracts and parses the JSON. - - Args: - text: Raw LLM response text - - Returns: - Tuple of (parsed_json_or_None, cleaned_text) - """ - if not isinstance(text, str): - return None, str(text) - - cleaned = text.strip() - - # Try to extract JSON from markdown code blocks - # Pattern: ```json ... ``` or ``` ... ``` - code_block_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```" - matches = re.findall(code_block_pattern, cleaned) - - if matches: - # Try to parse each match - for match in matches: - try: - parsed = json.loads(match.strip()) - return parsed, match.strip() - except json.JSONDecodeError as e: - logger.debug( - f"Failed to parse JSON from code block: {e}. " - f"Content preview: {match.strip()[:100]}..." - ) - continue - - # No code blocks or parsing failed - try parsing the whole response - try: - parsed = json.loads(cleaned) - return parsed, cleaned - except json.JSONDecodeError as e: - logger.debug( - f"Failed to parse entire response as JSON: {e}. Content preview: {cleaned[:100]}..." - ) - - # Try to find JSON-like content (starts with { or [) - json_start_pattern = r"(\{[\s\S]*\}|\[[\s\S]*\])" - json_matches = re.findall(json_start_pattern, cleaned) - - for match in json_matches: - try: - parsed = json.loads(match) - return parsed, match - except json.JSONDecodeError as e: - logger.debug(f"Failed to parse JSON pattern: {e}. Content preview: {match[:100]}...") - continue - - # Could not parse as JSON - log warning - logger.warning( - f"Could not parse LLM response as JSON after trying all strategies. " - f"Response preview: {cleaned[:200]}..." - ) - return None, cleaned - - -@dataclass -class StepExecutionResult: - """Result of executing a plan step.""" - - success: bool - outputs: dict[str, Any] = field(default_factory=dict) - error: str | None = None - error_type: str | None = None # For judge rules: timeout, rate_limit, etc. - - # Metadata - tokens_used: int = 0 - latency_ms: int = 0 - executor_type: str = "" - - -class WorkerNode: - """ - Executes plan steps by dispatching to appropriate executors. - - Usage: - worker = WorkerNode( - runtime=runtime, - llm=llm_provider, - tools=tool_registry, - ) - - result = await worker.execute(step, context) - """ - - def __init__( - self, - runtime: Runtime, - llm: LLMProvider | None = None, - tools: dict[str, Tool] | None = None, - tool_executor: Callable | None = None, - functions: dict[str, Callable] | None = None, - sub_graph_executor: Callable | None = None, - sandbox: CodeSandbox | None = None, - ): - """ - Initialize the Worker. - - Args: - runtime: Runtime for decision logging - llm: LLM provider for LLM_CALL actions - tools: Available tools for TOOL_USE actions - tool_executor: Function to execute tools - functions: Registered functions for FUNCTION actions - sub_graph_executor: Function to execute sub-graphs - sandbox: Code sandbox for CODE_EXECUTION actions - """ - self.runtime = runtime - self.llm = llm - self.tools = tools or {} - self.tool_executor = tool_executor - self.functions = functions or {} - self.sub_graph_executor = sub_graph_executor - self.sandbox = sandbox or CodeSandbox() - - async def execute( - self, - step: PlanStep, - context: dict[str, Any], - ) -> StepExecutionResult: - """ - Execute a plan step. - - Args: - step: The step to execute - context: Current execution context - - Returns: - StepExecutionResult with outputs and status - """ - # Record decision - decision_id = self.runtime.decide( - intent=f"Execute plan step: {step.description}", - options=[ - { - "id": step.action.action_type.value, - "description": f"Execute {step.action.action_type.value} action", - "action_type": step.action.action_type.value, - } - ], - chosen=step.action.action_type.value, - reasoning=f"Step requires {step.action.action_type.value}", - context={"step_id": step.id, "inputs": step.inputs}, - ) - - start_time = time.time() - - try: - # Resolve inputs from context - resolved_inputs = self._resolve_inputs(step.inputs, context) - - # Dispatch to appropriate executor - result = await self._dispatch(step.action, resolved_inputs, context) - - latency_ms = int((time.time() - start_time) * 1000) - result.latency_ms = latency_ms - - # Record outcome - self.runtime.record_outcome( - decision_id=decision_id, - success=result.success, - result=result.outputs if result.success else result.error, - tokens_used=result.tokens_used, - latency_ms=latency_ms, - ) - - return result - - except Exception as e: - latency_ms = int((time.time() - start_time) * 1000) - - self.runtime.record_outcome( - decision_id=decision_id, - success=False, - error=str(e), - latency_ms=latency_ms, - ) - - return StepExecutionResult( - success=False, - error=str(e), - error_type="exception", - latency_ms=latency_ms, - ) - - def _resolve_inputs( - self, - inputs: dict[str, Any], - context: dict[str, Any], - ) -> dict[str, Any]: - """Resolve input references from context.""" - resolved = {} - - for key, value in inputs.items(): - if isinstance(value, str) and value.startswith("$"): - # Reference to context variable - ref_key = value[1:] # Remove $ - resolved[key] = context.get(ref_key, value) - else: - resolved[key] = value - - return resolved - - async def _dispatch( - self, - action: ActionSpec, - inputs: dict[str, Any], - context: dict[str, Any], - ) -> StepExecutionResult: - """Dispatch to appropriate executor based on action type.""" - if action.action_type == ActionType.LLM_CALL: - return await self._execute_llm_call(action, inputs, context) - - elif action.action_type == ActionType.TOOL_USE: - return await self._execute_tool_use(action, inputs) - - elif action.action_type == ActionType.SUB_GRAPH: - return await self._execute_sub_graph(action, inputs, context) - - elif action.action_type == ActionType.FUNCTION: - return await self._execute_function(action, inputs) - - elif action.action_type == ActionType.CODE_EXECUTION: - return self._execute_code(action, inputs, context) - - else: - return StepExecutionResult( - success=False, - error=f"Unknown action type: {action.action_type}", - error_type="invalid_action", - ) - - async def _execute_llm_call( - self, - action: ActionSpec, - inputs: dict[str, Any], - context: dict[str, Any], - ) -> StepExecutionResult: - """Execute an LLM call action.""" - if self.llm is None: - return StepExecutionResult( - success=False, - error="No LLM provider configured", - error_type="configuration", - executor_type="llm_call", - ) - - try: - # Build prompt with context data - prompt = action.prompt or "" - - # First try format placeholders (for prompts like "Hello {name}") - if inputs: - try: - prompt = prompt.format(**inputs) - except (KeyError, ValueError): - pass # Keep original prompt if formatting fails - - # Always append context data so LLM can personalize - # This ensures the LLM has access to lead info, company context, etc. - if inputs: - context_section = "\n\n--- Context Data ---\n" - for key, value in inputs.items(): - if isinstance(value, dict | list): - context_section += f"{key}: {json.dumps(value, indent=2)}\n" - else: - context_section += f"{key}: {value}\n" - prompt = prompt + context_section - - messages = [{"role": "user", "content": prompt}] - - response = await self.llm.acomplete( - messages=messages, - system=action.system_prompt, - ) - - # Try to parse JSON from LLM response - # LLMs often return JSON wrapped in markdown code blocks - parsed_json, _ = parse_llm_json_response(response.content) - - # If JSON was parsed successfully, use it as the result - # Otherwise, use the raw text - result_value = parsed_json if parsed_json is not None else response.content - - return StepExecutionResult( - success=True, - outputs={ - "result": result_value, - "response": response.content, # Always keep raw response - "parsed_json": parsed_json, # Explicit parsed JSON (or None) - }, - tokens_used=response.input_tokens + response.output_tokens, - executor_type="llm_call", - ) - - except Exception as e: - error_type = "rate_limit" if "rate" in str(e).lower() else "llm_error" - return StepExecutionResult( - success=False, - error=str(e), - error_type=error_type, - executor_type="llm_call", - ) - - async def _execute_tool_use( - self, - action: ActionSpec, - inputs: dict[str, Any], - ) -> StepExecutionResult: - """Execute a tool use action.""" - tool_name = action.tool_name - if not tool_name: - return StepExecutionResult( - success=False, - error="No tool name specified", - error_type="invalid_action", - executor_type="tool_use", - ) - - # Merge action args with inputs - args = {**action.tool_args, **inputs} - - # Resolve any $variable references in the merged args - # (tool_args may contain $refs that should be resolved from inputs) - resolved_args = {} - for key, value in args.items(): - if isinstance(value, str) and value.startswith("$"): - ref_key = value[1:] # Remove $ - resolved_args[key] = args.get(ref_key, value) - else: - resolved_args[key] = value - args = resolved_args - - # First, check if we have a registered function with this name - # This allows simpler tool registration without full Tool/ToolExecutor setup - if tool_name in self.functions: - try: - func = self.functions[tool_name] - result = func(**args) - - # Handle async functions - if hasattr(result, "__await__"): - result = await result - - # If result is already a dict with success/outputs, use it directly - if isinstance(result, dict) and "success" in result: - return StepExecutionResult( - success=result.get("success", False), - outputs=result.get("outputs", {}), - error=result.get("error"), - error_type=result.get("error_type"), - executor_type="tool_use", - ) - - # Otherwise wrap the result - return StepExecutionResult( - success=True, - outputs={"result": result}, - executor_type="tool_use", - ) - - except Exception as e: - return StepExecutionResult( - success=False, - error=str(e), - error_type="tool_exception", - executor_type="tool_use", - ) - - # Fall back to formal Tool registry - if tool_name not in self.tools: - return StepExecutionResult( - success=False, - error=f"Tool '{tool_name}' not found", - error_type="missing_tool", - executor_type="tool_use", - ) - - if self.tool_executor is None: - return StepExecutionResult( - success=False, - error="No tool executor configured", - error_type="configuration", - executor_type="tool_use", - ) - - try: - # Execute tool via formal executor - from framework.llm.provider import ToolUse - - tool_use = ToolUse( - id=f"step_{tool_name}", - name=tool_name, - input=args, - ) - - result = self.tool_executor(tool_use) - - if result.is_error: - return StepExecutionResult( - success=False, - outputs={}, - error=result.content, - error_type="tool_error", - executor_type="tool_use", - ) - - # Parse JSON result and unpack fields into outputs - # Tools return JSON like {"lead_email": "...", "company_name": "..."} - # We want each field as a separate output key - outputs = {"result": result.content} - try: - parsed = json.loads(result.content) - if isinstance(parsed, dict): - # Unpack all fields from the JSON response - outputs.update(parsed) - except (json.JSONDecodeError, TypeError): - pass # Keep result as-is if not valid JSON - - return StepExecutionResult( - success=True, - outputs=outputs, - executor_type="tool_use", - ) - - except Exception as e: - return StepExecutionResult( - success=False, - error=str(e), - error_type="tool_exception", - executor_type="tool_use", - ) - - async def _execute_sub_graph( - self, - action: ActionSpec, - inputs: dict[str, Any], - context: dict[str, Any], - ) -> StepExecutionResult: - """Execute a sub-graph action.""" - if self.sub_graph_executor is None: - return StepExecutionResult( - success=False, - error="No sub-graph executor configured", - error_type="configuration", - executor_type="sub_graph", - ) - - graph_id = action.graph_id - if not graph_id: - return StepExecutionResult( - success=False, - error="No graph ID specified", - error_type="invalid_action", - executor_type="sub_graph", - ) - - try: - result = await self.sub_graph_executor(graph_id, inputs, context) - - return StepExecutionResult( - success=result.success, - outputs=result.output if result.success else {}, - error=result.error if not result.success else None, - tokens_used=result.total_tokens, - executor_type="sub_graph", - ) - - except Exception as e: - return StepExecutionResult( - success=False, - error=str(e), - error_type="sub_graph_exception", - executor_type="sub_graph", - ) - - async def _execute_function( - self, - action: ActionSpec, - inputs: dict[str, Any], - ) -> StepExecutionResult: - """Execute a function action.""" - func_name = action.function_name - if not func_name: - return StepExecutionResult( - success=False, - error="No function name specified", - error_type="invalid_action", - executor_type="function", - ) - - if func_name not in self.functions: - return StepExecutionResult( - success=False, - error=f"Function '{func_name}' not registered", - error_type="missing_function", - executor_type="function", - ) - - try: - func = self.functions[func_name] - - # Merge action args with inputs - args = {**action.function_args, **inputs} - - # Execute function - result = func(**args) - - # Handle async functions - if hasattr(result, "__await__"): - result = await result - - return StepExecutionResult( - success=True, - outputs={"result": result}, - executor_type="function", - ) - - except Exception as e: - return StepExecutionResult( - success=False, - error=str(e), - error_type="function_exception", - executor_type="function", - ) - - def _execute_code( - self, - action: ActionSpec, - inputs: dict[str, Any], - context: dict[str, Any], - ) -> StepExecutionResult: - """Execute a code action in sandbox.""" - code = action.code - if not code: - return StepExecutionResult( - success=False, - error="No code specified", - error_type="invalid_action", - executor_type="code_execution", - ) - - # Merge inputs with context for code - code_inputs = {**context, **inputs} - - # Execute in sandbox - sandbox_result = self.sandbox.execute(code, code_inputs) - - if sandbox_result.success: - return StepExecutionResult( - success=True, - outputs={ - "result": sandbox_result.result, - **sandbox_result.variables, - }, - executor_type="code_execution", - latency_ms=sandbox_result.execution_time_ms, - ) - else: - error_type = "security" if "Security" in (sandbox_result.error or "") else "code_error" - return StepExecutionResult( - success=False, - error=sandbox_result.error, - error_type=error_type, - executor_type="code_execution", - latency_ms=sandbox_result.execution_time_ms, - ) - - def register_function(self, name: str, func: Callable) -> None: - """Register a function for FUNCTION actions.""" - self.functions[name] = func - - def register_tool(self, tool: Tool) -> None: - """Register a tool for TOOL_USE actions.""" - self.tools[tool.name] = tool diff --git a/core/framework/mcp/agent_builder_server.py b/core/framework/mcp/agent_builder_server.py index fdd393b0..b512efd2 100644 --- a/core/framework/mcp/agent_builder_server.py +++ b/core/framework/mcp/agent_builder_server.py @@ -36,8 +36,6 @@ from framework.graph import ( # noqa: E402 NodeSpec, SuccessCriterion, ) -from framework.graph.plan import Plan # noqa: E402 - # Testing framework imports from framework.testing.prompts import ( # noqa: E402 PYTEST_TEST_FILE_HEADER, @@ -587,13 +585,12 @@ def add_node( description: Annotated[str, "What this node does"], node_type: Annotated[ str, - "Type: event_loop (recommended), function, router. " - "Deprecated: llm_generate, llm_tool_use (use event_loop instead)", + "Type: event_loop (recommended), router.", ], input_keys: Annotated[str, "JSON array of keys this node reads from shared memory"], output_keys: Annotated[str, "JSON array of keys this node writes to shared memory"], system_prompt: Annotated[str, "Instructions for LLM nodes"] = "", - tools: Annotated[str, "JSON array of tool names for event_loop or llm_tool_use nodes"] = "[]", + tools: Annotated[str, "JSON array of tool names for event_loop nodes"] = "[]", routes: Annotated[ str, "JSON object mapping conditions to target node IDs for router nodes" ] = "{}", @@ -665,24 +662,20 @@ def add_node( errors.append("Node must have an id") if not name: errors.append("Node must have a name") - if node_type == "llm_tool_use" and not tools_list: - errors.append(f"Node '{node_id}' of type llm_tool_use must specify tools") + + # Reject removed node types + if node_type in ("function", "llm_tool_use", "llm_generate"): + errors.append( + f"Node type '{node_type}' is no longer supported. Use 'event_loop' instead." + ) + if node_type == "router" and not routes_dict: errors.append(f"Router node '{node_id}' must specify routes") - if node_type in ("llm_generate", "llm_tool_use") and not system_prompt: - warnings.append(f"LLM node '{node_id}' should have a system_prompt") # EventLoopNode validation if node_type == "event_loop" and not system_prompt: warnings.append(f"Event loop node '{node_id}' should have a system_prompt") - # Deprecated type warnings - if node_type in ("llm_generate", "llm_tool_use"): - warnings.append( - f"Node type '{node_type}' is deprecated. Use 'event_loop' instead. " - "EventLoopNode supports tool use, streaming, and judge-based evaluation." - ) - # Warn about client_facing on nodes with tools (likely autonomous work) if node_type == "event_loop" and client_facing and tools_list: warnings.append( @@ -838,8 +831,7 @@ def update_node( description: Annotated[str, "Updated description"] = "", node_type: Annotated[ str, - "Updated type: event_loop (recommended), function, router. " - "Deprecated: llm_generate, llm_tool_use", + "Updated type: event_loop (recommended), router.", ] = "", input_keys: Annotated[str, "Updated JSON array of input keys"] = "", output_keys: Annotated[str, "Updated JSON array of output keys"] = "", @@ -919,24 +911,19 @@ def update_node( errors = [] warnings = [] - if node.node_type == "llm_tool_use" and not node.tools: - errors.append(f"Node '{node_id}' of type llm_tool_use must specify tools") + # Reject removed node types + if node.node_type in ("function", "llm_tool_use", "llm_generate"): + errors.append( + f"Node type '{node.node_type}' is no longer supported. Use 'event_loop' instead." + ) + if node.node_type == "router" and not node.routes: errors.append(f"Router node '{node_id}' must specify routes") - if node.node_type in ("llm_generate", "llm_tool_use") and not node.system_prompt: - warnings.append(f"LLM node '{node_id}' should have a system_prompt") # EventLoopNode validation if node.node_type == "event_loop" and not node.system_prompt: warnings.append(f"Event loop node '{node_id}' should have a system_prompt") - # Deprecated type warnings - if node.node_type in ("llm_generate", "llm_tool_use"): - warnings.append( - f"Node type '{node.node_type}' is deprecated. Use 'event_loop' instead. " - "EventLoopNode supports tool use, streaming, and judge-based evaluation." - ) - # nullable_output_keys must be a subset of output_keys if node.nullable_output_keys: invalid_nullable = [k for k in node.nullable_output_keys if k not in node.output_keys] @@ -1390,16 +1377,6 @@ def validate_graph() -> str: f"must be a subset of output_keys {node.output_keys}" ) - # Deprecated node type warnings - deprecated_nodes = [ - {"node_id": n.id, "type": n.node_type, "replacement": "event_loop"} - for n in session.nodes - if n.node_type in ("llm_generate", "llm_tool_use") - ] - for dn in deprecated_nodes: - warnings.append( - f"Node '{dn['node_id']}' uses deprecated type '{dn['type']}'. Use 'event_loop' instead." - ) # Warn if all event_loop nodes are client_facing (common misconfiguration) el_nodes = [n for n in session.nodes if n.node_type == "event_loop"] @@ -1646,9 +1623,8 @@ def export_graph() -> str: """ Export the validated graph as a GraphSpec for GraphExecutor. - Exports the complete agent definition including nodes, edges, goal, - and evaluation rules. The GraphExecutor runs the graph with dynamic - edge traversal and routing logic. + Exports the complete agent definition including nodes, edges, and goal. + The GraphExecutor runs the graph with dynamic edge traversal and routing logic. AUTOMATICALLY WRITES FILES TO DISK: - exports/{agent-name}/agent.json - Full agent specification @@ -1856,7 +1832,6 @@ def export_graph() -> str: "files_written": files_written, "graph": graph_spec, "goal": session.goal.model_dump(), - "evaluation_rules": _evaluation_rules, "required_tools": list(all_tools), "node_count": len(session.nodes), "edge_count": len(edges_list), @@ -1966,9 +1941,6 @@ def get_session_status() -> str: "mcp_servers": [s["name"] for s in session.mcp_servers], "event_loop_nodes": [n.id for n in session.nodes if n.node_type == "event_loop"], "client_facing_nodes": [n.id for n in session.nodes if n.client_facing], - "deprecated_nodes": [ - n.id for n in session.nodes if n.node_type in ("llm_generate", "llm_tool_use") - ], "feedback_edges": [e.id for e in session.edges if e.priority < 0], } ) @@ -2139,7 +2111,7 @@ def add_mcp_server( "total_mcp_servers": len(session.mcp_servers), "note": ( f"MCP server '{name}' registered with {len(tool_names)} tools. " - "These tools can now be used in llm_tool_use nodes." + "These tools can now be used in event_loop nodes." ), }, indent=2, @@ -2240,7 +2212,7 @@ def list_mcp_tools( "success": True, "tools_by_server": all_tools, "total_tools": total_tools, - "note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes", + "note": "Use these tool names in the 'tools' parameter when adding event_loop nodes", }, indent=2, ) @@ -2339,23 +2311,6 @@ def test_node( + f"Max visits per graph run: {node_spec.max_node_visits}." ) - elif node_spec.node_type in ("llm_generate", "llm_tool_use"): - # Legacy LLM node types - result["system_prompt"] = node_spec.system_prompt - result["available_tools"] = node_spec.tools - result["deprecation_warning"] = ( - f"Node type '{node_spec.node_type}' is deprecated. Use 'event_loop' instead." - ) - - if mock_llm_response: - result["mock_response"] = mock_llm_response - result["simulation"] = "LLM would receive prompt and produce response" - else: - result["simulation"] = "LLM would be called with the system prompt and input data" - - elif node_spec.node_type == "function": - result["simulation"] = "Function node would execute deterministic logic" - # Show memory state after (simulated) result["expected_memory_state"] = { "inputs_available": {k: input_data.get(k, "") for k in node_spec.input_keys}, @@ -2449,7 +2404,7 @@ def test_graph( "writes": current_node.output_keys, } - if current_node.node_type in ("llm_generate", "llm_tool_use", "event_loop"): + if current_node.node_type == "event_loop": step_info["prompt_preview"] = ( current_node.system_prompt[:200] + "..." if current_node.system_prompt and len(current_node.system_prompt) > 200 @@ -2520,466 +2475,6 @@ def test_graph( ) -# ============================================================================= -# FLEXIBLE EXECUTION TOOLS (Worker-Judge Pattern) -# ============================================================================= - -# Storage for evaluation rules -_evaluation_rules: list[dict] = [] - - -@mcp.tool() -def add_evaluation_rule( - rule_id: Annotated[str, "Unique identifier for the rule"], - description: Annotated[str, "Human-readable description of what this rule checks"], - condition: Annotated[ - str, - "Python expression with result, step, goal context. E.g., 'result.get(\"success\")'", - ], - action: Annotated[str, "Action when rule matches: accept, retry, replan, escalate"], - feedback_template: Annotated[ - str, "Template for feedback message, can use {result}, {step}" - ] = "", - priority: Annotated[int, "Rule priority (higher = checked first)"] = 0, -) -> str: - """ - Add an evaluation rule for the HybridJudge. - - Rules are checked in priority order before falling back to LLM evaluation. - Use this to define deterministic success/failure conditions. - - Example conditions: - - 'result.get("success") == True' - Check for explicit success flag - - 'result.get("error_type") == "timeout"' - Check for specific error type - - 'len(result.get("data", [])) > 0' - Check for non-empty data - """ - global _evaluation_rules - - # Validate action - valid_actions = ["accept", "retry", "replan", "escalate"] - if action.lower() not in valid_actions: - return json.dumps( - { - "success": False, - "error": f"Invalid action '{action}'. Must be one of: {valid_actions}", - } - ) - - # Check for duplicate - if any(r["id"] == rule_id for r in _evaluation_rules): - return json.dumps( - { - "success": False, - "error": f"Rule '{rule_id}' already exists", - } - ) - - rule = { - "id": rule_id, - "description": description, - "condition": condition, - "action": action.lower(), - "feedback_template": feedback_template, - "priority": priority, - } - - _evaluation_rules.append(rule) - _evaluation_rules.sort(key=lambda r: -r["priority"]) - - return json.dumps( - { - "success": True, - "rule": rule, - "total_rules": len(_evaluation_rules), - } - ) - - -@mcp.tool() -def list_evaluation_rules() -> str: - """List all configured evaluation rules for the HybridJudge.""" - return json.dumps( - { - "rules": _evaluation_rules, - "total": len(_evaluation_rules), - } - ) - - -@mcp.tool() -def remove_evaluation_rule( - rule_id: Annotated[str, "ID of the rule to remove"], -) -> str: - """Remove an evaluation rule.""" - global _evaluation_rules - - for i, rule in enumerate(_evaluation_rules): - if rule["id"] == rule_id: - _evaluation_rules.pop(i) - return json.dumps({"success": True, "removed": rule_id}) - - return json.dumps({"success": False, "error": f"Rule '{rule_id}' not found"}) - - -@mcp.tool() -def create_plan( - plan_id: Annotated[str, "Unique identifier for the plan"], - goal_id: Annotated[str, "ID of the goal this plan achieves"], - description: Annotated[str, "Description of what this plan does"], - steps: Annotated[ - str, - "JSON array of plan steps with id, description, action, inputs, outputs, deps", - ], - context: Annotated[str, "JSON object with initial context for execution"] = "{}", -) -> str: - """ - Create a plan for flexible execution. - - Plans are executed by the Worker-Judge loop. Each step specifies: - - id: Unique step identifier - - description: What this step does - - action: Object with action_type and parameters - - action_type: "llm_call", "tool_use", "function", "code_execution", "sub_graph" - - For llm_call: prompt, system_prompt - - For tool_use: tool_name, tool_args - - For function: function_name, function_args - - For code_execution: code - - inputs: Dict mapping input names to values or "$variable" references - - expected_outputs: List of output keys this step should produce - - dependencies: List of step IDs that must complete first (deps) - - Example step: - { - "id": "step_1", - "description": "Fetch user data", - "action": {"action_type": "tool_use", "tool_name": "get_user", ...}, - "inputs": {"user_id": "$input_user_id"}, - "expected_outputs": ["user_data"], - "dependencies": [] - } - """ - try: - steps_list = json.loads(steps) - context_dict = json.loads(context) - except json.JSONDecodeError as e: - return json.dumps({"success": False, "error": f"Invalid JSON: {e}"}) - - # Validate steps - errors = [] - step_ids = set() - - for i, step in enumerate(steps_list): - if "id" not in step: - errors.append(f"Step {i} missing 'id'") - else: - if step["id"] in step_ids: - errors.append(f"Duplicate step id: {step['id']}") - step_ids.add(step["id"]) - - if "description" not in step: - errors.append(f"Step {i} missing 'description'") - - if "action" not in step: - errors.append(f"Step {i} missing 'action'") - elif "action_type" not in step.get("action", {}): - errors.append(f"Step {i} action missing 'action_type'") - - # Check dependencies exist - for dep in step.get("dependencies", []): - if dep not in step_ids: - errors.append(f"Step {step.get('id', i)} has unknown dependency: {dep}") - - if errors: - return json.dumps({"success": False, "errors": errors}) - - # Build plan object - plan = { - "id": plan_id, - "goal_id": goal_id, - "description": description, - "steps": steps_list, - "context": context_dict, - "revision": 1, - "created_at": datetime.now().isoformat(), - } - - return json.dumps( - { - "success": True, - "plan": plan, - "step_count": len(steps_list), - "note": "Plan created. Use execute_plan to run it with the Worker-Judge loop.", - }, - indent=2, - ) - - -@mcp.tool() -def validate_plan( - plan_json: Annotated[str, "JSON string of the plan to validate"], -) -> str: - """ - Validate a plan structure before execution. - - Checks: - - All required fields present - - No circular dependencies - - All dependencies reference existing steps - - Action types are valid - - Context flow: all $variable references can be resolved - """ - try: - plan = json.loads(plan_json) - except json.JSONDecodeError as e: - return json.dumps({"valid": False, "errors": [f"Invalid JSON: {e}"]}) - - errors = [] - warnings = [] - - # Check required fields - required = ["id", "goal_id", "steps"] - for field in required: - if field not in plan: - errors.append(f"Missing required field: {field}") - - if "steps" not in plan: - return json.dumps({"valid": False, "errors": errors}) - - steps = plan["steps"] - step_ids = {s.get("id") for s in steps if "id" in s} - steps_by_id = {s.get("id"): s for s in steps} - - # Check each step - valid_action_types = ["llm_call", "tool_use", "function", "code_execution", "sub_graph"] - - for i, step in enumerate(steps): - step_id = step.get("id", f"step_{i}") - - # Check dependencies - for dep in step.get("dependencies", []): - if dep not in step_ids: - errors.append(f"Step '{step_id}': unknown dependency '{dep}'") - - # Check action type - action = step.get("action", {}) - action_type = action.get("action_type") - if action_type and action_type not in valid_action_types: - errors.append(f"Step '{step_id}': invalid action_type '{action_type}'") - - # Check action has required params - if action_type == "llm_call" and not action.get("prompt"): - warnings.append(f"Step '{step_id}': llm_call without prompt") - if action_type == "tool_use" and not action.get("tool_name"): - errors.append(f"Step '{step_id}': tool_use requires tool_name") - if action_type == "code_execution" and not action.get("code"): - errors.append(f"Step '{step_id}': code_execution requires code") - - # Check for circular dependencies - def has_cycle(step_id: str, visited: set, path: set) -> bool: - if step_id in path: - return True - if step_id in visited: - return False - - visited.add(step_id) - path.add(step_id) - - step = next((s for s in steps if s.get("id") == step_id), None) - if step: - for dep in step.get("dependencies", []): - if has_cycle(dep, visited, path): - return True - - path.remove(step_id) - return False - - for step in steps: - if has_cycle(step.get("id", ""), set(), set()): - errors.append(f"Circular dependency detected involving step '{step.get('id')}'") - break - - # === CONTEXT FLOW VALIDATION === - # Compute what keys each step can access (from dependencies' outputs) - - # Build output map (step_id -> expected_outputs) - step_outputs: dict[str, set[str]] = {} - for step in steps: - step_outputs[step.get("id", "")] = set(step.get("expected_outputs", [])) - - # Compute available context for each step in topological order - available_context: dict[str, set[str]] = {} - computed = set() - remaining = set(step_ids) - - # Get initial context keys from plan.context - initial_context = set(plan.get("context", {}).keys()) - - for _ in range(len(steps) * 2): - if not remaining: - break - - for step_id in list(remaining): - step = steps_by_id.get(step_id) - if not step: - remaining.discard(step_id) - continue - - deps = step.get("dependencies", []) - - # Can compute if all dependencies are computed - if all(d in computed for d in deps): - # Collect outputs from all dependencies (transitive) - available = set(initial_context) - for dep_id in deps: - available.update(step_outputs.get(dep_id, set())) - available.update(available_context.get(dep_id, set())) - - available_context[step_id] = available - computed.add(step_id) - remaining.discard(step_id) - break - - # Check each step's inputs can be resolved - context_errors = [] - context_warnings = [] - - for step in steps: - step_id = step.get("id", "") - available = available_context.get(step_id, set()) - deps = step.get("dependencies", []) - inputs = step.get("inputs", {}) - - missing_vars = [] - for _, input_value in inputs.items(): - # Check $variable references - if isinstance(input_value, str) and input_value.startswith("$"): - var_name = input_value[1:] # Remove $ prefix - if var_name not in available: - missing_vars.append(var_name) - - if missing_vars: - if not deps: - # Entry step - inputs must come from initial context - context_warnings.append( - f"Step '{step_id}' requires ${missing_vars} from initial context. " - f"Ensure these are provided when running the agent: {missing_vars}" - ) - else: - # Find which step could provide each missing var - suggestions = [] - for var in missing_vars: - producers = [s.get("id") for s in steps if var in s.get("expected_outputs", [])] - if producers: - suggestions.append(f"${var} is produced by {producers} - add as dependency") - else: - suggestions.append( - f"${var} is not produced by any step - add a step that outputs '{var}'" - ) - - context_errors.append( - f"Step '{step_id}' references ${missing_vars} but deps " - f"{deps} don't provide them. Suggestions: {'; '.join(suggestions)}" - ) - - errors.extend(context_errors) - warnings.extend(context_warnings) - - return json.dumps( - { - "valid": len(errors) == 0, - "errors": errors, - "warnings": warnings, - "step_count": len(steps), - "context_flow": {step_id: list(keys) for step_id, keys in available_context.items()} - if available_context - else None, - } - ) - - -@mcp.tool() -def simulate_plan_execution( - plan_json: Annotated[str, "JSON string of the plan to simulate"], - max_steps: Annotated[int, "Maximum steps to simulate"] = 20, -) -> str: - """ - Simulate plan execution without actually running it. - - Shows the order steps would execute based on dependencies. - Useful for understanding the execution flow before running. - """ - try: - plan = json.loads(plan_json) - except json.JSONDecodeError as e: - return json.dumps({"success": False, "error": f"Invalid JSON: {e}"}) - - # Validate first - validation = json.loads(validate_plan(plan_json)) - if not validation["valid"]: - return json.dumps( - { - "success": False, - "error": "Plan is not valid", - "validation_errors": validation["errors"], - } - ) - - steps = plan.get("steps", []) - completed = set() - execution_order = [] - iteration = 0 - - while len(completed) < len(steps) and iteration < max_steps: - iteration += 1 - - # Find ready steps - ready = [] - for step in steps: - step_id = step.get("id") - if step_id in completed: - continue - deps = set(step.get("dependencies", [])) - if deps.issubset(completed): - ready.append(step) - - if not ready: - break - - # Execute first ready step (in real execution, could be parallel) - step = ready[0] - step_id = step.get("id") - - execution_order.append( - { - "iteration": iteration, - "step_id": step_id, - "description": step.get("description"), - "action_type": step.get("action", {}).get("action_type"), - "dependencies_met": list(step.get("dependencies", [])), - "parallel_candidates": [s.get("id") for s in ready[1:]], - } - ) - - completed.add(step_id) - - remaining = [s.get("id") for s in steps if s.get("id") not in completed] - - return json.dumps( - { - "success": True, - "execution_order": execution_order, - "steps_simulated": len(execution_order), - "remaining_steps": remaining, - "plan_complete": len(remaining) == 0, - "note": ( - "This is a simulation. Actual execution may differ " - "based on step results and judge decisions." - ), - }, - indent=2, - ) - - # ============================================================================= # TESTING TOOLS (Goal-Based Evaluation) # ============================================================================= @@ -3713,60 +3208,6 @@ def list_tests( ) -# ============================================================================= -# PLAN LOADING AND EXECUTION -# ============================================================================= - - -def load_plan_from_json(plan_json: str | dict) -> Plan: - """ - Load a Plan object from exported JSON. - - Args: - plan_json: JSON string or dict from export_graph() - - Returns: - Plan object ready for FlexibleGraphExecutor - """ - from framework.graph.plan import Plan - - return Plan.from_json(plan_json) - - -@mcp.tool() -def load_exported_plan( - plan_json: Annotated[str, "JSON string from export_graph() output"], -) -> str: - """ - Validate and load an exported plan, returning its structure. - - Use this to verify a plan can be loaded before execution. - """ - try: - plan = load_plan_from_json(plan_json) - return json.dumps( - { - "success": True, - "plan_id": plan.id, - "goal_id": plan.goal_id, - "description": plan.description, - "step_count": len(plan.steps), - "steps": [ - { - "id": s.id, - "description": s.description, - "action_type": s.action.action_type.value, - "dependencies": s.dependencies, - } - for s in plan.steps - ], - }, - indent=2, - ) - except Exception as e: - return json.dumps({"success": False, "error": str(e)}) - - # ============================================================================= # CREDENTIAL STORE TOOLS # ============================================================================= diff --git a/core/framework/runner/runner.py b/core/framework/runner/runner.py index 7d3cf7f9..d1fe423f 100644 --- a/core/framework/runner/runner.py +++ b/core/framework/runner/runner.py @@ -1144,7 +1144,7 @@ class AgentRunner: except ImportError: # aden_tools not installed - fall back to direct check has_llm_nodes = any( - node.node_type in ("llm_generate", "llm_tool_use") for node in self.graph.nodes + node.node_type == "event_loop" for node in self.graph.nodes ) if has_llm_nodes: api_key_env = self._get_api_key_env_var(self.model) diff --git a/core/framework/runtime/runtime_log_schemas.py b/core/framework/runtime/runtime_log_schemas.py index 430079bf..b33ea6a6 100644 --- a/core/framework/runtime/runtime_log_schemas.py +++ b/core/framework/runtime/runtime_log_schemas.py @@ -30,14 +30,14 @@ class NodeStepLog(BaseModel): """Full tool and LLM details for one step within a node. For EventLoopNode, each iteration is a step. For single-step nodes - (LLMNode, FunctionNode, RouterNode), step_index is 0. + (e.g. RouterNode), step_index is 0. OTel-aligned fields (trace_id, span_id, execution_id) enable correlation and future OpenTelemetry export without schema changes. """ node_id: str - node_type: str = "" # "event_loop"|"llm_tool_use"|"llm_generate"|"function"|"router" + node_type: str = "" # "event_loop" (the only valid type) step_index: int = 0 # iteration number for event_loop, 0 for single-step nodes llm_text: str = "" tool_calls: list[ToolCallLog] = Field(default_factory=list) diff --git a/core/framework/runtime/tests/test_agent_runtime.py b/core/framework/runtime/tests/test_agent_runtime.py index 0a5ce9fc..bc167a9a 100644 --- a/core/framework/runtime/tests/test_agent_runtime.py +++ b/core/framework/runtime/tests/test_agent_runtime.py @@ -64,7 +64,7 @@ def sample_graph(): id="process-webhook", name="Process Webhook", description="Process incoming webhook", - node_type="llm_generate", + node_type="event_loop", input_keys=["webhook_data"], output_keys=["result"], ), @@ -72,7 +72,7 @@ def sample_graph(): id="process-api", name="Process API Request", description="Process API request", - node_type="llm_generate", + node_type="event_loop", input_keys=["request_data"], output_keys=["result"], ), @@ -538,7 +538,7 @@ class TestGraphSpecValidation: id="valid-node", name="Valid Node", description="A valid node", - node_type="llm_generate", + node_type="event_loop", input_keys=[], output_keys=[], ), diff --git a/core/framework/runtime/tests/test_webhook_server.py b/core/framework/runtime/tests/test_webhook_server.py index 9f09e3be..b08be7ef 100644 --- a/core/framework/runtime/tests/test_webhook_server.py +++ b/core/framework/runtime/tests/test_webhook_server.py @@ -472,7 +472,7 @@ class TestEventDrivenEntryPoints: id="process-event", name="Process Event", description="Process incoming event", - node_type="llm_generate", + node_type="event_loop", input_keys=["event"], output_keys=["result"], ), diff --git a/core/tests/test_client_facing_validation.py b/core/tests/test_client_facing_validation.py index bf78ef42..9e599fef 100644 --- a/core/tests/test_client_facing_validation.py +++ b/core/tests/test_client_facing_validation.py @@ -157,40 +157,6 @@ class TestEventLoopOutputKeyOverlap: key_errors = [e for e in errors if "output_key" in e] assert len(key_errors) == 0 - def test_overlapping_keys_non_event_loop_no_error(self): - """Non-event_loop nodes with overlapping keys -> no error (last-wins OK).""" - graph = GraphSpec( - id="g1", - goal_id="goal1", - entry_node="src", - nodes=[ - NodeSpec(id="src", name="src", description="Source node"), - NodeSpec( - id="a", - name="a", - description="Node a", - node_type="llm_generate", - output_keys=["shared"], - ), - NodeSpec( - id="b", - name="b", - description="Node b", - node_type="llm_generate", - output_keys=["shared"], - ), - ], - edges=[ - EdgeSpec(id="src->a", source="src", target="a", condition=EdgeCondition.ON_SUCCESS), - EdgeSpec(id="src->b", source="src", target="b", condition=EdgeCondition.ON_SUCCESS), - ], - ) - - errors = graph.validate() - key_errors = [e for e in errors if "output_key" in e] - assert len(key_errors) == 0 - - # --------------------------------------------------------------------------- # Baseline: no fan-out -> no errors from these rules # --------------------------------------------------------------------------- diff --git a/core/tests/test_conditional_edge_direct_key.py b/core/tests/test_conditional_edge_direct_key.py index a8d15c3d..56d0a18f 100644 --- a/core/tests/test_conditional_edge_direct_key.py +++ b/core/tests/test_conditional_edge_direct_key.py @@ -85,14 +85,14 @@ async def test_direct_key_access_in_conditional_edge(): id="score_node", name="ScoreNode", description="Outputs a score", - node_type="function", + node_type="event_loop", output_keys=["score"], ), NodeSpec( id="high_score_node", name="HighScoreNode", description="Handles high scores", - node_type="function", + node_type="event_loop", input_keys=["score"], output_keys=["result"], ), @@ -153,14 +153,14 @@ async def test_backward_compatibility_output_syntax(): id="score_node", name="ScoreNode", description="Outputs a score", - node_type="function", + node_type="event_loop", output_keys=["score"], ), NodeSpec( id="consumer_node", name="ConsumerNode", description="Consumer", - node_type="function", + node_type="event_loop", input_keys=["score"], output_keys=["processed"], ), @@ -221,14 +221,14 @@ async def test_multiple_keys_in_expression(): id="multi_key_node", name="MultiKeyNode", description="Outputs multiple keys", - node_type="function", + node_type="event_loop", output_keys=["x", "y"], ), NodeSpec( id="consumer_node", name="ConsumerNode", description="Consumer", - node_type="function", + node_type="event_loop", input_keys=["x", "y"], output_keys=["processed"], ), @@ -295,14 +295,14 @@ async def test_negative_case_condition_false(): id="low_score_node", name="LowScoreNode", description="Outputs low score", - node_type="function", + node_type="event_loop", output_keys=["score"], ), NodeSpec( id="high_score_handler", name="HighScoreHandler", description="Should NOT execute", - node_type="function", + node_type="event_loop", input_keys=["score"], output_keys=["result"], ), diff --git a/core/tests/test_event_loop_integration.py b/core/tests/test_event_loop_integration.py index 77aee972..a0c02ef8 100644 --- a/core/tests/test_event_loop_integration.py +++ b/core/tests/test_event_loop_integration.py @@ -826,7 +826,7 @@ async def test_event_loop_no_executor_retry(runtime): result = await executor.execute(graph, goal, {}) assert not result.success - assert failing_node.attempt_count == 1 # Executor forced max_retries to 0 + assert failing_node.attempt_count == 3 # Custom nodes keep their max_retries # =========================================================================== @@ -1007,11 +1007,20 @@ async def test_internal_node_no_client_output(): @pytest.mark.asyncio async def test_mixed_node_graph(runtime): - """function -> event_loop -> function end-to-end.""" + """Simple node -> event_loop -> simple node end-to-end.""" - # Function 1: write leads to memory - def load_leads(**kwargs): - return ["lead_A", "lead_B", "lead_C"] + class LoadLeadsNode(NodeProtocol): + async def execute(self, ctx: NodeContext) -> NodeResult: + leads = ["lead_A", "lead_B", "lead_C"] + ctx.memory.write("leads", leads) + return NodeResult(success=True, output={"leads": leads}) + + class FormatOutputNode(NodeProtocol): + async def execute(self, ctx: NodeContext) -> NodeResult: + summary = ctx.input_data.get("summary", ctx.memory.read("summary") or "no summary") + report = f"Report: {summary}" + ctx.memory.write("report", report) + return NodeResult(success=True, output={"report": report}) # Event loop: process leads, produce summary el_scripts = [ @@ -1028,18 +1037,12 @@ async def test_mixed_node_graph(runtime): ] el_llm = ScriptableMockLLMProvider(el_scripts) - # Function 2: format final output - def format_output(**kwargs): - summary = kwargs.get("summary", "no summary") - return f"Report: {summary}" - # Node specs load_spec = NodeSpec( id="load", name="Load Leads", description="Load lead data", - node_type="function", - function="load_leads", + node_type="event_loop", output_keys=["leads"], ) process_spec = NodeSpec( @@ -1047,17 +1050,13 @@ async def test_mixed_node_graph(runtime): name="Process Leads", description="Process leads with LLM", node_type="event_loop", - # input_keys left empty: EventLoopNode._check_pause() reads "pause_requested" - # from memory, and a restrictive scope would block it. Data flows via input_data. output_keys=["summary"], ) format_spec = NodeSpec( id="format", name="Format Output", description="Format final report", - node_type="function", - function="format_output", - # input_keys left empty for same scoping reason with FunctionNode + node_type="event_loop", output_keys=["report"], ) @@ -1078,9 +1077,9 @@ async def test_mixed_node_graph(runtime): goal = Goal(id="test_goal", name="Pipeline Test", description="test full pipeline") executor = GraphExecutor(runtime=runtime, llm=el_llm) - executor.register_function("load", load_leads) + executor.register_node("load", LoadLeadsNode()) executor.register_node("process", EventLoopNode(config=LoopConfig(max_iterations=5))) - executor.register_function("format", format_output) + executor.register_node("format", FormatOutputNode()) result = await executor.execute(graph, goal, {}) diff --git a/core/tests/test_event_loop_wiring.py b/core/tests/test_event_loop_wiring.py index 2acb8f78..4432bd5b 100644 --- a/core/tests/test_event_loop_wiring.py +++ b/core/tests/test_event_loop_wiring.py @@ -65,7 +65,7 @@ def test_client_facing_defaults_false(): id="n1", name="Node 1", description="test", - node_type="llm_generate", + node_type="event_loop", ) assert spec.client_facing is False @@ -143,7 +143,7 @@ def test_registered_event_loop_returns_impl(runtime): @pytest.mark.asyncio async def test_event_loop_max_retries_forced_zero(runtime): - """An event_loop node with max_retries=3 should only execute once (no retry).""" + """Custom NodeProtocol impls with node_type=event_loop keep their max_retries.""" node_spec = NodeSpec( id="el_fail", name="Failing Event Loop", @@ -171,9 +171,9 @@ async def test_event_loop_max_retries_forced_zero(runtime): result = await executor.execute(graph, goal, {}) - # Event loop nodes get max_retries overridden to 0, meaning execute once then fail + # Custom nodes (not EventLoopNode instances) keep their max_retries assert not result.success - assert failing_node.attempt_count == 1 + assert failing_node.attempt_count == 3 @pytest.mark.asyncio @@ -246,21 +246,21 @@ async def test_event_loop_max_retries_positive_logs_warning(runtime, caplog): with caplog.at_level(logging.WARNING): await executor.execute(graph, goal, {}) - assert "Overriding to 0" in caplog.text - assert "el_warn" in caplog.text + # Custom nodes (not EventLoopNode instances) don't get override warning + assert "Overriding to 0" not in caplog.text # --- Existing node types unaffected --- def test_existing_node_types_unchanged(): - """All pre-existing node types must still be in VALID_NODE_TYPES with defaults preserved.""" - expected = {"llm_tool_use", "llm_generate", "router", "function", "human_input"} - assert expected.issubset(GraphExecutor.VALID_NODE_TYPES) + """Only event_loop is a valid node type.""" + expected = {"event_loop"} + assert expected == GraphExecutor.VALID_NODE_TYPES - # Default node_type is still llm_tool_use + # Default node_type is event_loop spec = NodeSpec(id="x", name="X", description="x") - assert spec.node_type == "llm_tool_use" + assert spec.node_type == "event_loop" # Default max_retries is still 3 assert spec.max_retries == 3 diff --git a/core/tests/test_execution_quality.py b/core/tests/test_execution_quality.py index 6657757c..27bd5add 100644 --- a/core/tests/test_execution_quality.py +++ b/core/tests/test_execution_quality.py @@ -106,7 +106,7 @@ class TestExecutionQuality: id="node1", name="Always Succeeds", description="Never fails", - node_type="function", + node_type="event_loop", output_keys=["result"], ), ], @@ -151,6 +151,7 @@ class TestExecutionQuality: ) # Create graph with flaky node (fails 2 times before succeeding) + # (actual impl from registry is FlakyNode) graph = GraphSpec( id="test-graph", goal_id=goal.id, @@ -159,7 +160,7 @@ class TestExecutionQuality: id="flaky", name="Flaky Node", description="Fails then succeeds", - node_type="function", + node_type="event_loop", output_keys=["result"], max_retries=3, # Allow retries ), @@ -206,6 +207,7 @@ class TestExecutionQuality: ) # Create graph with always-failing node + # (actual impl from registry is AlwaysFailsNode) graph = GraphSpec( id="test-graph", goal_id=goal.id, @@ -214,7 +216,7 @@ class TestExecutionQuality: id="fails", name="Always Fails", description="Never succeeds", - node_type="function", + node_type="event_loop", output_keys=["result"], max_retries=2, # Will retry twice then fail ), @@ -261,6 +263,7 @@ class TestExecutionQuality: ) # Create graph with multiple flaky nodes + # (actual impls from registry are FlakyNode instances) graph = GraphSpec( id="test-graph", goal_id=goal.id, @@ -269,7 +272,7 @@ class TestExecutionQuality: id="flaky1", name="Flaky Node 1", description="Fails once", - node_type="function", + node_type="event_loop", output_keys=["result1"], max_retries=3, ), @@ -277,7 +280,7 @@ class TestExecutionQuality: id="flaky2", name="Flaky Node 2", description="Fails twice", - node_type="function", + node_type="event_loop", input_keys=["result1"], output_keys=["result2"], max_retries=3, @@ -286,7 +289,7 @@ class TestExecutionQuality: id="success", name="Success Node", description="Always succeeds", - node_type="function", + node_type="event_loop", input_keys=["result2"], output_keys=["final"], ), diff --git a/core/tests/test_execution_stream.py b/core/tests/test_execution_stream.py index 8db30d18..32072e7a 100644 --- a/core/tests/test_execution_stream.py +++ b/core/tests/test_execution_stream.py @@ -62,7 +62,7 @@ async def test_execution_stream_retention(tmp_path): id="hello", name="Hello", description="Return a result", - node_type="llm_generate", + node_type="event_loop", input_keys=["user_name"], output_keys=["result"], system_prompt='Return JSON: {"result": "ok"}', @@ -149,7 +149,7 @@ async def test_shared_session_reuses_directory_and_memory(tmp_path): id="hello", name="Hello", description="Return a result", - node_type="llm_generate", + node_type="event_loop", input_keys=["user_name"], output_keys=["result"], system_prompt='Return JSON: {"result": "ok"}', diff --git a/core/tests/test_executor_feedback_edges.py b/core/tests/test_executor_feedback_edges.py index e19daf94..2139bc91 100644 --- a/core/tests/test_executor_feedback_edges.py +++ b/core/tests/test_executor_feedback_edges.py @@ -81,7 +81,7 @@ def goal(): def test_max_node_visits_default(): """NodeSpec.max_node_visits should default to 1.""" - spec = NodeSpec(id="n", name="N", description="test", node_type="function", output_keys=["out"]) + spec = NodeSpec(id="n", name="N", description="test", node_type="event_loop", output_keys=["out"]) assert spec.max_node_visits == 1 @@ -101,7 +101,7 @@ async def test_visit_limit_skips_node(runtime, goal): id="a", name="A", description="entry with visit limit", - node_type="function", + node_type="event_loop", output_keys=["a_out"], max_node_visits=1, ) @@ -109,7 +109,7 @@ async def test_visit_limit_skips_node(runtime, goal): id="b", name="B", description="middle node", - node_type="function", + node_type="event_loop", output_keys=["b_out"], max_node_visits=0, # unlimited β€” let max_steps guard ) @@ -159,7 +159,7 @@ async def test_visit_limit_allows_multiple(runtime, goal): id="a", name="A", description="entry allows two visits", - node_type="function", + node_type="event_loop", output_keys=["a_out"], max_node_visits=2, ) @@ -167,7 +167,7 @@ async def test_visit_limit_allows_multiple(runtime, goal): id="b", name="B", description="middle node", - node_type="function", + node_type="event_loop", output_keys=["b_out"], max_node_visits=0, # unlimited ) @@ -215,7 +215,7 @@ async def test_visit_limit_zero_unlimited(runtime, goal): id="a", name="A", description="unlimited visits", - node_type="function", + node_type="event_loop", output_keys=["a_out"], max_node_visits=0, ) @@ -223,7 +223,7 @@ async def test_visit_limit_zero_unlimited(runtime, goal): id="b", name="B", description="middle node", - node_type="function", + node_type="event_loop", output_keys=["b_out"], max_node_visits=0, ) @@ -274,7 +274,7 @@ async def test_conditional_feedback_edge(runtime, goal): id="director", name="Director", description="plans work", - node_type="function", + node_type="event_loop", output_keys=["plan"], max_node_visits=2, ) @@ -282,7 +282,7 @@ async def test_conditional_feedback_edge(runtime, goal): id="writer", name="Writer", description="writes draft", - node_type="function", + node_type="event_loop", output_keys=["draft", "needs_revision"], max_node_visits=2, ) @@ -290,7 +290,7 @@ async def test_conditional_feedback_edge(runtime, goal): id="output", name="Output", description="final output", - node_type="function", + node_type="event_loop", output_keys=["final"], ) @@ -370,7 +370,7 @@ async def test_conditional_feedback_false(runtime, goal): id="director", name="Director", description="plans work", - node_type="function", + node_type="event_loop", output_keys=["plan"], max_node_visits=2, ) @@ -378,14 +378,14 @@ async def test_conditional_feedback_false(runtime, goal): id="writer", name="Writer", description="writes draft", - node_type="function", + node_type="event_loop", output_keys=["draft", "needs_revision"], ) output_node = NodeSpec( id="output", name="Output", description="final output", - node_type="function", + node_type="event_loop", output_keys=["final"], ) @@ -458,14 +458,14 @@ async def test_visit_counts_in_result(runtime, goal): id="a", name="A", description="entry", - node_type="function", + node_type="event_loop", output_keys=["a_out"], ) node_b = NodeSpec( id="b", name="B", description="terminal", - node_type="function", + node_type="event_loop", input_keys=["a_out"], output_keys=["b_out"], ) @@ -509,21 +509,21 @@ async def test_conditional_priority_prevents_fanout(runtime, goal): id="writer", name="Writer", description="produces output", - node_type="function", + node_type="event_loop", output_keys=["draft", "needs_revision"], ) output_node = NodeSpec( id="output", name="Output", description="forward target", - node_type="function", + node_type="event_loop", output_keys=["final"], ) director = NodeSpec( id="director", name="Director", description="feedback target", - node_type="function", + node_type="event_loop", output_keys=["plan"], max_node_visits=2, ) diff --git a/core/tests/test_executor_max_retries.py b/core/tests/test_executor_max_retries.py index 62b6df84..b6e41891 100644 --- a/core/tests/test_executor_max_retries.py +++ b/core/tests/test_executor_max_retries.py @@ -79,7 +79,7 @@ async def test_executor_respects_custom_max_retries_high(runtime): name="Flaky Node", description="A node that fails multiple times before succeeding", max_retries=10, # Should allow 10 retries - node_type="function", + node_type="event_loop", output_keys=["result"], ) @@ -123,7 +123,7 @@ async def test_executor_respects_custom_max_retries_low(runtime): name="Fragile Node", description="A node with low retry tolerance", max_retries=2, # max_retries=N means N total attempts allowed - node_type="function", + node_type="event_loop", output_keys=["result"], ) @@ -166,7 +166,7 @@ async def test_executor_respects_default_max_retries(runtime): name="Default Node", description="A node using default retry settings", # max_retries not specified, should default to 3 - node_type="function", + node_type="event_loop", output_keys=["result"], ) @@ -211,7 +211,7 @@ async def test_executor_max_retries_two_succeeds_on_second(runtime): name="Two Retry Node", description="A node with two attempts allowed", max_retries=2, # max_retries=N means N total attempts allowed - node_type="function", + node_type="event_loop", output_keys=["result"], ) @@ -253,7 +253,7 @@ async def test_executor_different_nodes_different_max_retries(runtime): name="Node 1", description="First node in multi-node test", max_retries=2, - node_type="function", + node_type="event_loop", output_keys=["result1"], ) @@ -262,7 +262,7 @@ async def test_executor_different_nodes_different_max_retries(runtime): name="Node 2", description="Second node in multi-node test", max_retries=5, - node_type="function", + node_type="event_loop", input_keys=["result1"], output_keys=["result2"], ) diff --git a/core/tests/test_fanout.py b/core/tests/test_fanout.py index 92c53588..5060b653 100644 --- a/core/tests/test_fanout.py +++ b/core/tests/test_fanout.py @@ -116,7 +116,7 @@ def _make_fanout_graph( id="source", name="Source", description="entry", - node_type="function", + node_type="event_loop", output_keys=["data"], ) @@ -164,10 +164,10 @@ def _make_fanout_graph( async def test_fanout_triggers_on_multiple_success_edges(runtime, goal): """Fan-out should activate when a node has >1 ON_SUCCESS outgoing edges.""" b1 = NodeSpec( - id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( - id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_out"] + id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_out"] ) graph = _make_fanout_graph([b1, b2]) @@ -195,10 +195,10 @@ async def test_branches_execute_concurrently(runtime, goal): """All fan-out branches should be launched via asyncio.gather (concurrent).""" order = [] b1 = NodeSpec( - id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_done"] + id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_done"] ) b2 = NodeSpec( - id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_done"] + id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_done"] ) graph = _make_fanout_graph([b1, b2]) @@ -223,13 +223,13 @@ async def test_branches_execute_concurrently(runtime, goal): async def test_convergence_at_fan_in_node(runtime, goal): """After fan-out branches complete, execution should continue at convergence node.""" b1 = NodeSpec( - id="b1", name="B1", description="branch 1", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="branch 1", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( - id="b2", name="B2", description="branch 2", node_type="function", output_keys=["b2_out"] + id="b2", name="B2", description="branch 2", node_type="event_loop", output_keys=["b2_out"] ) merge = NodeSpec( - id="merge", name="Merge", description="fan-in", node_type="function", output_keys=["merged"] + id="merge", name="Merge", description="fan-in", node_type="event_loop", output_keys=["merged"] ) graph = _make_fanout_graph([b1, b2], fan_in_node=merge) @@ -255,13 +255,13 @@ async def test_convergence_at_fan_in_node(runtime, goal): async def test_fail_all_strategy_raises_on_branch_failure(runtime, goal): """fail_all should raise RuntimeError if any branch fails.""" b1 = NodeSpec( - id="b1", name="B1", description="ok branch", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="ok branch", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( id="b2", name="B2", description="bad branch", - node_type="function", + node_type="event_loop", output_keys=["b2_out"], max_retries=1, ) @@ -290,13 +290,13 @@ async def test_fail_all_strategy_raises_on_branch_failure(runtime, goal): async def test_continue_others_strategy_allows_partial_success(runtime, goal): """continue_others should let successful branches complete even if one fails.""" b1 = NodeSpec( - id="b1", name="B1", description="ok", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="ok", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( id="b2", name="B2", description="fail", - node_type="function", + node_type="event_loop", output_keys=["b2_out"], max_retries=1, ) @@ -325,13 +325,13 @@ async def test_continue_others_strategy_allows_partial_success(runtime, goal): async def test_wait_all_strategy_collects_all_results(runtime, goal): """wait_all should wait for all branches before proceeding.""" b1 = NodeSpec( - id="b1", name="B1", description="ok", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="ok", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( id="b2", name="B2", description="fail", - node_type="function", + node_type="event_loop", output_keys=["b2_out"], max_retries=1, ) @@ -365,12 +365,12 @@ async def test_per_branch_retry(runtime, goal): id="b1", name="B1", description="flaky", - node_type="function", + node_type="event_loop", output_keys=["b1_out"], max_retries=5, ) b2 = NodeSpec( - id="b2", name="B2", description="solid", node_type="function", output_keys=["b2_out"] + id="b2", name="B2", description="solid", node_type="event_loop", output_keys=["b2_out"] ) graph = _make_fanout_graph([b1, b2]) @@ -394,13 +394,13 @@ async def test_per_branch_retry(runtime, goal): async def test_single_edge_no_parallel_overhead(runtime, goal): """A single outgoing edge should follow normal sequential path, not fan-out.""" n1 = NodeSpec( - id="n1", name="N1", description="entry", node_type="function", output_keys=["out1"] + id="n1", name="N1", description="entry", node_type="event_loop", output_keys=["out1"] ) n2 = NodeSpec( id="n2", name="N2", description="next", - node_type="function", + node_type="event_loop", input_keys=["out1"], output_keys=["out2"], ) @@ -432,8 +432,8 @@ async def test_single_edge_no_parallel_overhead(runtime, goal): def test_detect_fan_out_nodes(): """GraphSpec.detect_fan_out_nodes should identify fan-out topology.""" - b1 = NodeSpec(id="b1", name="B1", description="b", node_type="function", output_keys=["x"]) - b2 = NodeSpec(id="b2", name="B2", description="b", node_type="function", output_keys=["y"]) + b1 = NodeSpec(id="b1", name="B1", description="b", node_type="event_loop", output_keys=["x"]) + b2 = NodeSpec(id="b2", name="B2", description="b", node_type="event_loop", output_keys=["y"]) graph = _make_fanout_graph([b1, b2]) fan_outs = graph.detect_fan_out_nodes() @@ -447,10 +447,10 @@ def test_detect_fan_out_nodes(): def test_detect_fan_in_nodes(): """GraphSpec.detect_fan_in_nodes should identify convergence topology.""" - b1 = NodeSpec(id="b1", name="B1", description="b", node_type="function", output_keys=["x"]) - b2 = NodeSpec(id="b2", name="B2", description="b", node_type="function", output_keys=["y"]) + b1 = NodeSpec(id="b1", name="B1", description="b", node_type="event_loop", output_keys=["x"]) + b2 = NodeSpec(id="b2", name="B2", description="b", node_type="event_loop", output_keys=["y"]) merge = NodeSpec( - id="merge", name="Merge", description="m", node_type="function", output_keys=["z"] + id="merge", name="Merge", description="m", node_type="event_loop", output_keys=["z"] ) graph = _make_fanout_graph([b1, b2], fan_in_node=merge) @@ -467,10 +467,10 @@ def test_detect_fan_in_nodes(): async def test_parallel_disabled_uses_sequential(runtime, goal): """When enable_parallel_execution=False, multi-edge should follow first match only.""" b1 = NodeSpec( - id="b1", name="B1", description="b1", node_type="function", output_keys=["b1_out"] + id="b1", name="B1", description="b1", node_type="event_loop", output_keys=["b1_out"] ) b2 = NodeSpec( - id="b2", name="B2", description="b2", node_type="function", output_keys=["b2_out"] + id="b2", name="B2", description="b2", node_type="event_loop", output_keys=["b2_out"] ) graph = _make_fanout_graph([b1, b2]) diff --git a/core/tests/test_flexible_executor.py b/core/tests/test_flexible_executor.py deleted file mode 100644 index ddd904a7..00000000 --- a/core/tests/test_flexible_executor.py +++ /dev/null @@ -1,442 +0,0 @@ -""" -Tests for the Worker-Judge flexible execution pattern. - -Tests cover: -- Plan and PlanStep data structures -- Code sandbox security -- HybridJudge rule evaluation -- WorkerNode action dispatch -- FlexibleGraphExecutor end-to-end -""" - -import asyncio - -import pytest - -from framework.graph.code_sandbox import ( - CodeSandbox, - safe_eval, - safe_exec, -) -from framework.graph.goal import Goal, SuccessCriterion -from framework.graph.judge import HybridJudge, create_default_judge -from framework.graph.plan import ( - ActionSpec, - ActionType, - EvaluationRule, - ExecutionStatus, - Judgment, - JudgmentAction, - Plan, - PlanExecutionResult, - PlanStep, - StepStatus, -) - - -class TestPlanDataStructures: - """Tests for Plan and PlanStep.""" - - def test_plan_step_creation(self): - """Test creating a PlanStep.""" - action = ActionSpec( - action_type=ActionType.LLM_CALL, - prompt="Hello, world!", - ) - step = PlanStep( - id="step_1", - description="Say hello", - action=action, - expected_outputs=["greeting"], - ) - - assert step.id == "step_1" - assert step.status == StepStatus.PENDING - assert step.action.action_type == ActionType.LLM_CALL - - def test_plan_step_is_ready(self): - """Test PlanStep.is_ready() with dependencies.""" - step1 = PlanStep( - id="step_1", - description="First step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - ) - step2 = PlanStep( - id="step_2", - description="Second step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1"], - ) - - # Step 1 is ready (no deps) - assert step1.is_ready(set()) is True - - # Step 2 is not ready (dep not met) - assert step2.is_ready(set()) is False - - # Step 2 is ready after step 1 completes - assert step2.is_ready({"step_1"}) is True - - def test_plan_get_ready_steps(self): - """Test Plan.get_ready_steps().""" - plan = Plan( - id="test_plan", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - ), - PlanStep( - id="step_2", - description="Second", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1"], - ), - ], - ) - - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step_1" - - def test_plan_is_complete(self): - """Test Plan.is_complete().""" - plan = Plan( - id="test_plan", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.COMPLETED, - ), - ], - ) - - assert plan.is_complete() is True - - def test_plan_to_feedback_context(self): - """Test Plan.to_feedback_context().""" - plan = Plan( - id="test_plan", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="Completed step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.COMPLETED, - result={"data": "value"}, - ), - PlanStep( - id="step_2", - description="Failed step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.FAILED, - error="Something went wrong", - attempts=3, - ), - ], - ) - - context = plan.to_feedback_context() - assert context["plan_id"] == "test_plan" - assert len(context["completed_steps"]) == 1 - assert len(context["failed_steps"]) == 1 - assert context["failed_steps"][0]["error"] == "Something went wrong" - - -class TestCodeSandbox: - """Tests for code sandbox security.""" - - def test_simple_execution(self): - """Test simple code execution.""" - result = safe_exec("x = 1 + 2\nresult = x * 3") - assert result.success is True - assert result.variables.get("x") == 3 - assert result.result == 9 - - def test_input_injection(self): - """Test passing inputs to sandbox.""" - result = safe_exec( - "result = x + y", - inputs={"x": 10, "y": 20}, - ) - assert result.success is True - assert result.result == 30 - - def test_blocked_import(self): - """Test that dangerous imports are blocked.""" - result = safe_exec("import os") - assert result.success is False - assert "blocked" in result.error.lower() or "import" in result.error.lower() - - def test_blocked_private_access(self): - """Test that private attribute access is blocked.""" - result = safe_exec("x = [].__class__.__bases__") - assert result.success is False - - def test_blocked_exec_eval(self): - """Test that exec/eval are blocked.""" - result = safe_exec("exec('print(1)')") - assert result.success is False - - def test_safe_eval_expression(self): - """Test safe_eval for expressions.""" - result = safe_eval("x + y", inputs={"x": 5, "y": 3}) - assert result.success is True - assert result.result == 8 - - def test_allowed_modules(self): - """Test that allowed modules work.""" - sandbox = CodeSandbox() - # math is in ALLOWED_MODULES - result = sandbox.execute( - """ -import math -result = math.sqrt(16) -""", - inputs={}, - ) - # Note: imports are blocked by default in validation - # This test documents current behavior - assert result.success is False # imports blocked by validator - - -class TestHybridJudge: - """Tests for the HybridJudge.""" - - def test_rule_based_accept(self): - """Test rule-based accept judgment.""" - judge = HybridJudge() - judge.add_rule( - EvaluationRule( - id="success_check", - description="Accept on success flag", - condition="result.get('success') == True", - action=JudgmentAction.ACCEPT, - ) - ) - - step = PlanStep( - id="test_step", - description="Test", - action=ActionSpec(action_type=ActionType.FUNCTION), - ) - goal = Goal( - id="goal_1", - name="Test Goal", - description="A test goal", - success_criteria=[ - SuccessCriterion( - id="sc1", description="Complete task", metric="completion", target="100%" - ), - ], - ) - - # Use sync version for testing - judgment = asyncio.run(judge.evaluate(step, {"success": True}, goal)) - - assert judgment.action == JudgmentAction.ACCEPT - assert judgment.rule_matched == "success_check" - - def test_rule_based_retry(self): - """Test rule-based retry judgment.""" - judge = HybridJudge() - judge.add_rule( - EvaluationRule( - id="timeout_retry", - description="Retry on timeout", - condition="result.get('error_type') == 'timeout'", - action=JudgmentAction.RETRY, - feedback_template="Timeout occurred, please retry", - ) - ) - - step = PlanStep( - id="test_step", - description="Test", - action=ActionSpec(action_type=ActionType.FUNCTION), - ) - goal = Goal( - id="goal_1", - name="Test Goal", - description="A test goal", - success_criteria=[ - SuccessCriterion( - id="sc1", description="Complete task", metric="completion", target="100%" - ), - ], - ) - - judgment = asyncio.run(judge.evaluate(step, {"error_type": "timeout"}, goal)) - - assert judgment.action == JudgmentAction.RETRY - - def test_rule_priority(self): - """Test that higher priority rules are checked first.""" - judge = HybridJudge() - - # Lower priority - would match - judge.add_rule( - EvaluationRule( - id="low_priority", - description="Low priority accept", - condition="True", - action=JudgmentAction.ACCEPT, - priority=1, - ) - ) - - # Higher priority - should match first - judge.add_rule( - EvaluationRule( - id="high_priority", - description="High priority escalate", - condition="True", - action=JudgmentAction.ESCALATE, - priority=100, - ) - ) - - step = PlanStep( - id="test_step", - description="Test", - action=ActionSpec(action_type=ActionType.FUNCTION), - ) - goal = Goal( - id="goal_1", - name="Test Goal", - description="A test goal", - success_criteria=[ - SuccessCriterion( - id="sc1", description="Complete task", metric="completion", target="100%" - ), - ], - ) - - judgment = asyncio.run(judge.evaluate(step, {}, goal)) - - assert judgment.rule_matched == "high_priority" - assert judgment.action == JudgmentAction.ESCALATE - - def test_default_judge_rules(self): - """Test that create_default_judge includes useful rules.""" - judge = create_default_judge() - - # Should have rules for common cases - rule_ids = {r.id for r in judge.rules} - assert "explicit_success" in rule_ids - assert "transient_error_retry" in rule_ids - assert "security_escalate" in rule_ids - - -class TestJudgment: - """Tests for Judgment data structure.""" - - def test_judgment_creation(self): - """Test creating a Judgment.""" - judgment = Judgment( - action=JudgmentAction.ACCEPT, - reasoning="Step completed successfully", - confidence=0.95, - ) - - assert judgment.action == JudgmentAction.ACCEPT - assert judgment.confidence == 0.95 - assert judgment.llm_used is False - - def test_judgment_with_feedback(self): - """Test Judgment with feedback for retry/replan.""" - judgment = Judgment( - action=JudgmentAction.REPLAN, - reasoning="Missing required data", - feedback="Need to fetch user data first", - context={"missing": ["user_id", "email"]}, - ) - - assert judgment.action == JudgmentAction.REPLAN - assert judgment.feedback is not None - assert "user_id" in judgment.context["missing"] - - -class TestPlanExecutionResult: - """Tests for PlanExecutionResult.""" - - def test_completed_result(self): - """Test completed execution result.""" - result = PlanExecutionResult( - status=ExecutionStatus.COMPLETED, - results={"output": "success"}, - steps_executed=5, - total_tokens=1000, - ) - - assert result.status == ExecutionStatus.COMPLETED - assert result.steps_executed == 5 - - def test_needs_replan_result(self): - """Test needs_replan execution result.""" - result = PlanExecutionResult( - status=ExecutionStatus.NEEDS_REPLAN, - feedback="Step 3 failed: missing data", - feedback_context={ - "completed_steps": ["step_1", "step_2"], - "failed_step": "step_3", - }, - completed_steps=["step_1", "step_2"], - ) - - assert result.status == ExecutionStatus.NEEDS_REPLAN - assert result.feedback is not None - assert len(result.completed_steps) == 2 - - -# Integration tests would require mocking Runtime and LLM -class TestFlexibleExecutorIntegration: - """Integration tests for FlexibleGraphExecutor.""" - - def test_executor_creation(self, tmp_path): - """Test creating a FlexibleGraphExecutor.""" - from framework.graph.flexible_executor import FlexibleGraphExecutor - from framework.runtime.core import Runtime - - runtime = Runtime(storage_path=tmp_path / "runtime") - executor = FlexibleGraphExecutor(runtime=runtime) - - assert executor.runtime == runtime - assert executor.judge is not None - assert executor.worker is not None - - def test_executor_with_custom_judge(self, tmp_path): - """Test executor with custom judge.""" - from framework.graph.flexible_executor import FlexibleGraphExecutor - from framework.runtime.core import Runtime - - runtime = Runtime(storage_path=tmp_path / "runtime") - custom_judge = HybridJudge() - custom_judge.add_rule( - EvaluationRule( - id="custom_rule", - description="Custom rule", - condition="True", - action=JudgmentAction.ACCEPT, - ) - ) - - executor = FlexibleGraphExecutor(runtime=runtime, judge=custom_judge) - - assert len(executor.judge.rules) == 1 - assert executor.judge.rules[0].id == "custom_rule" - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/core/tests/test_graph_executor.py b/core/tests/test_graph_executor.py index 4aadacce..79d888bf 100644 --- a/core/tests/test_graph_executor.py +++ b/core/tests/test_graph_executor.py @@ -49,7 +49,7 @@ async def test_executor_single_node_success(): id="n1", name="node1", description="test node", - node_type="llm_generate", + node_type="event_loop", input_keys=[], output_keys=["result"], max_retries=0, @@ -104,7 +104,7 @@ async def test_executor_single_node_failure(): id="n1", name="node1", description="failing node", - node_type="llm_generate", + node_type="event_loop", input_keys=[], output_keys=["result"], max_retries=0, @@ -157,78 +157,6 @@ class FakeEventBus: @pytest.mark.asyncio -async def test_executor_emits_node_events(): - """Executor should emit NODE_LOOP_STARTED/COMPLETED for each non-event_loop node.""" - runtime = DummyRuntime() - event_bus = FakeEventBus() - - graph = GraphSpec( - id="graph-ev", - goal_id="g-ev", - nodes=[ - NodeSpec( - id="n1", - name="first", - description="first node", - node_type="llm_generate", - input_keys=[], - output_keys=["result"], - max_retries=0, - ), - NodeSpec( - id="n2", - name="second", - description="second node", - node_type="llm_generate", - input_keys=["result"], - output_keys=["result"], - max_retries=0, - ), - ], - edges=[ - EdgeSpec( - id="e1", - source="n1", - target="n2", - condition=EdgeCondition.ON_SUCCESS, - ), - ], - entry_node="n1", - terminal_nodes=["n2"], - ) - - executor = GraphExecutor( - runtime=runtime, - node_registry={ - "n1": SuccessNode(), - "n2": SuccessNode(), - }, - event_bus=event_bus, - stream_id="test-stream", - ) - - goal = Goal(id="g-ev", name="event-test", description="test events") - result = await executor.execute(graph=graph, goal=goal) - - assert result.success is True - assert result.path == ["n1", "n2"] - - # Should have 5 events: started/completed for n1, edge_traversed, then started/completed for n2 - assert len(event_bus.events) == 5 - assert event_bus.events[0] == ("started", {"stream_id": "test-stream", "node_id": "n1"}) - assert event_bus.events[1] == ( - "completed", - {"stream_id": "test-stream", "node_id": "n1", "iterations": 1}, - ) - assert event_bus.events[2] == ( - "edge_traversed", - {"stream_id": "test-stream", "source_node": "n1", "target_node": "n2"}, - ) - assert event_bus.events[3] == ("started", {"stream_id": "test-stream", "node_id": "n2"}) - assert event_bus.events[4] == ( - "completed", - {"stream_id": "test-stream", "node_id": "n2", "iterations": 1}, - ) # ---- Fake event_loop node (registered, so executor won't emit for it) ---- @@ -292,7 +220,7 @@ async def test_executor_no_events_without_event_bus(): id="n1", name="node1", description="test node", - node_type="llm_generate", + node_type="event_loop", input_keys=[], output_keys=["result"], max_retries=0, diff --git a/core/tests/test_node_json_extraction.py b/core/tests/test_node_json_extraction.py deleted file mode 100644 index 36c43fd8..00000000 --- a/core/tests/test_node_json_extraction.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Tests for LLMNode JSON extraction logic. - -Run with: - cd core - pytest tests/test_node_json_extraction.py -v -""" - -import pytest - -from framework.graph.node import LLMNode - - -class TestJsonExtraction: - """Test _extract_json JSON extraction without LLM calls.""" - - @pytest.fixture - def node(self): - """Create an LLMNode instance for testing.""" - return LLMNode() - - def test_clean_json(self, node): - """Test parsing clean JSON directly.""" - result = node._extract_json('{"key": "value"}', ["key"]) - assert result == {"key": "value"} - - def test_json_with_whitespace(self, node): - """Test parsing JSON with surrounding whitespace.""" - result = node._extract_json(' {"key": "value"} \n', ["key"]) - assert result == {"key": "value"} - - def test_markdown_code_block_at_start(self, node): - """Test extracting JSON from markdown code block at start.""" - input_text = '```json\n{"key": "value"}\n```' - result = node._extract_json(input_text, ["key"]) - assert result == {"key": "value"} - - def test_markdown_code_block_without_json_label(self, node): - """Test extracting JSON from markdown code block without 'json' label.""" - input_text = '```\n{"key": "value"}\n```' - result = node._extract_json(input_text, ["key"]) - assert result == {"key": "value"} - - def test_prose_around_markdown_block(self, node): - """Test extracting JSON when prose surrounds the markdown block.""" - input_text = 'Here is the result:\n```json\n{"key": "value"}\n```\nHope this helps!' - result = node._extract_json(input_text, ["key"]) - assert result == {"key": "value"} - - def test_json_embedded_in_prose(self, node): - """Test extracting JSON embedded in prose text.""" - input_text = 'The answer is {"key": "value"} as requested.' - result = node._extract_json(input_text, ["key"]) - assert result == {"key": "value"} - - def test_nested_json(self, node): - """Test parsing nested JSON objects.""" - input_text = '{"outer": {"inner": "value"}}' - result = node._extract_json(input_text, ["outer"]) - assert result == {"outer": {"inner": "value"}} - - def test_deeply_nested_json(self, node): - """Test parsing deeply nested JSON objects.""" - input_text = '{"a": {"b": {"c": {"d": "deep"}}}}' - result = node._extract_json(input_text, ["a"]) - assert result == {"a": {"b": {"c": {"d": "deep"}}}} - - def test_json_with_array(self, node): - """Test parsing JSON with array values.""" - input_text = '{"items": [1, 2, 3]}' - result = node._extract_json(input_text, ["items"]) - assert result == {"items": [1, 2, 3]} - - def test_json_with_string_containing_braces(self, node): - """Test parsing JSON where string values contain braces.""" - input_text = '{"code": "function() { return 1; }"}' - result = node._extract_json(input_text, ["code"]) - assert result == {"code": "function() { return 1; }"} - - def test_json_with_escaped_quotes(self, node): - """Test parsing JSON with escaped quotes in strings.""" - input_text = '{"message": "He said \\"hello\\""}' - result = node._extract_json(input_text, ["message"]) - assert result == {"message": 'He said "hello"'} - - def test_multiple_json_objects_takes_first(self, node): - """Test that when multiple JSON objects exist, first is taken.""" - input_text = '{"first": 1} and then {"second": 2}' - result = node._extract_json(input_text, ["first"]) - assert result == {"first": 1} - - def test_json_with_boolean_and_null(self, node): - """Test parsing JSON with boolean and null values.""" - input_text = '{"active": true, "deleted": false, "data": null}' - result = node._extract_json(input_text, ["active", "deleted", "data"]) - assert result == {"active": True, "deleted": False, "data": None} - - def test_json_with_numbers(self, node): - """Test parsing JSON with integer and float values.""" - input_text = '{"count": 42, "price": 19.99}' - result = node._extract_json(input_text, ["count", "price"]) - assert result == {"count": 42, "price": 19.99} - - def test_invalid_json_raises_error(self, node, monkeypatch): - """Test that completely invalid JSON raises an error when no LLM fallback available.""" - # Remove API keys so LLM fallback is not attempted - monkeypatch.delenv("CEREBRAS_API_KEY", raising=False) - monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - with pytest.raises(ValueError, match="Cannot parse JSON"): - node._extract_json("This is not JSON at all", ["key"]) - - def test_empty_string_raises_error(self, node, monkeypatch): - """Test that empty string raises an error when no LLM fallback available.""" - # Remove API keys so LLM fallback is not attempted - monkeypatch.delenv("CEREBRAS_API_KEY", raising=False) - monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - with pytest.raises(ValueError, match="Cannot parse JSON"): - node._extract_json("", ["key"]) diff --git a/core/tests/test_on_failure_edges.py b/core/tests/test_on_failure_edges.py index 8a31f40e..5ce1ff0c 100644 --- a/core/tests/test_on_failure_edges.py +++ b/core/tests/test_on_failure_edges.py @@ -95,7 +95,7 @@ async def test_on_failure_edge_followed_after_max_retries(runtime, goal): id="failing", name="Failing Node", description="Always fails", - node_type="function", + node_type="event_loop", output_keys=[], max_retries=1, ), @@ -103,7 +103,7 @@ async def test_on_failure_edge_followed_after_max_retries(runtime, goal): id="handler", name="Failure Handler", description="Handles failures", - node_type="function", + node_type="event_loop", output_keys=["handled", "recovery"], ), ] @@ -156,7 +156,7 @@ async def test_no_on_failure_edge_still_terminates(runtime, goal): id="failing", name="Failing Node", description="Always fails", - node_type="function", + node_type="event_loop", output_keys=[], max_retries=1, ), @@ -193,21 +193,21 @@ async def test_on_failure_edge_not_followed_on_success(runtime, goal): id="working", name="Working Node", description="Always succeeds", - node_type="function", + node_type="event_loop", output_keys=["result"], ), NodeSpec( id="handler", name="Failure Handler", description="Should not be reached", - node_type="function", + node_type="event_loop", output_keys=["handled"], ), NodeSpec( id="next", name="Next Node", description="Normal successor", - node_type="function", + node_type="event_loop", output_keys=["done"], ), ] @@ -261,7 +261,7 @@ async def test_on_failure_edge_with_zero_retries(runtime, goal): id="fragile", name="Fragile Node", description="Fails with no retries", - node_type="function", + node_type="event_loop", output_keys=[], max_retries=0, ), @@ -269,7 +269,7 @@ async def test_on_failure_edge_with_zero_retries(runtime, goal): id="handler", name="Failure Handler", description="Handles failures", - node_type="function", + node_type="event_loop", output_keys=["handled", "recovery"], ), ] @@ -317,7 +317,7 @@ async def test_on_failure_handler_appears_in_path(runtime, goal): id="failing", name="Failing Node", description="Always fails", - node_type="function", + node_type="event_loop", output_keys=[], max_retries=1, ), @@ -325,7 +325,7 @@ async def test_on_failure_handler_appears_in_path(runtime, goal): id="handler", name="Failure Handler", description="Handles failures", - node_type="function", + node_type="event_loop", output_keys=["handled", "recovery"], ), ] diff --git a/core/tests/test_plan.py b/core/tests/test_plan.py deleted file mode 100644 index 3867fa1b..00000000 --- a/core/tests/test_plan.py +++ /dev/null @@ -1,592 +0,0 @@ -"""Tests for plan.py - Plan enums and Pydantic models.""" - -import json - -import pytest - -from framework.graph.plan import ( - ActionSpec, - ActionType, - ApprovalDecision, - ExecutionStatus, - JudgmentAction, - Plan, - PlanStep, - StepStatus, -) - - -class TestActionTypeEnum: - """Tests for ActionType enum values.""" - - def test_action_type_values_exist(self): - """All 5 ActionType values exist.""" - assert ActionType.LLM_CALL.value == "llm_call" - assert ActionType.TOOL_USE.value == "tool_use" - assert ActionType.SUB_GRAPH.value == "sub_graph" - assert ActionType.FUNCTION.value == "function" - assert ActionType.CODE_EXECUTION.value == "code_execution" - - def test_action_type_count(self): - """ActionType has exactly 5 members.""" - assert len(ActionType) == 5 - - def test_action_type_string_enum(self): - """ActionType is a string enum.""" - assert isinstance(ActionType.LLM_CALL, str) - assert ActionType.LLM_CALL == "llm_call" - - -class TestStepStatusEnum: - """Tests for StepStatus enum values.""" - - def test_step_status_values_exist(self): - """All 7 StepStatus values exist.""" - assert StepStatus.PENDING.value == "pending" - assert StepStatus.AWAITING_APPROVAL.value == "awaiting_approval" - assert StepStatus.IN_PROGRESS.value == "in_progress" - assert StepStatus.COMPLETED.value == "completed" - assert StepStatus.FAILED.value == "failed" - assert StepStatus.SKIPPED.value == "skipped" - assert StepStatus.REJECTED.value == "rejected" - - def test_step_status_count(self): - """StepStatus has exactly 7 members.""" - assert len(StepStatus) == 7 - - def test_step_status_transition_pending_to_in_progress(self): - """Status can change from PENDING to IN_PROGRESS.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.PENDING, - ) - step.status = StepStatus.IN_PROGRESS - assert step.status == StepStatus.IN_PROGRESS - - def test_step_status_transition_in_progress_to_completed(self): - """Status can change from IN_PROGRESS to COMPLETED.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.IN_PROGRESS, - ) - step.status = StepStatus.COMPLETED - assert step.status == StepStatus.COMPLETED - - def test_step_status_transition_in_progress_to_failed(self): - """Status can change from IN_PROGRESS to FAILED.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.IN_PROGRESS, - ) - step.status = StepStatus.FAILED - assert step.status == StepStatus.FAILED - - -class TestApprovalDecisionEnum: - """Tests for ApprovalDecision enum values.""" - - def test_approval_decision_values_exist(self): - """All 4 ApprovalDecision values exist.""" - assert ApprovalDecision.APPROVE.value == "approve" - assert ApprovalDecision.REJECT.value == "reject" - assert ApprovalDecision.MODIFY.value == "modify" - assert ApprovalDecision.ABORT.value == "abort" - - def test_approval_decision_count(self): - """ApprovalDecision has exactly 4 members.""" - assert len(ApprovalDecision) == 4 - - -class TestJudgmentActionEnum: - """Tests for JudgmentAction enum values.""" - - def test_judgment_action_values_exist(self): - """All 4 JudgmentAction values exist.""" - assert JudgmentAction.ACCEPT.value == "accept" - assert JudgmentAction.RETRY.value == "retry" - assert JudgmentAction.REPLAN.value == "replan" - assert JudgmentAction.ESCALATE.value == "escalate" - - def test_judgment_action_count(self): - """JudgmentAction has exactly 4 members.""" - assert len(JudgmentAction) == 4 - - -class TestExecutionStatusEnum: - """Tests for ExecutionStatus enum values.""" - - def test_execution_status_values_exist(self): - """All 7 ExecutionStatus values exist.""" - assert ExecutionStatus.COMPLETED.value == "completed" - assert ExecutionStatus.AWAITING_APPROVAL.value == "awaiting_approval" - assert ExecutionStatus.NEEDS_REPLAN.value == "needs_replan" - assert ExecutionStatus.NEEDS_ESCALATION.value == "needs_escalation" - assert ExecutionStatus.REJECTED.value == "rejected" - assert ExecutionStatus.ABORTED.value == "aborted" - assert ExecutionStatus.FAILED.value == "failed" - - def test_execution_status_count(self): - """ExecutionStatus has exactly 7 members.""" - assert len(ExecutionStatus) == 7 - - -class TestPlanStepIsReady: - """Tests for PlanStep.is_ready() method.""" - - def test_plan_step_is_ready_no_deps(self): - """Step with no dependencies is ready when PENDING.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - status=StepStatus.PENDING, - ) - assert step.is_ready(set()) is True - - def test_plan_step_is_ready_deps_met(self): - """Step is ready when all dependencies are completed.""" - step = PlanStep( - id="step_2", - description="Second step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1"], - status=StepStatus.PENDING, - ) - assert step.is_ready({"step_1"}) is True - - def test_plan_step_not_ready_deps_missing(self): - """Step is not ready when dependencies are incomplete.""" - step = PlanStep( - id="step_2", - description="Second step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1", "step_3"], - status=StepStatus.PENDING, - ) - # Only step_1 completed, step_3 still pending - assert step.is_ready({"step_1"}) is False - - def test_plan_step_not_ready_wrong_status(self): - """Step is not ready if status is not PENDING.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - status=StepStatus.IN_PROGRESS, - ) - assert step.is_ready(set()) is False - - def test_plan_step_not_ready_completed_status(self): - """Completed step is not ready to execute again.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - status=StepStatus.COMPLETED, - ) - assert step.is_ready(set()) is False - - def test_plan_step_is_ready_multiple_deps_all_met(self): - """Step with multiple dependencies is ready when all are met.""" - step = PlanStep( - id="step_4", - description="Fourth step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1", "step_2", "step_3"], - status=StepStatus.PENDING, - ) - assert step.is_ready({"step_1", "step_2", "step_3"}) is True - - -class TestPlanFromJson: - """Tests for Plan.from_json() method.""" - - def test_plan_from_json_string(self): - """Parse Plan from JSON string.""" - json_str = json.dumps( - { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [ - { - "id": "step_1", - "description": "First step", - "action": { - "action_type": "function", - "function_name": "do_something", - }, - } - ], - } - ) - - plan = Plan.from_json(json_str) - - assert plan.id == "plan_1" - assert plan.goal_id == "goal_1" - assert len(plan.steps) == 1 - assert plan.steps[0].id == "step_1" - - def test_plan_from_json_dict(self): - """Parse Plan from dict directly.""" - data = { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [ - { - "id": "step_1", - "description": "First step", - "action": { - "action_type": "function", - }, - } - ], - } - - plan = Plan.from_json(data) - - assert plan.id == "plan_1" - assert plan.goal_id == "goal_1" - - def test_plan_from_json_nested_plan_key(self): - """Handle {"plan": {...}} wrapper from export_graph().""" - data = { - "plan": { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [], - } - } - - plan = Plan.from_json(data) - - assert plan.id == "plan_1" - - def test_plan_from_json_action_type_conversion(self): - """String action_type is converted to ActionType enum.""" - data = { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [ - { - "id": "step_1", - "description": "LLM step", - "action": { - "action_type": "llm_call", - "prompt": "Hello", - }, - } - ], - } - - plan = Plan.from_json(data) - - assert plan.steps[0].action.action_type == ActionType.LLM_CALL - - def test_plan_from_json_all_action_types(self): - """All action types are correctly converted.""" - action_types = ["llm_call", "tool_use", "sub_graph", "function", "code_execution"] - - for action_type in action_types: - data = { - "id": "plan", - "goal_id": "goal", - "description": "Test", - "steps": [ - { - "id": "step", - "description": "Step", - "action": {"action_type": action_type}, - } - ], - } - plan = Plan.from_json(data) - assert plan.steps[0].action.action_type.value == action_type - - def test_from_json_invalid_action_type(self): - """Unknown action_type raises ValueError.""" - data = { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [ - { - "id": "step_1", - "description": "Invalid step", - "action": { - "action_type": "invalid_type", - }, - } - ], - } - - with pytest.raises(ValueError): - Plan.from_json(data) - - def test_from_json_malformed_json_string(self): - """Invalid JSON syntax raises parse error.""" - invalid_json = "{ invalid json }" - - with pytest.raises(json.JSONDecodeError): - Plan.from_json(invalid_json) - - def test_from_json_missing_step_id(self): - """Step without 'id' raises validation error.""" - data = { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": [ - { - "description": "Step without ID", - "action": {"action_type": "function"}, - } - ], - } - - with pytest.raises(KeyError): - Plan.from_json(data) - - def test_from_json_wrong_type_for_steps(self): - """Non-list steps value raises error.""" - data = { - "id": "plan_1", - "goal_id": "goal_1", - "description": "Test plan", - "steps": "not a list", - } - - with pytest.raises(AttributeError): - Plan.from_json(data) - - def test_from_json_empty_data(self): - """Empty dict creates plan with defaults.""" - plan = Plan.from_json({}) - - assert plan.id == "plan" - assert plan.goal_id == "" - assert plan.steps == [] - - -class TestPlanMethods: - """Tests for Plan instance methods.""" - - @pytest.fixture - def sample_plan(self): - """Create a sample plan with multiple steps.""" - return Plan( - id="test_plan", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=[], - status=StepStatus.COMPLETED, - result={"data": "result1"}, - ), - PlanStep( - id="step_2", - description="Second step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1"], - status=StepStatus.PENDING, - ), - PlanStep( - id="step_3", - description="Third step", - action=ActionSpec(action_type=ActionType.FUNCTION), - dependencies=["step_1"], - status=StepStatus.FAILED, - error="Something went wrong", - attempts=3, - ), - ], - ) - - def test_plan_get_step(self, sample_plan): - """Find step by ID.""" - step = sample_plan.get_step("step_2") - - assert step is not None - assert step.id == "step_2" - assert step.description == "Second step" - - def test_plan_get_step_not_found(self, sample_plan): - """Returns None for missing step ID.""" - step = sample_plan.get_step("nonexistent") - - assert step is None - - def test_plan_get_ready_steps(self, sample_plan): - """Filter steps ready to execute.""" - ready = sample_plan.get_ready_steps() - - assert len(ready) == 1 - assert ready[0].id == "step_2" - - def test_plan_get_completed_steps(self, sample_plan): - """Filter completed steps.""" - completed = sample_plan.get_completed_steps() - - assert len(completed) == 1 - assert completed[0].id == "step_1" - - def test_plan_is_complete_false(self, sample_plan): - """Plan is not complete when steps are pending/failed.""" - assert sample_plan.is_complete() is False - - def test_plan_is_complete_true(self): - """Plan is complete when all steps are completed.""" - plan = Plan( - id="test_plan", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.COMPLETED, - ), - PlanStep( - id="step_2", - description="Second step", - action=ActionSpec(action_type=ActionType.FUNCTION), - status=StepStatus.COMPLETED, - ), - ], - ) - assert plan.is_complete() is True - - def test_plan_is_complete_empty(self): - """Empty plan is considered complete.""" - plan = Plan( - id="empty_plan", - goal_id="goal_1", - description="Empty plan", - steps=[], - ) - assert plan.is_complete() is True - - def test_plan_to_feedback_context(self, sample_plan): - """Serializes context for replanning.""" - context = sample_plan.to_feedback_context() - - assert context["plan_id"] == "test_plan" - assert context["revision"] == 1 - assert len(context["completed_steps"]) == 1 - assert context["completed_steps"][0]["id"] == "step_1" - assert len(context["failed_steps"]) == 1 - assert context["failed_steps"][0]["id"] == "step_3" - assert context["failed_steps"][0]["error"] == "Something went wrong" - - -class TestPlanRoundTrip: - """Tests for Plan serialization round-trip.""" - - def test_plan_round_trip_model_dump(self): - """from_json(plan.model_dump()) preserves data.""" - original = Plan( - id="plan_1", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First step", - action=ActionSpec( - action_type=ActionType.LLM_CALL, - prompt="Hello world", - ), - dependencies=[], - expected_outputs=["greeting"], - ), - ], - context={"key": "value"}, - revision=2, - ) - - # Round-trip through dict - data = original.model_dump() - restored = Plan.from_json(data) - - assert restored.id == original.id - assert restored.goal_id == original.goal_id - assert restored.description == original.description - assert restored.context == original.context - assert restored.revision == original.revision - assert len(restored.steps) == len(original.steps) - assert restored.steps[0].id == original.steps[0].id - assert restored.steps[0].action.action_type == original.steps[0].action.action_type - - def test_plan_round_trip_json_string(self): - """from_json(plan.model_dump_json()) preserves data.""" - original = Plan( - id="plan_1", - goal_id="goal_1", - description="Test plan", - steps=[ - PlanStep( - id="step_1", - description="First step", - action=ActionSpec( - action_type=ActionType.TOOL_USE, - tool_name="my_tool", - tool_args={"arg1": "value1"}, - ), - dependencies=[], - ), - ], - ) - - # Round-trip through JSON string - json_str = original.model_dump_json() - restored = Plan.from_json(json_str) - - assert restored.id == original.id - assert len(restored.steps) == 1 - assert restored.steps[0].action.tool_name == "my_tool" - - def test_plan_step_serialization(self): - """PlanStep serializes and deserializes correctly.""" - step = PlanStep( - id="step_1", - description="Test step", - action=ActionSpec( - action_type=ActionType.CODE_EXECUTION, - code="print('hello')", - language="python", - ), - inputs={"input1": "value1"}, - expected_outputs=["output1", "output2"], - dependencies=["dep1", "dep2"], - requires_approval=True, - approval_message="Please approve", - ) - - # Serialize and deserialize - data = step.model_dump() - - assert data["id"] == "step_1" - assert data["action"]["action_type"] == "code_execution" - assert data["action"]["code"] == "print('hello')" - assert data["inputs"] == {"input1": "value1"} - assert data["expected_outputs"] == ["output1", "output2"] - assert data["dependencies"] == ["dep1", "dep2"] - assert data["requires_approval"] is True diff --git a/core/tests/test_plan_dependency_resolution.py b/core/tests/test_plan_dependency_resolution.py deleted file mode 100644 index 5300612f..00000000 --- a/core/tests/test_plan_dependency_resolution.py +++ /dev/null @@ -1,384 +0,0 @@ -""" -Tests for Plan dependency resolution with failed steps. - -These tests verify that plan execution correctly handles failed dependencies -instead of hanging indefinitely. -""" - -import pytest - -from framework.graph.plan import ( - ActionSpec, - ActionType, - Plan, - PlanStep, - StepStatus, -) - - -class TestStepStatusTerminal: - """Tests for StepStatus.is_terminal() method.""" - - def test_completed_is_terminal(self): - """COMPLETED status should be terminal.""" - assert StepStatus.COMPLETED.is_terminal() is True - - def test_failed_is_terminal(self): - """FAILED status should be terminal.""" - assert StepStatus.FAILED.is_terminal() is True - - def test_skipped_is_terminal(self): - """SKIPPED status should be terminal.""" - assert StepStatus.SKIPPED.is_terminal() is True - - def test_rejected_is_terminal(self): - """REJECTED status should be terminal.""" - assert StepStatus.REJECTED.is_terminal() is True - - def test_pending_is_not_terminal(self): - """PENDING status should not be terminal.""" - assert StepStatus.PENDING.is_terminal() is False - - def test_in_progress_is_not_terminal(self): - """IN_PROGRESS status should not be terminal.""" - assert StepStatus.IN_PROGRESS.is_terminal() is False - - def test_awaiting_approval_is_not_terminal(self): - """AWAITING_APPROVAL status should not be terminal.""" - assert StepStatus.AWAITING_APPROVAL.is_terminal() is False - - def test_completed_is_successful(self): - """Only COMPLETED should be successful.""" - assert StepStatus.COMPLETED.is_successful() is True - assert StepStatus.FAILED.is_successful() is False - assert StepStatus.SKIPPED.is_successful() is False - - -class TestPlanStepIsReady: - """Tests for PlanStep.is_ready() with terminal states.""" - - def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING): - """Helper to create a step.""" - return PlanStep( - id=id, - description=f"Step {id}", - action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"), - dependencies=deps or [], - status=status, - ) - - def test_step_ready_when_no_dependencies(self): - """Step with no dependencies should be ready.""" - step = self._make_step("step1") - assert step.is_ready(set()) is True - - def test_step_ready_when_dependency_completed(self): - """Step should be ready when dependency is completed.""" - step = self._make_step("step2", deps=["step1"]) - assert step.is_ready({"step1"}) is True - - def test_step_ready_when_dependency_failed(self): - """Step should be ready when dependency failed (terminal state).""" - step = self._make_step("step2", deps=["step1"]) - # step1 is in terminal_step_ids because it failed - assert step.is_ready({"step1"}) is True - - def test_step_not_ready_when_dependency_pending(self): - """Step should not be ready when dependency is still pending.""" - step = self._make_step("step2", deps=["step1"]) - assert step.is_ready(set()) is False - - def test_step_not_ready_when_already_completed(self): - """Completed step should not be ready.""" - step = self._make_step("step1", status=StepStatus.COMPLETED) - assert step.is_ready(set()) is False - - def test_step_not_ready_when_in_progress(self): - """In-progress step should not be ready.""" - step = self._make_step("step1", status=StepStatus.IN_PROGRESS) - assert step.is_ready(set()) is False - - def test_step_ready_with_multiple_dependencies_all_terminal(self): - """Step should be ready when all dependencies are terminal.""" - step = self._make_step("step3", deps=["step1", "step2"]) - assert step.is_ready({"step1", "step2"}) is True - - def test_step_not_ready_with_partial_dependencies(self): - """Step should not be ready when only some dependencies are terminal.""" - step = self._make_step("step3", deps=["step1", "step2"]) - assert step.is_ready({"step1"}) is False - - -class TestPlanGetReadySteps: - """Tests for Plan.get_ready_steps() with failed dependencies.""" - - def _make_plan(self, steps: list[PlanStep]) -> Plan: - """Helper to create a plan.""" - return Plan( - id="test_plan", - goal_id="test_goal", - description="Test plan", - steps=steps, - ) - - def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING): - """Helper to create a step.""" - return PlanStep( - id=id, - description=f"Step {id}", - action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"), - dependencies=deps or [], - status=status, - ) - - def test_ready_steps_with_no_dependencies(self): - """Steps with no dependencies should be ready.""" - plan = self._make_plan( - [ - self._make_step("step1"), - self._make_step("step2"), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 2 - assert {s.id for s in ready} == {"step1", "step2"} - - def test_ready_steps_with_completed_dependency(self): - """Dependent step should be ready when dependency is completed.""" - plan = self._make_plan( - [ - self._make_step("step1", status=StepStatus.COMPLETED), - self._make_step("step2", deps=["step1"]), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step2" - - def test_ready_steps_with_failed_dependency(self): - """Dependent step should be ready when dependency failed.""" - plan = self._make_plan( - [ - self._make_step("step1", status=StepStatus.FAILED), - self._make_step("step2", deps=["step1"]), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step2" - - def test_ready_steps_with_skipped_dependency(self): - """Dependent step should be ready when dependency was skipped.""" - plan = self._make_plan( - [ - self._make_step("step1", status=StepStatus.SKIPPED), - self._make_step("step2", deps=["step1"]), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step2" - - def test_ready_steps_with_rejected_dependency(self): - """Dependent step should be ready when dependency was rejected.""" - plan = self._make_plan( - [ - self._make_step("step1", status=StepStatus.REJECTED), - self._make_step("step2", deps=["step1"]), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step2" - - def test_no_ready_steps_when_dependency_in_progress(self): - """Dependent step should not be ready when dependency is in progress.""" - plan = self._make_plan( - [ - self._make_step("step1", status=StepStatus.IN_PROGRESS), - self._make_step("step2", deps=["step1"]), - ] - ) - ready = plan.get_ready_steps() - assert len(ready) == 0 - - -class TestPlanCompletion: - """Tests for Plan completion status methods.""" - - def _make_plan(self, steps: list[PlanStep]) -> Plan: - """Helper to create a plan.""" - return Plan( - id="test_plan", - goal_id="test_goal", - description="Test plan", - steps=steps, - ) - - def _make_step(self, id: str, status: StepStatus = StepStatus.PENDING): - """Helper to create a step.""" - return PlanStep( - id=id, - description=f"Step {id}", - action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"), - status=status, - ) - - def test_is_complete_when_all_completed(self): - """Plan should be complete when all steps are completed.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.COMPLETED), - ] - ) - assert plan.is_complete() is True - - def test_is_complete_when_all_terminal_mixed(self): - """Plan should be complete when all steps are in terminal states (mixed).""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.FAILED), - self._make_step("step3", StepStatus.SKIPPED), - ] - ) - assert plan.is_complete() is True - - def test_is_not_complete_when_pending(self): - """Plan should not be complete when steps are pending.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.PENDING), - ] - ) - assert plan.is_complete() is False - - def test_is_not_complete_when_in_progress(self): - """Plan should not be complete when steps are in progress.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.IN_PROGRESS), - ] - ) - assert plan.is_complete() is False - - def test_is_successful_when_all_completed(self): - """Plan should be successful only when all steps completed.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.COMPLETED), - ] - ) - assert plan.is_successful() is True - - def test_is_not_successful_when_failed(self): - """Plan should not be successful when any step failed.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.FAILED), - ] - ) - assert plan.is_successful() is False - - def test_has_failed_steps(self): - """has_failed_steps should detect failed steps.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.FAILED), - ] - ) - assert plan.has_failed_steps() is True - - def test_has_no_failed_steps(self): - """has_failed_steps should return False when all succeeded.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.COMPLETED), - ] - ) - assert plan.has_failed_steps() is False - - def test_get_failed_steps(self): - """get_failed_steps should return all failed/skipped/rejected steps.""" - plan = self._make_plan( - [ - self._make_step("step1", StepStatus.COMPLETED), - self._make_step("step2", StepStatus.FAILED), - self._make_step("step3", StepStatus.SKIPPED), - self._make_step("step4", StepStatus.REJECTED), - ] - ) - failed = plan.get_failed_steps() - assert len(failed) == 3 - assert {s.id for s in failed} == {"step2", "step3", "step4"} - - -class TestBugScenario: - """Test the specific bug scenario that was fixed.""" - - def _make_step(self, id: str, deps: list[str] = None, status: StepStatus = StepStatus.PENDING): - """Helper to create a step.""" - return PlanStep( - id=id, - description=f"Step {id}", - action=ActionSpec(action_type=ActionType.FUNCTION, function_name="test"), - dependencies=deps or [], - status=status, - ) - - def test_dependent_step_becomes_ready_after_dependency_fails(self): - """ - BUG SCENARIO: When step1 fails, step2 (which depends on step1) should - become ready, allowing the executor to handle it appropriately. - - Before fix: step2 would never become ready, causing infinite hang. - After fix: step2 becomes ready and executor can decide how to handle it. - """ - plan = Plan( - id="test_plan", - goal_id="test_goal", - description="Test plan with dependency", - steps=[ - self._make_step("step1", status=StepStatus.PENDING), - self._make_step("step2", deps=["step1"], status=StepStatus.PENDING), - ], - ) - - # Initially, only step1 is ready - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step1" - - # Simulate step1 failing - plan.steps[0].status = StepStatus.FAILED - - # Now step2 should be ready (dependency is in terminal state) - ready = plan.get_ready_steps() - assert len(ready) == 1 - assert ready[0].id == "step2" - - # Plan should not be complete yet (step2 is still pending) - assert plan.is_complete() is False - - # Simulate step2 also failing (or being skipped due to failed dependency) - plan.steps[1].status = StepStatus.SKIPPED - - # Now plan should be complete (all steps in terminal states) - assert plan.is_complete() is True - - # But not successful - assert plan.is_successful() is False - - # And should have failed steps - assert plan.has_failed_steps() is True - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/core/tests/test_pydantic_validation.py b/core/tests/test_pydantic_validation.py index aea3db87..c86b8a91 100644 --- a/core/tests/test_pydantic_validation.py +++ b/core/tests/test_pydantic_validation.py @@ -46,7 +46,7 @@ class TestNodeSpecOutputModel: id="test_node", name="Test Node", description="A test node", - node_type="llm_generate", + node_type="event_loop", output_model=SimpleOutput, ) @@ -400,7 +400,7 @@ class TestPydanticValidationIntegrationExtended: id="full_test", name="Full Validation Test", description="Tests all validation options", - node_type="llm_generate", + node_type="event_loop", output_keys=["category", "priority", "summary", "suggested_action"], output_model=TicketAnalysis, max_validation_retries=3, diff --git a/core/tests/test_runtime_logger.py b/core/tests/test_runtime_logger.py index 4495c08d..7ac3a91a 100644 --- a/core/tests/test_runtime_logger.py +++ b/core/tests/test_runtime_logger.py @@ -51,7 +51,7 @@ class TestRuntimeLogStore: detail2 = NodeDetail( node_id="node-2", node_name="Process Node", - node_type="function", + node_type="event_loop", success=True, total_steps=1, ) @@ -64,7 +64,7 @@ class TestRuntimeLogStore: assert len(loaded.nodes) == 2 assert loaded.nodes[0].node_id == "node-1" assert loaded.nodes[0].exit_status == "success" - assert loaded.nodes[1].node_type == "function" + assert loaded.nodes[1].node_type == "event_loop" @pytest.mark.asyncio async def test_append_and_load_tool_logs(self, tmp_path: Path): @@ -606,14 +606,14 @@ class TestRuntimeLogger: # Node 2: function rt_logger.log_step( node_id="node-2", - node_type="function", + node_type="event_loop", step_index=0, latency_ms=50, ) rt_logger.log_node_complete( node_id="node-2", node_name="Process", - node_type="function", + node_type="event_loop", success=True, total_steps=1, latency_ms=50, diff --git a/docs/articles/aden-vs-autogen.md b/docs/articles/aden-vs-autogen.md index fe354766..4d247926 100644 --- a/docs/articles/aden-vs-autogen.md +++ b/docs/articles/aden-vs-autogen.md @@ -82,9 +82,9 @@ learn from the corrections to improve accuracy. | Feature | AutoGen | Aden | |---------|---------|------| | Agent-to-agent | Natural language | Generated connections | -| Conversation history | Built-in | Via memory nodes | +| Conversation history | Built-in | Via shared memory | | Message passing | Sequential turns | Async/event-driven | -| Human interaction | Via UserProxyAgent | Native HITL nodes | +| Human interaction | Via UserProxyAgent | Client-facing nodes | **Verdict:** AutoGen is more natural for dialogue; Aden is more flexible for diverse patterns. diff --git a/docs/cleanup-plan.md b/docs/cleanup-plan.md new file mode 100644 index 00000000..8ded231f --- /dev/null +++ b/docs/cleanup-plan.md @@ -0,0 +1,161 @@ +# Phase 2: FunctionNode Removal + Dead Code Cleanup + +> Ref: [GitHub Issue #4753](https://github.com/adenhq/hive/issues/4753) + +## Context + +`FunctionNode` (`node_type="function"`) breaks three core agent principles: conversation continuity, cumulative tools, and user interruptibility. Phase 1 (soft deprecation warnings) is complete. This plan covers Phase 2 (hard removal) plus cleanup of other dead code discovered during scoping. + +**Total estimated removal: ~5,000+ lines** across production code, tests, docs, and examples. + +--- + +## Part 1: Remove `FunctionNode` class and `"function"` node type + +### 1.1 Core framework + +| File | What to remove/change | +|---|---| +| `core/framework/graph/node.py` | Delete `FunctionNode` class (~L1878-1985). Remove `function` field from `NodeSpec` (~L200). | +| `core/framework/graph/executor.py` | Remove `FunctionNode` import (~L24). Remove `"function"` from `VALID_NODE_TYPES` (~L1473). Remove `node_type == "function"` branch (~L1529-1533). Remove `register_function()` (~L1975-1977). Add migration error for graphs with `node_type="function"`. | +| `core/framework/builder/workflow.py` | Remove `node_type == "function"` validation block (~L258-260). | + +### 1.2 Agent Builder MCP server + +| File | What to change | +|---|---| +| `core/framework/mcp/agent_builder_server.py` | Remove `"function"` from `node_type` description in `add_node` (~L590) and `update_node` (~L841). Remove `node_type == "function"` simulation branch in `test_node` (~L2356-2357). | + +### 1.3 Examples & demos + +| File | Action | +|---|---| +| `core/examples/manual_agent.py` | Rewrite to use `event_loop` nodes | +| `core/demos/github_outreach_demo.py` | Convert `Sender` node from `function` to `event_loop` | +| `core/examples/mcp_integration_example.py` | Rewrite to use `event_loop` nodes | + +### 1.4 Docs & skills + +| File | Action | +|---|---| +| `.claude/skills/hive-create/SKILL.md` | Remove `"function"` from node type table (~L495, L856) | +| `docs/developer-guide.md` | Remove `"function"` node type reference (~L613) | +| `core/MCP_SERVER_GUIDE.md` | Audit for `"function"` references | +| `docs/why-conditional-edge-priority.md` | Remove or repurpose (entire doc framed around function nodes) | +| `docs/environment-setup.md` | Remove "function" from node types list (~L216) | +| `docs/i18n/*.md` | Update BUILD diagrams in 7 i18n files (ja, ko, pt, hi, es, ru, zh-CN) removing "Function" | +| `core/framework/runtime/runtime_log_schemas.py` | Remove `"function"` from node_type comment (~L40) | + +--- + +## Part 2: Remove deprecated `LLMNode` + `llm_tool_use` / `llm_generate` + +Already soft-deprecated with `DeprecationWarning`. No template agent uses them. Only `mcp_integration_example.py` references them. + +| File | What to remove/change | +|---|---| +| `core/framework/graph/node.py` | Delete `LLMNode` class (~L660-1689, ~1000 lines). Largest single removal. | +| `core/framework/graph/executor.py` | Remove `LLMNode` import. Remove `"llm_tool_use"`/`"llm_generate"` from `VALID_NODE_TYPES`. Remove `DEPRECATED_NODE_TYPES` dict. Remove their branches in `_get_node_implementation` (~L1507-1523). Update `human_input` branch to use `EventLoopNode` instead of `LLMNode`. Add migration error for deprecated types. | +| `core/framework/mcp/agent_builder_server.py` | Remove `llm_tool_use`/`llm_generate` validation warnings and branches (~L668-683, L922-937) | + +--- + +## Part 3: Rewrite tests using `function` nodes as fixtures + +These tests use `node_type="function"` as convenient scaffolding but actually test graph execution features (retries, fan-out, feedback edges, etc.). They all need rewriting. + +| Test file | What it tests | +|---|---| +| `core/tests/test_on_failure_edges.py` | On-failure edge routing (~10 function nodes) | +| `core/tests/test_executor_feedback_edges.py` | Max node visits, feedback loops (~20+ function nodes) | +| `core/tests/test_executor_max_retries.py` | Retry behavior (~7 function nodes) | +| `core/tests/test_fanout.py` | Fan-out/fan-in parallel execution (~20+ function nodes) | +| `core/tests/test_execution_quality.py` | Retry + quality scoring (~8 function nodes) | +| `core/tests/test_conditional_edge_direct_key.py` | Conditional edge evaluation (~8 function nodes) | +| `core/tests/test_event_loop_integration.py` | Mixed node graph test (~2 function nodes) | +| `core/tests/test_runtime_logger.py` | Runtime log schema (~2 references) | +| `tools/tests/tools/test_runtime_logs_tool.py` | Log tool output (~2 references) | + +**Strategy:** Create a `MockNode(NodeProtocol)` test helper that wraps a callable, providing the same convenience as `FunctionNode` but scoped to tests only. Tests swap `node_type="function"` for a neutral `node_type="event_loop"` and register a `MockNode` in the executor's `node_registry`. This minimizes rewrite effort. + +--- + +## Part 4: Items NOT recommended for removal + +| Item | Reason to keep | +|---|---| +| `RouterNode` | Architecturally sound (deterministic routing), just lacks template examples | +| `human_input` node type | Valid HITL pattern, but switch implementation from `LLMNode` to `EventLoopNode` | +| `register_function` in `tool_registry.py` | For **tool** registration β€” completely different concept from function nodes | + +--- + +## Part 5: Remove the Planner-Worker subsystem (~3,900 lines dead code) + +The entire Planner-Worker-Judge pattern has **zero external consumers**. No template agent, example, demo, or runner references it. It is only consumed by: +- Its own internal files (self-referential imports) +- The agent-builder MCP server (exposes tools for it) +- Its own dedicated tests + +### 5.1 Delete these files entirely + +| File | Lines | What | +|---|---|---| +| `core/framework/graph/flexible_executor.py` | 552 | `FlexibleGraphExecutor` β€” Worker-Judge orchestrator | +| `core/framework/graph/worker_node.py` | 620 | `WorkerNode` β€” plan step dispatcher | +| `core/framework/graph/plan.py` | 513 | `Plan`, `PlanStep`, `ActionType`, `ActionSpec` data structures | +| `core/framework/graph/judge.py` | 406 | `HybridJudge` β€” step result evaluator | +| `core/framework/graph/code_sandbox.py` | 413 | `CodeSandbox` β€” sandboxed code execution | +| `core/tests/test_flexible_executor.py` | 442 | FlexibleGraphExecutor tests | +| `core/tests/test_plan.py` | 592 | Plan data structure tests | +| `core/tests/test_plan_dependency_resolution.py` | 384 | Plan dependency resolution tests | + +### 5.2 Clean up exports + +`core/framework/graph/__init__.py` β€” Remove all planner-worker exports: `FlexibleGraphExecutor`, `ExecutorConfig`, `WorkerNode`, `StepExecutionResult`, `HybridJudge`, `create_default_judge`, `CodeSandbox`, `safe_eval`, `safe_exec`, `Plan`, `PlanStep`, `ActionType`, `ActionSpec`, and all related symbols. + +### 5.3 Remove MCP tools from agent-builder server + +`core/framework/mcp/agent_builder_server.py` β€” Remove these 7 MCP tools: + +| MCP tool | Description | +|---|---| +| `create_plan` | Creates a plan with steps | +| `validate_plan` | Validates plan structure | +| `simulate_plan_execution` | Dry-run simulation | +| `load_exported_plan` | Loads plan from JSON | +| `add_evaluation_rule` | Adds HybridJudge rule | +| `list_evaluation_rules` | Lists evaluation rules | +| `remove_evaluation_rule` | Removes evaluation rule | + +Also remove: +- `from framework.graph.plan import Plan` import (~L39, L3731) +- `_evaluation_rules` global list (~L2528) +- `"evaluation_rules"` from export/session data (~L1859) +- `load_plan_from_json()` helper function (~L3721-3733) + +--- + +## Execution order + +1. **Create `MockNode` test helper** β€” unblocks all test rewrites +2. **Rewrite tests** using function nodes as fixtures (Part 3) +3. **Remove `FunctionNode` class + all references** (Part 1) +4. **Remove `LLMNode` class + deprecated types** (Part 2) +5. **Delete Planner-Worker subsystem files** (Part 5.1) +6. **Clean up `__init__.py` exports** (Part 5.2) +7. **Remove MCP tools** for plans/evaluation from agent-builder server (Part 5.3) +8. **Update examples/demos/docs/skills** (Parts 1.3, 1.4) +9. **Run full test suite** to verify + +--- + +## Verification + +1. `pytest core/tests/` β€” all tests pass +2. `pytest tools/tests/` β€” runtime log tests pass +3. Load any template agent JSON β€” no errors +4. Attempt to load a graph with `node_type="function"` β€” clear `RuntimeError` with migration guidance +5. Attempt to load a graph with `node_type="llm_tool_use"` β€” clear `RuntimeError` with migration guidance +6. Agent builder MCP: `add_node` with `node_type="function"` β€” rejected with helpful message +7. Plan/evaluation MCP tools no longer appear in tool list diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 848fd438..4c60187d 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -278,7 +278,7 @@ claude> /hive-test 2. **Design the Workflow** - The skill guides you through defining nodes - - Each node is a unit of work (LLM call, function, router) + - Each node is a unit of work (LLM call with event_loop) - Edges define how execution flows 3. **Generate the Agent** @@ -314,7 +314,7 @@ If you prefer to build agents manually: { "node_id": "analyze", "name": "Analyze Ticket", - "node_type": "llm_generate", + "node_type": "event_loop", "system_prompt": "Analyze this support ticket...", "input_keys": ["ticket_content"], "output_keys": ["category", "priority"] @@ -610,7 +610,7 @@ def my_custom_tool(param1: str, param2: int) -> Dict[str, Any]: "nodes": [ { "node_id": "use_tool", - "node_type": "function", + "node_type": "event_loop", "tools": ["my_custom_tool"], ... } diff --git a/docs/environment-setup.md b/docs/environment-setup.md index 9bdb5b8b..41812ddf 100644 --- a/docs/environment-setup.md +++ b/docs/environment-setup.md @@ -213,7 +213,7 @@ Follow the prompts to: 1. Understand the agent architecture and file structure 2. Define the agent's goal, success criteria, and constraints -3. Learn node types (LLM, tool-use, router, function) +3. Learn node types (event_loop only) 4. Discover and validate available tools before use This step establishes the core concepts and rules needed before building an agent. diff --git a/docs/i18n/es.md b/docs/i18n/es.md index 7c280e10..87f0cdf8 100644 --- a/docs/i18n/es.md +++ b/docs/i18n/es.md @@ -119,7 +119,7 @@ Los frameworks de agentes tradicionales requieren que diseΓ±es manualmente flujo ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/hi.md b/docs/i18n/hi.md index 2f538819..c875d9fc 100644 --- a/docs/i18n/hi.md +++ b/docs/i18n/hi.md @@ -128,7 +128,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}' ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/ja.md b/docs/i18n/ja.md index 41d75032..264fa605 100644 --- a/docs/i18n/ja.md +++ b/docs/i18n/ja.md @@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}' ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/ko.md b/docs/i18n/ko.md index 400f51bb..3f1bf7b8 100644 --- a/docs/i18n/ko.md +++ b/docs/i18n/ko.md @@ -120,7 +120,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}' ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/pt.md b/docs/i18n/pt.md index 0dbc768e..9f679bfa 100644 --- a/docs/i18n/pt.md +++ b/docs/i18n/pt.md @@ -121,7 +121,7 @@ Frameworks de agentes tradicionais exigem que vocΓͺ projete manualmente fluxos d ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/ru.md b/docs/i18n/ru.md index 3dd8ba3d..0663a260 100644 --- a/docs/i18n/ru.md +++ b/docs/i18n/ru.md @@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}' ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/i18n/zh-CN.md b/docs/i18n/zh-CN.md index a749ed96..ce9ccf13 100644 --- a/docs/i18n/zh-CN.md +++ b/docs/i18n/zh-CN.md @@ -121,7 +121,7 @@ PYTHONPATH=exports uv run python -m your_agent_name run --input '{...}' ```mermaid flowchart LR subgraph BUILD["πŸ—οΈ BUILD"] - GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
LLM/Router/Function"] + GOAL["Define Goal
+ Success Criteria"] --> NODES["Add Nodes
Event Loop"] NODES --> EDGES["Connect Edges
on_success/failure/conditional"] EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"] end diff --git a/docs/key_concepts/graph.md b/docs/key_concepts/graph.md index a49b0f40..c5249cb1 100644 --- a/docs/key_concepts/graph.md +++ b/docs/key_concepts/graph.md @@ -10,17 +10,15 @@ Edges can loop back, creating feedback cycles where an agent retries a step or t ## Nodes -A node is a unit of work. Each node reads inputs from shared memory, does something, and writes outputs back. There are a handful of node types, each suited to a different kind of work: +A node is a unit of work. Each node reads inputs from shared memory, does something, and writes outputs back. -**`event_loop`** β€” The workhorse. This is a multi-turn LLM loop: the model reasons about the current state, calls tools, observes results, and keeps going until it has produced the required outputs. Most of the interesting agent behavior happens in these nodes. They handle long-running tasks, manage their own context window, and can recover from crashes mid-conversation. +**`event_loop`** β€” This is the only node type in Hive. It's a multi-turn LLM loop where the model reasons about the current state, calls tools, observes results, and keeps going until it has produced the required outputs. All agent behavior happens in these nodes. They handle long-running tasks, manage their own context window, and can recover from crashes mid-conversation. -**`function`** β€” A plain Python function. No LLM involved. Use these for anything deterministic: data transformation, API calls with known parameters, validation logic, or any step where you don't want a language model making judgment calls. - -**`router`** β€” A decision point that directs execution down different paths. Can be rule-based ("if confidence is high, go left; otherwise, go right") or LLM-powered ("given the goal and what we know so far, which path makes sense?"). - -**`human_input`** β€” A pause point where the agent stops and asks a human for input before continuing. See [Human-in-the-Loop](#human-in-the-loop) below. - -There are also simpler LLM node types (`llm_tool_use` for a single LLM call with tools, `llm_generate` for pure text generation) for steps that don't need the full event loop. +Event loop nodes are highly configurable: +- **Tools** β€” Give the node access to specific capabilities (web search, API calls, database queries, etc.) +- **Client-facing** β€” Set `client_facing=True` to make the node interact directly with humans (see [Human-in-the-Loop](#human-in-the-loop)) +- **Custom logic** β€” Implement the `NodeProtocol` interface to wrap deterministic functions or any custom behavior +- **Judge** β€” Configure evaluation criteria to control when the node accepts its output vs. retries ### Self-Correction Within a Node @@ -57,11 +55,11 @@ Data flows through the graph in a natural way: input arrives at the start, each ## Human-in-the-Loop -Human-in-the-loop (HITL) nodes are where the agent pauses and asks a person for input. This isn't a blunt "stop everything" β€” the framework supports structured questions: open-ended text, multiple choice, yes/no approvals, and multi-field forms. +Human-in-the-loop (HITL) is enabled by setting `client_facing=True` on an event loop node. These nodes pause and ask a person for input. This isn't a blunt "stop everything" β€” the framework supports structured questions: open-ended text, multiple choice, yes/no approvals, and multi-field forms. -When the agent hits a HITL node, it saves its entire state and presents the questions. The session can sit paused for minutes, hours, or days. When the human responds, execution picks up exactly where it left off. +When the agent hits a client-facing node, it saves its entire state and presents the output or questions directly to the user. The session can sit paused for minutes, hours, or days. When the human responds, execution picks up exactly where it left off. -This is what makes Hive agents supervisable in production. You place HITL nodes at critical decision points β€” before sending a message, before making a purchase, before any action that's hard to undo. The agent handles the routine work autonomously; humans weigh in on the decisions that matter. And every time a human provides input, that decision becomes data the [evolution](./evolution.md) process can learn from. +This is what makes Hive agents supervisable in production. You place client-facing nodes at critical decision points β€” before sending a message, before making a purchase, before any action that's hard to undo. The agent handles the routine work autonomously; humans weigh in on the decisions that matter. And every time a human provides input, that decision becomes data the [evolution](./evolution.md) process can learn from. ## The Shape of an Agent diff --git a/docs/roadmap.md b/docs/roadmap.md index ae13eea5..15fc44e6 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -241,7 +241,7 @@ classDef done fill:#9e9e9e,color:#fff,stroke:#757575 - [ ] Migrate from monolithic run storage - [ ] **Context Building & Conversation Loop** - [ ] Implement `Message.stream(sessionID)` - - [ ] Update `LLMNode.execute()` for full context building + - [ ] Update `EventLoopNode.execute()` for full context building - [ ] Implement `Message.toModelMessages()` conversion - [ ] **Proactive Compaction** - [ ] Implement proactive overflow detection diff --git a/docs/why-conditional-edge-priority.md b/docs/why-conditional-edge-priority.md deleted file mode 100644 index a664fe0b..00000000 --- a/docs/why-conditional-edge-priority.md +++ /dev/null @@ -1,42 +0,0 @@ -# Why Conditional Edges Need Priority (Function Nodes) - -## The problem - -Function nodes return everything they computed. They don't pick one output key β€” they return all of them. - -```python -def score_lead(inputs): - score = compute_score(inputs["profile"]) - return { - "score": score, - "is_high_value": score > 80, - "needs_enrichment": score > 50 and not inputs["profile"].get("company"), - } -``` - -Lead comes in: score 92, no company on file. Output: `{"score": 92, "is_high_value": True, "needs_enrichment": True}`. - -Two conditional edges leaving this node: - -``` -Edge A: needs_enrichment == True β†’ enrichment node -Edge B: is_high_value == True β†’ outreach node -``` - -Both are true. Without priority, the graph either fans out to both (wrong β€” you'd email someone while still enriching their data) or picks one randomly (wrong β€” non-deterministic). - -## Priority fixes it - -``` -Edge A: needs_enrichment == True priority=2 (higher = checked first) -Edge B: is_high_value == True priority=1 -Edge C: is_high_value == False priority=0 -``` - -Executor keeps only the highest-priority matching group. A wins. Lead gets enriched first, loops back, gets re-scored β€” now `needs_enrichment` is false, B wins, outreach happens. - -## Why event loop nodes don't need this - -The LLM understands "if/else." You tell it in the prompt: "if needs enrichment, set `needs_enrichment`. Otherwise if high value, set `approved`." It picks one. Only one conditional edge matches. - -A function just returns a dict. It doesn't do "otherwise." Priority is the "otherwise" for function nodes. diff --git a/tools/src/aden_tools/credentials/base.py b/tools/src/aden_tools/credentials/base.py index 377a5689..c7d4548a 100644 --- a/tools/src/aden_tools/credentials/base.py +++ b/tools/src/aden_tools/credentials/base.py @@ -29,7 +29,7 @@ class CredentialSpec: """Tool names that require this credential (e.g., ['web_search'])""" node_types: list[str] = field(default_factory=list) - """Node types that require this credential (e.g., ['llm_generate', 'llm_tool_use'])""" + """Node types that require this credential (e.g., ['event_loop'])""" required: bool = True """Whether this credential is required (vs optional)""" @@ -321,7 +321,7 @@ class CredentialManager: Get list of missing credentials for the given node types. Args: - node_types: List of node types to check (e.g., ['llm_generate', 'llm_tool_use']) + node_types: List of node types to check (e.g., ['event_loop']) Returns: List of (credential_name, spec) tuples for missing credentials @@ -357,7 +357,7 @@ class CredentialManager: Example: creds = CredentialManager() - creds.validate_for_node_types(["llm_generate", "llm_tool_use"]) + creds.validate_for_node_types(["event_loop"]) # Raises CredentialError if ANTHROPIC_API_KEY is not set """ missing = self.get_missing_for_node_types(node_types) diff --git a/tools/src/aden_tools/credentials/llm.py b/tools/src/aden_tools/credentials/llm.py index 267eb4c5..ea8c8b84 100644 --- a/tools/src/aden_tools/credentials/llm.py +++ b/tools/src/aden_tools/credentials/llm.py @@ -10,7 +10,7 @@ LLM_CREDENTIALS = { "anthropic": CredentialSpec( env_var="ANTHROPIC_API_KEY", tools=[], - node_types=["llm_generate", "llm_tool_use"], + node_types=["event_loop"], required=False, # Not required - agents can use other providers via LiteLLM startup_required=False, # MCP server doesn't need LLM credentials help_url="https://console.anthropic.com/settings/keys", diff --git a/tools/tests/test_credentials.py b/tools/tests/test_credentials.py index 76bcd7ca..2b66d264 100644 --- a/tools/tests/test_credentials.py +++ b/tools/tests/test_credentials.py @@ -278,7 +278,7 @@ class TestCredentialSpec: spec = CredentialSpec( env_var="API_KEY", tools=["tool_a", "tool_b"], - node_types=["llm_generate"], + node_types=["event_loop"], required=False, startup_required=True, help_url="https://example.com", @@ -287,7 +287,7 @@ class TestCredentialSpec: assert spec.env_var == "API_KEY" assert spec.tools == ["tool_a", "tool_b"] - assert spec.node_types == ["llm_generate"] + assert spec.node_types == ["event_loop"] assert spec.required is False assert spec.startup_required is True assert spec.help_url == "https://example.com" @@ -315,8 +315,7 @@ class TestCredentialSpecs: spec = CREDENTIAL_SPECS["anthropic"] assert spec.env_var == "ANTHROPIC_API_KEY" assert spec.tools == [] - assert "llm_generate" in spec.node_types - assert "llm_tool_use" in spec.node_types + assert "event_loop" in spec.node_types assert spec.required is False assert spec.startup_required is False assert "anthropic.com" in spec.help_url @@ -399,7 +398,7 @@ class TestNodeTypeValidation: creds = CredentialStoreAdapter.with_env_storage() # Should not raise - creds.validate_for_node_types(["llm_generate", "llm_tool_use"]) + creds.validate_for_node_types(["event_loop"]) class TestStartupValidation: diff --git a/tools/tests/tools/test_runtime_logs_tool.py b/tools/tests/tools/test_runtime_logs_tool.py index 18aaed27..8d4687b3 100644 --- a/tools/tests/tools/test_runtime_logs_tool.py +++ b/tools/tests/tools/test_runtime_logs_tool.py @@ -68,7 +68,7 @@ def runtime_logs_dir(tmp_path: Path) -> Path: { "node_id": "node-2", "node_name": "Format", - "node_type": "function", + "node_type": "event_loop", "success": True, "total_steps": 1, "tokens_used": 0, @@ -112,7 +112,7 @@ def runtime_logs_dir(tmp_path: Path) -> Path: }, { "node_id": "node-2", - "node_type": "function", + "node_type": "event_loop", "step_index": 0, "llm_text": "", "tool_calls": [],