Files
hive/core/framework/graph/node.py
T
2026-04-02 12:35:16 -07:00

647 lines
22 KiB
Python

"""
Node Protocol - The building block of agent graphs.
A Node is a unit of work that:
1. Receives context (goal, shared memory, input)
2. Makes decisions (using LLM, tools, or logic)
3. Produces results (output, state changes)
4. Records everything to the Runtime
Nodes are composable and reusable. The same node can appear
in different graphs for different goals.
Protocol:
Every node must implement the NodeProtocol interface.
The framework provides NodeContext with everything the node needs.
"""
import asyncio
import json
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any
from pydantic import BaseModel, Field
from framework.llm.provider import LLMProvider, Tool
from framework.runtime.core import Runtime
logger = logging.getLogger(__name__)
def find_json_object(text: str) -> str | None:
"""Find the first valid JSON object in text using balanced brace matching.
This handles nested objects correctly, unlike simple regex like r'\\{[^{}]*\\}'.
"""
start = text.find("{")
if start == -1:
return None
end = text.rfind("}")
if end == -1 or end < start:
return None
# Fast path: try json.loads directly (C extension, handles 1MB in ~14ms)
try:
candidate = text[start : end + 1]
json.loads(candidate)
return candidate
except json.JSONDecodeError:
pass
# Fall back to existing brace matching
depth = 0
in_string = False
escape_next = False
for i, char in enumerate(text[start:], start):
if escape_next:
escape_next = False
continue
if char == "\\" and in_string:
escape_next = True
continue
if char == '"' and not escape_next:
in_string = not in_string
continue
if in_string:
continue
if char == "{":
depth += 1
elif char == "}":
depth -= 1
if depth == 0:
return text[start : i + 1]
return None
class NodeSpec(BaseModel):
"""
Specification for a node in the graph.
This is the declarative definition of a node - what it does,
what it needs, and what it produces. The actual implementation
is separate (NodeProtocol).
Example:
NodeSpec(
id="calculator",
name="Calculator Node",
description="Performs mathematical calculations",
node_type="event_loop",
input_keys=["expression"],
output_keys=["result"],
tools=["calculate", "math_function"],
system_prompt="You are a calculator..."
)
"""
id: str
name: str
description: str
# Node behavior type
node_type: str = Field(
default="event_loop",
description="Type: 'event_loop' (recommended), 'gcu' (browser automation).",
)
# Data flow
input_keys: list[str] = Field(
default_factory=list, description="Keys this node reads from shared memory or input"
)
output_keys: list[str] = Field(
default_factory=list, description="Keys this node writes to shared memory or output"
)
nullable_output_keys: list[str] = Field(
default_factory=list,
description="Output keys that can be None without triggering validation errors",
)
# Optional schemas for validation and cleansing
input_schema: dict[str, dict] = Field(
default_factory=dict,
description=(
"Optional schema for input validation. "
"Format: {key: {type: 'string', required: True, description: '...'}}"
),
)
output_schema: dict[str, dict] = Field(
default_factory=dict,
description=(
"Optional schema for output validation. "
"Format: {key: {type: 'dict', required: True, description: '...'}}"
),
)
# For LLM nodes
system_prompt: str | None = Field(default=None, description="System prompt for LLM nodes")
tools: list[str] = Field(default_factory=list, description="Tool names this node can use")
model: str | None = Field(
default=None, description="Specific model to use (defaults to graph default)"
)
# For subagent delegation
sub_agents: list[str] = Field(
default_factory=list,
description="Node IDs that can be invoked as subagents from this node",
)
# For function nodes
function: str | None = Field(
default=None, description="Function name or path for function nodes"
)
# For router nodes
routes: dict[str, str] = Field(
default_factory=dict, description="Condition -> target_node_id mapping for routers"
)
# Retry behavior
max_retries: int = Field(default=3)
retry_on: list[str] = Field(default_factory=list, description="Error types to retry on")
# Visit limits (for feedback/callback edges)
max_node_visits: int = Field(
default=0,
description=(
"Max times this node executes in one graph run. "
"0 = unlimited (default, required for forever-alive agents). "
"Set >1 for one-shot agents with feedback loops."
),
)
# Pydantic model for output validation
output_model: type[BaseModel] | None = Field(
default=None,
description=(
"Optional Pydantic model class for validating and parsing LLM output. "
"When set, the LLM response will be validated against this model."
),
)
max_validation_retries: int = Field(
default=2,
description="Maximum retries when Pydantic validation fails (with feedback to LLM)",
)
# Client-facing behavior
client_facing: bool = Field(
default=False,
description="If True, this node streams output to the end user and can request input.",
)
# Phase completion criteria for conversation-aware judge (Level 2)
success_criteria: str | None = Field(
default=None,
description=(
"Natural-language criteria for phase completion. When set, the "
"implicit judge upgrades to Level 2: after output keys are satisfied, "
"a fast LLM evaluates whether the conversation meets these criteria."
),
)
# Opt out of judge evaluation entirely (no feedback injected, loop continues normally)
skip_judge: bool = Field(
default=False,
description=(
"When True, the implicit judge is bypassed entirely — no feedback is "
"injected and the loop continues naturally. Intended for conversational "
"nodes (e.g., the queen) that should never receive tool-use pressure."
),
)
model_config = {"extra": "allow", "arbitrary_types_allowed": True}
class DataBufferWriteError(Exception):
"""Raised when an invalid value is written to the data buffer."""
pass
@dataclass
class DataBuffer:
"""
Shared data buffer between nodes in a graph execution.
Nodes read and write to the data buffer using typed keys.
The buffer is scoped to a single run.
For parallel execution, use write_async() which provides per-key locking
to prevent race conditions when multiple nodes write concurrently.
"""
_data: dict[str, Any] = field(default_factory=dict)
_allowed_read: set[str] = field(default_factory=set)
_allowed_write: set[str] = field(default_factory=set)
# Locks for thread-safe parallel execution
_lock: asyncio.Lock | None = field(default=None, repr=False)
_key_locks: dict[str, asyncio.Lock] = field(default_factory=dict, repr=False)
def __post_init__(self) -> None:
"""Initialize the main lock if not provided."""
if self._lock is None:
self._lock = asyncio.Lock()
def read(self, key: str) -> Any:
"""Read a value from the data buffer."""
if self._allowed_read and key not in self._allowed_read:
raise PermissionError(f"Node not allowed to read key: {key}")
return self._data.get(key)
def write(self, key: str, value: Any, validate: bool = True) -> None:
"""
Write a value to the data buffer.
Args:
key: The buffer key to write to
value: The value to write
validate: If True, check for suspicious content (default True)
Raises:
PermissionError: If node doesn't have write permission
DataBufferWriteError: If value appears to be hallucinated content
"""
if self._allowed_write and key not in self._allowed_write:
raise PermissionError(f"Node not allowed to write key: {key}")
if validate and isinstance(value, str):
# Check for obviously hallucinated content
if len(value) > 5000:
# Long strings that look like code are suspicious
if self._contains_code_indicators(value):
logger.warning(
f"⚠ Suspicious write to key '{key}': appears to be code "
f"({len(value)} chars). Consider using validate=False if intended."
)
raise DataBufferWriteError(
f"Rejected suspicious content for key '{key}': "
f"appears to be hallucinated code ({len(value)} chars). "
"If this is intentional, use validate=False."
)
self._data[key] = value
async def write_async(self, key: str, value: Any, validate: bool = True) -> None:
"""
Thread-safe async write with per-key locking.
Use this method when multiple nodes may write concurrently during
parallel execution. Each key has its own lock to minimize contention.
Args:
key: The buffer key to write to
value: The value to write
validate: If True, check for suspicious content (default True)
Raises:
PermissionError: If node doesn't have write permission
DataBufferWriteError: If value appears to be hallucinated content
"""
# Check permissions first (no lock needed)
if self._allowed_write and key not in self._allowed_write:
raise PermissionError(f"Node not allowed to write key: {key}")
# Ensure key has a lock (double-checked locking pattern)
if key not in self._key_locks:
async with self._lock:
if key not in self._key_locks:
self._key_locks[key] = asyncio.Lock()
# Acquire per-key lock and write
async with self._key_locks[key]:
if validate and isinstance(value, str):
if len(value) > 5000:
if self._contains_code_indicators(value):
logger.warning(
f"⚠ Suspicious write to key '{key}': appears to be code "
f"({len(value)} chars). Consider using validate=False if intended."
)
raise DataBufferWriteError(
f"Rejected suspicious content for key '{key}': "
f"appears to be hallucinated code ({len(value)} chars). "
"If this is intentional, use validate=False."
)
self._data[key] = value
def _contains_code_indicators(self, value: str) -> bool:
"""
Check for code patterns in a string using sampling for efficiency.
For strings under 10KB, checks the entire content.
For longer strings, samples at strategic positions to balance
performance with detection accuracy.
Args:
value: The string to check for code indicators
Returns:
True if code indicators are found, False otherwise
"""
code_indicators = [
# Python
"```python",
"def ",
"class ",
"import ",
"async def ",
"from ",
# JavaScript/TypeScript
"function ",
"const ",
"let ",
"=> {",
"require(",
"export ",
# SQL
"SELECT ",
"INSERT ",
"UPDATE ",
"DELETE ",
"DROP ",
# HTML/Script injection
"<script",
"<?php",
"<%",
]
# For strings under 10KB, check the entire content
if len(value) < 10000:
return any(indicator in value for indicator in code_indicators)
# For longer strings, sample at strategic positions
sample_positions = [
0, # Start
len(value) // 4, # 25%
len(value) // 2, # 50%
3 * len(value) // 4, # 75%
max(0, len(value) - 2000), # Near end
]
for pos in sample_positions:
chunk = value[pos : pos + 2000]
if any(indicator in chunk for indicator in code_indicators):
return True
return False
def read_all(self) -> dict[str, Any]:
"""Read all accessible data."""
if self._allowed_read:
return {k: v for k, v in self._data.items() if k in self._allowed_read}
return dict(self._data)
def with_permissions(
self,
read_keys: list[str],
write_keys: list[str],
) -> "DataBuffer":
"""Create a view with restricted permissions for a specific node.
The scoped view shares the same underlying data and locks,
enabling thread-safe parallel execution across scoped views.
"""
return DataBuffer(
_data=self._data,
_allowed_read=set(read_keys) if read_keys else set(),
_allowed_write=set(write_keys) if write_keys else set(),
_lock=self._lock, # Share lock for thread safety
_key_locks=self._key_locks, # Share key locks
)
@dataclass
class NodeContext:
"""
Everything a node needs to execute.
This is passed to every node and provides:
- Access to the runtime (for decision logging)
- Access to the data buffer (for state)
- Access to LLM (for generation)
- Access to tools (for actions)
- The goal context (for guidance)
"""
# Core runtime
runtime: Runtime
# Node identity
node_id: str
node_spec: NodeSpec
# State
buffer: DataBuffer
input_data: dict[str, Any] = field(default_factory=dict)
# LLM access (if applicable)
llm: LLMProvider | None = None
available_tools: list[Tool] = field(default_factory=list)
# Goal context
goal_context: str = ""
goal: Any = None # Goal object for LLM-powered routers
# LLM configuration
max_tokens: int = 4096 # Maximum tokens for LLM responses
# Execution metadata
attempt: int = 1
max_attempts: int = 3
# Runtime logging (optional)
runtime_logger: Any = None # RuntimeLogger | None — uses Any to avoid import
# Pause control (optional) - asyncio.Event for pause requests
pause_event: Any = None # asyncio.Event | None
# Continuous conversation mode
continuous_mode: bool = False # True when graph has conversation_mode="continuous"
inherited_conversation: Any = None # NodeConversation | None (from prior node)
cumulative_output_keys: list[str] = field(default_factory=list) # All output keys from path
# Connected accounts prompt (injected from runner)
accounts_prompt: str = ""
# Resume context — Layer 1 (identity) and Layer 2 (narrative) for
# rebuilding the full system prompt when restoring from conversation store.
identity_prompt: str = ""
narrative: str = ""
# Event-triggered execution (no interactive user attached)
event_triggered: bool = False
# Execution ID (from StreamRuntimeAdapter)
execution_id: str = ""
run_id: str = ""
@property
def effective_run_id(self) -> str | None:
"""Normalized run_id: returns run_id if truthy, otherwise None.
The field defaults to ``""``; callers should use this property
instead of ``self.run_id or None`` to avoid silently falling
back to session-scoped storage.
"""
return self.run_id or None
# Stream identity — the ExecutionStream this node runs within.
# Falls back to node_id when not set (legacy / standalone executor).
stream_id: str = ""
# Subagent mode
is_subagent_mode: bool = False # True when running as a subagent (prevents nested delegation)
report_callback: Any = None # async (message: str, data: dict | None) -> None
node_registry: dict[str, "NodeSpec"] = field(default_factory=dict) # For subagent lookup
# Full tool catalog (unfiltered) — used by _execute_subagent to resolve
# subagent tools that aren't in the parent node's filtered available_tools.
all_tools: list[Tool] = field(default_factory=list)
# Shared reference to the executor's node_registry — used by subagent
# escalation (_EscalationReceiver) to register temporary receivers that
# the inject_input() routing chain can find.
shared_node_registry: dict[str, Any] = field(default_factory=dict)
# Dynamic tool provider — when set, EventLoopNode rebuilds the tool
# list from this callback at the start of each iteration. Used by
# the queen to switch between building-mode and running-mode tools.
dynamic_tools_provider: Any = None # Callable[[], list[Tool]] | None
# Dynamic prompt provider — when set, EventLoopNode checks each
# iteration and updates the system prompt if it changed. Used by
# the queen to switch between phase-specific prompts (building /
# staging / running) without restarting the conversation.
dynamic_prompt_provider: Any = None # Callable[[], str] | None
# Skill system prompts — injected by the skill discovery pipeline
skills_catalog_prompt: str = "" # Available skills XML catalog
protocols_prompt: str = "" # Default skill operational protocols
skill_dirs: list[str] = field(default_factory=list) # Skill base dirs for resource access
# DS-12: batch auto-detection nudge appended to system prompt when input looks like a batch
default_skill_batch_nudge: str | None = None
# DS-13: token usage ratio at which to inject a context preservation warning
default_skill_warn_ratio: float | None = None
# Per-iteration metadata provider — when set, EventLoopNode merges
# the returned dict into node_loop_iteration event data. Used by
# the queen to record the current phase per iteration.
iteration_metadata_provider: Any = None # Callable[[], dict] | None
@dataclass
class NodeResult:
"""
The output of a node execution.
Contains:
- Success/failure status
- Output data
- State changes made
- Route decision (for routers)
"""
success: bool
output: dict[str, Any] = field(default_factory=dict)
error: str | None = None
# For routing decisions
next_node: str | None = None
route_reason: str | None = None
# Metadata
tokens_used: int = 0
latency_ms: int = 0
# Pydantic validation errors (if any)
validation_errors: list[str] = field(default_factory=list)
# Continuous conversation mode: return conversation for threading to next node
conversation: Any = None # NodeConversation | None
def to_summary(self, node_spec: Any = None) -> str:
"""
Generate a human-readable summary of this node's execution and output.
This is like toString() - it describes what the node produced in its current state.
"""
if not self.success:
return f"❌ Failed: {self.error}"
if not self.output:
return "✓ Completed (no output)"
parts = [f"✓ Completed with {len(self.output)} outputs:"]
for key, value in list(self.output.items())[:5]: # Limit to 5 keys
value_str = str(value)[:100]
if len(str(value)) > 100:
value_str += "..."
parts.append(f"{key}: {value_str}")
return "\n".join(parts)
class NodeProtocol(ABC):
"""
The interface all nodes must implement.
To create a node:
1. Subclass NodeProtocol
2. Implement execute()
3. Register with the executor
Example:
class CalculatorNode(NodeProtocol):
async def execute(self, ctx: NodeContext) -> NodeResult:
expression = ctx.input_data.get("expression")
# Record decision
decision_id = ctx.runtime.decide(
intent="Calculate expression",
options=[...],
chosen="evaluate",
reasoning="Direct evaluation"
)
# Do the work
result = eval(expression)
# Record outcome
ctx.runtime.record_outcome(decision_id, success=True, result=result)
return NodeResult(success=True, output={"result": result})
"""
@abstractmethod
async def execute(self, ctx: NodeContext) -> NodeResult:
"""
Execute this node's logic.
Args:
ctx: NodeContext with everything needed
Returns:
NodeResult with output and status
"""
pass
def validate_input(self, ctx: NodeContext) -> list[str]:
"""
Validate that required inputs are present.
Override to add custom validation.
Returns:
List of validation error messages (empty if valid)
"""
errors = []
for key in ctx.node_spec.input_keys:
if key not in ctx.input_data and ctx.buffer.read(key) is None:
errors.append(f"Missing required input: {key}")
return errors