589c5b06fe
- Auto-fixed 70 lint errors (import sorting, aliased errors, datetime.UTC)
- Fixed 85 remaining errors manually:
- E501: wrapped long lines in queen_profiles, catalog, routes_credentials
- F821: added missing TYPE_CHECKING imports for AgentHost, ToolRegistry,
HookContext, HookResult; added runtime imports where needed
- F811: removed duplicate method definitions in queen_lifecycle_tools
- F841/B007: removed unused variables in discovery.py
- W291: removed trailing whitespace in queen nodes
- E402: moved import to top of queen_memory_v2.py
- Fixed AgentRuntime -> AgentHost in example template type annotations
- Reformatted 343 files with ruff format
1251 lines
51 KiB
Python
1251 lines
51 KiB
Python
"""
|
|
Execution Stream - Manages concurrent executions for a single entry point.
|
|
|
|
Each stream has:
|
|
- Its own StreamRuntime for decision tracking
|
|
- Access to shared state (read/write based on isolation)
|
|
- Connection to the outcome aggregator
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import time
|
|
import uuid
|
|
from collections import OrderedDict
|
|
from collections.abc import Callable
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from framework.host.event_bus import EventBus
|
|
from framework.host.shared_state import IsolationLevel, SharedBufferManager
|
|
from framework.host.stream_runtime import StreamDecisionTracker, StreamRuntimeAdapter
|
|
from framework.orchestrator.checkpoint_config import CheckpointConfig
|
|
from framework.orchestrator.orchestrator import ExecutionResult, Orchestrator
|
|
|
|
if TYPE_CHECKING:
|
|
from framework.host.event_bus import AgentEvent
|
|
from framework.host.outcome_aggregator import OutcomeAggregator
|
|
from framework.llm.provider import LLMProvider, Tool
|
|
from framework.orchestrator.edge import GraphSpec
|
|
from framework.orchestrator.goal import Goal
|
|
from framework.storage.concurrent import ConcurrentStorage
|
|
from framework.storage.session_store import SessionStore
|
|
|
|
|
|
class ExecutionAlreadyRunningError(RuntimeError):
|
|
"""Raised when attempting to start an execution on a stream that already has one running."""
|
|
|
|
def __init__(self, stream_id: str, active_ids: list[str]):
|
|
self.stream_id = stream_id
|
|
self.active_ids = active_ids
|
|
super().__init__(
|
|
f"Stream '{stream_id}' already has an active execution: {active_ids}. "
|
|
"Concurrent executions on the same stream are not allowed."
|
|
)
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GraphScopedEventBus(EventBus):
|
|
"""Proxy that stamps ``graph_id`` on every published event.
|
|
|
|
The ``GraphExecutor`` and ``EventLoopNode`` emit events via the
|
|
convenience methods on ``EventBus`` (e.g. ``emit_llm_text_delta``).
|
|
Rather than threading ``graph_id`` through every one of those 20+
|
|
methods, this subclass overrides ``publish()`` to stamp the id
|
|
before forwarding to the real bus.
|
|
|
|
Because the ``emit_*`` methods are *inherited* from ``EventBus``,
|
|
``self.publish()`` inside them resolves to this class's override —
|
|
unlike a ``__getattr__``-based proxy where the delegated bound
|
|
methods would call ``EventBus.publish`` directly, bypassing the
|
|
stamp entirely.
|
|
"""
|
|
|
|
def __init__(self, bus: "EventBus", graph_id: str) -> None:
|
|
# Intentionally skip super().__init__() — we delegate all state
|
|
# (subscriptions, history, semaphore, etc.) to the real bus.
|
|
self._real_bus = bus
|
|
self._scope_graph_id = graph_id
|
|
self.last_activity_time: float = time.monotonic()
|
|
|
|
async def publish(self, event: "AgentEvent") -> None: # type: ignore[override]
|
|
event.graph_id = self._scope_graph_id
|
|
self.last_activity_time = time.monotonic()
|
|
await self._real_bus.publish(event)
|
|
|
|
# --- Delegate state-reading methods to the real bus ---
|
|
# These access internal state (_subscriptions, _event_history, etc.)
|
|
# that only exists on the real bus.
|
|
|
|
def subscribe(self, *args: Any, **kwargs: Any) -> str:
|
|
return self._real_bus.subscribe(*args, **kwargs)
|
|
|
|
def unsubscribe(self, subscription_id: str) -> bool:
|
|
return self._real_bus.unsubscribe(subscription_id)
|
|
|
|
def get_history(self, *args: Any, **kwargs: Any) -> list:
|
|
return self._real_bus.get_history(*args, **kwargs)
|
|
|
|
def get_stats(self) -> dict:
|
|
return self._real_bus.get_stats()
|
|
|
|
async def wait_for(self, *args: Any, **kwargs: Any) -> Any:
|
|
return await self._real_bus.wait_for(*args, **kwargs)
|
|
|
|
|
|
@dataclass
|
|
class EntryPointSpec:
|
|
"""Specification for an entry point."""
|
|
|
|
id: str
|
|
name: str
|
|
entry_node: str # Node ID to start from
|
|
trigger_type: str # "webhook", "api", "timer", "event", "manual"
|
|
trigger_config: dict[str, Any] = field(default_factory=dict)
|
|
isolation_level: str = "shared" # "isolated" | "shared" | "synchronized"
|
|
priority: int = 0
|
|
max_concurrent: int = 10 # Max concurrent executions for this entry point
|
|
max_resurrections: int = 3 # Auto-restart on non-fatal failure (0 to disable)
|
|
|
|
def get_isolation_level(self) -> IsolationLevel:
|
|
"""Convert string isolation level to enum."""
|
|
return IsolationLevel(self.isolation_level)
|
|
|
|
|
|
@dataclass
|
|
class ExecutionContext:
|
|
"""Context for a single execution."""
|
|
|
|
id: str
|
|
correlation_id: str
|
|
stream_id: str
|
|
entry_point: str
|
|
input_data: dict[str, Any]
|
|
isolation_level: IsolationLevel
|
|
session_state: dict[str, Any] | None = None # For resuming from pause
|
|
run_id: str | None = None # Unique ID per trigger() invocation
|
|
started_at: datetime = field(default_factory=datetime.now)
|
|
completed_at: datetime | None = None
|
|
status: str = "pending" # pending, running, completed, failed, paused
|
|
|
|
|
|
class ExecutionManager:
|
|
"""
|
|
Manages concurrent executions for a single entry point.
|
|
|
|
Each stream:
|
|
- Has its own StreamRuntime for thread-safe decision tracking
|
|
- Creates GraphExecutor instances per execution
|
|
- Manages execution lifecycle with proper isolation
|
|
|
|
Example:
|
|
stream = ExecutionStream(
|
|
stream_id="webhook",
|
|
entry_spec=webhook_entry,
|
|
graph=graph_spec,
|
|
goal=goal,
|
|
state_manager=shared_state,
|
|
storage=concurrent_storage,
|
|
outcome_aggregator=aggregator,
|
|
event_bus=event_bus,
|
|
llm=llm_provider,
|
|
)
|
|
|
|
await stream.start()
|
|
|
|
# Trigger execution
|
|
exec_id = await stream.execute({"ticket_id": "123"})
|
|
|
|
# Wait for result
|
|
result = await stream.wait_for_completion(exec_id)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
stream_id: str,
|
|
entry_spec: EntryPointSpec,
|
|
graph: "GraphSpec",
|
|
goal: "Goal",
|
|
state_manager: SharedBufferManager,
|
|
storage: "ConcurrentStorage",
|
|
outcome_aggregator: "OutcomeAggregator | None" = None,
|
|
event_bus: "EventBus | None" = None,
|
|
llm: "LLMProvider | None" = None,
|
|
tools: list["Tool"] | None = None,
|
|
tool_executor: Callable | None = None,
|
|
result_retention_max: int | None = 1000,
|
|
result_retention_ttl_seconds: float | None = None,
|
|
runtime_log_store: Any = None,
|
|
session_store: "SessionStore | None" = None,
|
|
checkpoint_config: CheckpointConfig | None = None,
|
|
graph_id: str | None = None,
|
|
accounts_prompt: str = "",
|
|
accounts_data: list[dict] | None = None,
|
|
tool_provider_map: dict[str, str] | None = None,
|
|
skills_catalog_prompt: str = "",
|
|
protocols_prompt: str = "",
|
|
skill_dirs: list[str] | None = None,
|
|
context_warn_ratio: float | None = None,
|
|
batch_init_nudge: str | None = None,
|
|
dynamic_memory_provider_factory: Callable[[str], Callable[[], str] | None] | None = None,
|
|
):
|
|
"""
|
|
Initialize execution stream.
|
|
|
|
Args:
|
|
stream_id: Unique identifier for this stream
|
|
entry_spec: Entry point specification
|
|
graph: Graph specification for this agent
|
|
goal: Goal driving execution
|
|
state_manager: Shared state manager
|
|
storage: Concurrent storage backend
|
|
outcome_aggregator: For cross-stream evaluation
|
|
event_bus: Optional event bus for publishing events
|
|
llm: LLM provider for nodes
|
|
tools: Available tools
|
|
tool_executor: Function to execute tools
|
|
runtime_log_store: Optional RuntimeLogStore for per-execution logging
|
|
session_store: Optional SessionStore for unified session storage
|
|
checkpoint_config: Optional checkpoint configuration for resumable sessions
|
|
graph_id: Optional graph identifier for multi-graph sessions
|
|
accounts_prompt: Connected accounts block for system prompt injection
|
|
accounts_data: Raw account data for per-node prompt generation
|
|
tool_provider_map: Tool name to provider name mapping for account routing
|
|
skills_catalog_prompt: Available skills catalog for system prompt
|
|
protocols_prompt: Default skill operational protocols for system prompt
|
|
skill_dirs: Skill base directories for Tier 3 resource access
|
|
context_warn_ratio: Token usage ratio to trigger DS-13 preservation warning
|
|
batch_init_nudge: System prompt nudge for DS-12 batch auto-detection
|
|
"""
|
|
self.stream_id = stream_id
|
|
self.entry_spec = entry_spec
|
|
self.graph = graph
|
|
self.goal = goal
|
|
self.graph_id = graph_id
|
|
self._state_manager = state_manager
|
|
self._storage = storage
|
|
self._outcome_aggregator = outcome_aggregator
|
|
self._event_bus = event_bus
|
|
self._llm = llm
|
|
self._tools = tools or []
|
|
self._tool_executor = tool_executor
|
|
self._result_retention_max = result_retention_max
|
|
self._result_retention_ttl_seconds = result_retention_ttl_seconds
|
|
self._runtime_log_store = runtime_log_store
|
|
self._checkpoint_config = checkpoint_config
|
|
self._session_store = session_store
|
|
self._accounts_prompt = accounts_prompt
|
|
self._accounts_data = accounts_data
|
|
self._tool_provider_map = tool_provider_map
|
|
self._skills_catalog_prompt = skills_catalog_prompt
|
|
self._protocols_prompt = protocols_prompt
|
|
self._skill_dirs: list[str] = skill_dirs or []
|
|
self._context_warn_ratio: float | None = context_warn_ratio
|
|
self._batch_init_nudge: str | None = batch_init_nudge
|
|
self._dynamic_memory_provider_factory = dynamic_memory_provider_factory
|
|
|
|
_es_logger = logging.getLogger(__name__)
|
|
if protocols_prompt:
|
|
_es_logger.info(
|
|
"ExecutionStream[%s] received protocols_prompt (%d chars)",
|
|
stream_id,
|
|
len(protocols_prompt),
|
|
)
|
|
else:
|
|
_es_logger.warning(
|
|
"ExecutionStream[%s] received EMPTY protocols_prompt",
|
|
stream_id,
|
|
)
|
|
|
|
# Create stream-scoped runtime
|
|
self._runtime = StreamDecisionTracker(
|
|
stream_id=stream_id,
|
|
storage=storage,
|
|
)
|
|
|
|
# Execution tracking
|
|
self._active_executions: dict[str, ExecutionContext] = {}
|
|
self._execution_tasks: dict[str, asyncio.Task] = {}
|
|
self._active_executors: dict[str, Orchestrator] = {}
|
|
self._cancel_reasons: dict[str, str] = {}
|
|
self._execution_results: OrderedDict[str, ExecutionResult] = OrderedDict()
|
|
self._execution_result_times: dict[str, float] = {}
|
|
self._completion_events: dict[str, asyncio.Event] = {}
|
|
|
|
# Concurrency control
|
|
self._semaphore = asyncio.Semaphore(entry_spec.max_concurrent)
|
|
self._lock = asyncio.Lock()
|
|
|
|
# Graph-scoped event bus (stamps graph_id on published events)
|
|
# Always wrap in GraphScopedEventBus so we can track last_activity_time.
|
|
if self._event_bus:
|
|
self._scoped_event_bus = GraphScopedEventBus(self._event_bus, self.graph_id or "")
|
|
else:
|
|
self._scoped_event_bus = None
|
|
|
|
# State
|
|
self._running = False
|
|
|
|
async def start(self) -> None:
|
|
"""Start the execution stream."""
|
|
if self._running:
|
|
return
|
|
|
|
self._running = True
|
|
logger.info(f"ExecutionStream '{self.stream_id}' started")
|
|
|
|
# Emit stream started event
|
|
if self._scoped_event_bus:
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
|
|
await self._scoped_event_bus.publish(
|
|
AgentEvent(
|
|
type=EventType.STREAM_STARTED,
|
|
stream_id=self.stream_id,
|
|
data={"entry_point": self.entry_spec.id},
|
|
)
|
|
)
|
|
|
|
@property
|
|
def active_execution_ids(self) -> list[str]:
|
|
"""Return IDs of all currently active executions."""
|
|
return list(self._active_executions.keys())
|
|
|
|
@property
|
|
def agent_idle_seconds(self) -> float:
|
|
"""Seconds since the last agent activity (LLM call, tool call, node transition).
|
|
|
|
Returns ``float('inf')`` if no event bus is attached or no events have
|
|
been published yet. When there are no active executions, also returns
|
|
``float('inf')`` (nothing to be idle *about*).
|
|
"""
|
|
if not self._active_executions:
|
|
return float("inf")
|
|
bus = self._scoped_event_bus
|
|
if isinstance(bus, GraphScopedEventBus):
|
|
return time.monotonic() - bus.last_activity_time
|
|
return float("inf")
|
|
|
|
@property
|
|
def is_awaiting_input(self) -> bool:
|
|
"""True when an active execution is blocked waiting for client input."""
|
|
if not self._active_executors:
|
|
return False
|
|
for executor in self._active_executors.values():
|
|
for node in executor.node_registry.values():
|
|
if getattr(node, "_awaiting_input", False):
|
|
return True
|
|
return False
|
|
|
|
def get_waiting_nodes(self) -> list[dict[str, str]]:
|
|
"""Return nodes currently blocked waiting for client input.
|
|
|
|
Each entry is ``{"node_id": ..., "execution_id": ...}``.
|
|
"""
|
|
waiting: list[dict[str, str]] = []
|
|
for exec_id, executor in self._active_executors.items():
|
|
for node_id, node in executor.node_registry.items():
|
|
if getattr(node, "_awaiting_input", False):
|
|
waiting.append({"node_id": node_id, "execution_id": exec_id})
|
|
return waiting
|
|
|
|
def get_injectable_nodes(self) -> list[dict[str, str]]:
|
|
"""Return nodes that support message injection (have ``inject_event``).
|
|
|
|
Each entry is ``{"node_id": ..., "execution_id": ...}``.
|
|
The currently executing node is placed first so that
|
|
``inject_message`` targets the active node, not a stale one.
|
|
"""
|
|
injectable: list[dict[str, str]] = []
|
|
current_first: list[dict[str, str]] = []
|
|
for exec_id, executor in self._active_executors.items():
|
|
current = getattr(executor, "current_node_id", None)
|
|
for node_id, node in executor.node_registry.items():
|
|
if hasattr(node, "inject_event"):
|
|
entry = {"node_id": node_id, "execution_id": exec_id}
|
|
if node_id == current:
|
|
current_first.append(entry)
|
|
else:
|
|
injectable.append(entry)
|
|
return current_first + injectable
|
|
|
|
def _record_execution_result(self, execution_id: str, result: ExecutionResult) -> None:
|
|
"""Record a completed execution result with retention pruning."""
|
|
self._execution_results[execution_id] = result
|
|
self._execution_results.move_to_end(execution_id)
|
|
self._execution_result_times[execution_id] = time.time()
|
|
self._prune_execution_results()
|
|
|
|
def _prune_execution_results(self) -> None:
|
|
"""Prune completed results based on TTL and max retention."""
|
|
if self._result_retention_ttl_seconds is not None:
|
|
cutoff = time.time() - self._result_retention_ttl_seconds
|
|
for exec_id, recorded_at in list(self._execution_result_times.items()):
|
|
if recorded_at < cutoff:
|
|
self._execution_result_times.pop(exec_id, None)
|
|
self._execution_results.pop(exec_id, None)
|
|
|
|
if self._result_retention_max is not None:
|
|
while len(self._execution_results) > self._result_retention_max:
|
|
old_exec_id, _ = self._execution_results.popitem(last=False)
|
|
self._execution_result_times.pop(old_exec_id, None)
|
|
|
|
async def stop(self) -> None:
|
|
"""Stop the execution stream and cancel active executions."""
|
|
if not self._running:
|
|
return
|
|
|
|
self._running = False
|
|
|
|
# Cancel all active executions
|
|
tasks_to_wait = []
|
|
for _, task in self._execution_tasks.items():
|
|
if not task.done():
|
|
task.cancel()
|
|
tasks_to_wait.append(task)
|
|
|
|
if tasks_to_wait:
|
|
# Wait briefly — don't block indefinitely if tasks are stuck
|
|
# in long-running operations (LLM calls, tool executions).
|
|
_, pending = await asyncio.wait(tasks_to_wait, timeout=5.0)
|
|
if pending:
|
|
logger.warning(
|
|
"%d execution task(s) did not finish within 5s after cancellation",
|
|
len(pending),
|
|
)
|
|
|
|
self._execution_tasks.clear()
|
|
self._active_executions.clear()
|
|
|
|
logger.info(f"ExecutionStream '{self.stream_id}' stopped")
|
|
|
|
# Emit stream stopped event
|
|
if self._scoped_event_bus:
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
|
|
await self._scoped_event_bus.publish(
|
|
AgentEvent(
|
|
type=EventType.STREAM_STOPPED,
|
|
stream_id=self.stream_id,
|
|
)
|
|
)
|
|
|
|
async def inject_input(
|
|
self,
|
|
node_id: str,
|
|
content: str,
|
|
*,
|
|
is_client_input: bool = False,
|
|
image_content: list[dict[str, Any]] | None = None,
|
|
) -> bool:
|
|
"""Inject user input into a running client-facing EventLoopNode.
|
|
|
|
Searches active executors for a node matching ``node_id`` and calls
|
|
its ``inject_event()`` method to unblock ``_await_user_input()``.
|
|
|
|
Returns True if input was delivered, False otherwise.
|
|
"""
|
|
for executor in self._active_executors.values():
|
|
node = executor.node_registry.get(node_id)
|
|
if node is not None and hasattr(node, "inject_event"):
|
|
await node.inject_event(content, is_client_input=is_client_input, image_content=image_content)
|
|
return True
|
|
return False
|
|
|
|
async def inject_trigger(
|
|
self,
|
|
node_id: str,
|
|
trigger: Any,
|
|
) -> bool:
|
|
"""Inject a trigger event into a running queen EventLoopNode.
|
|
|
|
Searches active executors for a node matching ``node_id`` and calls
|
|
its ``inject_trigger()`` method to wake the queen.
|
|
|
|
Args:
|
|
node_id: The queen EventLoopNode ID.
|
|
trigger: A ``TriggerEvent`` instance (typed as Any to avoid
|
|
circular imports with graph layer).
|
|
|
|
Returns True if the trigger was delivered, False otherwise.
|
|
"""
|
|
for executor in self._active_executors.values():
|
|
node = executor.node_registry.get(node_id)
|
|
if node is not None and hasattr(node, "inject_trigger"):
|
|
await node.inject_trigger(trigger)
|
|
return True
|
|
return False
|
|
|
|
async def execute(
|
|
self,
|
|
input_data: dict[str, Any],
|
|
correlation_id: str | None = None,
|
|
session_state: dict[str, Any] | None = None,
|
|
run_id: str | None = None,
|
|
) -> str:
|
|
"""
|
|
Queue an execution and return its ID.
|
|
|
|
Non-blocking - the execution runs in the background.
|
|
|
|
Args:
|
|
input_data: Input data for this execution
|
|
correlation_id: Optional ID to correlate related executions
|
|
session_state: Optional session state to resume from (with paused_at, memory)
|
|
run_id: Unique ID for this trigger invocation (for run dividers)
|
|
|
|
Returns:
|
|
Execution ID for tracking
|
|
"""
|
|
if not self._running:
|
|
raise RuntimeError(f"ExecutionStream '{self.stream_id}' is not running")
|
|
|
|
# Only one execution may run on a stream at a time — concurrent
|
|
# executions corrupt shared session state. Cancel any running
|
|
# execution before starting the new one. The cancelled execution
|
|
# writes its state to disk before cleanup, and the new execution
|
|
# runs in the same session directory (via resume_session_id).
|
|
active = self.active_execution_ids
|
|
for eid in active:
|
|
logger.info(
|
|
"Cancelling running execution %s on stream '%s' before starting new one",
|
|
eid,
|
|
self.stream_id,
|
|
)
|
|
executor = self._active_executors.get(eid)
|
|
if executor:
|
|
for node in executor.node_registry.values():
|
|
if hasattr(node, "signal_shutdown"):
|
|
node.signal_shutdown()
|
|
if hasattr(node, "cancel_current_turn"):
|
|
node.cancel_current_turn()
|
|
await self.cancel_execution(eid, reason="Restarted with new execution")
|
|
|
|
# When resuming, reuse the original session ID so the execution
|
|
# continues in the same session directory instead of creating a new one.
|
|
resume_session_id = session_state.get("resume_session_id") if session_state else None
|
|
|
|
if resume_session_id:
|
|
execution_id = resume_session_id
|
|
elif self._session_store:
|
|
execution_id = self._session_store.generate_session_id()
|
|
else:
|
|
# Fallback to old format if SessionStore not available (shouldn't happen)
|
|
import warnings
|
|
|
|
warnings.warn(
|
|
"SessionStore not available, using deprecated exec_* ID format. "
|
|
"Please ensure AgentRuntime is properly initialized.",
|
|
DeprecationWarning,
|
|
stacklevel=2,
|
|
)
|
|
execution_id = f"exec_{self.stream_id}_{uuid.uuid4().hex[:8]}"
|
|
|
|
if correlation_id is None:
|
|
correlation_id = execution_id
|
|
|
|
# Create execution context
|
|
effective_run_id = None
|
|
if session_state:
|
|
existing_run_id = session_state.get("run_id")
|
|
if isinstance(existing_run_id, str) and existing_run_id:
|
|
effective_run_id = existing_run_id
|
|
if effective_run_id is None:
|
|
effective_run_id = run_id
|
|
|
|
ctx = ExecutionContext(
|
|
id=execution_id,
|
|
correlation_id=correlation_id,
|
|
stream_id=self.stream_id,
|
|
entry_point=self.entry_spec.id,
|
|
input_data=input_data,
|
|
isolation_level=self.entry_spec.get_isolation_level(),
|
|
session_state=session_state,
|
|
run_id=effective_run_id,
|
|
)
|
|
|
|
async with self._lock:
|
|
self._active_executions[execution_id] = ctx
|
|
self._completion_events[execution_id] = asyncio.Event()
|
|
|
|
# Start execution task
|
|
task = asyncio.create_task(self._run_execution(ctx))
|
|
self._execution_tasks[execution_id] = task
|
|
|
|
logger.debug(f"Queued execution {execution_id} for stream {self.stream_id}")
|
|
return execution_id
|
|
|
|
# Errors that indicate resurrection won't help — the same error will recur.
|
|
# Includes both configuration/environment errors and deterministic node
|
|
# failures where the conversation/state hasn't changed.
|
|
_FATAL_ERROR_PATTERNS: tuple[str, ...] = (
|
|
# Configuration / environment
|
|
"credential",
|
|
"authentication",
|
|
"unauthorized",
|
|
"forbidden",
|
|
"api key",
|
|
"import error",
|
|
"module not found",
|
|
"no module named",
|
|
"permission denied",
|
|
"invalid api",
|
|
"configuration error",
|
|
# Deterministic node failures — resurrecting at the same node with
|
|
# the same conversation produces the same result.
|
|
"node stalled",
|
|
"ghost empty stream",
|
|
"max iterations",
|
|
)
|
|
|
|
@classmethod
|
|
def _is_fatal_error(cls, error: str | None) -> bool:
|
|
"""Return True if the error is life-threatening (no point resurrecting)."""
|
|
if not error:
|
|
return False
|
|
error_lower = error.lower()
|
|
return any(pat in error_lower for pat in cls._FATAL_ERROR_PATTERNS)
|
|
|
|
async def _run_execution(self, ctx: ExecutionContext) -> None:
|
|
"""Run a single execution within the stream.
|
|
|
|
Supports automatic resurrection: when the execution fails with a
|
|
non-fatal error, it restarts from the failed node up to
|
|
``entry_spec.max_resurrections`` times (default 3).
|
|
"""
|
|
execution_id = ctx.id
|
|
|
|
# When sharing a session with another entry point (resume_session_id),
|
|
# skip writing initial/final session state — the primary execution
|
|
# owns the state.json and _write_progress() keeps memory up-to-date.
|
|
_is_shared_session = bool(ctx.session_state and ctx.session_state.get("resume_session_id"))
|
|
|
|
max_resurrections = self.entry_spec.max_resurrections
|
|
_resurrection_count = 0
|
|
_current_session_state = ctx.session_state
|
|
_current_input_data = ctx.input_data
|
|
|
|
# Acquire semaphore to limit concurrency
|
|
async with self._semaphore:
|
|
ctx.status = "running"
|
|
|
|
try:
|
|
# Emit started event
|
|
if self._scoped_event_bus:
|
|
await self._scoped_event_bus.emit_execution_started(
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
input_data=ctx.input_data,
|
|
correlation_id=ctx.correlation_id,
|
|
run_id=ctx.run_id,
|
|
)
|
|
self._write_run_event(execution_id, ctx.run_id, "run_started")
|
|
|
|
# Create execution-scoped memory
|
|
self._state_manager.create_buffer(
|
|
execution_id=execution_id,
|
|
stream_id=self.stream_id,
|
|
isolation=ctx.isolation_level,
|
|
)
|
|
|
|
# Create runtime adapter for this execution
|
|
runtime_adapter = StreamRuntimeAdapter(self._runtime, execution_id)
|
|
|
|
# Start run to set trace context (CRITICAL for observability)
|
|
runtime_adapter.start_run(
|
|
goal_id=self.goal.id,
|
|
goal_description=self.goal.description,
|
|
input_data=ctx.input_data,
|
|
)
|
|
|
|
# Create per-execution runtime logger
|
|
runtime_logger = None
|
|
if self._runtime_log_store:
|
|
from framework.tracker.runtime_logger import RuntimeLogger
|
|
|
|
runtime_logger = RuntimeLogger(store=self._runtime_log_store, agent_id=self.graph.id)
|
|
|
|
# Derive storage from session_store (graph-specific for secondary
|
|
# graphs) so that all files — conversations, state, checkpoints,
|
|
# data — land under the graph's own sessions/ directory, not the
|
|
# primary worker's.
|
|
if self._session_store:
|
|
exec_storage = self._session_store.sessions_dir / execution_id
|
|
else:
|
|
exec_storage = self._storage.base_path / "sessions" / execution_id
|
|
|
|
# Create modified graph with entry point
|
|
# We need to override the entry_node to use our entry point
|
|
modified_graph = self._create_modified_graph()
|
|
|
|
# Write initial session state
|
|
if not _is_shared_session:
|
|
await self._write_session_state(execution_id, ctx)
|
|
|
|
# --- Resurrection loop ---
|
|
# Each iteration creates a fresh executor. On non-fatal failure,
|
|
# the executor's session_state (memory + resume_from) carries
|
|
# forward so the next attempt resumes at the failed node.
|
|
while True:
|
|
# Create executor for this execution.
|
|
executor = Orchestrator(
|
|
runtime=runtime_adapter,
|
|
llm=self._llm,
|
|
tools=self._tools,
|
|
tool_executor=self._tool_executor,
|
|
event_bus=self._scoped_event_bus,
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
run_id=ctx.run_id or "",
|
|
storage_path=exec_storage,
|
|
runtime_logger=runtime_logger,
|
|
loop_config=self.graph.loop_config,
|
|
accounts_prompt=self._accounts_prompt,
|
|
accounts_data=self._accounts_data,
|
|
tool_provider_map=self._tool_provider_map,
|
|
skills_catalog_prompt=self._skills_catalog_prompt,
|
|
protocols_prompt=self._protocols_prompt,
|
|
skill_dirs=self._skill_dirs,
|
|
context_warn_ratio=self._context_warn_ratio,
|
|
batch_init_nudge=self._batch_init_nudge,
|
|
dynamic_memory_provider=(
|
|
self._dynamic_memory_provider_factory(execution_id)
|
|
if self._dynamic_memory_provider_factory is not None
|
|
else None
|
|
),
|
|
)
|
|
# Track executor so inject_input() can reach EventLoopNode instances
|
|
self._active_executors[execution_id] = executor
|
|
|
|
# Execute
|
|
result = await executor.execute(
|
|
graph=modified_graph,
|
|
goal=self.goal,
|
|
input_data=_current_input_data,
|
|
session_state=_current_session_state,
|
|
checkpoint_config=self._checkpoint_config,
|
|
)
|
|
|
|
# Clean up executor reference
|
|
self._active_executors.pop(execution_id, None)
|
|
|
|
# Check if resurrection is appropriate
|
|
if (
|
|
not result.success
|
|
and not result.paused_at
|
|
and _resurrection_count < max_resurrections
|
|
and result.session_state
|
|
and not self._is_fatal_error(result.error)
|
|
):
|
|
_resurrection_count += 1
|
|
logger.warning(
|
|
"Execution %s failed (%s) — resurrecting (%d/%d) from node '%s'",
|
|
execution_id,
|
|
(result.error or "unknown")[:200],
|
|
_resurrection_count,
|
|
max_resurrections,
|
|
result.session_state.get("resume_from", "?"),
|
|
)
|
|
|
|
# Emit resurrection event
|
|
if self._scoped_event_bus:
|
|
from framework.host.event_bus import AgentEvent, EventType
|
|
|
|
await self._scoped_event_bus.publish(
|
|
AgentEvent(
|
|
type=EventType.EXECUTION_RESURRECTED,
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
data={
|
|
"attempt": _resurrection_count,
|
|
"max_resurrections": max_resurrections,
|
|
"error": (result.error or "")[:500],
|
|
"resume_from": result.session_state.get("resume_from"),
|
|
},
|
|
)
|
|
)
|
|
|
|
# Resume from the failed node with preserved memory
|
|
_current_session_state = {
|
|
**result.session_state,
|
|
"resume_session_id": execution_id,
|
|
}
|
|
# On resurrection, input_data is already in memory —
|
|
# pass empty so we don't overwrite intermediate results.
|
|
_current_input_data = {}
|
|
|
|
# Brief cooldown before resurrection
|
|
await asyncio.sleep(2.0)
|
|
continue
|
|
|
|
break # success, fatal failure, or resurrections exhausted
|
|
|
|
# Store result with retention
|
|
self._record_execution_result(execution_id, result)
|
|
|
|
# End run to complete trace (for observability)
|
|
runtime_adapter.end_run(
|
|
success=result.success,
|
|
narrative=f"Execution {'succeeded' if result.success else 'failed'}",
|
|
output_data=result.output,
|
|
)
|
|
|
|
# Update context
|
|
ctx.completed_at = datetime.now()
|
|
ctx.status = "completed" if result.success else "failed"
|
|
if result.paused_at:
|
|
ctx.status = "paused"
|
|
|
|
# Write final session state (skip for shared-session executions)
|
|
if not _is_shared_session:
|
|
await self._write_session_state(execution_id, ctx, result=result)
|
|
|
|
# Emit completion/failure/pause event
|
|
if self._scoped_event_bus:
|
|
if result.success:
|
|
await self._scoped_event_bus.emit_execution_completed(
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
output=result.output,
|
|
correlation_id=ctx.correlation_id,
|
|
run_id=ctx.run_id,
|
|
)
|
|
elif result.paused_at:
|
|
# The executor returns paused_at on CancelledError but
|
|
# does NOT emit execution_paused itself — we must emit
|
|
# it here so the frontend can transition out of "running".
|
|
await self._scoped_event_bus.emit_execution_paused(
|
|
stream_id=self.stream_id,
|
|
node_id=result.paused_at,
|
|
reason=result.error or "Execution paused",
|
|
execution_id=execution_id,
|
|
)
|
|
else:
|
|
await self._scoped_event_bus.emit_execution_failed(
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
error=result.error or "Unknown error",
|
|
correlation_id=ctx.correlation_id,
|
|
run_id=ctx.run_id,
|
|
)
|
|
|
|
# Write run event for historical restoration
|
|
if result.success:
|
|
self._write_run_event(execution_id, ctx.run_id, "run_completed")
|
|
elif result.paused_at:
|
|
self._write_run_event(execution_id, ctx.run_id, "run_paused")
|
|
else:
|
|
self._write_run_event(
|
|
execution_id,
|
|
ctx.run_id,
|
|
"run_failed",
|
|
{"error": result.error or "Unknown error"},
|
|
)
|
|
|
|
logger.debug(f"Execution {execution_id} completed: success={result.success}")
|
|
|
|
except asyncio.CancelledError:
|
|
# Execution was cancelled
|
|
# The executor catches CancelledError and returns a paused result,
|
|
# but if cancellation happened before executor started, we won't have a result
|
|
logger.info(f"Execution {execution_id} cancelled")
|
|
|
|
# Check if we have a result (executor completed and returned)
|
|
try:
|
|
_ = result # Check if result variable exists
|
|
has_result = True
|
|
except NameError:
|
|
has_result = False
|
|
result = ExecutionResult(
|
|
success=False,
|
|
error="Execution cancelled",
|
|
)
|
|
|
|
# Update context status based on result
|
|
if has_result and result.paused_at:
|
|
ctx.status = "paused"
|
|
ctx.completed_at = datetime.now()
|
|
else:
|
|
ctx.status = "cancelled"
|
|
|
|
# Clean up executor reference
|
|
self._active_executors.pop(execution_id, None)
|
|
|
|
# Store result with retention
|
|
self._record_execution_result(execution_id, result)
|
|
|
|
# Write session state (skip for shared-session executions)
|
|
if not _is_shared_session:
|
|
if has_result and result.paused_at:
|
|
await self._write_session_state(execution_id, ctx, result=result)
|
|
else:
|
|
await self._write_session_state(execution_id, ctx, error="Execution cancelled")
|
|
|
|
# Emit SSE event so the frontend knows the execution stopped.
|
|
# The executor does NOT emit on CancelledError, so there is no
|
|
# risk of double-emitting.
|
|
cancel_reason = self._cancel_reasons.pop(execution_id, "Execution cancelled")
|
|
if self._scoped_event_bus:
|
|
if has_result and result.paused_at:
|
|
await self._scoped_event_bus.emit_execution_paused(
|
|
stream_id=self.stream_id,
|
|
node_id=result.paused_at,
|
|
reason=cancel_reason,
|
|
execution_id=execution_id,
|
|
)
|
|
else:
|
|
await self._scoped_event_bus.emit_execution_failed(
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
error=cancel_reason,
|
|
correlation_id=ctx.correlation_id,
|
|
run_id=ctx.run_id,
|
|
)
|
|
|
|
self._write_run_event(execution_id, ctx.run_id, "run_cancelled")
|
|
# Don't re-raise - we've handled it and saved state
|
|
|
|
except Exception as e:
|
|
ctx.status = "failed"
|
|
logger.error(f"Execution {execution_id} failed: {e}")
|
|
|
|
# Store error result with retention
|
|
self._record_execution_result(
|
|
execution_id,
|
|
ExecutionResult(
|
|
success=False,
|
|
error=str(e),
|
|
),
|
|
)
|
|
|
|
# Write error session state (skip for shared-session executions)
|
|
if not _is_shared_session:
|
|
await self._write_session_state(execution_id, ctx, error=str(e))
|
|
|
|
# End run with failure (for observability)
|
|
try:
|
|
runtime_adapter.end_run(
|
|
success=False,
|
|
narrative=f"Execution failed: {str(e)}",
|
|
output_data={},
|
|
)
|
|
except Exception:
|
|
pass # Don't let end_run errors mask the original error
|
|
|
|
# Emit failure event
|
|
if self._scoped_event_bus:
|
|
await self._scoped_event_bus.emit_execution_failed(
|
|
stream_id=self.stream_id,
|
|
execution_id=execution_id,
|
|
error=str(e),
|
|
correlation_id=ctx.correlation_id,
|
|
run_id=ctx.run_id,
|
|
)
|
|
self._write_run_event(execution_id, ctx.run_id, "run_failed", {"error": str(e)})
|
|
|
|
finally:
|
|
# Clean up state
|
|
self._state_manager.cleanup_execution(execution_id)
|
|
|
|
# Signal completion
|
|
if execution_id in self._completion_events:
|
|
self._completion_events[execution_id].set()
|
|
|
|
# Remove in-flight bookkeeping
|
|
async with self._lock:
|
|
self._active_executions.pop(execution_id, None)
|
|
self._completion_events.pop(execution_id, None)
|
|
self._execution_tasks.pop(execution_id, None)
|
|
|
|
def _write_run_event(
|
|
self,
|
|
execution_id: str,
|
|
run_id: str | None,
|
|
event: str,
|
|
extra: dict[str, Any] | None = None,
|
|
) -> None:
|
|
"""Append a run lifecycle event to runs.jsonl for historical restoration."""
|
|
if not self._session_store or not run_id:
|
|
return
|
|
import json as _json
|
|
|
|
try:
|
|
session_dir = self._session_store.get_session_path(execution_id)
|
|
except ValueError:
|
|
return
|
|
runs_file = session_dir / "runs.jsonl"
|
|
now = datetime.now()
|
|
record = {
|
|
"run_id": run_id,
|
|
"event": event,
|
|
"timestamp": now.isoformat(),
|
|
"created_at": now.timestamp(),
|
|
}
|
|
if extra:
|
|
record.update(extra)
|
|
try:
|
|
runs_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(runs_file, "a", encoding="utf-8") as f:
|
|
f.write(_json.dumps(record) + "\n")
|
|
except OSError:
|
|
pass # Non-critical — don't break execution
|
|
|
|
async def _write_session_state(
|
|
self,
|
|
execution_id: str,
|
|
ctx: ExecutionContext,
|
|
result: ExecutionResult | None = None,
|
|
error: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Write state.json for a session.
|
|
|
|
Args:
|
|
execution_id: Session/execution ID
|
|
ctx: Execution context
|
|
result: Optional execution result (if completed)
|
|
error: Optional error message (if failed)
|
|
"""
|
|
# Only write if session_store is available
|
|
if not self._session_store:
|
|
return
|
|
|
|
from framework.schemas.session_state import SessionState, SessionStatus
|
|
|
|
try:
|
|
# Determine status
|
|
if result:
|
|
if result.paused_at:
|
|
status = SessionStatus.PAUSED
|
|
elif result.success:
|
|
status = SessionStatus.COMPLETED
|
|
else:
|
|
status = SessionStatus.FAILED
|
|
elif error:
|
|
# Check if this is a cancellation
|
|
if ctx.status == "cancelled" or "cancelled" in error.lower():
|
|
status = SessionStatus.CANCELLED
|
|
else:
|
|
status = SessionStatus.FAILED
|
|
else:
|
|
status = SessionStatus.ACTIVE
|
|
|
|
# Create SessionState
|
|
if result:
|
|
# Create from execution result
|
|
state = SessionState.from_execution_result(
|
|
session_id=execution_id,
|
|
goal_id=self.goal.id,
|
|
result=result,
|
|
stream_id=self.stream_id,
|
|
correlation_id=ctx.correlation_id,
|
|
started_at=ctx.started_at.isoformat(),
|
|
input_data=ctx.input_data,
|
|
agent_id=self.graph.id,
|
|
entry_point=self.entry_spec.id,
|
|
)
|
|
state.current_run_id = ctx.run_id
|
|
else:
|
|
# Create initial state — when resuming, preserve the previous
|
|
# execution's progress so crashes don't lose track of state.
|
|
from framework.schemas.session_state import (
|
|
SessionProgress,
|
|
SessionTimestamps,
|
|
)
|
|
|
|
now = datetime.now().isoformat()
|
|
ss = ctx.session_state or {}
|
|
progress = SessionProgress(
|
|
current_node=ss.get("paused_at") or ss.get("resume_from"),
|
|
paused_at=ss.get("paused_at"),
|
|
resume_from=ss.get("paused_at") or ss.get("resume_from"),
|
|
path=ss.get("execution_path", []),
|
|
node_visit_counts=ss.get("node_visit_counts", {}),
|
|
)
|
|
state = SessionState(
|
|
session_id=execution_id,
|
|
stream_id=self.stream_id,
|
|
correlation_id=ctx.correlation_id,
|
|
goal_id=self.goal.id,
|
|
agent_id=self.graph.id,
|
|
entry_point=self.entry_spec.id,
|
|
status=status,
|
|
timestamps=SessionTimestamps(
|
|
started_at=ctx.started_at.isoformat(),
|
|
updated_at=now,
|
|
),
|
|
progress=progress,
|
|
data_buffer=ss.get("data_buffer", ss.get("memory", {})),
|
|
input_data=ctx.input_data,
|
|
current_run_id=ctx.run_id,
|
|
)
|
|
|
|
# Handle error case
|
|
if error:
|
|
state.result.error = error
|
|
|
|
# Stamp the owning process ID for cross-process stale detection
|
|
state.pid = os.getpid()
|
|
|
|
# Write state.json
|
|
await self._session_store.write_state(execution_id, state)
|
|
logger.debug(f"Wrote state.json for session {execution_id} (status={status})")
|
|
|
|
except Exception as e:
|
|
# Log but don't fail the execution
|
|
logger.error(f"Failed to write state.json for {execution_id}: {e}")
|
|
|
|
def _create_modified_graph(self) -> "GraphSpec":
|
|
"""Create a graph with the entry point overridden.
|
|
|
|
Preserves the original graph's entry_points so that validation
|
|
correctly considers ALL entry nodes reachable.
|
|
Each stream only executes from its own entry_node, but the full
|
|
graph must validate with all entry points accounted for.
|
|
"""
|
|
from framework.orchestrator.edge import GraphSpec
|
|
|
|
# Merge entry points: this stream's entry + original graph's primary
|
|
# entry + any other entry points. This ensures all nodes are
|
|
# reachable during validation even though this stream only starts
|
|
# from self.entry_spec.entry_node.
|
|
merged_entry_points = {
|
|
"start": self.entry_spec.entry_node,
|
|
}
|
|
# Preserve the original graph's primary entry node
|
|
if self.graph.entry_node:
|
|
merged_entry_points["primary"] = self.graph.entry_node
|
|
# Include any explicitly defined entry points from the graph
|
|
merged_entry_points.update(self.graph.entry_points)
|
|
|
|
return GraphSpec(
|
|
id=self.graph.id,
|
|
goal_id=self.graph.goal_id,
|
|
version=self.graph.version,
|
|
entry_node=self.entry_spec.entry_node, # Use our entry point
|
|
entry_points=merged_entry_points,
|
|
terminal_nodes=self.graph.terminal_nodes,
|
|
pause_nodes=self.graph.pause_nodes,
|
|
nodes=self.graph.nodes,
|
|
edges=self.graph.edges,
|
|
default_model=self.graph.default_model,
|
|
max_tokens=self.graph.max_tokens,
|
|
max_steps=self.graph.max_steps,
|
|
cleanup_llm_model=self.graph.cleanup_llm_model,
|
|
loop_config=self.graph.loop_config,
|
|
conversation_mode=self.graph.conversation_mode,
|
|
identity_prompt=self.graph.identity_prompt,
|
|
)
|
|
|
|
async def wait_for_completion(
|
|
self,
|
|
execution_id: str,
|
|
timeout: float | None = None,
|
|
) -> ExecutionResult | None:
|
|
"""
|
|
Wait for an execution to complete.
|
|
|
|
Args:
|
|
execution_id: Execution to wait for
|
|
timeout: Maximum time to wait (seconds)
|
|
|
|
Returns:
|
|
ExecutionResult or None if timeout
|
|
"""
|
|
event = self._completion_events.get(execution_id)
|
|
if event is None:
|
|
# Execution not found or already cleaned up
|
|
self._prune_execution_results()
|
|
return self._execution_results.get(execution_id)
|
|
|
|
try:
|
|
if timeout:
|
|
await asyncio.wait_for(event.wait(), timeout=timeout)
|
|
else:
|
|
await event.wait()
|
|
|
|
self._prune_execution_results()
|
|
return self._execution_results.get(execution_id)
|
|
|
|
except TimeoutError:
|
|
return None
|
|
|
|
def get_result(self, execution_id: str) -> ExecutionResult | None:
|
|
"""Get result of a completed execution."""
|
|
self._prune_execution_results()
|
|
return self._execution_results.get(execution_id)
|
|
|
|
def get_context(self, execution_id: str) -> ExecutionContext | None:
|
|
"""Get execution context."""
|
|
return self._active_executions.get(execution_id)
|
|
|
|
async def cancel_execution(self, execution_id: str, *, reason: str | None = None) -> bool:
|
|
"""
|
|
Cancel a running execution.
|
|
|
|
Args:
|
|
execution_id: Execution to cancel
|
|
reason: Human-readable reason for the cancellation (e.g.
|
|
"Stopped by queen", "User requested pause"). If not
|
|
provided, defaults to "Execution cancelled".
|
|
|
|
Returns:
|
|
True if cancelled, False if not found
|
|
"""
|
|
task = self._execution_tasks.get(execution_id)
|
|
if task and not task.done():
|
|
# Store the reason so the CancelledError handler can use it
|
|
# when emitting the pause/fail event.
|
|
self._cancel_reasons[execution_id] = reason or "Execution cancelled"
|
|
task.cancel()
|
|
# Wait briefly for the task to finish. Don't block indefinitely —
|
|
# the task may be stuck in a long LLM API call that doesn't
|
|
# respond to cancellation quickly.
|
|
done, _ = await asyncio.wait({task}, timeout=5.0)
|
|
if not done:
|
|
# Task didn't finish within timeout — clean up bookkeeping now
|
|
# so the session doesn't think it still has running executions.
|
|
# The task will continue winding down in the background and its
|
|
# finally block will harmlessly pop already-removed keys.
|
|
logger.warning(
|
|
"Execution %s did not finish within cancel timeout; force-cleaning bookkeeping",
|
|
execution_id,
|
|
)
|
|
async with self._lock:
|
|
self._active_executions.pop(execution_id, None)
|
|
self._execution_tasks.pop(execution_id, None)
|
|
self._active_executors.pop(execution_id, None)
|
|
return True
|
|
return False
|
|
|
|
# === STATS AND MONITORING ===
|
|
|
|
def get_active_count(self) -> int:
|
|
"""Get count of active executions."""
|
|
return len([ctx for ctx in self._active_executions.values() if ctx.status == "running"])
|
|
|
|
def get_stats(self) -> dict:
|
|
"""Get stream statistics."""
|
|
statuses = {}
|
|
for ctx in self._active_executions.values():
|
|
statuses[ctx.status] = statuses.get(ctx.status, 0) + 1
|
|
|
|
# Calculate available slots from running count instead of accessing private _value
|
|
running_count = statuses.get("running", 0)
|
|
available_slots = self.entry_spec.max_concurrent - running_count
|
|
|
|
return {
|
|
"stream_id": self.stream_id,
|
|
"entry_point": self.entry_spec.id,
|
|
"running": self._running,
|
|
"total_executions": len(self._active_executions),
|
|
"completed_executions": len(self._execution_results),
|
|
"status_counts": statuses,
|
|
"max_concurrent": self.entry_spec.max_concurrent,
|
|
"available_slots": available_slots,
|
|
}
|