""" Agent Runtime - Top-level orchestrator for multi-entry-point agents. Manages agent lifecycle and coordinates multiple execution streams while preserving the goal-driven approach. """ import asyncio import logging import time import uuid from collections import OrderedDict from collections.abc import Callable from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING, Any from framework.host.event_bus import EventBus from framework.host.execution_manager import EntryPointSpec, ExecutionManager from framework.host.outcome_aggregator import OutcomeAggregator from framework.host.shared_state import SharedBufferManager from framework.orchestrator.checkpoint_config import CheckpointConfig from framework.orchestrator.orchestrator import ExecutionResult from framework.storage.concurrent import ConcurrentStorage from framework.storage.session_store import SessionStore from framework.tracker.runtime_log_store import RuntimeLogStore if TYPE_CHECKING: from framework.llm.provider import LLMProvider, Tool from framework.orchestrator.edge import GraphSpec from framework.orchestrator.goal import Goal from framework.pipeline.stage import PipelineStage from framework.skills.manager import SkillsManagerConfig logger = logging.getLogger(__name__) @dataclass class AgentRuntimeConfig: """Configuration for AgentHost.""" max_concurrent_executions: int = 100 cache_ttl: float = 60.0 batch_interval: float = 0.1 max_history: int = 1000 execution_result_max: int = 1000 execution_result_ttl_seconds: float | None = None # Idempotency cache for trigger() deduplication idempotency_ttl_seconds: float = 300.0 idempotency_max_keys: int = 10000 # Webhook server config (only starts if webhook_routes is non-empty) webhook_host: str = "127.0.0.1" webhook_port: int = 8080 webhook_routes: list[dict] = field(default_factory=list) # Each dict: {"source_id": str, "path": str, "methods": ["POST"], "secret": str|None} @dataclass class _GraphRegistration: """Tracks a loaded graph and its runtime resources.""" graph: "GraphSpec" goal: "Goal" entry_points: dict[str, EntryPointSpec] streams: dict[str, ExecutionManager] # ep_id -> stream (NOT namespaced) storage_subpath: str # relative to session root, e.g. "graphs/email_agent" event_subscriptions: list[str] = field(default_factory=list) timer_tasks: list[asyncio.Task] = field(default_factory=list) timer_next_fire: dict[str, float] = field(default_factory=dict) class AgentHost: """ Top-level runtime that manages agent lifecycle and concurrent executions. Responsibilities: - Register and manage multiple entry points - Coordinate execution streams - Manage shared state across streams - Aggregate decisions/outcomes for goal evaluation - Handle lifecycle events (start, pause, shutdown) Example: # Create runtime runtime = AgentRuntime( graph=support_agent_graph, goal=support_agent_goal, storage_path=Path("./storage"), llm=llm_provider, ) # Register entry points runtime.register_entry_point(EntryPointSpec( id="webhook", name="Zendesk Webhook", entry_node="process-webhook", trigger_type="webhook", isolation_level="shared", )) runtime.register_entry_point(EntryPointSpec( id="api", name="API Handler", entry_node="process-request", trigger_type="api", isolation_level="shared", )) # Start runtime await runtime.start() # Trigger executions (non-blocking) exec_1 = await runtime.trigger("webhook", {"ticket_id": "123"}) exec_2 = await runtime.trigger("api", {"query": "help"}) # Check goal progress progress = await runtime.get_goal_progress() print(f"Progress: {progress['overall_progress']:.1%}") # Stop runtime await runtime.stop() """ def __init__( self, graph: "GraphSpec", goal: "Goal", storage_path: str | Path, llm: "LLMProvider | None" = None, tools: list["Tool"] | None = None, tool_executor: Callable | None = None, config: AgentRuntimeConfig | None = None, runtime_log_store: Any = None, checkpoint_config: CheckpointConfig | None = None, graph_id: str | None = None, accounts_prompt: str = "", accounts_data: list[dict] | None = None, tool_provider_map: dict[str, str] | None = None, event_bus: "EventBus | None" = None, skills_manager_config: "SkillsManagerConfig | None" = None, # Deprecated — pass skills_manager_config instead. skills_catalog_prompt: str = "", protocols_prompt: str = "", skill_dirs: list[str] | None = None, pipeline_stages: "list[PipelineStage] | None" = None, ): """ Initialize agent runtime. Args: graph: Graph specification for this agent goal: Goal driving execution storage_path: Path for persistent storage llm: LLM provider for nodes tools: Available tools tool_executor: Function to execute tools config: Optional runtime configuration runtime_log_store: Optional RuntimeLogStore for per-execution logging checkpoint_config: Optional checkpoint configuration for resumable sessions graph_id: Optional identifier for the primary graph (defaults to "primary") accounts_prompt: Connected accounts block for system prompt injection accounts_data: Raw account data for per-node prompt generation tool_provider_map: Tool name to provider name mapping for account routing event_bus: Optional external EventBus. If provided, the runtime shares this bus instead of creating its own. Used by SessionManager to share a single bus between queen, worker, and judge. skills_catalog_prompt: Available skills catalog for system prompt protocols_prompt: Default skill operational protocols for system prompt skill_dirs: Skill base directories for Tier 3 resource access skills_manager_config: Skill configuration — the runtime owns discovery, loading, and prompt renderation internally. skills_catalog_prompt: Deprecated. Pre-rendered skills catalog. protocols_prompt: Deprecated. Pre-rendered operational protocols. """ from framework.pipeline.runner import PipelineRunner from framework.skills.manager import SkillsManager self.graph = graph self.goal = goal self._config = config or AgentRuntimeConfig() self._runtime_log_store = runtime_log_store self._checkpoint_config = checkpoint_config self.accounts_prompt = accounts_prompt # Pipeline middleware: runs before every trigger() dispatch. # Accepts either pre-built stage objects or loads from config. if pipeline_stages: self._pipeline = PipelineRunner(pipeline_stages) else: self._pipeline = self._load_pipeline_from_config() # --- Skill lifecycle: runtime owns the SkillsManager --- if skills_manager_config is not None: # New path: config-driven, runtime handles loading self._skills_manager = SkillsManager(skills_manager_config) self._skills_manager.load() elif skills_catalog_prompt or protocols_prompt: # Legacy path: caller passed pre-rendered strings import warnings warnings.warn( "Passing pre-rendered skills_catalog_prompt/protocols_prompt " "is deprecated. Pass skills_manager_config instead.", DeprecationWarning, stacklevel=2, ) self._skills_manager = SkillsManager.from_precomputed(skills_catalog_prompt, protocols_prompt) else: # Bare constructor: auto-load defaults self._skills_manager = SkillsManager() self._skills_manager.load() self.skill_dirs: list[str] = self._skills_manager.allowlisted_dirs self.context_warn_ratio: float | None = self._skills_manager.context_warn_ratio self.batch_init_nudge: str | None = self._skills_manager.batch_init_nudge # Primary graph identity self._graph_id: str = graph_id or "primary" # Multi-graph state self._graphs: dict[str, _GraphRegistration] = {} self._active_graph_id: str = self._graph_id # User presence tracking (monotonic timestamp of last inject_input) self._last_user_input_time: float = 0.0 # Initialize storage storage_path_obj = Path(storage_path) if isinstance(storage_path, str) else storage_path self._storage = ConcurrentStorage( base_path=storage_path_obj, cache_ttl=self._config.cache_ttl, batch_interval=self._config.batch_interval, ) # Initialize SessionStore for unified sessions (always enabled) self._session_store = SessionStore(storage_path_obj) # Initialize shared components self._state_manager = SharedBufferManager() self._event_bus = event_bus or EventBus(max_history=self._config.max_history) self._outcome_aggregator = OutcomeAggregator(goal, self._event_bus) # LLM and tools self._llm = llm self._tools = tools or [] self._tool_executor = tool_executor self._accounts_prompt = accounts_prompt self._dynamic_memory_provider_factory: Callable[[str], Callable[[], str] | None] | None = None self._accounts_data = accounts_data self._tool_provider_map = tool_provider_map # Entry points and streams (primary graph) self._entry_points: dict[str, EntryPointSpec] = {} self._streams: dict[str, ExecutionManager] = {} # Webhook server (created on start if webhook_routes configured) self._webhook_server: Any = None # Event-driven entry point subscriptions (primary graph) self._event_subscriptions: list[str] = [] # Timer tasks for scheduled entry points (primary graph) self._timer_tasks: list[asyncio.Task] = [] # Next fire time for each timer entry point (ep_id -> datetime) self._timer_next_fire: dict[str, float] = {} # Idempotency cache for trigger() deduplication self._idempotency_keys: OrderedDict[str, str] = OrderedDict() self._idempotency_times: dict[str, float] = {} # State self._running = False self._timers_paused = False self._lock = asyncio.Lock() # Optional greeting shown to user on TUI load (set by AgentRunner) self.intro_message: str = "" # ------------------------------------------------------------------ # Skill prompt accessors (read by ExecutionManager constructors) # ------------------------------------------------------------------ @property def skills_catalog_prompt(self) -> str: return self._skills_manager.skills_catalog_prompt @property def protocols_prompt(self) -> str: return self._skills_manager.protocols_prompt def register_entry_point(self, spec: EntryPointSpec) -> None: """ Register a named entry point for the agent. Args: spec: Entry point specification Raises: ValueError: If entry point ID already registered RuntimeError: If runtime is already running """ if self._running: raise RuntimeError("Cannot register entry points while runtime is running") if spec.id in self._entry_points: raise ValueError(f"Entry point '{spec.id}' already registered") # Validate entry node exists in graph if self.graph.get_node(spec.entry_node) is None: raise ValueError(f"Entry node '{spec.entry_node}' not found in graph") self._entry_points[spec.id] = spec logger.info(f"Registered entry point: {spec.id} -> {spec.entry_node}") def unregister_entry_point(self, entry_point_id: str) -> bool: """ Unregister an entry point. Args: entry_point_id: Entry point to remove Returns: True if removed, False if not found Raises: RuntimeError: If runtime is running """ if self._running: raise RuntimeError("Cannot unregister entry points while runtime is running") if entry_point_id in self._entry_points: del self._entry_points[entry_point_id] return True return False async def start(self) -> None: """Start the agent runtime and all registered entry points.""" if self._running: return async with self._lock: # Start storage await self._storage.start() # Initialize pipeline stages FIRST -- they inject LLM, tools, # credentials, and skills into the host before streams are created. await self._pipeline.initialize_all() self._apply_pipeline_results() # Create streams for each entry point (uses pipeline results) for ep_id, spec in self._entry_points.items(): stream = ExecutionManager( stream_id=ep_id, entry_spec=spec, graph=self.graph, goal=self.goal, state_manager=self._state_manager, storage=self._storage, outcome_aggregator=self._outcome_aggregator, event_bus=self._event_bus, llm=self._llm, tools=self._tools, tool_executor=self._tool_executor, result_retention_max=self._config.execution_result_max, result_retention_ttl_seconds=self._config.execution_result_ttl_seconds, runtime_log_store=self._runtime_log_store, session_store=self._session_store, checkpoint_config=self._checkpoint_config, graph_id=self._graph_id, accounts_prompt=self._accounts_prompt, accounts_data=self._accounts_data, tool_provider_map=self._tool_provider_map, skills_catalog_prompt=self.skills_catalog_prompt, protocols_prompt=self.protocols_prompt, skill_dirs=self.skill_dirs, context_warn_ratio=self.context_warn_ratio, batch_init_nudge=self.batch_init_nudge, dynamic_memory_provider_factory=self._dynamic_memory_provider_factory, ) await stream.start() self._streams[ep_id] = stream # Start webhook server if routes are configured if self._config.webhook_routes: from framework.host.webhook_server import ( WebhookRoute, WebhookServer, WebhookServerConfig, ) wh_config = WebhookServerConfig( host=self._config.webhook_host, port=self._config.webhook_port, ) self._webhook_server = WebhookServer(self._event_bus, wh_config) for rc in self._config.webhook_routes: route = WebhookRoute( source_id=rc["source_id"], path=rc["path"], methods=rc.get("methods", ["POST"]), secret=rc.get("secret"), ) self._webhook_server.add_route(route) await self._webhook_server.start() # Subscribe event-driven entry points to EventBus from framework.host.event_bus import EventType as _ET for ep_id, spec in self._entry_points.items(): if spec.trigger_type != "event": continue tc = spec.trigger_config event_types = [_ET(et) for et in tc.get("event_types", [])] if not event_types: logger.warning( f"Entry point '{ep_id}' has trigger_type='event' but no event_types in trigger_config" ) continue # Capture ep_id and config in closure exclude_own = tc.get("exclude_own_graph", False) def _make_handler(entry_point_id: str, _exclude_own: bool): _persistent_session_id: str | None = None async def _on_event(event): nonlocal _persistent_session_id if not self._running or entry_point_id not in self._streams: return # Skip events originating from this graph's own # executions (e.g. guardian should not fire on # queen failures — only secondary graphs). if _exclude_own and event.graph_id == self._graph_id: return ep_spec = self._entry_points.get(entry_point_id) is_isolated = ep_spec and ep_spec.isolation_level == "isolated" if is_isolated: if _persistent_session_id: session_state = {"resume_session_id": _persistent_session_id} else: session_state = None else: # Run in the same session as the primary entry # point so memory (e.g. user-defined rules) is # shared and logs land in one session directory. session_state = self._get_primary_session_state(exclude_entry_point=entry_point_id) exec_id = await self.trigger( entry_point_id, {"event": event.to_dict()}, session_state=session_state, ) if not _persistent_session_id and is_isolated: _persistent_session_id = exec_id return _on_event sub_id = self._event_bus.subscribe( event_types=event_types, handler=_make_handler(ep_id, exclude_own), filter_stream=tc.get("filter_stream"), filter_node=tc.get("filter_node"), filter_graph=tc.get("filter_graph"), ) self._event_subscriptions.append(sub_id) # Start timer-driven entry points await self._start_timers() # Start skill hot-reload watcher (no-op if watchfiles not installed) await self._skills_manager.start_watching() self._running = True self._timers_paused = False n_stages = len(self._pipeline.stages) logger.info( "AgentHost started with %d streams, %d pipeline stages", len(self._streams), n_stages, ) async def _start_timers(self) -> None: """Start timer-driven entry points (extracted from start()).""" for ep_id, spec in self._entry_points.items(): if spec.trigger_type != "timer": continue tc = spec.trigger_config cron_expr = tc.get("cron") _raw_interval = tc.get("interval_minutes") interval = float(_raw_interval) if _raw_interval is not None else None run_immediately = tc.get("run_immediately", False) if cron_expr: # Cron expression mode — takes priority over interval_minutes try: from croniter import croniter except ImportError as e: raise RuntimeError( "croniter is required for cron-based entry points. Install it with: uv pip install croniter" ) from e try: if not croniter.is_valid(cron_expr): raise ValueError(f"Invalid cron expression: {cron_expr}") except ValueError as e: logger.warning( "Entry point '%s' has invalid cron config: %s", ep_id, e, ) continue def _make_cron_timer( entry_point_id: str, expr: str, immediate: bool, idle_timeout: float = 300, ): async def _cron_loop(): from croniter import croniter _persistent_session_id: str | None = None if not immediate: cron = croniter(expr, datetime.now()) next_dt = cron.get_next(datetime) sleep_secs = (next_dt - datetime.now()).total_seconds() self._timer_next_fire[entry_point_id] = time.monotonic() + sleep_secs await asyncio.sleep(max(0, sleep_secs)) while self._running: # Calculate next fire time upfront (used by skip paths too) cron = croniter(expr, datetime.now()) next_dt = cron.get_next(datetime) sleep_secs = (next_dt - datetime.now()).total_seconds() # Gate: skip tick if timers are explicitly paused if self._timers_paused: logger.debug( "Cron '%s': paused, skipping tick", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + sleep_secs await asyncio.sleep(max(0, sleep_secs)) continue # Gate: skip tick if ANY stream is actively working. # If the execution is idle (no LLM/tool activity # beyond idle_timeout) let the timer proceed — # execute() will cancel the stale execution. _any_active = False _min_idle = float("inf") for _s in self._streams.values(): if _s.active_execution_ids: _any_active = True _idle = _s.agent_idle_seconds if _idle < _min_idle: _min_idle = _idle logger.info( "Cron '%s': gate — active=%s, idle=%.1fs, timeout=%ds", entry_point_id, _any_active, _min_idle, idle_timeout, ) if _any_active and _min_idle < idle_timeout: logger.info( "Cron '%s': agent actively working, skipping tick", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + sleep_secs await asyncio.sleep(max(0, sleep_secs)) continue self._timer_next_fire.pop(entry_point_id, None) try: ep_spec = self._entry_points.get(entry_point_id) is_isolated = ep_spec and ep_spec.isolation_level == "isolated" if is_isolated: if _persistent_session_id: session_state = {"resume_session_id": _persistent_session_id} else: session_state = None else: session_state = self._get_primary_session_state(exclude_entry_point=entry_point_id) # Gate: skip tick if no active session if session_state is None: logger.debug( "Cron '%s': no active session, skipping", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + sleep_secs await asyncio.sleep(max(0, sleep_secs)) continue exec_id = await self.trigger( entry_point_id, { "event": { "source": "timer", "reason": "scheduled", } }, session_state=session_state, ) if not _persistent_session_id and is_isolated: _persistent_session_id = exec_id logger.info( "Cron fired for entry point '%s' (expr: %s)", entry_point_id, expr, ) except Exception: logger.error( "Cron trigger failed for '%s'", entry_point_id, exc_info=True, ) # Calculate next fire from now cron = croniter(expr, datetime.now()) next_dt = cron.get_next(datetime) sleep_secs = (next_dt - datetime.now()).total_seconds() self._timer_next_fire[entry_point_id] = time.monotonic() + sleep_secs await asyncio.sleep(max(0, sleep_secs)) return _cron_loop task = asyncio.create_task( _make_cron_timer( ep_id, cron_expr, run_immediately, idle_timeout=float(tc.get("idle_timeout_seconds", 300)), )() ) self._timer_tasks.append(task) logger.info( "Started cron timer for entry point '%s' with expression '%s'%s", ep_id, cron_expr, " (immediate first run)" if run_immediately else "", ) elif interval and interval > 0: # Fixed interval mode (original behavior) def _make_timer( entry_point_id: str, mins: float, immediate: bool, idle_timeout: float = 300, ): async def _timer_loop(): interval_secs = mins * 60 _persistent_session_id: str | None = None if not immediate: self._timer_next_fire[entry_point_id] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) while self._running: # Gate: skip tick if timers are explicitly paused if self._timers_paused: logger.debug( "Timer '%s': paused, skipping tick", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue # Gate: skip tick if agent is actively working. # Gate: skip tick if ANY stream is actively working. _any_active = False _min_idle = float("inf") for _s in self._streams.values(): if _s.active_execution_ids: _any_active = True _idle = _s.agent_idle_seconds if _idle < _min_idle: _min_idle = _idle logger.info( "Timer '%s': gate — active=%s, idle=%.1fs, timeout=%ds", entry_point_id, _any_active, _min_idle, idle_timeout, ) if _any_active and _min_idle < idle_timeout: logger.info( "Timer '%s': agent actively working, skipping tick", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue self._timer_next_fire.pop(entry_point_id, None) try: ep_spec = self._entry_points.get(entry_point_id) is_isolated = ep_spec and ep_spec.isolation_level == "isolated" if is_isolated: if _persistent_session_id: session_state = {"resume_session_id": _persistent_session_id} else: session_state = None else: session_state = self._get_primary_session_state(exclude_entry_point=entry_point_id) # Gate: skip tick if no active session if session_state is None: logger.debug( "Timer '%s': no active session, skipping", entry_point_id, ) self._timer_next_fire[entry_point_id] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue exec_id = await self.trigger( entry_point_id, { "event": { "source": "timer", "reason": "scheduled", } }, session_state=session_state, ) if not _persistent_session_id and is_isolated: _persistent_session_id = exec_id logger.info( "Timer fired for entry point '%s' (next in %s min)", entry_point_id, mins, ) except Exception: logger.error( "Timer trigger failed for '%s'", entry_point_id, exc_info=True, ) self._timer_next_fire[entry_point_id] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) return _timer_loop task = asyncio.create_task( _make_timer( ep_id, interval, run_immediately, idle_timeout=float(tc.get("idle_timeout_seconds", 300)), )() ) self._timer_tasks.append(task) logger.info( "Started timer for entry point '%s' every %s min%s", ep_id, interval, " (immediate first run)" if run_immediately else "", ) else: logger.warning( "Entry point '%s' has trigger_type='timer' " "but no 'cron' or valid 'interval_minutes' in trigger_config", ep_id, ) # Register primary graph self._graphs[self._graph_id] = _GraphRegistration( graph=self.graph, goal=self.goal, entry_points=dict(self._entry_points), streams=dict(self._streams), storage_subpath="", event_subscriptions=list(self._event_subscriptions), timer_tasks=list(self._timer_tasks), timer_next_fire=self._timer_next_fire, ) async def stop(self) -> None: """Stop the agent runtime and all streams.""" if not self._running: return async with self._lock: # Stop secondary graphs first secondary_ids = [gid for gid in self._graphs if gid != self._graph_id] for gid in secondary_ids: await self._teardown_graph(gid) # Cancel primary timer tasks for task in self._timer_tasks: task.cancel() self._timer_tasks.clear() # Unsubscribe primary event-driven entry points for sub_id in self._event_subscriptions: self._event_bus.unsubscribe(sub_id) self._event_subscriptions.clear() # Stop webhook server if self._webhook_server: await self._webhook_server.stop() self._webhook_server = None # Stop all primary streams for stream in self._streams.values(): await stream.stop() self._streams.clear() self._graphs.clear() # Stop skill hot-reload watcher await self._skills_manager.stop_watching() # Stop storage await self._storage.stop() self._running = False logger.info("AgentHost stopped") def pause_timers(self) -> None: """Pause all timer-driven entry points. Timers will skip their ticks until ``resume_timers()`` is called. """ self._timers_paused = True logger.info("Timers paused") def resume_timers(self) -> None: """Resume timer-driven entry points after a pause.""" self._timers_paused = False logger.info("Timers resumed") def _resolve_stream( self, entry_point_id: str, graph_id: str | None = None, ) -> ExecutionManager | None: """Find the stream for an entry point, searching the active graph first. Lookup order: 1. If *graph_id* is given, search that graph only. 2. Otherwise search the active graph (``active_graph_id``). 3. Fall back to the primary graph's streams (``self._streams``). """ if graph_id: reg = self._graphs.get(graph_id) return reg.streams.get(entry_point_id) if reg else None # Active graph target = self._active_graph_id if target != self._graph_id: reg = self._graphs.get(target) if reg: stream = reg.streams.get(entry_point_id) if stream is not None: return stream # Primary graph (also stored in self._streams) return self._streams.get(entry_point_id) def _apply_pipeline_results(self) -> None: """Read typed attributes from pipeline stages after initialization.""" for stage in self._pipeline.stages: name = stage.__class__.__name__ if stage.tool_registry is not None: tools = list(stage.tool_registry.get_tools().values()) if tools: self._tools = tools self._tool_executor = stage.tool_registry.get_executor() logger.info("Pipeline: %d tools from %s", len(tools), name) if stage.llm is not None and self._llm is None: self._llm = stage.llm logger.info("Pipeline: LLM from %s", name) if stage.accounts_prompt: self._accounts_prompt = stage.accounts_prompt self._accounts_data = stage.accounts_data self._tool_provider_map = stage.tool_provider_map if stage.skills_manager is not None: self._skills_manager = stage.skills_manager @staticmethod def _load_pipeline_from_config(): """Build pipeline from ``~/.hive/configuration.json`` ``pipeline`` key. Returns an empty pipeline if no config is set. """ from framework.config import get_hive_config from framework.pipeline.registry import build_pipeline_from_config from framework.pipeline.runner import PipelineRunner config = get_hive_config() stages_config = config.get("pipeline", {}).get("stages", []) if not stages_config: return PipelineRunner([]) return build_pipeline_from_config(stages_config) async def _reload_pipeline(self) -> None: """Hot-reload pipeline from config. Atomic swap.""" new_pipeline = self._load_pipeline_from_config() await new_pipeline.initialize_all() self._pipeline = new_pipeline logger.info( "Pipeline reloaded: %d stages", len(new_pipeline.stages), ) def _prune_idempotency_keys(self) -> None: """Prune expired idempotency keys based on TTL and max size.""" ttl = self._config.idempotency_ttl_seconds if ttl > 0: cutoff = time.time() - ttl for key, recorded_at in list(self._idempotency_times.items()): if recorded_at < cutoff: self._idempotency_times.pop(key, None) self._idempotency_keys.pop(key, None) max_keys = self._config.idempotency_max_keys if max_keys > 0: while len(self._idempotency_keys) > max_keys: old_key, _ = self._idempotency_keys.popitem(last=False) self._idempotency_times.pop(old_key, None) async def trigger( self, entry_point_id: str, input_data: dict[str, Any], correlation_id: str | None = None, session_state: dict[str, Any] | None = None, idempotency_key: str | None = None, graph_id: str | None = None, ) -> str: """ Trigger execution at a specific entry point. Non-blocking - returns immediately with execution ID. Args: entry_point_id: Which entry point to trigger input_data: Input data for the execution correlation_id: Optional ID to correlate related executions session_state: Optional session state to resume from (with paused_at, memory) idempotency_key: Optional key for deduplication. If a trigger with the same key was already processed within the TTL window, the cached execution_id is returned instead of starting a new execution. Useful for webhook providers that retry on timeout. graph_id: Graph to trigger on. ``None`` uses the active graph first, then falls back to the primary graph. Returns: Execution ID for tracking Raises: ValueError: If entry point not found RuntimeError: If runtime not running """ if not self._running: raise RuntimeError("AgentHost is not running") # Idempotency check: return cached execution_id for duplicate keys. if idempotency_key is not None: self._prune_idempotency_keys() cached = self._idempotency_keys.get(idempotency_key) if cached is not None: logger.debug( "Idempotent trigger: key '%s' already seen, returning %s", idempotency_key, cached, ) return cached # Run pipeline middleware (rate limiting, validation, cost guards, ...) # Raises PipelineRejectedError if any stage rejects. if self._pipeline.stages: from framework.pipeline.stage import PipelineContext pipeline_ctx = PipelineContext( entry_point_id=entry_point_id, input_data=input_data, correlation_id=correlation_id, session_state=session_state, ) pipeline_ctx = await self._pipeline.run(pipeline_ctx) # Stages may have transformed the input_data. input_data = pipeline_ctx.input_data stream = self._resolve_stream(entry_point_id, graph_id) if stream is None: raise ValueError(f"Entry point '{entry_point_id}' not found") run_id = uuid.uuid4().hex[:12] exec_id = await stream.execute(input_data, correlation_id, session_state, run_id=run_id) # Cache after execute() so the value is always a real execution_id # that callers can use for tracking. if idempotency_key is not None: self._idempotency_keys[idempotency_key] = exec_id self._idempotency_times[idempotency_key] = time.time() return exec_id async def trigger_and_wait( self, entry_point_id: str, input_data: dict[str, Any], timeout: float | None = None, session_state: dict[str, Any] | None = None, idempotency_key: str | None = None, ) -> ExecutionResult | None: """ Trigger execution and wait for completion. Args: entry_point_id: Which entry point to trigger input_data: Input data for the execution timeout: Maximum time to wait (seconds) session_state: Optional session state to resume from (with paused_at, memory) idempotency_key: Optional key for deduplication (see trigger() for details). Returns: ExecutionResult or None if timeout """ exec_id = await self.trigger( entry_point_id, input_data, session_state=session_state, idempotency_key=idempotency_key, ) stream = self._resolve_stream(entry_point_id) if stream is None: raise ValueError(f"Entry point '{entry_point_id}' not found") return await stream.wait_for_completion(exec_id, timeout) # === MULTI-GRAPH MANAGEMENT === async def add_graph( self, graph_id: str, graph: "GraphSpec", goal: "Goal", entry_points: dict[str, EntryPointSpec], storage_subpath: str | None = None, ) -> None: """Load a secondary graph into this runtime session. Creates execution streams for the graph's entry points, sets up event/timer triggers, and registers the graph. Shares the same EventBus, state.json, and data directory as the primary graph. Can be called while the runtime is running. Args: graph_id: Unique identifier for the graph graph: Graph specification goal: Goal driving this graph's execution entry_points: Entry point specs (ep_id -> spec) storage_subpath: Relative path under session root for this graph's conversations/checkpoints. Defaults to ``"graphs/{graph_id}"``. Raises: ValueError: If graph_id already registered or entry node missing """ if graph_id in self._graphs: raise ValueError(f"Graph '{graph_id}' already registered") subpath = storage_subpath or f"graphs/{graph_id}" # Validate entry nodes exist in graph for _ep_id, spec in entry_points.items(): if graph.get_node(spec.entry_node) is None: raise ValueError(f"Entry node '{spec.entry_node}' not found in graph '{graph_id}'") # Secondary graphs get their own SessionStore AND RuntimeLogStore # so their sessions and logs don't pollute the worker's directories. graph_base = self._session_store.base_path / subpath graph_session_store = SessionStore(graph_base) graph_log_store = RuntimeLogStore(graph_base / "runtime_logs") # Create streams for each entry point streams: dict[str, ExecutionManager] = {} for ep_id, spec in entry_points.items(): stream = ExecutionManager( stream_id=f"{graph_id}::{ep_id}", entry_spec=spec, graph=graph, goal=goal, state_manager=self._state_manager, storage=self._storage, outcome_aggregator=self._outcome_aggregator, event_bus=self._event_bus, llm=self._llm, tools=self._tools, tool_executor=self._tool_executor, result_retention_max=self._config.execution_result_max, result_retention_ttl_seconds=self._config.execution_result_ttl_seconds, runtime_log_store=graph_log_store, session_store=graph_session_store, checkpoint_config=self._checkpoint_config, graph_id=graph_id, accounts_prompt=self._accounts_prompt, accounts_data=self._accounts_data, tool_provider_map=self._tool_provider_map, skills_catalog_prompt=self.skills_catalog_prompt, protocols_prompt=self.protocols_prompt, skill_dirs=self.skill_dirs, ) if self._running: await stream.start() streams[ep_id] = stream # Set up event-driven subscriptions from framework.host.event_bus import EventType as _ET event_subs: list[str] = [] for ep_id, spec in entry_points.items(): if spec.trigger_type != "event": continue tc = spec.trigger_config event_types = [_ET(et) for et in tc.get("event_types", [])] if not event_types: logger.warning( "Entry point '%s::%s' has trigger_type='event' but no event_types in trigger_config", graph_id, ep_id, ) continue namespaced_ep = f"{graph_id}::{ep_id}" exclude_own = tc.get("exclude_own_graph", False) def _make_handler(entry_point_id: str, gid: str, _exclude_own: bool): _persistent_session_id: str | None = None async def _on_event(event): nonlocal _persistent_session_id if not self._running or gid not in self._graphs: return # Skip events from this graph's own executions if _exclude_own and event.graph_id == gid: return reg = self._graphs[gid] local_ep = entry_point_id.split("::", 1)[-1] stream = reg.streams.get(local_ep) if stream is None: return ep_spec = reg.entry_points.get(local_ep) is_isolated = ep_spec and ep_spec.isolation_level == "isolated" if is_isolated: if _persistent_session_id: session_state = {"resume_session_id": _persistent_session_id} else: session_state = None else: session_state = self._get_primary_session_state( local_ep, source_graph_id=gid, ) exec_id = await stream.execute( {"event": event.to_dict()}, session_state=session_state, ) if not _persistent_session_id and is_isolated: _persistent_session_id = exec_id return _on_event sub_id = self._event_bus.subscribe( event_types=event_types, handler=_make_handler(namespaced_ep, graph_id, exclude_own), filter_stream=tc.get("filter_stream"), filter_node=tc.get("filter_node"), filter_graph=tc.get("filter_graph"), ) event_subs.append(sub_id) # Set up timer-driven entry points timer_tasks: list[asyncio.Task] = [] timer_next_fire: dict[str, float] = {} for ep_id, spec in entry_points.items(): if spec.trigger_type != "timer": continue tc = spec.trigger_config _raw_interval = tc.get("interval_minutes") interval = float(_raw_interval) if _raw_interval is not None else None run_immediately = tc.get("run_immediately", False) if interval and interval > 0 and self._running: logger.info( "Creating timer for '%s::%s': interval=%s min, immediate=%s, loop=%s", graph_id, ep_id, interval, run_immediately, id(asyncio.get_event_loop()), ) def _make_timer( gid: str, local_ep: str, mins: float, immediate: bool, idle_timeout: float = 300, ): async def _timer_loop(): interval_secs = mins * 60 # For isolated entry points, reuse ONE session across # all timer ticks so conversation_mode="continuous" # actually works and we don't create N sessions. _persistent_session_id: str | None = None logger.info( "Timer loop started for '%s::%s' (sleep %ss)", gid, local_ep, interval_secs, ) if not immediate: timer_next_fire[local_ep] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) while self._running and gid in self._graphs: # Gate: skip tick if timers are explicitly paused if self._timers_paused: logger.debug( "Timer '%s::%s': paused, skipping tick", gid, local_ep, ) timer_next_fire[local_ep] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue # Gate: skip tick if ANY stream in this graph is actively working. _reg = self._graphs.get(gid) _any_active = False _min_idle = float("inf") if _reg: for _sid, _s in _reg.streams.items(): if _s.active_execution_ids: _any_active = True _idle = _s.agent_idle_seconds if _idle < _min_idle: _min_idle = _idle logger.info( "Timer '%s::%s': gate — active=%s, idle=%.1fs, timeout=%ds", gid, local_ep, _any_active, _min_idle, idle_timeout, ) if _any_active and _min_idle < idle_timeout: logger.info( "Timer '%s::%s': agent actively working, skipping tick", gid, local_ep, ) timer_next_fire[local_ep] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue logger.info("Timer firing for '%s::%s'", gid, local_ep) timer_next_fire.pop(local_ep, None) try: reg = self._graphs.get(gid) if not reg: logger.warning("Timer: no reg for '%s', stopping", gid) break stream = reg.streams.get(local_ep) if not stream: logger.warning("Timer: no stream '%s' in '%s', stopping", local_ep, gid) break # Isolated entry points get their own session; # shared ones join the primary session. ep_spec = reg.entry_points.get(local_ep) if ep_spec and ep_spec.isolation_level == "isolated": if _persistent_session_id: session_state = {"resume_session_id": _persistent_session_id} else: session_state = None else: session_state = self._get_primary_session_state(local_ep, source_graph_id=gid) # Gate: skip tick if no active session if session_state is None: logger.debug( "Timer '%s::%s': no active session, skipping", gid, local_ep, ) timer_next_fire[local_ep] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) continue exec_id = await stream.execute( {"event": {"source": "timer", "reason": "scheduled"}}, session_state=session_state, ) # Remember session ID for reuse on next tick if not _persistent_session_id and ep_spec and ep_spec.isolation_level == "isolated": _persistent_session_id = exec_id except Exception: logger.error( "Timer trigger failed for '%s::%s'", gid, local_ep, exc_info=True, ) timer_next_fire[local_ep] = time.monotonic() + interval_secs await asyncio.sleep(interval_secs) logger.info("Timer loop exited for '%s::%s'", gid, local_ep) return _timer_loop task = asyncio.create_task( _make_timer( graph_id, ep_id, interval, run_immediately, idle_timeout=float(tc.get("idle_timeout_seconds", 300)), )() ) timer_tasks.append(task) logger.info("Timer task created for '%s::%s': %s", graph_id, ep_id, task) self._graphs[graph_id] = _GraphRegistration( graph=graph, goal=goal, entry_points=entry_points, streams=streams, storage_subpath=subpath, event_subscriptions=event_subs, timer_tasks=timer_tasks, timer_next_fire=timer_next_fire, ) logger.info( "Added graph '%s' with %d entry points (%d streams)", graph_id, len(entry_points), len(streams), ) async def remove_graph(self, graph_id: str) -> None: """Remove a secondary graph from this runtime session. Stops all streams, cancels timers, unsubscribes events, and removes the registration. Cannot remove the primary graph. Args: graph_id: Graph to remove Raises: ValueError: If graph_id is the primary graph or not found """ if graph_id == self._graph_id: raise ValueError("Cannot remove the primary graph") if graph_id not in self._graphs: raise ValueError(f"Graph '{graph_id}' not found") await self._teardown_graph(graph_id) logger.info("Removed graph '%s'", graph_id) async def _teardown_graph(self, graph_id: str) -> None: """Internal: stop and clean up all resources for a graph.""" reg = self._graphs.pop(graph_id, None) if reg is None: return # Cancel timers for task in reg.timer_tasks: task.cancel() # Unsubscribe events for sub_id in reg.event_subscriptions: self._event_bus.unsubscribe(sub_id) # Stop streams for stream in reg.streams.values(): await stream.stop() # Reset active graph if it was the removed one if self._active_graph_id == graph_id: self._active_graph_id = self._graph_id def list_graphs(self) -> list[str]: """Return all registered graph IDs (primary first).""" result = [] if self._graph_id in self._graphs: result.append(self._graph_id) for gid in self._graphs: if gid != self._graph_id: result.append(gid) return result @property def graph_id(self) -> str: """The primary graph's ID.""" return self._graph_id @property def colony_id(self) -> str: """Colony compatibility — returns the primary graph ID.""" return self._graph_id def list_workers(self) -> list[str]: """Colony compatibility — returns registered graph IDs.""" return self.list_graphs() def get_worker_registration(self, graph_id: str): """Colony compatibility — returns self for the matching graph.""" if graph_id in self._graphs: return self return None @property def streams(self) -> dict: """Colony compatibility — returns _streams dict.""" return self._streams @property def active_graph_id(self) -> str: """The currently focused graph (for TUI routing).""" return self._active_graph_id @active_graph_id.setter def active_graph_id(self, value: str) -> None: if value not in self._graphs: raise ValueError(f"Graph '{value}' not registered") self._active_graph_id = value def get_active_graph(self) -> "GraphSpec": """Return the GraphSpec for the currently active graph.""" if self._active_graph_id == self._graph_id: return self.graph reg = self._graphs.get(self._active_graph_id) if reg is not None: return reg.graph return self.graph @property def user_idle_seconds(self) -> float: """Seconds since the user last provided input. Returns ``float('inf')`` if no input has been received yet. """ if self._last_user_input_time == 0.0: return float("inf") return time.monotonic() - self._last_user_input_time @property def agent_idle_seconds(self) -> float: """Seconds since any stream last had activity (LLM call, tool call, etc.). Returns the *minimum* idle time across all streams with active executions. Returns ``float('inf')`` if nothing is running. """ min_idle = float("inf") for reg in self._graphs.values(): for stream in reg.streams.values(): idle = stream.agent_idle_seconds if idle < min_idle: min_idle = idle return min_idle def get_graph_registration(self, graph_id: str) -> _GraphRegistration | None: """Get the registration for a specific graph (or None).""" return self._graphs.get(graph_id) def cancel_all_tasks(self, loop: asyncio.AbstractEventLoop) -> bool: """Cancel all running execution tasks across all graphs. Schedules the cancellation on *loop* (the agent event loop) so that ``_execution_tasks`` is only read from the thread that owns it, avoiding cross-thread dict access. Safe to call from any thread (e.g. the Textual UI thread). Blocks the caller for up to 5 seconds waiting for the result. For async callers, use :meth:`cancel_all_tasks_async` instead. """ future = asyncio.run_coroutine_threadsafe(self.cancel_all_tasks_async(), loop) try: return future.result(timeout=5) except Exception: logger.warning("cancel_all_tasks: timed out or failed") return False async def cancel_all_tasks_async(self) -> bool: """Cancel all running execution tasks (runs on the agent loop). Iterates ``_execution_tasks`` and calls ``task.cancel()`` directly. Must be awaited on the agent event loop so dict access is thread-safe. Returns True if at least one task was cancelled. """ cancelled = False for gid in self.list_graphs(): reg = self.get_graph_registration(gid) if reg: for stream in reg.streams.values(): for task in list(stream._execution_tasks.values()): if task and not task.done(): task.cancel() cancelled = True return cancelled async def stop_all_workers(self) -> bool: """Alias for ``cancel_all_tasks_async`` used by queen-lifecycle tools. Queen tools (``stop_worker``, ``switch_to_reviewing``, etc.) call ``runtime.stop_all_workers()`` which is the :class:`ColonyRuntime` idiom. In the current architecture the session's runtime is an :class:`AgentHost`, which stops workers by cancelling their execution tasks. This alias bridges the two interfaces. """ return await self.cancel_all_tasks_async() def _get_primary_session_state( self, exclude_entry_point: str, *, source_graph_id: str | None = None, ) -> dict[str, Any] | None: """Build session_state so an async entry point runs in the primary session. Looks for an active execution from another stream (the "primary" session, e.g. the user-facing intake loop) and returns a ``session_state`` dict containing: - ``resume_session_id``: reuse the same session directory - ``data_buffer``: only the keys that the async entry node declares as inputs (e.g. ``rules``, ``max_emails``). Stale outputs from previous runs (``emails``, ``actions_taken``, …) are excluded so each trigger starts fresh. The data buffer is read from the primary session's ``state.json`` which is kept up-to-date by ``GraphExecutor._write_progress()`` at every node transition. Searches across ALL graphs' streams (primary + secondary) so event-driven entry points on secondary graphs can share the primary session. Args: exclude_entry_point: Entry point ID to skip (the one being triggered) source_graph_id: Graph the exclude_entry_point belongs to (for resolving the entry node spec). Defaults to primary graph. Returns ``None`` if no primary session is active (the webhook execution will just create its own session). """ import json as _json # Determine which data buffer keys the async entry node needs. allowed_keys: set[str] | None = None # Look up the entry node from the correct graph src_graph_id = source_graph_id or self._graph_id src_reg = self._graphs.get(src_graph_id) ep_spec = ( src_reg.entry_points.get(exclude_entry_point) if src_reg else self._entry_points.get(exclude_entry_point) ) if ep_spec: graph = src_reg.graph if src_reg else self.graph entry_node = graph.get_node(ep_spec.entry_node) if entry_node and entry_node.input_keys: allowed_keys = set(entry_node.input_keys) # Search primary graph's streams for an active session. # Skip isolated streams — they have their own session directories # and must never be used as a shared session. all_streams: list[tuple[str, ExecutionManager]] = [] for _gid, reg in self._graphs.items(): for ep_id, stream in reg.streams.items(): # Skip isolated entry points — they run in their own namespace ep_spec = reg.entry_points.get(ep_id) if ep_spec and getattr(ep_spec, "isolation_level", "shared") == "isolated": continue all_streams.append((ep_id, stream)) for ep_id, stream in all_streams: if ep_id == exclude_entry_point: continue for exec_id in stream.active_execution_ids: state_path = self._storage.base_path / "sessions" / exec_id / "state.json" try: if state_path.exists(): data = _json.loads(state_path.read_text(encoding="utf-8")) full_buffer = data.get("data_buffer", data.get("memory", {})) if not full_buffer: continue # Filter to only input keys so stale outputs # from previous triggers don't leak through. if allowed_keys is not None: buffer_data = {k: v for k, v in full_buffer.items() if k in allowed_keys} else: buffer_data = full_buffer if buffer_data: return { "resume_session_id": exec_id, "data_buffer": buffer_data, } except Exception: logger.debug( "Could not read state.json for %s: skipping", exec_id, exc_info=True, ) return None async def inject_input( self, node_id: str, content: str, graph_id: str | None = None, *, is_client_input: bool = False, image_content: list[dict[str, Any]] | None = None, ) -> bool: """Inject user input into a running client-facing node. Routes input to the EventLoopNode identified by ``node_id``. Searches the specified graph (or active graph) first, then all others. Args: node_id: The node currently waiting for input content: The user's input text graph_id: Optional graph to search first (defaults to active graph) is_client_input: True when the message originates from a real human user (e.g. /chat endpoint), False for external events. image_content: Optional list of image content blocks (OpenAI image_url format) to include alongside the text. Returns: True if input was delivered, False if no matching node found """ # Track user presence self._last_user_input_time = time.monotonic() # Search target graph first target = graph_id or self._active_graph_id if target in self._graphs: for stream in self._graphs[target].streams.values(): if await stream.inject_input( node_id, content, is_client_input=is_client_input, image_content=image_content ): return True # Then search all other graphs for gid, reg in self._graphs.items(): if gid == target: continue for stream in reg.streams.values(): if await stream.inject_input( node_id, content, is_client_input=is_client_input, image_content=image_content ): return True return False async def get_goal_progress(self) -> dict[str, Any]: """ Evaluate goal progress across all streams. Returns: Progress report including overall progress, criteria status, constraint violations, and metrics. """ return await self._outcome_aggregator.evaluate_goal_progress() async def cancel_execution( self, entry_point_id: str, execution_id: str, graph_id: str | None = None, ) -> bool: """ Cancel a running execution. Args: entry_point_id: Stream containing the execution execution_id: Execution to cancel graph_id: Graph to search (defaults to active graph) Returns: True if cancelled, False if not found """ stream = self._resolve_stream(entry_point_id, graph_id) if stream is None: return False return await stream.cancel_execution(execution_id) # === QUERY OPERATIONS === def get_entry_points(self, graph_id: str | None = None) -> list[EntryPointSpec]: """Get entry points for a graph. Args: graph_id: Graph to query. ``None`` (default) uses the currently active graph (``active_graph_id``). Returns: List of EntryPointSpec for the requested graph. Falls back to the primary graph if the graph_id is not found. """ gid = graph_id or self._active_graph_id if gid == self._graph_id: return list(self._entry_points.values()) reg = self._graphs.get(gid) if reg is not None: return list(reg.entry_points.values()) # Fallback: primary graph return list(self._entry_points.values()) def get_timer_next_fire_in(self, entry_point_id: str) -> float | None: """Return seconds until the next timer fire for *entry_point_id*. Checks the primary graph's ``_timer_next_fire`` dict as well as all registered secondary graphs. Returns ``None`` when no fire time is recorded (e.g. the timer is currently executing or the entry point is not a timer). """ mono = self._timer_next_fire.get(entry_point_id) if mono is not None: return max(0.0, mono - time.monotonic()) for reg in self._graphs.values(): mono = reg.timer_next_fire.get(entry_point_id) if mono is not None: return max(0.0, mono - time.monotonic()) return None def get_stream(self, entry_point_id: str) -> ExecutionManager | None: """Get a specific execution stream.""" return self._streams.get(entry_point_id) def find_awaiting_node(self) -> tuple[str | None, str | None]: """Find a node that is currently awaiting user input. Searches all graphs and their streams for any active executor whose node has ``_awaiting_input`` set to ``True``. Returns: (node_id, graph_id) if found, else (None, None). """ for graph_id, reg in self._graphs.items(): for stream in reg.streams.values(): for executor in stream._active_executors.values(): for node_id, node in executor.node_registry.items(): if getattr(node, "_awaiting_input", False): # Skip escalation receivers — those are handled # by the queen via inject_message(), not # by the user directly. if ":escalation:" in node_id: continue return node_id, graph_id return None, None def get_execution_result( self, entry_point_id: str, execution_id: str, graph_id: str | None = None, ) -> ExecutionResult | None: """Get result of a completed execution.""" stream = self._resolve_stream(entry_point_id, graph_id) if stream: return stream.get_result(execution_id) return None # === EVENT SUBSCRIPTIONS === def subscribe_to_events( self, event_types: list, handler: Callable, filter_stream: str | None = None, filter_graph: str | None = None, ) -> str: """ Subscribe to agent events. Args: event_types: Types of events to receive handler: Async function to call when event occurs filter_stream: Only receive events from this stream filter_graph: Only receive events from this graph Returns: Subscription ID (use to unsubscribe) """ return self._event_bus.subscribe( event_types=event_types, handler=handler, filter_stream=filter_stream, filter_graph=filter_graph, ) def unsubscribe_from_events(self, subscription_id: str) -> bool: """Unsubscribe from events.""" return self._event_bus.unsubscribe(subscription_id) # === STATS AND MONITORING === def get_stats(self) -> dict: """Get comprehensive runtime statistics.""" stream_stats = {} for ep_id, stream in self._streams.items(): stream_stats[ep_id] = stream.get_stats() return { "running": self._running, "entry_points": len(self._entry_points), "streams": stream_stats, "goal_id": self.goal.id, "outcome_aggregator": self._outcome_aggregator.get_stats(), "event_bus": self._event_bus.get_stats(), "state_manager": self._state_manager.get_stats(), } def get_active_streams(self) -> list[dict[str, Any]]: """Return metadata for every stream that has active executions. Each dict contains: ``graph_id``, ``stream_id``, ``entry_point_id``, ``active_execution_ids``, ``is_awaiting_input``, ``waiting_nodes``. """ result: list[dict[str, Any]] = [] for graph_id, reg in self._graphs.items(): for ep_id, stream in reg.streams.items(): active = stream.active_execution_ids if not active: continue result.append( { "graph_id": graph_id, "stream_id": stream.stream_id, "entry_point_id": ep_id, "active_execution_ids": active, "is_awaiting_input": stream.is_awaiting_input, "waiting_nodes": stream.get_waiting_nodes(), } ) return result def get_waiting_nodes(self) -> list[dict[str, Any]]: """Return all nodes currently blocked waiting for client input. Each dict contains: ``graph_id``, ``stream_id``, ``node_id``, ``execution_id``. """ result: list[dict[str, Any]] = [] for graph_id, reg in self._graphs.items(): for _ep_id, stream in reg.streams.items(): for waiting in stream.get_waiting_nodes(): result.append( { "graph_id": graph_id, "stream_id": stream.stream_id, **waiting, } ) return result # === PROPERTIES === @property def state_manager(self) -> SharedBufferManager: """Access the shared state manager.""" return self._state_manager @property def event_bus(self) -> EventBus: """Access the event bus.""" return self._event_bus @property def outcome_aggregator(self) -> OutcomeAggregator: """Access the outcome aggregator.""" return self._outcome_aggregator @property def webhook_server(self) -> Any: """Access the webhook server (None if no webhook entry points).""" return self._webhook_server @property def timers_paused(self) -> bool: """True when timer-driven entry points are paused (e.g. by stop_worker).""" return self._timers_paused @property def is_running(self) -> bool: """Check if runtime is running.""" return self._running # === CONVENIENCE FACTORY ===