"""Worker — a single autonomous AgentLoop clone in a colony. Two modes: **Ephemeral (default)**: runs a single AgentLoop execution with a task, emits a `SUBAGENT_REPORT` event on termination (success, partial, or failed), and terminates. Used for parallel fan-out from the overseer. **Persistent (``persistent=True``)**: runs an initial AgentLoop execution (usually idle, no task) and then loops forever, receiving user chat via ``inject(message)`` and pumping each message into the already-running agent loop via ``inject_event``. Used for the colony's long-running client-facing overseer. """ from __future__ import annotations import asyncio import logging import time from dataclasses import dataclass, field from enum import StrEnum from pathlib import Path from typing import Any logger = logging.getLogger(__name__) class WorkerStatus(StrEnum): PENDING = "pending" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" STOPPED = "stopped" @dataclass class WorkerResult: output: dict[str, Any] = field(default_factory=dict) error: str | None = None tokens_used: int = 0 duration_seconds: float = 0.0 # New: structured report fields. Populated by report_to_parent tool or # synthesised from AgentResult on termination. status: str = "success" # "success" | "partial" | "failed" | "timeout" | "stopped" summary: str = "" data: dict[str, Any] = field(default_factory=dict) @dataclass class WorkerInfo: id: str task: str status: WorkerStatus started_at: float = 0.0 result: WorkerResult | None = None class Worker: """A single autonomous clone in a colony. Ephemeral mode (default): - PENDING → RUNNING → COMPLETED/FAILED/STOPPED, one shot, terminates. Persistent mode (``persistent=True``, used by the overseer): - PENDING → RUNNING (never transitions out by itself). - Receives user chat via ``inject(message)``. - Each injected message is pumped into the running AgentLoop via ``inject_event``, triggering another turn. """ def __init__( self, worker_id: str, task: str, agent_loop: Any, context: Any, event_bus: Any = None, colony_id: str = "", persistent: bool = False, storage_path: Path | None = None, ): self.id = worker_id self.task = task self.status = WorkerStatus.PENDING self._agent_loop = agent_loop self._context = context self._event_bus = event_bus self._colony_id = colony_id self._persistent = persistent # Canonical on-disk home for this worker (conversations, events, # result.json, data). Required when seed_conversation() is used — # we deliberately do NOT fall back to CWD, which previously caused # conversation parts to leak into the process working directory. self._storage_path: Path | None = Path(storage_path) if storage_path is not None else None self._task_handle: asyncio.Task | None = None self._started_at: float = 0.0 self._result: WorkerResult | None = None self._input_queue: asyncio.Queue[str | None] = asyncio.Queue() # Set by AgentLoop when the worker's LLM calls ``report_to_parent``. # Takes precedence over the synthesised report from AgentResult. self._explicit_report: dict[str, Any] | None = None # Back-reference so AgentLoop's report_to_parent handler can call # record_explicit_report on the owning Worker. The agent_loop's # _owner_worker attribute is set here during construction. if agent_loop is not None: agent_loop._owner_worker = self @property def info(self) -> WorkerInfo: return WorkerInfo( id=self.id, task=self.task, status=self.status, started_at=self._started_at, result=self._result, ) @property def is_active(self) -> bool: return self.status in (WorkerStatus.PENDING, WorkerStatus.RUNNING) @property def is_persistent(self) -> bool: return self._persistent @property def agent_loop(self) -> Any: """The wrapped AgentLoop. Used by the SessionManager chat path.""" return self._agent_loop # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ async def run(self) -> WorkerResult: """Entry point for the worker's background task. Ephemeral workers run ``AgentLoop.execute`` once and terminate, emitting a ``SUBAGENT_REPORT`` event. Persistent workers run the initial execute then loop forever processing injected user messages. """ self.status = WorkerStatus.RUNNING self._started_at = time.monotonic() try: result = await self._agent_loop.execute(self._context) duration = time.monotonic() - self._started_at if result.success: self.status = WorkerStatus.COMPLETED self._result = self._build_result(result, duration, default_status="success") else: self.status = WorkerStatus.FAILED self._result = self._build_result(result, duration, default_status="failed") await self._emit_terminal_events(result) if self._persistent: # Persistent worker: keep the loop alive, pump injected # messages forever. Status stays RUNNING; info reflects # current progress. self.status = WorkerStatus.RUNNING await self._persistent_input_loop() return self._result # type: ignore[return-value] except asyncio.CancelledError: self.status = WorkerStatus.STOPPED duration = time.monotonic() - self._started_at self._result = WorkerResult( error="Worker stopped by queen", duration_seconds=duration, status="stopped", summary="Worker was cancelled before completion.", ) await self._emit_terminal_events(None, force_status="stopped") return self._result except Exception as exc: self.status = WorkerStatus.FAILED duration = time.monotonic() - self._started_at self._result = WorkerResult( error=str(exc), duration_seconds=duration, status="failed", summary=f"Worker crashed: {exc}", ) logger.error("Worker %s failed: %s", self.id, exc, exc_info=True) await self._emit_terminal_events(None, force_status="failed") return self._result async def _persistent_input_loop(self) -> None: """Pump injected messages into the running AgentLoop forever. Each ``inject(msg)`` call puts a string on ``_input_queue``. This loop awaits it and calls ``agent_loop.inject_event(msg)`` which wakes the loop's pending user-input gate. """ while True: msg = await self._input_queue.get() if msg is None: # Sentinel: shutdown return try: await self._agent_loop.inject_event(msg, is_client_input=True) except Exception: logger.exception( "Overseer %s: inject_event failed for injected message", self.id, ) # ------------------------------------------------------------------ # Reporting # ------------------------------------------------------------------ def record_explicit_report( self, status: str, summary: str, data: dict[str, Any] | None = None, ) -> None: """Called by AgentLoop when the worker's LLM invokes ``report_to_parent``. Stores the report so that when ``run()`` reaches the termination block, the explicit report wins over a synthesised one. """ self._explicit_report = { "status": status, "summary": summary, "data": data or {}, } def _build_result( self, agent_result: Any, duration: float, default_status: str, ) -> WorkerResult: """Construct a WorkerResult from AgentResult + optional explicit report.""" explicit = self._explicit_report if explicit is not None: return WorkerResult( output=dict(agent_result.output or {}), error=agent_result.error, tokens_used=getattr(agent_result, "tokens_used", 0), duration_seconds=duration, status=explicit["status"], summary=explicit["summary"], data=explicit["data"], ) # Synthesise a minimal report from AgentResult if agent_result.success: summary = f"Completed task '{self.task[:80]}' with {len(agent_result.output or {})} outputs." data = dict(agent_result.output or {}) else: summary = f"Task '{self.task[:80]}' failed: {agent_result.error or 'unknown'}" data = {} return WorkerResult( output=dict(agent_result.output or {}), error=agent_result.error, tokens_used=getattr(agent_result, "tokens_used", 0), duration_seconds=duration, status=default_status, summary=summary, data=data, ) async def _emit_terminal_events( self, agent_result: Any, force_status: str | None = None, ) -> None: """Emit EXECUTION_COMPLETED/FAILED AND SUBAGENT_REPORT on termination. Both events are published so that consumers that listen for either shape keep working. The SUBAGENT_REPORT carries the structured summary the overseer actually cares about. """ if self._event_bus is None: return from framework.host.event_bus import AgentEvent, EventType # EXECUTION_COMPLETED / EXECUTION_FAILED (backwards-compat) if agent_result is not None: lifecycle_type = EventType.EXECUTION_COMPLETED if agent_result.success else EventType.EXECUTION_FAILED await self._event_bus.publish( AgentEvent( type=lifecycle_type, stream_id=self._context.stream_id or self.id, node_id=self.id, execution_id=self._context.execution_id or self.id, data={ "worker_id": self.id, "colony_id": self._colony_id, "task": self.task, "success": agent_result.success, "error": agent_result.error, "output_keys": (list(agent_result.output.keys()) if agent_result.output else []), }, ) ) # SUBAGENT_REPORT — the structured channel the overseer awaits result = self._result if result is None: return await self._event_bus.publish( AgentEvent( type=EventType.SUBAGENT_REPORT, stream_id=self._context.stream_id or self.id, node_id=self.id, execution_id=self._context.execution_id or self.id, data={ "worker_id": self.id, "colony_id": self._colony_id, "task": self.task, "status": force_status or result.status, "summary": result.summary, "data": result.data, "error": result.error, "duration_seconds": result.duration_seconds, "tokens_used": result.tokens_used, }, ) ) # ------------------------------------------------------------------ # External control # ------------------------------------------------------------------ async def start_background(self) -> None: """Spawn the worker's run() as an asyncio background task.""" self._task_handle = asyncio.create_task(self.run(), name=f"worker:{self.id}") # Surface any exception that escapes run(); without this callback # a crash here only becomes visible when stop() eventually awaits # the handle (and is silently lost if stop() is never called). self._task_handle.add_done_callback(self._on_task_done) def _on_task_done(self, task: asyncio.Task) -> None: if task.cancelled(): return exc = task.exception() if exc is not None: logger.error( "Worker '%s' background task crashed: %s", self.id, exc, exc_info=exc, ) async def stop(self) -> None: """Cancel the worker's background task, if any.""" if self._persistent: # Signal the input loop to exit cleanly first await self._input_queue.put(None) if self._task_handle and not self._task_handle.done(): self._task_handle.cancel() try: await self._task_handle except asyncio.CancelledError: pass async def inject(self, message: str) -> None: """Pump a user message into the worker. For ephemeral workers this is rarely used (they don't take follow-up input). For persistent overseers this is the chat injection path. """ await self._input_queue.put(message) async def seed_conversation(self, messages: list[dict[str, Any]]) -> None: """Pre-populate the worker's ConversationStore before starting. Used when forking a queen DM into a colony: the DM's prior conversation becomes the colony overseer's starting point so the overseer resumes mid-thought instead of greeting the user fresh. ``messages`` is a list of dicts matching the ConversationStore's part format: ``{seq, role, content, tool_calls, tool_use_id, created_at, phase}``. The caller is responsible for rewriting ``agent_id`` to match the new worker, and for numbering ``seq`` monotonically from 0. Must be called BEFORE ``start_background``. """ if self.status != WorkerStatus.PENDING: raise RuntimeError( f"seed_conversation must be called before start_background (worker {self.id} is {self.status})" ) # Write parts directly to the worker's on-disk conversation store # so that the AgentLoop's FileConversationStore picks them up when # NodeConversation loads from disk. We require an explicit # storage_path — falling back to CWD previously caused part files # to leak into the process working directory. if self._storage_path is None: raise RuntimeError( f"seed_conversation requires storage_path to be set on " f"Worker {self.id}; construct Worker with storage_path=..." ) parts_dir = self._storage_path / "conversations" / "parts" parts_dir.mkdir(parents=True, exist_ok=True) import json for i, msg in enumerate(messages): msg = dict(msg) # copy msg.setdefault("seq", i) msg.setdefault("agent_id", self.id) part_file = parts_dir / f"{msg['seq']:010d}.json" part_file.write_text(json.dumps(msg), encoding="utf-8") logger.info( "Worker %s: seeded %d messages into %s", self.id, len(messages), parts_dir, )